Local Variable Caching
Recently, I’ve seen a lot of performance-critical code that has made heavy use of field variables. For example, an expensive loop might look like this: for (var i:int = 0; i < this.numObjects; ++i)
. I've recommended to some of the programmers writing such code that they modify it to cache the field variable as a local variable to improve performance. Was I right to recommend this? In today's article I'll examine the read and write times to see if caching field variables locally really improves performance.
First, let's get an idea of what I'm talking about when I draw a distinction between "field variable" and "local variable":
class Example { private var fieldVariable:int; private function example(): void { var localVariable:int; // NOTE: "x" here can be anything value x = localVariable; // local variable read localVariable = x; // local variable write x = this.fieldVariable; // field variable read this.fieldVariable = x; // field variable write } }
With this in mind, I've devised a simple performance test. I've taken Skyboy's advice and reduced the impact of the test loop by doing 100 tests (variable reads or writes) per loop. To do this and preserve some element of brevity, I've reduced the variable names to two letters. Please do not interpret this as a variable naming convention recommendation. :)
Also, the test checks two kinds of field variables and two kinds of local variables. For fields, non-static variables are checked in addition to static variables. For local variables, ordering is important, so I checked one declared at the beginning of the function and one declared at the end of the function.
package { import flash.display.*; import flash.utils.*; import flash.text.*; public class LocalVsField extends Sprite { private var __logger:TextField = new TextField(); private function log(msg:*): void { __logger.appendText(msg + "\n"); } private var of:int; // of = object field private static var cf:int; // cf = class field public function LocalVsField() { __logger.autoSize = TextFieldAutoSize.LEFT; addChild(__logger); var lf:int; // lf = local first var i:int; const REPS:int = 100000000; var beforeTime:int; var readTime:int; var writeTime:int; var ll:int; // ll = local last var t:int; log("Variable,Read Time,Write Time"); beforeTime = getTimer(); for (i = 0; i < REPS; ++i) { t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf; t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf; t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf; t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf; t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf; t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf; t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf; t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf; t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf; t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf;t=lf; } readTime = getTimer() - beforeTime; beforeTime = getTimer(); for (i = 0; i < REPS; ++i) { lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t; lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t; lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t; lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t; lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t; lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t; lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t; lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t; lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t; lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t;lf=t; } writeTime = getTimer() - beforeTime; log("Local First," + readTime + "," + writeTime); beforeTime = getTimer(); for (i = 0; i < REPS; ++i) { t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll; t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll; t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll; t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll; t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll; t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll; t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll; t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll; t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll; t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll;t=ll; } readTime = getTimer() - beforeTime; beforeTime = getTimer(); for (i = 0; i < REPS; ++i) { ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t; ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t; ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t; ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t; ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t; ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t; ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t; ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t; ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t; ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t;ll=t; } writeTime = getTimer() - beforeTime; log("Local Last," + readTime + "," + writeTime); beforeTime = getTimer(); for (i = 0; i < REPS; ++i) { t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of; t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of; t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of; t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of; t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of; t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of; t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of; t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of; t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of; t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of;t=of; } readTime = getTimer() - beforeTime; beforeTime = getTimer(); for (i = 0; i < REPS; ++i) { of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t; of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t; of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t; of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t; of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t; of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t; of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t; of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t; of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t; of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t;of=t; } writeTime = getTimer() - beforeTime; log("Object Field," + readTime + "," + writeTime); beforeTime = getTimer(); for (i = 0; i < REPS; ++i) { t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf; t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf; t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf; t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf; t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf; t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf; t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf; t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf; t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf; t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf;t=cf; } readTime = getTimer() - beforeTime; beforeTime = getTimer(); for (i = 0; i < REPS; ++i) { cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t; cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t; cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t; cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t; cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t; cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t; cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t; cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t; cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t; cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t;cf=t; } writeTime = getTimer() - beforeTime; log("Class Field," + readTime + "," + writeTime); } } }
Here is the test environment I ran the above app on:
- Flex SDK (MXMLC) 4.1.0.16076, compiling in release mode (no debugging or verbose stack traces)
- Release version of Flash Player 10.2.152.26
- 2.4 Ghz Intel Core i5
- Mac OS X 10.6.6
And here are the results I got:
Variable | Read Time | Write Time |
---|---|---|
Local First | 216 | 210 |
Local Last | 215 | 207 |
Object Field | 242 | 3744 |
Class Field | 273 | 3965 |
Some of the differences are smaller than I would have guessed. Non-static "Object" fields are only 13% slower than local variables when it comes to reading and static "Class" fields are only 27% slower. However, the difference becomes dramatic when we look at write speed. Writing to a non-static field variable is 18x slower than writing to a local variable and a static field variable is 19x slower when written to. This is obviously huge, but especially striking because the the write times for local variables are actually lower than the read times, not hugely larger.
Now, what will your reaction be next time you see a loop like this?
class NumberHolder { public var numbers:Vector.<Number>; public var total:Number; public function NumberHolder(numbers:Vector.<Number>) { total = 0; for each (var number:Number in numbers) { total += number; } } }
This code is like a ticking time bomb. Just imagine the performance when someone innocently passes in a Vector
with a million elements. Consider this simple alternative using local variable caching:
class NumberHolder { public var numbers:Vector.<Number>; public var total:Number; public function NumberHolder(numbers:Vector.<Number>) { var total:Number = 0; for each (var number:Number in numbers) { total += number; } this.total = total; } }
You may want to name your local variable something other than total
, but the point remains: do your heavy work on a local variable and then copy the result to a field variable when you're done. The result will be a 18-19x speedup in your variable access and may well result in a sizable performance increase.
#1 by as3isolib on March 14th, 2011 ·
Curious to know if this holds true for more complex object structures. In the past I have always had non-Top Level variable (int, uint, Number, Bool, String) instances reside outside as cached “scratch” objects. Aside from Array and Vector, do your tests conclude the same for say an object such as:
#2 by jackson on March 14th, 2011 ·
I tried using your
CustomClass
instead ofint
. I didn’t even instantiate your class and instead just read/wrote a bunch ofnull
values. The result was the same for everything except writing field variables, which was 10x slower than in the article! So, it would seem that local variable caching is even more important when dealing with object instances than it is with basic types (e.g.int
).Thanks for the tip!
#3 by Jonnie on March 14th, 2011 ·
Yamma hamma… This is quite the find. I’ve been using field variables in large loops to try to improve performance, instead of calculating the variable each time. I’m glad to know this is still fine, considering I’m only using read access in the loop. I think it’s time you/someone write a PMD rule that finds writing to field vars in loops.
#4 by jackson on March 14th, 2011 ·
Glad to be of service. :)
You might still keep your pre-calculated field variables, but add local variable caching. For example, this code:
Could easily be changed to this code:
I guess this is just the “read” equivalent of the “write” example at the bottom of the article…
#5 by Mims H. Wright on March 14th, 2011 ·
Hey, great post! I was wondering how the results changed when you compare closed or “final” classes with open “dynamic” classes?
#6 by jackson on March 14th, 2011 ·
Thanks. I followed your suggestion and tried using the
final
anddynamic
keywords (one at a time, of course) on both theCustomClass
(suggested by as3isolib above) and theLocalVsField
test app (i.e. document) class. In none of these four scenarios did the results change. I figured there may have been a change when making the document class dynamic, but it turns out that it doesn’t.#7 by skyboy on March 14th, 2011 ·
This certainly explains the reason I noticed a drop in performance when I cached it locally in some instances and not others.
However, there is one interesting note from a test I did just a few minutes before reading this article. Accessing the field of a field.
This test took 6,000 ms on my machine. Strictly typed, it only takes 10 ms.
As I mentioned earlier, Increasing the tests per loop could increase the gap, or it could already be at it’s minimum time. Personally, I don’t need to know the results further than this because of the difference it’s at already.
#8 by jackson on March 14th, 2011 ·
I think your test is just showing how slow dynamic access is and isn’t really related to accessing field variables. For example,
lookup
could be a local variable and you’d still see the huge 6000ms spike (or close to it) when the variable is typed*
. I wrote an article that kind of covered this a couple years back, but perhaps I should do a modern one that really explicitly shows the performance downsides of dynamic accesses. Thanks for yet-another idea! :)#9 by skyboy on March 15th, 2011 ·
While writing some internal package-level functions and consts, I realized that access of these wasn’t included in your test.
In the short test I did using this same method, I got an average result of it being around 0-30% slower, but still incredibly fast (9-11ms vs 10-14ms). For my use, this is fine; However, I did not test write speed.
#10 by Jeremy Rudd on July 28th, 2012 ·
Your test could be wrong – constructors are not JIT compiled by the AVM2 and behaves differently from normal functions.
#11 by jackson on July 28th, 2012 ·
I moved everything into an
init
function and got the same results. There may be some case where the constructor runs slower than non-constructor functions, but I haven’t seen them proved yet. For more on this, see Constructor Slowdown?. If you have any test cases that show a slowdown in the constructor, I’d love to see them.