JacksonDunstan.com

IL2CPP can really slow our code down sometimes, and not just for esoteric features. Calling common math and string functions can be dramatically slower in IL2CPP. Today’s article shows you how you can work around this to speed them back up.

Min and Max

Computing the minimum or maximum of two numbers is extremely common. Unfortunately, it’s quite slow as we’re about to see. Here is some example code to calculate a minimum int value using Math.Min(int, int):

public static class TestClass
{
	public static int MathMinInt(int a, int b)
	{
		return Math.Min(a, b);
	}
}

Now here’s the C++ that IL2CPP generates for this function in Unity 2017.3:

extern "C"  int32_t TestClass_MathMinInt_m2631826738 (RuntimeObject * __this /* static, unused */, int32_t ___a0, int32_t ___b1, const RuntimeMethod* method)
{
	{
		int32_t L_0 = ___a0;
		int32_t L_1 = ___b1;
		int32_t L_2 = Math_Min_m3468062251(NULL /*static, unused*/, L_0, L_1, /*hidden argument*/NULL);
		return L_2;
	}
}

Aside from unnecessarily copying the parameters to local variables, this is a literal translation of the C#. Let’s see what ARM machine code Xcode 9.2 compiles this to for iOS:

	movs	r0, #0
	movs	r3, #0
	b.w	_Math_Min_m3468062251

Again, we have a literal translation to just a function call in assembly. So what does that function look like? Let’s see:

extern "C"  int32_t Math_Min_m3468062251 (RuntimeObject * __this /* static, unused */, int32_t ___val10, int32_t ___val21, const RuntimeMethod* method)
{
	int32_t G_B3_0 = 0;
	{
		int32_t L_0 = ___val10;
		int32_t L_1 = ___val21;
		if ((((int32_t)L_0) >= ((int32_t)L_1)))
		{
			goto IL_000d;
		}
	}
	{
		int32_t L_2 = ___val10;
		G_B3_0 = L_2;
		goto IL_000e;
	}
 
IL_000d:
	{
		int32_t L_3 = ___val21;
		G_B3_0 = L_3;
	}
 
IL_000e:
	{
		return G_B3_0;
	}
}

There are a lot of pointless local variable copies, goto is used instead of more idiomatic control flow, and extra code blocks ({}) are added to no effect, but this is essentially just return a >= b ? b : a. Let’s see what it compiles to in assembly:

	cmp	r1, r2
	it	lt
	movlt	r2, r1
	mov	r0, r2
	bx	lr

So with Math.Min(int, int) we’re essentially just getting an unnecessary function call and the associated overhead. Let’s take a look at a version using Mathf.Min(int, int):

public static class TestClass
{
	public static int MathfMinInt(int a, int b)
	{
		return Mathf.Min(a, b);
	}
}

Here’s the C++ from IL2CPP:

extern "C"  int32_t TestClass_MathfMinInt_m2489671107 (RuntimeObject * __this /* static, unused */, int32_t ___a0, int32_t ___b1, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (TestClass_MathfMinInt_m2489671107_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	{
		int32_t L_0 = ___a0;
		int32_t L_1 = ___b1;
		IL2CPP_RUNTIME_CLASS_INIT(Mathf_t3464937446_il2cpp_TypeInfo_var);
		int32_t L_2 = Mathf_Min_m18103608(NULL /*static, unused*/, L_0, L_1, /*hidden argument*/NULL);
		return L_2;
	}
}

Class initialization overhead has been added to this version! Why? Because Mathf has a static constructor to set exactly one static readonly field: Epsilon. IL2CPP is making sure that the static constructor is called before the class is used. It does this at every single call site of a Mathf static function in our whole codebase. So every function with call to a Mathf static function will contain this overhead.

Other than that class initialization code, it’s just calling Mathf.Min. Let’s see the assembly for this:

	push	{r4, r5, r6, r7, lr}
	add	r7, sp, #12
	movw	r6, :lower16:(__ZZ33TestClass_MathfMinInt_m2489671107E25s_Il2CppMethodInitialized-(LPC4_0+4))
	mov	r4, r2
	movt	r6, :upper16:(__ZZ33TestClass_MathfMinInt_m2489671107E25s_Il2CppMethodInitialized-(LPC4_0+4))
	mov	r5, r1
LPC4_0:
	add	r6, pc
	ldrb	r0, [r6]
	cbnz	r0, LBB4_2
	movw	r0, :lower16:(L_TestClass_MathfMinInt_m2489671107_MetadataUsageId$non_lazy_ptr-(LPC4_1+4))
	movt	r0, :upper16:(L_TestClass_MathfMinInt_m2489671107_MetadataUsageId$non_lazy_ptr-(LPC4_1+4))
LPC4_1:
	add	r0, pc
	ldr	r0, [r0]
	ldr	r0, [r0]
	bl	__ZN6il2cpp2vm13MetadataCache24InitializeMethodMetadataEj
	movs	r0, #1
	strb	r0, [r6]
LBB4_2:
	movw	r0, :lower16:(L_Mathf_t3464937446_il2cpp_TypeInfo_var$non_lazy_ptr-(LPC4_2+4))
	movt	r0, :upper16:(L_Mathf_t3464937446_il2cpp_TypeInfo_var$non_lazy_ptr-(LPC4_2+4))
LPC4_2:
	add	r0, pc
	ldr	r0, [r0]
	ldr	r0, [r0]
	ldrb.w	r1, [r0, #178]
	lsls	r1, r1, #31
	beq	LBB4_5
	ldr	r1, [r0, #96]
	cbnz	r1, LBB4_5
	bl	__ZN6il2cpp2vm7Runtime9ClassInitEP11Il2CppClass
LBB4_5:
	movs	r0, #0
	mov	r1, r5
	mov	r2, r4
	movs	r3, #0
	pop.w	{r4, r5, r6, r7, lr}
	b.w	_Mathf_Min_m18103608

Almost all of that is the class initialization overhead. Only at the very end do we see the call to Mathf.Min. So let’s see what the C++ for it looks like:

extern "C"  int32_t Mathf_Min_m18103608 (RuntimeObject * __this /* static, unused */, int32_t ___a0, int32_t ___b1, const RuntimeMethod* method)
{
	int32_t V_0 = 0;
	int32_t G_B3_0 = 0;
	{
		int32_t L_0 = ___a0;
		int32_t L_1 = ___b1;
		if ((((int32_t)L_0) >= ((int32_t)L_1)))
		{
			goto IL_000e;
		}
	}
	{
		int32_t L_2 = ___a0;
		G_B3_0 = L_2;
		goto IL_000f;
	}
 
IL_000e:
	{
		int32_t L_3 = ___b1;
		G_B3_0 = L_3;
	}
 
IL_000f:
	{
		V_0 = G_B3_0;
		goto IL_0015;
	}
 
IL_0015:
	{
		int32_t L_4 = V_0;
		return L_4;
	}
}

This is basically the same code as with Math.Min, so we should expect the same assembly. Let’s verify by looking:

	cmp	r1, r2
	it	lt
	movlt	r2, r1
	mov	r0, r2
	bx	lr

This is exactly the same sequence of five instructions as with Math.Min. So, from these examples so far we can conclude that calling Math.Min is strictly faster than calling Mathf.Min. Let’s see if we can beat them both though by directly writing the trivial amount of code required rather than calling either function:

public static class TestClass
{
	public static int ManualMinInt(int a, int b)
	{
		return a < b ? a : b;
	}
}

Here’s the C++ for this:

extern "C"  int32_t TestClass_ManualMinInt_m171002159 (RuntimeObject * __this /* static, unused */, int32_t ___a0, int32_t ___b1, const RuntimeMethod* method)
{
	int32_t G_B3_0 = 0;
	{
		int32_t L_0 = ___a0;
		int32_t L_1 = ___b1;
		if ((((int32_t)L_0) >= ((int32_t)L_1)))
		{
			goto IL_000d;
		}
	}
	{
		int32_t L_2 = ___a0;
		G_B3_0 = L_2;
		goto IL_000e;
	}
 
IL_000d:
	{
		int32_t L_3 = ___b1;
		G_B3_0 = L_3;
	}
 
IL_000e:
	{
		return G_B3_0;
	}
}

Again, this looks just like the C++ for Math.Min and Mathf.Min. Let’s make sure we get the same assembly output from the C++ compiler:

	cmp	r1, r2
	it	lt
	movlt	r2, r1
	mov	r0, r2
	bx	lr

Once again we have the same five instructions. So we can effectively avoid the function call overhead of Math.Min and the class initialization overhead of Mathf.Min by writing the code directly ourselves.

The code for Max is essentially the same as the code for Min, so I won’t repeat it here. When using float instead of int, Mathf is simply return a < b ? a : b but Math.Min calls float.IsNaN(float) so that it can return quiet NaN, which is of dubious value. Writing the code directly is still the fastest approach with float, especially if "quiet NaN" isn't needed.

Vector3

Next, let's look at some Vector3 functionality starting with the sqrMagnitude property:

public static class TestClass
{
	public static float Vector3SqrMagnitude(Vector3 v)
	{
		return v.sqrMagnitude;
	}
}

Here's the C++ that IL2CPP produces:

extern "C"  float TestClass_Vector3SqrMagnitude_m2846473366 (RuntimeObject * __this /* static, unused */, Vector3_t3722313464  ___v0, const RuntimeMethod* method)
{
	{
		float L_0 = Vector3_get_sqrMagnitude_m1474274574((&___v0), /*hidden argument*/NULL);
		return L_0;
	}
}

This is pretty much just a call to the sqrMagnitude property, so let's see how the assembly code looks once this is compiled:

	push	{r7, lr}
	mov	r7, sp
	push	{r1, r2, r3}
	mov	r0, sp
	movs	r1, #0
	bl	_Vector3_get_sqrMagnitude_m1474274574
	add	sp, #12
	pop	{r7, pc}

Again, this is just a call to sqrMagnitude. So let's go see what it's doing:

extern "C"  float Vector3_get_sqrMagnitude_m1474274574 (Vector3_t3722313464 * __this, const RuntimeMethod* method)
{
	float V_0 = 0.0f;
	{
		float L_0 = __this->get_x_0();
		float L_1 = __this->get_x_0();
		float L_2 = __this->get_y_1();
		float L_3 = __this->get_y_1();
		float L_4 = __this->get_z_2();
		float L_5 = __this->get_z_2();
		V_0 = ((float)il2cpp_codegen_add((float)((float)il2cpp_codegen_add((float)((float)il2cpp_codegen_multiply((float)L_0, (float)L_1)), (float)((float)il2cpp_codegen_multiply((float)L_2, (float)L_3)))), (float)((float)il2cpp_codegen_multiply((float)L_4, (float)L_5))));
		goto IL_0030;
	}
 
IL_0030:
	{
		float L_6 = V_0;
		return L_6;
	}
}

il2cpp_codegen_add and il2cpp_codegen_multiply really gets in the way here, but this is basically just return v.x * v.x + v.y * v.y + v.z * v.z. Let's see the assembly this compiles to:

	vldr	s2, [r0, #4]
	vldr	s0, [r0]
	vmul.f32	s2, s2, s2
	vldr	s4, [r0, #8]
	vmla.f32	s2, s0, s0
	vmla.f32	s2, s4, s4
	vmov	r0, s2
	bx	lr

All those il2cpp_codegen_add and il2cpp_codegen_multiply calls, goto, extra code blocks, and pointless local variable copies have been removed by the C++ compiler and we're left with instructions for the minimal math.

Now let's try writing our own sqrMagnitude instead of calling the property:

public static class TestClass
{
	public static float ManualSqrMagnitude(Vector3 v)
	{
		return v.x * v.x + v.y * v.y + v.z * v.z;
	}
}

Here's the C++ out of IL2CPP:

extern "C"  float TestClass_ManualSqrMagnitude_m3717072405 (RuntimeObject * __this /* static, unused */, Vector3_t3722313464  ___v0, const RuntimeMethod* method)
{
	{
		float L_0 = (&___v0)->get_x_0();
		float L_1 = (&___v0)->get_x_0();
		float L_2 = (&___v0)->get_y_1();
		float L_3 = (&___v0)->get_y_1();
		float L_4 = (&___v0)->get_z_2();
		float L_5 = (&___v0)->get_z_2();
		return ((float)il2cpp_codegen_add((float)((float)il2cpp_codegen_add((float)((float)il2cpp_codegen_multiply((float)L_0, (float)L_1)), (float)((float)il2cpp_codegen_multiply((float)L_2, (float)L_3)))), (float)((float)il2cpp_codegen_multiply((float)L_4, (float)L_5))));
	}
}

This looks just like sqrMagnitude, so let's confirm that it compiles to the same assembly code:

	vmov	s0, r2
	vmov	s2, r1
	vmul.f32	s0, s0, s0
	vmla.f32	s0, s2, s2
	vmov	s2, r3
	vmla.f32	s0, s2, s2
	vmov	r0, s0
	bx	lr

Indeed, this is identical to the sqrMagntiude code. So to save some time in performance-critical code by writing the contents of sqrMagnitude directly instead of calling the property.

Most of the Vector3 functionality is actually on static functions, so let's look at a call to Dot:

public static class TestClass
{
	public static float Vector3Dot(Vector3 a, Vector3 b)
	{
		return Vector3.Dot(a, b);
	}
}

Here's what IL2CPP outputs:

extern "C"  float TestClass_Vector3Dot_m3633618062 (RuntimeObject * __this /* static, unused */, Vector3_t3722313464  ___a0, Vector3_t3722313464  ___b1, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (TestClass_Vector3Dot_m3633618062_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	{
		Vector3_t3722313464  L_0 = ___a0;
		Vector3_t3722313464  L_1 = ___b1;
		IL2CPP_RUNTIME_CLASS_INIT(Vector3_t3722313464_il2cpp_TypeInfo_var);
		float L_2 = Vector3_Dot_m606404487(NULL /*static, unused*/, L_0, L_1, /*hidden argument*/NULL);
		return L_2;
	}
}

Type initialization code is back! This is to allow the static constructor to create cached versions of the value returned by Vector3.up and other static properties. Let's see what this compiles to because of that overhead:

	push	{r4, r5, r6, r7, lr}
	add	r7, sp, #12
	push.w	{r8, r10, r11}
	sub	sp, #16
	movw	r4, :lower16:(__ZZ32TestClass_Vector3Dot_m3633618062E25s_Il2CppMethodInitialized-(LPC8_0+4))
	mov	r8, r3
	movt	r4, :upper16:(__ZZ32TestClass_Vector3Dot_m3633618062E25s_Il2CppMethodInitialized-(LPC8_0+4))
	mov	r5, r2
LPC8_0:
	add	r4, pc
	mov	r6, r1
	ldrb	r0, [r4]
	cbnz	r0, LBB8_2
	movw	r0, :lower16:(L_TestClass_Vector3Dot_m3633618062_MetadataUsageId$non_lazy_ptr-(LPC8_1+4))
	movt	r0, :upper16:(L_TestClass_Vector3Dot_m3633618062_MetadataUsageId$non_lazy_ptr-(LPC8_1+4))
LPC8_1:
	add	r0, pc
	ldr	r0, [r0]
	ldr	r0, [r0]
	bl	__ZN6il2cpp2vm13MetadataCache24InitializeMethodMetadataEj
	movs	r0, #1
	strb	r0, [r4]
LBB8_2:
	movw	r0, :lower16:(L_Vector3_t3722313464_il2cpp_TypeInfo_var$non_lazy_ptr-(LPC8_2+4))
	ldr	r4, [r7, #16]
	movt	r0, :upper16:(L_Vector3_t3722313464_il2cpp_TypeInfo_var$non_lazy_ptr-(LPC8_2+4))
	ldr.w	r11, [r7, #12]
LPC8_2:
	add	r0, pc
	ldr.w	r10, [r7, #8]
	ldr	r0, [r0]
	ldr	r0, [r0]
	ldrb.w	r1, [r0, #178]
	lsls	r1, r1, #31
	beq	LBB8_5
	ldr	r1, [r0, #96]
	cbnz	r1, LBB8_5
	bl	__ZN6il2cpp2vm7Runtime9ClassInitEP11Il2CppClass
	movs	r0, #0
	strd	r10, r11, [sp]
	strd	r4, r0, [sp, #8]
	movs	r0, #0
	mov	r1, r6
	mov	r2, r5
	mov	r3, r8
	bl	_Vector3_Dot_m606404487
	add	sp, #16
	pop.w	{r8, r10, r11}
	pop	{r4, r5, r6, r7, pc}

This should look familiar to the Mathf.Min call from earlier. There's a huge amount of type initialization code with a call to Vector3.Dot at the very end. Let's go look at Vector3.Dot:

extern "C"  float Vector3_Dot_m606404487 (RuntimeObject * __this /* static, unused */, Vector3_t3722313464  ___lhs0, Vector3_t3722313464  ___rhs1, const RuntimeMethod* method)
{
	float V_0 = 0.0f;
	{
		float L_0 = (&___lhs0)->get_x_0();
		float L_1 = (&___rhs1)->get_x_0();
		float L_2 = (&___lhs0)->get_y_1();
		float L_3 = (&___rhs1)->get_y_1();
		float L_4 = (&___lhs0)->get_z_2();
		float L_5 = (&___rhs1)->get_z_2();
		V_0 = ((float)il2cpp_codegen_add((float)((float)il2cpp_codegen_add((float)((float)il2cpp_codegen_multiply((float)L_0, (float)L_1)), (float)((float)il2cpp_codegen_multiply((float)L_2, (float)L_3)))), (float)((float)il2cpp_codegen_multiply((float)L_4, (float)L_5))));
		goto IL_0036;
	}
 
IL_0036:
	{
		float L_6 = V_0;
		return L_6;
	}
}

Since sqrMagnitude is essentially just Dot(this, this), this code should look very familiar. Here's the assembly it compiles to:

	vldr	s0, [sp, #4]
	vmov	s2, r2
	vmov	s4, r1
	vmul.f32	s0, s2, s0
	vldr	s2, [sp]
	vmla.f32	s0, s4, s2
	vldr	s2, [sp, #8]
	vmov	s4, r3
	vmla.f32	s0, s4, s2
	vmov	r0, s0
	bx	lr

This is a bit longer than sqrMagnitude since it has to deal with two vectors, but still just the straightforward code you'd expect.

Now let's see if we can beat this with our own version of Dot written directly into the function:

public static class TestClass
{
	public static float ManualDot(Vector3 a, Vector3 b)
	{
		return a.x * b.x + a.y * b.y + a.z * b.z;
	}
}

Here's the IL2CPP output:

extern "C"  float TestClass_ManualDot_m558301483 (RuntimeObject * __this /* static, unused */, Vector3_t3722313464  ___a0, Vector3_t3722313464  ___b1, const RuntimeMethod* method)
{
	{
		float L_0 = (&___a0)->get_x_0();
		float L_1 = (&___b1)->get_x_0();
		float L_2 = (&___a0)->get_y_1();
		float L_3 = (&___b1)->get_y_1();
		float L_4 = (&___a0)->get_z_2();
		float L_5 = (&___b1)->get_z_2();
		return ((float)il2cpp_codegen_add((float)((float)il2cpp_codegen_add((float)((float)il2cpp_codegen_multiply((float)L_0, (float)L_1)), (float)((float)il2cpp_codegen_multiply((float)L_2, (float)L_3)))), (float)((float)il2cpp_codegen_multiply((float)L_4, (float)L_5))));
	}
}

This looks just like Vector3.Dot, so let's confirm the assembly also looks the same:

	vldr	s0, [sp, #4]
	vmov	s2, r2
	vmov	s4, r1
	vmul.f32	s0, s2, s0
	vldr	s2, [sp]
	vmla.f32	s0, s4, s2
	vldr	s2, [sp, #8]
	vmov	s4, r3
	vmla.f32	s0, s4, s2
	vmov	r0, s0
	bx	lr

We've successfully removed both the function call and type initialization overhead from our code that takes the dot product of two vectors. While this is somewhat less clear code, the tradeoff may be worthwhile in performance hotspots.

Strings

One very common string operation is to call string.IsNullOrEmpty to check for, well, a string that is either null or empty: Length == 0.

public static class TestClass
{
	public static bool StringIsNullOrEmpty(string s)
	{
		return string.IsNullOrEmpty(s);
	}
}

Here's what IL2CPP outputs:

extern "C"  bool TestClass_StringIsNullOrEmpty_m285996536 (RuntimeObject * __this /* static, unused */, String_t* ___s0, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (TestClass_StringIsNullOrEmpty_m285996536_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	{
		String_t* L_0 = ___s0;
		IL2CPP_RUNTIME_CLASS_INIT(String_t_il2cpp_TypeInfo_var);
		bool L_1 = String_IsNullOrEmpty_m2969720369(NULL /*static, unused*/, L_0, /*hidden argument*/NULL);
		return L_1;
	}
}

Again we have the dreaded type initialization overhead. This supports string.Empty which is static readonly instead of const. The pattern should be clear at this point, but let's look at the assembly this compiles to anyhow:

	push	{r4, r5, r7, lr}
	add	r7, sp, #8
	movw	r5, :lower16:(__ZZ40TestClass_StringIsNullOrEmpty_m285996536E25s_Il2CppMethodInitialized-(LPC10_0+4))
	mov	r4, r1
	movt	r5, :upper16:(__ZZ40TestClass_StringIsNullOrEmpty_m285996536E25s_Il2CppMethodInitialized-(LPC10_0+4))
	add	r5, pc
	ldrb	r0, [r5]
	cbnz	r0, LBB10_2
	movw	r0, :lower16:(L_TestClass_StringIsNullOrEmpty_m285996536_MetadataUsageId$non_lazy_ptr-(LPC10_1+4))
	movt	r0, :upper16:(L_TestClass_StringIsNullOrEmpty_m285996536_MetadataUsageId$non_lazy_ptr-(LPC10_1+4))
LPC10_1:
	add	r0, pc
	ldr	r0, [r0]
	ldr	r0, [r0]
	bl	__ZN6il2cpp2vm13MetadataCache24InitializeMethodMetadataEj
	movs	r0, #1
	strb	r0, [r5]
LBB10_2:
	movw	r0, :lower16:(L_String_t_il2cpp_TypeInfo_var$non_lazy_ptr-(LPC10_2+4))
	movt	r0, :upper16:(L_String_t_il2cpp_TypeInfo_var$non_lazy_ptr-(LPC10_2+4))
LPC10_2:
	add	r0, pc
	ldr	r0, [r0]
	ldr	r0, [r0]
	ldrb.w	r1, [r0, #178]
	lsls	r1, r1, #31
	beq	LBB10_5
	ldr	r1, [r0, #96]
	cbnz	r1, LBB10_5
	bl	__ZN6il2cpp2vm7Runtime9ClassInitEP11Il2CppClass
LBB10_5:
	movs	r0, #0
	mov	r1, r4
	movs	r2, #0
	pop.w	{r4, r5, r7, lr}
	b.w	_String_IsNullOrEmpty_m2969720369

Predictably, this is all type initialization code followed by the action call to string.IsNullOrEmpty. Let's look at its implementation:

extern "C"  bool String_IsNullOrEmpty_m2969720369 (RuntimeObject * __this /* static, unused */, String_t* ___value0, const RuntimeMethod* method)
{
	int32_t G_B3_0 = 0;
	{
		String_t* L_0 = ___value0;
		if (!L_0)
		{
			goto IL_0011;
		}
	}
	{
		String_t* L_1 = ___value0;
		NullCheck(L_1);
		int32_t L_2 = String_get_Length_m3847582255(L_1, /*hidden argument*/NULL);
		G_B3_0 = ((((int32_t)L_2) == ((int32_t)0))? 1 : 0);
		goto IL_0012;
	}
 
IL_0011:
	{
		G_B3_0 = 1;
	}
 
IL_0012:
	{
		return (bool)G_B3_0;
	}
}

IL2CPP has made this tough to read, but it's basically just checking for null and Length == 0. Now let's see what it compiles to:

	cbz	r1, LBB478_2
	ldr	r1, [r1, #8]
	movs	r0, #0
	cmp	r1, #0
	it	eq
	moveq	r0, #1
	bx	lr
LBB478_2:
	movs	r0, #1
	bx	lr

The C++ compiler has once again removed all the syntactic noise and left us with code that simply performs the two checks and returns the result. Note that the redundant NullCheck call has also been eliminated as the C++ compiler correctly determined that it could never take effect.

Let's see how we can do by performing our own checks directly in combination with disabling IL2CPP's null-checking since we're already performing our own:

public static class TestClass
{
	public static bool ManualIsNullOrEmpty(string s)
	{
		return s == null || s.Length == 0;
	}
}

IL2CPP turns this into the following C++:

extern "C"  bool TestClass_ManualIsNullOrEmpty_m2282295196 (RuntimeObject * __this /* static, unused */, String_t* ___s0, const RuntimeMethod* method)
{
	int32_t G_B3_0 = 0;
	{
		String_t* L_0 = ___s0;
		if (!L_0)
		{
			goto IL_0011;
		}
	}
	{
		String_t* L_1 = ___s0;
		NullCheck(L_1);
		int32_t L_2 = String_get_Length_m3847582255(L_1, /*hidden argument*/NULL);
		G_B3_0 = ((((int32_t)L_2) == ((int32_t)0))? 1 : 0);
		goto IL_0012;
	}
 
IL_0011:
	{
		G_B3_0 = 1;
	}
 
IL_0012:
	{
		return (bool)G_B3_0;
	}
}

This looks exactly the same as string.IsNullOrEmpty. Let's confirm that it compiles to the same machine code:

	push	{r4, r7, lr}
	add	r7, sp, #4
	cbz	r1, LBB11_2
	mov	r0, r1
	movs	r1, #0
	movs	r4, #0
	bl	_String_get_Length_m3847582255
	cmp	r0, #0
	it	eq
	moveq	r4, #1
	b	LBB11_3
LBB11_2:
	movs	r4, #1
LBB11_3:
	mov	r0, r4
	pop	{r4, r7, pc}

Unfortunately, the call to get the Length property is not inlined here unlike in string.IsNullOrEmpty. So while we're able to remove the type initialization overhead, we can't remove the function call overhead.

Conclusion

Adding a static constructor to a class adds type initialization overhead to all calls to its static functions. This includes literally specifying a static constructor or just declaring static fields and setting their values inline. Unfortunately, this applies to common classes like Mathf, Vector3, and string. We can work around this in one of two ways. First, in trivial cases like Mathf.Min, Vector3.Dot, and string.IsNullOrEmpty, we can simply write the code to perform the operation rather than calling the function. Second, in non-trivial cases, we can implement our own version in a class that doesn't have a static constructor. While it's a shame to have to rewrite code that has already been provided to us, sometimes this is a worthy price to pay for improved performance in a critical area of the codebase.

#1 by VVEthan on April 16th, 2018 · Reply

Oh man, the Mathf static constructor thing… that never occurred to me.

I’ve written my own static classes, with both inline initialization and helper methods, assuming the only penalty was paid on launch, but apparently not!

Great article, as usual!

#2 by Brown2Fox on February 16th, 2022 · Reply

Nice article! Thx.

But today behavior is even worse. In Unity 2019.4.33f Math.Min gives the same result as Mathf.Min :(

// System.Int32 MyTest::MathMinInt(System.Int32,System.Int32)
IL2CPP_EXTERN_C IL2CPP_METHOD_ATTR int32_t MyTest_MathMinInt_mA7CE3756095AA2DDAB25649A56EEE4FE028C75A8 (int32_t ___a0, int32_t ___b1, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (MyTest_MathMinInt_mA7CE3756095AA2DDAB25649A56EEE4FE028C75A8_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	int32_t V_0 = 0;
	{
		int32_t L_0 = ___a0;
		int32_t L_1 = ___b1;
		IL2CPP_RUNTIME_CLASS_INIT(Math_tFB388E53C7FDC6FCCF9A19ABF5A4E521FBD52E19_il2cpp_TypeInfo_var);
		int32_t L_2 = Math_Min_mC950438198519FB2B0260FCB91220847EE4BB525(L_0, L_1, /*hidden argument*/NULL);
		V_0 = L_2;
		goto IL_000b;
	}
 
IL_000b:
	{
		int32_t L_3 = V_0;
		return L_3;
	}
}
// System.Int32 MyTest::MathfMinInt(System.Int32,System.Int32)
IL2CPP_EXTERN_C IL2CPP_METHOD_ATTR int32_t MyTest_MathfMinInt_m086466398A92609ED7E1E1B6B33FA833B1CB3C87 (int32_t ___a0, int32_t ___b1, const RuntimeMethod* method)
{
	static bool s_Il2CppMethodInitialized;
	if (!s_Il2CppMethodInitialized)
	{
		il2cpp_codegen_initialize_method (MyTest_MathfMinInt_m086466398A92609ED7E1E1B6B33FA833B1CB3C87_MetadataUsageId);
		s_Il2CppMethodInitialized = true;
	}
	int32_t V_0 = 0;
	{
		int32_t L_0 = ___a0;
		int32_t L_1 = ___b1;
		IL2CPP_RUNTIME_CLASS_INIT(Mathf_tFBDE6467D269BFE410605C7D806FD9991D4A89CB_il2cpp_TypeInfo_var);
		int32_t L_2 = Mathf_Min_m1A2CC204E361AE13C329B6535165179798D3313A(L_0, L_1, /*hidden argument*/NULL);
		V_0 = L_2;
		goto IL_000b;
	}
 
IL_000b:
	{
		int32_t L_3 = V_0;
		return L_3;
	}
}

Common Functions That IL2CPP Slows Down

Min and Max

Vector3

Strings

Conclusion

Comments