Burst 1.0.1 is a patch-level update to the original 1.0.0 release, but it actually contains a useful new feature: we’re now able to force methods to be inlined. Read on to see how!

Update: A Russian translation of this article is available.

First, let’s create a Burst-compiled job that has a big method:

[BurstCompile]
struct BigMethodJob : IJob
{
    [ReadOnly] public int Val1;
    [ReadOnly] public int Val2;
    [WriteOnly] public NativeArray<float> Result;
 
    public void Execute()
    {
        float f = PointlessAddition(Val1);
        f += PointlessAddition(Val2);
        Result[0] = f;
    }
 
    float PointlessAddition(int val)
    {
        float f = 0;
        f += val+1;
        f += val+2;
        f += val+3;
        f += val+4;
        f += val+5;
        f += val+6;
        f += val+7;
        f += val+8;
        f += val+9;
        f += val+10;
        f += val+11;
        f += val+12;
        f += val+13;
        f += val+14;
        f += val+15;
        f += val+16;
        f += val+17;
        f += val+18;
        f += val+19;
        f += val+20;
        f += val+21;
        f += val+22;
        f += val+23;
        f += val+24;
        f += val+25;
        f += val+26;
        f += val+27;
        f += val+28;
        f += val+29;
        f += val+30;
        f += val+31;
        f += val+32;
        f += val+33;
        f += val+34;
        f += val+35;
        f += val+36;
        f += val+37;
        f += val+38;
        f += val+39;
        f += val+40;
        return f;
    }
}

The job is pretty pointless, but has enough code to prevent Burst from inlining the body of PointlessAddition into the two call sites in Execute. Let’s prove that by looking at Burst Inspector:

; Execute
push    r14
push    rbx
push    rax
mov     rbx, rdi
mov     edi, dword ptr [rbx]
movabs  r14, offset ".LBigMethodJob.PointlessAddition(BigMethodJob* this, int val)_4CCC87A9ADCAE58C"
call    r14
movss   dword ptr [rsp + 4], xmm0
mov     edi, dword ptr [rbx + 4]
call    r14
addss   xmm0, dword ptr [rsp + 4]
mov     rax, qword ptr [rbx + 8]
movss   dword ptr [rax], xmm0
add     rsp, 8
pop     rbx
pop     r14
ret
 
; PointlessAddition
movd    xmm0, edi
pshufd  xmm0, xmm0, 0
movabs  rax, offset .LCPI1_0
movdqa  xmm8, xmmword ptr [rax]
paddd   xmm8, xmm0
movabs  rax, offset .LCPI1_1
movdqa  xmm2, xmmword ptr [rax]
paddd   xmm2, xmm0
movabs  rax, offset .LCPI1_2
movdqa  xmm3, xmmword ptr [rax]
paddd   xmm3, xmm0
movabs  rax, offset .LCPI1_3
movdqa  xmm4, xmmword ptr [rax]
paddd   xmm4, xmm0
movabs  rax, offset .LCPI1_4
movdqa  xmm5, xmmword ptr [rax]
paddd   xmm5, xmm0
movabs  rax, offset .LCPI1_5
movdqa  xmm6, xmmword ptr [rax]
paddd   xmm6, xmm0
movabs  rax, offset .LCPI1_6
movdqa  xmm7, xmmword ptr [rax]
paddd   xmm7, xmm0
movabs  rax, offset .LCPI1_7
movdqa  xmm1, xmmword ptr [rax]
paddd   xmm1, xmm0
cvtdq2ps        xmm1, xmm1
cvtdq2ps        xmm7, xmm7
cvtdq2ps        xmm6, xmm6
cvtdq2ps        xmm5, xmm5
addps   xmm5, xmm6
addps   xmm5, xmm1
addps   xmm5, xmm7
cvtdq2ps        xmm1, xmm4
cvtdq2ps        xmm3, xmm3
cvtdq2ps        xmm2, xmm2
cvtdq2ps        xmm4, xmm8
addps   xmm4, xmm2
addps   xmm4, xmm1
addps   xmm4, xmm5
addps   xmm4, xmm3
movabs  rax, offset .LCPI1_8
movdqa  xmm1, xmmword ptr [rax]
paddd   xmm1, xmm0
movabs  rax, offset .LCPI1_9
paddd   xmm0, xmmword ptr [rax]
cvtdq2ps        xmm0, xmm0
cvtdq2ps        xmm1, xmm1
addps   xmm1, xmm0
movaps  xmm2, xmm4
movhlps xmm2, xmm2
addps   xmm2, xmm4
haddps  xmm2, xmm2
movaps  xmm0, xmm1
movhlps xmm0, xmm0
addps   xmm0, xmm1
haddps  xmm0, xmm0
addss   xmm0, xmm2
ret

The meaning of most of this assembly doesn’t really matter. The important part is right at the top in Execute where we see two call instructions. These call the PointlessAddition method twice. Then we see the rest of the instructions are for PointlessAddition. This shows that the method is being called rather than having its instructions directly pasted into Execute.

Now let’s try a new feature from Burst 1.0.1 and enable inlining of the method with [MethodImpl]:

[MethodImpl(MethodImplOptions.AggressiveInlining)]
float PointlessAddition(int val)

Now when we look in Burst Inspector we see no call instructions and a lot more total instructions:

mov     ecx, dword ptr [rdi]
mov     eax, dword ptr [rdi + 4]
lea     edx, [rcx + 1]
cvtsi2ss        xmm0, edx
lea     edx, [rcx + 2]
cvtsi2ss        xmm1, edx
lea     edx, [rcx + 3]
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 4]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 5]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 6]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 7]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 8]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 9]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 10]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 11]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 12]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 13]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 14]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 15]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 16]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 17]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 18]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 19]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 20]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 21]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 22]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 23]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 24]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 25]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 26]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 27]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 28]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 29]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 30]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 31]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 32]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 33]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 34]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 35]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 36]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 37]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
lea     edx, [rcx + 38]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, edx
addss   xmm1, xmm2
lea     edx, [rcx + 39]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, edx
addss   xmm2, xmm1
add     ecx, 40
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 1]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
lea     ecx, [rax + 2]
cvtsi2ss        xmm3, ecx
addss   xmm3, xmm1
addss   xmm3, xmm2
lea     ecx, [rax + 3]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm3
lea     ecx, [rax + 4]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 5]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 6]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 7]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 8]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 9]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 10]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 11]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 12]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 13]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 14]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 15]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 16]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 17]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 18]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 19]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 20]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 21]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 22]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 23]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 24]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 25]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 26]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 27]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 28]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 29]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 30]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 31]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 32]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 33]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 34]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 35]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 36]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 37]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
lea     ecx, [rax + 38]
xorps   xmm2, xmm2
cvtsi2ss        xmm2, ecx
addss   xmm2, xmm1
lea     ecx, [rax + 39]
xorps   xmm1, xmm1
cvtsi2ss        xmm1, ecx
addss   xmm1, xmm2
add     eax, 40
xorps   xmm2, xmm2
cvtsi2ss        xmm2, eax
addss   xmm2, xmm1
addss   xmm2, xmm0
mov     rax, qword ptr [rdi + 8]
movss   dword ptr [rax], xmm2
ret

This has effectively removed the function calls and duplicated the function within Execute.

Now that we’ve seen method inlining work, the question becomes whether we should use it or not and under what circumstances. As an anecdote, I needed to add a lot of code to the method in order for Burst to not automatically inline it. This means that it’s already aggressively inlining methods, even without the attribute. So [MethodImpl] should only be necessary when either Burst isn’t inlining a short method that it really should have inlined. Large methods generally don’t need to be inlined because the cost of calling them is relatively low compared to the cost of executing their body. This leads to a rule of thumb- force method inlining sparingly.