SIMD Bit-Gather: X86 AVX2 Edition

2025-06-22

Here is a small micro-optimization that I implemented for the new version of the terrain generator in Unity. It is a simple bit-gather, but vectorized using X86 AVX2 intrinsics.

For some reason, Burst compiler did not want to vectorize the Load4 internal loop, so I wanted to see if using manually placed intrinsics would be faster. It was faster by 3ms on the median time for a chunk of volume 66^3, but for a chunk with volume 34^3, this is most probably only a marginal improvement. I will most probably remove this for the sake of code cleanliness and upgradeability, since having raw intrinsics like that make modifying the code extremely more convoluted and annoying for such a small speedup. But, I will leave this here on my website just as a way to archive it because I still feel proud of such an optimization, even though it's not that good.

1[MethodImpl(MethodImplOptions.AggressiveInlining)]
2private unsafe int CalculateMarchingCubesCode(uint3 position, int baseIndex) {
3 // https://docs.unity3d.com/Packages/com.unity.burst@1.4/manual/docs/CSharpLanguageSupport_BurstIntrinsics.html
4 // I LOVE MICROOPTIMIZATIONS!!! I LOVE DOING THIS ON A WHIM WITHOUT ACTUALLY TRUSTING PROFILER DATA!!!!
5 // I actually profiled this and it is actually faster. Saved 3ms on the median time. Pretty good desu
6 if (X86.Avx2.IsAvx2Supported) {
7 int4 indices = (offsets[0] + new int4(baseIndex));
8 int4 indices2 = (offsets[1] + new int4(baseIndex));
9
10 v256 indices_v256 = new v256(indices.x, indices.y, indices.z, indices.w, indices2.x, indices2.y, indices2.z, indices2.w);
11
12 // divide by 32
13 v256 component_v256 = X86.Avx2.mm256_srli_epi32(indices_v256, 5);
14
15 // modulo by 32
16 v256 shift_v256 = X86.Avx2.mm256_and_si256(indices_v256, new v256(31u));
17
18 // fetch the uints using indices
19 v256 uints_v256 = X86.Avx2.mm256_i32gather_epi32(bits.GetUnsafeReadOnlyPtr(), component_v256, 4);
20
21 // "shift" and "and"
22 v256 shifted_right_v256 = X86.Avx2.mm256_srlv_epi32(uints_v256, shift_v256);
23 v256 anded_v256 = X86.Avx2.mm256_and_si256(shifted_right_v256, new v256(1u));
24
25 // check if the bits are set
26 uint4 sets1 = new uint4(anded_v256.UInt0, anded_v256.UInt1, anded_v256.UInt2, anded_v256.UInt3);
27 uint4 sets2 = new uint4(anded_v256.UInt4, anded_v256.UInt5, anded_v256.UInt6, anded_v256.UInt7);
28 return math.bitmask(sets1 == 0) | (math.bitmask(sets2 == 0) << 4);
29 } else {
30 bool4 test = Load4(position, 0, baseIndex);
31 bool4 test2 = Load4(position, 1, baseIndex);
32 return math.bitmask(test) | (math.bitmask(test2) << 4);
33 }
34}
35
36
37[MethodImpl(MethodImplOptions.AggressiveInlining)]
38private bool4 Load4(uint3 position, int selector, int baseIndex) {
39 uint4 indices = (uint4)(offsets[selector] + new int4(baseIndex));
40
41 bool4 hits = false;
42
43 for (int i = 0; i < 4; i++) {
44 int index = (int)indices[i];
45
46 int component = index / 32;
47 int shift = index % 32;
48
49 uint batch = bits[component];
50
51 hits[i] = ((batch >> shift) & 1U) == 1;
52 }
53
54 return hits;
55}