SIMD Bit-Gather: X86 AVX2 Edition
Here is a small micro-optimization that I implemented for the new version of the terrain generator in Unity. It is a simple bit-gather, but vectorized using X86 AVX2 intrinsics.
For some reason, Burst compiler did not want to vectorize the Load4 internal loop, so I wanted to see if using manually placed intrinsics would be faster. It was faster by 3ms on the median time for a chunk of volume 66^3, but for a chunk with volume 34^3, this is most probably only a marginal improvement. I will most probably remove this for the sake of code cleanliness and upgradeability, since having raw intrinsics like that make modifying the code extremely more convoluted and annoying for such a small speedup. But, I will leave this here on my website just as a way to archive it because I still feel proud of such an optimization, even though it's not that good.
| 1 | [MethodImpl(MethodImplOptions.AggressiveInlining)] | 2 | private unsafe int CalculateMarchingCubesCode(uint3 position, int baseIndex) { | 3 |  |  | 4 |  |  | 5 |  |  | 6 | if (X86.Avx2.IsAvx2Supported) { | 7 | int4 indices = (offsets[0] + new int4(baseIndex)); |  | 8 | int4 indices2 = (offsets[1] + new int4(baseIndex)); |  | 9 |  |  | 10 | v256 indices_v256 = new v256(indices.x, indices.y, indices.z, indices.w, indices2.x, indices2.y, indices2.z, indices2.w); |  | 11 |  |  | 12 |  |  | 13 | v256 component_v256 = X86.Avx2.mm256_srli_epi32(indices_v256, 5); |  | 14 |  |  | 15 |  |  | 16 | v256 shift_v256 = X86.Avx2.mm256_and_si256(indices_v256, new v256(31u)); |  | 17 |  |  | 18 |  |  | 19 | v256 uints_v256 = X86.Avx2.mm256_i32gather_epi32(bits.GetUnsafeReadOnlyPtr(), component_v256, 4); |  | 20 |  |  | 21 |  |  | 22 | v256 shifted_right_v256 = X86.Avx2.mm256_srlv_epi32(uints_v256, shift_v256); |  | 23 | v256 anded_v256 = X86.Avx2.mm256_and_si256(shifted_right_v256, new v256(1u)); |  | 24 |  |  | 25 |  |  | 26 | uint4 sets1 = new uint4(anded_v256.UInt0, anded_v256.UInt1, anded_v256.UInt2, anded_v256.UInt3); |  | 27 | uint4 sets2 = new uint4(anded_v256.UInt4, anded_v256.UInt5, anded_v256.UInt6, anded_v256.UInt7); |  | 28 | return math.bitmask(sets1 == 0) | (math.bitmask(sets2 == 0) << 4); |  | 29 | } else { | 30 | bool4 test = Load4(position, 0, baseIndex); |  | 31 | bool4 test2 = Load4(position, 1, baseIndex); |  | 32 | return math.bitmask(test) | (math.bitmask(test2) << 4); |  | 33 | } |  | 34 | } |  | 35 |  |  | 36 |  |  | 37 | [MethodImpl(MethodImplOptions.AggressiveInlining)] |  | 38 | private bool4 Load4(uint3 position, int selector, int baseIndex) { |  | 39 | uint4 indices = (uint4)(offsets[selector] + new int4(baseIndex)); |  | 40 |  |  | 41 | bool4 hits = false; |  | 42 |  |  | 43 | for (int i = 0; i < 4; i++) { | 44 | int index = (int)indices[i]; |  | 45 |  |  | 46 | int component = index / 32; |  | 47 | int shift = index % 32; |  | 48 |  |  | 49 | uint batch = bits[component]; |  | 50 |  |  | 51 | hits[i] = ((batch >> shift) & 1U) == 1; |  | 52 | } |  | 53 |  |  | 54 | return hits; |  | 55 | } |  |  |  |  |  |