SIMD Bit-Gather: X86 AVX2 Edition
Here is a small micro-optimization that I implemented for the new version of the terrain generator in Unity. It is a simple bit-gather, but vectorized using X86 AVX2
intrinsics.
For some reason, Burst compiler did not want to vectorize the Load4
internal loop, so I wanted to see if using manually placed intrinsics would be faster. It was faster by 3ms on the median time for a chunk of volume 66^3
, but for a chunk with volume 34^3
, this is most probably only a marginal improvement. I will most probably remove this for the sake of code cleanliness and upgradeability, since having raw intrinsics like that make modifying the code extremely more convoluted and annoying for such a small speedup. But, I will leave this here on my website just as a way to archive it because I still feel proud of such an optimization, even though it's not that good.
1 | [MethodImpl(MethodImplOptions.AggressiveInlining)]
2 | private unsafe int CalculateMarchingCubesCode(uint3 position, int baseIndex) {
3 | | 4 | | 5 | | 6 | if (X86.Avx2.IsAvx2Supported) {
7 | int4 indices = (offsets[0] + new int4(baseIndex));
| 8 | int4 indices2 = (offsets[1] + new int4(baseIndex));
| 9 |
| 10 | v256 indices_v256 = new v256(indices.x, indices.y, indices.z, indices.w, indices2.x, indices2.y, indices2.z, indices2.w);
| 11 |
| 12 | | 13 | v256 component_v256 = X86.Avx2.mm256_srli_epi32(indices_v256, 5);
| 14 |
| 15 | | 16 | v256 shift_v256 = X86.Avx2.mm256_and_si256(indices_v256, new v256(31u));
| 17 |
| 18 | | 19 | v256 uints_v256 = X86.Avx2.mm256_i32gather_epi32(bits.GetUnsafeReadOnlyPtr(), component_v256, 4);
| 20 |
| 21 | | 22 | v256 shifted_right_v256 = X86.Avx2.mm256_srlv_epi32(uints_v256, shift_v256);
| 23 | v256 anded_v256 = X86.Avx2.mm256_and_si256(shifted_right_v256, new v256(1u));
| 24 |
| 25 | | 26 | uint4 sets1 = new uint4(anded_v256.UInt0, anded_v256.UInt1, anded_v256.UInt2, anded_v256.UInt3);
| 27 | uint4 sets2 = new uint4(anded_v256.UInt4, anded_v256.UInt5, anded_v256.UInt6, anded_v256.UInt7);
| 28 | return math.bitmask(sets1 == 0) | (math.bitmask(sets2 == 0) << 4);
| 29 | } else {
30 | bool4 test = Load4(position, 0, baseIndex);
| 31 | bool4 test2 = Load4(position, 1, baseIndex);
| 32 | return math.bitmask(test) | (math.bitmask(test2) << 4);
| 33 | }
| 34 | }
| 35 |
| 36 |
| 37 | [MethodImpl(MethodImplOptions.AggressiveInlining)]
| 38 | private bool4 Load4(uint3 position, int selector, int baseIndex) {
| 39 | uint4 indices = (uint4)(offsets[selector] + new int4(baseIndex));
| 40 |
| 41 | bool4 hits = false;
| 42 |
| 43 | for (int i = 0; i < 4; i++) {
44 | int index = (int)indices[i];
| 45 |
| 46 | int component = index / 32;
| 47 | int shift = index % 32;
| 48 |
| 49 | uint batch = bits[component];
| 50 |
| 51 | hits[i] = ((batch >> shift) & 1U) == 1;
| 52 | }
| 53 |
| 54 | return hits;
| 55 | }
| | | | | |