| |
| |
| |
|
|
| #include "go_asm.h" |
| #include "textflag.h" |
|
|
| |
| TEXT 路ExpandAVX512(SB), NOSPLIT, $0-24 |
| MOVQ sizeClass+0(FP), CX |
| MOVQ packed+8(FP), AX |
|
|
| |
| LEAQ 路gcExpandersAVX512(SB), BX |
| CALL (BX)(CX*8) |
|
|
| MOVQ unpacked+16(FP), DI |
| VMOVDQU64 Z1, 0(DI) |
| VMOVDQU64 Z2, 64(DI) |
| VZEROUPPER |
| RET |
|
|
| TEXT 路scanSpanPackedAVX512(SB), NOSPLIT, $256-44 |
| |
| MOVQ objMarks+16(FP), AX |
| MOVQ sizeClass+24(FP), CX |
| LEAQ 路gcExpandersAVX512(SB), BX |
| CALL (BX)(CX*8) |
|
|
| |
| MOVQ ptrMask+32(FP), AX |
| VMOVDQU64 0(AX), Z3 |
| VMOVDQU64 64(AX), Z4 |
|
|
| |
| VPANDQ Z1, Z3, Z1 |
| VPANDQ Z2, Z4, Z2 |
|
|
| |
| |
| |
| |
| |
| |
| |
| VPOPCNTB Z1, Z3 |
| VPOPCNTB Z2, Z4 |
|
|
| |
| |
| |
| VMOVDQU64 Z1, 0(SP) |
| VMOVDQU64 Z2, 64(SP) |
| VMOVDQU64 Z3, 128(SP) |
| VMOVDQU64 Z4, 192(SP) |
|
|
| |
| MOVQ mem+0(FP), SI |
| |
| MOVQ bufp+8(FP), DI |
| |
| MOVQ $0, DX |
|
|
| |
| LEAQ 0(SP), AX |
|
|
| |
| |
| LEAQ 128(SP), BX |
|
|
| |
| |
| |
| |
| PCALIGN $64 |
| loop: |
| // CX = Fetch the mask of words to load from this frame. |
| MOVBQZX 0(AX), CX |
| // Skip empty frames. |
| TESTQ CX, CX |
| JZ skip |
|
|
| // Load the 64 byte frame. |
| KMOVB CX, K1 |
| VMOVDQA64 0(SI), Z1 |
|
|
| // Collect just the pointers from the greyed objects into the scan buffer, |
| // i.e., copy the word indices in the mask from Z1 into contiguous memory. |
| // |
| // N.B. VPCOMPRESSQ supports a memory destination. Unfortunately, on |
| // AMD Genoa / Zen 4, using VPCOMPRESSQ with a memory destination |
| // imposes a severe performance penalty of around an order of magnitude |
| // compared to a register destination. |
| // |
| // This workaround is unfortunate on other microarchitectures, where a |
| // memory destination is slightly faster than adding an additional move |
| // instruction, but no where near an order of magnitude. It would be |
| // nice to have a Genoa-only variant here. |
| // |
| // AMD Turin / Zen 5 fixes this issue. |
| // |
| // See |
| // https://lemire.me/blog/2025/02/14/avx-512-gotcha-avoid-compressing-words-to-memory-with-amd-zen-4-processors/. |
| VPCOMPRESSQ Z1, K1, Z2 |
| VMOVDQU64 Z2, (DI)(DX*8) |
|
|
| // Advance the scan buffer position by the number of pointers. |
| MOVBQZX 128(AX), CX |
| ADDQ CX, DX |
|
|
| skip: |
| ADDQ $64, SI |
| ADDQ $1, AX |
| CMPQ AX, BX |
| JB loop |
|
|
| end: |
| MOVL DX, count+40(FP) |
| VZEROUPPER |
| RET |
|
|