| // Copyright 2018 The Go Authors. All rights reserved. | |
| // Use of this source code is governed by a BSD-style | |
| // license that can be found in the LICENSE file. | |
| #include "go_asm.h" | |
| #include "asm_amd64.h" | |
| #include "textflag.h" | |
| TEXT 路Count(SB),NOSPLIT,$0-40 | |
| #ifndef hasPOPCNT | |
| CMPB internal鈭昪pu路X86+const_offsetX86HasPOPCNT(SB), $1 | |
| JEQ 2(PC) | |
| JMP 路countGeneric(SB) | |
| #endif | |
| MOVQ b_base+0(FP), SI | |
| MOVQ b_len+8(FP), BX | |
| MOVB c+24(FP), AL | |
| LEAQ ret+32(FP), R8 | |
| JMP countbody<>(SB) | |
| TEXT 路CountString(SB),NOSPLIT,$0-32 | |
| #ifndef hasPOPCNT | |
| CMPB internal鈭昪pu路X86+const_offsetX86HasPOPCNT(SB), $1 | |
| JEQ 2(PC) | |
| JMP 路countGenericString(SB) | |
| #endif | |
| MOVQ s_base+0(FP), SI | |
| MOVQ s_len+8(FP), BX | |
| MOVB c+16(FP), AL | |
| LEAQ ret+24(FP), R8 | |
| JMP countbody<>(SB) | |
| // input: | |
| // SI: data | |
| // BX: data len | |
| // AL: byte sought | |
| // R8: address to put result | |
| // This function requires the POPCNT instruction. | |
| TEXT countbody<>(SB),NOSPLIT,$0 | |
| // Shuffle X0 around so that each byte contains | |
| // the character we're looking for. | |
| MOVD AX, X0 | |
| PUNPCKLBW X0, X0 | |
| PUNPCKLBW X0, X0 | |
| PSHUFL $0, X0, X0 | |
| CMPQ BX, $16 | |
| JLT small | |
| MOVQ $0, R12 // Accumulator | |
| MOVQ SI, DI | |
| CMPQ BX, $64 | |
| JAE avx2 | |
| sse: | |
| LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes | |
| JMP sseloopentry | |
| PCALIGN $16 | |
| sseloop: | |
| // Move the next 16-byte chunk of the data into X1. | |
| MOVOU (DI), X1 | |
| // Compare bytes in X0 to X1. | |
| PCMPEQB X0, X1 | |
| // Take the top bit of each byte in X1 and put the result in DX. | |
| PMOVMSKB X1, DX | |
| // Count number of matching bytes | |
| POPCNTL DX, DX | |
| // Accumulate into R12 | |
| ADDQ DX, R12 | |
| // Advance to next block. | |
| ADDQ $16, DI | |
| sseloopentry: | |
| CMPQ DI, AX | |
| JBE sseloop | |
| // Get the number of bytes to consider in the last 16 bytes | |
| ANDQ $15, BX | |
| JZ | |
| // Create mask to ignore overlap between previous 16 byte block | |
| // and the next. | |
| MOVQ $16,CX | |
| SUBQ BX, CX | |
| MOVQ $0xFFFF, R10 | |
| SARQ CL, R10 | |
| SALQ CL, R10 | |
| // Process the last 16-byte chunk. This chunk may overlap with the | |
| // chunks we've already searched so we need to mask part of it. | |
| MOVOU (AX), X1 | |
| PCMPEQB X0, X1 | |
| PMOVMSKB X1, DX | |
| // Apply mask | |
| ANDQ R10, DX | |
| POPCNTL DX, DX | |
| ADDQ DX, R12 | |
| end: | |
| MOVQ R12, (R8) | |
| RET | |
| // handle for lengths < 16 | |
| small: | |
| TESTQ BX, BX | |
| JEQ endzero | |
| // Check if we'll load across a page boundary. | |
| LEAQ 16(SI), AX | |
| TESTW $0xff0, AX | |
| JEQ endofpage | |
| // We must ignore high bytes as they aren't part of our slice. | |
| // Create mask. | |
| MOVB BX, CX | |
| MOVQ $1, R10 | |
| SALQ CL, R10 | |
| SUBQ $1, R10 | |
| // Load data | |
| MOVOU (SI), X1 | |
| // Compare target byte with each byte in data. | |
| PCMPEQB X0, X1 | |
| // Move result bits to integer register. | |
| PMOVMSKB X1, DX | |
| // Apply mask | |
| ANDQ R10, DX | |
| POPCNTL DX, DX | |
| // Directly return DX, we don't need to accumulate | |
| // since we have <16 bytes. | |
| MOVQ DX, (R8) | |
| RET | |
| endzero: | |
| MOVQ $0, (R8) | |
| RET | |
| endofpage: | |
| // We must ignore low bytes as they aren't part of our slice. | |
| MOVQ $16,CX | |
| SUBQ BX, CX | |
| MOVQ $0xFFFF, R10 | |
| SARQ CL, R10 | |
| SALQ CL, R10 | |
| // Load data into the high end of X1. | |
| MOVOU -16(SI)(BX*1), X1 | |
| // Compare target byte with each byte in data. | |
| PCMPEQB X0, X1 | |
| // Move result bits to integer register. | |
| PMOVMSKB X1, DX | |
| // Apply mask | |
| ANDQ R10, DX | |
| // Directly return DX, we don't need to accumulate | |
| // since we have <16 bytes. | |
| POPCNTL DX, DX | |
| MOVQ DX, (R8) | |
| RET | |
| avx2: | |
| #ifndef hasAVX2 | |
| CMPB internal鈭昪pu路X86+const_offsetX86HasAVX2(SB), $1 | |
| JNE sse | |
| #endif | |
| MOVD AX, X0 | |
| LEAQ -64(SI)(BX*1), R11 | |
| LEAQ (SI)(BX*1), R13 | |
| VPBROADCASTB X0, Y1 | |
| PCALIGN $32 | |
| avx2_loop: | |
| VMOVDQU (DI), Y2 | |
| VMOVDQU 32(DI), Y4 | |
| VPCMPEQB Y1, Y2, Y3 | |
| VPCMPEQB Y1, Y4, Y5 | |
| VPMOVMSKB Y3, DX | |
| VPMOVMSKB Y5, CX | |
| POPCNTL DX, DX | |
| POPCNTL CX, CX | |
| ADDQ DX, R12 | |
| ADDQ CX, R12 | |
| ADDQ $64, DI | |
| CMPQ DI, R11 | |
| JLE avx2_loop | |
| // If last block is already processed, | |
| // skip to the end. | |
| // | |
| // This check is NOT an optimization; if the input length is a | |
| // multiple of 64, we must not go through the last leg of the | |
| // function because the bit shift count passed to SALQ below would | |
| // be 64, which is outside of the 0-63 range supported by those | |
| // instructions. | |
| // | |
| // Tests in the bytes and strings packages with input lengths that | |
| // are multiples of 64 will break if this condition were removed. | |
| CMPQ DI, R13 | |
| JEQ endavx | |
| // Load address of the last 64 bytes. | |
| // There is an overlap with the previous block. | |
| MOVQ R11, DI | |
| VMOVDQU (DI), Y2 | |
| VMOVDQU 32(DI), Y4 | |
| VPCMPEQB Y1, Y2, Y3 | |
| VPCMPEQB Y1, Y4, Y5 | |
| VPMOVMSKB Y3, DX | |
| VPMOVMSKB Y5, CX | |
| // Exit AVX mode. | |
| VZEROUPPER | |
| SALQ $32, CX | |
| ORQ CX, DX | |
| // Create mask to ignore overlap between previous 64 byte block | |
| // and the next. | |
| ANDQ $63, BX | |
| MOVQ $64, CX | |
| SUBQ BX, CX | |
| MOVQ $0xFFFFFFFFFFFFFFFF, R10 | |
| SALQ CL, R10 | |
| // Apply mask | |
| ANDQ R10, DX | |
| POPCNTQ DX, DX | |
| ADDQ DX, R12 | |
| MOVQ R12, (R8) | |
| RET | |
| endavx: | |
| // Exit AVX mode. | |
| VZEROUPPER | |
| MOVQ R12, (R8) | |
| RET | |