| // Copyright 2018 The Go Authors. All rights reserved. | |
| // Use of this source code is governed by a BSD-style | |
| // license that can be found in the LICENSE file. | |
| #include "go_asm.h" | |
| #include "textflag.h" | |
| // func Count(b []byte, c byte) int | |
| // input: | |
| // R0: b ptr | |
| // R1: b len | |
| // R2: b cap | |
| // R3: c byte to search | |
| // return: | |
| // R0: result | |
| TEXT 路Count<ABIInternal>(SB),NOSPLIT,$0-40 | |
| MOVD R3, R2 | |
| B 路CountString<ABIInternal>(SB) | |
| // func CountString(s string, c byte) int | |
| // input: | |
| // R0: s ptr | |
| // R1: s len | |
| // R2: c byte to search (due to ABIInternal upper bits can contain junk) | |
| // return: | |
| // R0: result | |
| TEXT 路CountString<ABIInternal>(SB),NOSPLIT,$0-32 | |
| // R11 = count of byte to search | |
| MOVD $0, R11 | |
| // short path to handle 0-byte case | |
| CBZ R1, done | |
| CMP $0x20, R1 | |
| // jump directly to head if length >= 32 | |
| BHS head | |
| tail: | |
| // Work with tail shorter than 32 bytes | |
| MOVBU.P 1(R0), R5 | |
| SUB $1, R1, R1 | |
| CMP R2.UXTB, R5 | |
| CINC EQ, R11, R11 | |
| CBNZ R1, tail | |
| done: | |
| MOVD R11, R0 | |
| RET | |
| PCALIGN $16 | |
| head: | |
| ANDS $0x1f, R0, R9 | |
| BEQ chunk | |
| // Work with not 32-byte aligned head | |
| BIC $0x1f, R0, R3 | |
| ADD $0x20, R3 | |
| PCALIGN $16 | |
| head_loop: | |
| MOVBU.P 1(R0), R5 | |
| CMP R2.UXTB, R5 | |
| CINC EQ, R11, R11 | |
| SUB $1, R1, R1 | |
| CMP R0, R3 | |
| BNE head_loop | |
| chunk: | |
| BIC $0x1f, R1, R9 | |
| // The first chunk can also be the last | |
| CBZ R9, tail | |
| // R3 = end of 32-byte chunks | |
| ADD R0, R9, R3 | |
| MOVD $1, R5 | |
| VMOV R5, V5.B16 | |
| // R1 = length of tail | |
| SUB R9, R1, R1 | |
| // Duplicate R2 (byte to search) to 16 1-byte elements of V0 | |
| VMOV R2, V0.B16 | |
| // Clear the low 64-bit element of V7 and V8 | |
| VEOR V7.B8, V7.B8, V7.B8 | |
| VEOR V8.B8, V8.B8, V8.B8 | |
| PCALIGN $16 | |
| // Count the target byte in 32-byte chunk | |
| chunk_loop: | |
| VLD1.P (R0), [V1.B16, V2.B16] | |
| CMP R0, R3 | |
| VCMEQ V0.B16, V1.B16, V3.B16 | |
| VCMEQ V0.B16, V2.B16, V4.B16 | |
| // Clear the higher 7 bits | |
| VAND V5.B16, V3.B16, V3.B16 | |
| VAND V5.B16, V4.B16, V4.B16 | |
| // Count lanes match the requested byte | |
| VADDP V4.B16, V3.B16, V6.B16 // 32B->16B | |
| VUADDLV V6.B16, V7 | |
| // Accumulate the count in low 64-bit element of V8 when inside the loop | |
| VADD V7, V8 | |
| BNE chunk_loop | |
| VMOV V8.D[0], R6 | |
| ADD R6, R11, R11 | |
| CBZ R1, done | |
| B tail | |