go / src /internal /bytealg /compare_amd64.s

Upload folder using huggingface_hub

e36aeda verified about 1 month ago

4.4 kB

	// Copyright 2018 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	#include "go_asm.h"
	#include "asm_amd64.h"
	#include "textflag.h"

	TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
	// AX = a_base (want in SI)
	// BX = a_len (want in BX)
	// CX = a_cap (unused)
	// DI = b_base (want in DI)
	// SI = b_len (want in DX)
	// R8 = b_cap (unused)
	MOVQ SI, DX
	MOVQ AX, SI
	JMP cmpbody<>(SB)

	TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
	// AX = a_base (want in SI)
	// BX = a_len (want in BX)
	// CX = b_base (want in DI)
	// DI = b_len (want in DX)
	MOVQ AX, SI
	MOVQ DI, DX
	MOVQ CX, DI
	JMP cmpbody<>(SB)

	// input:
	// SI = a
	// DI = b
	// BX = alen
	// DX = blen
	// output:
	// AX = output (-1/0/1)
	TEXT cmpbody<>(SB),NOSPLIT,$0-0
	CMPQ SI, DI
	JEQ allsame
	CMPQ BX, DX
	MOVQ DX, R8
	CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
	CMPQ R8, $8
	JB small

	CMPQ R8, $63
	JBE loop
	#ifndef hasAVX2
	CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
	JEQ big_loop_avx2
	JMP big_loop
	#else
	JMP big_loop_avx2
	#endif
	loop:
	CMPQ R8, $16
	JBE _0through16
	MOVOU (SI), X0
	MOVOU (DI), X1
	PCMPEQB X0, X1
	PMOVMSKB X1, AX
	XORQ $0xffff, AX // convert EQ to NE
	JNE diff16 // branch if at least one byte is not equal
	ADDQ $16, SI
	ADDQ $16, DI
	SUBQ $16, R8
	JMP loop

	diff64:
	ADDQ $48, SI
	ADDQ $48, DI
	JMP diff16
	diff48:
	ADDQ $32, SI
	ADDQ $32, DI
	JMP diff16
	diff32:
	ADDQ $16, SI
	ADDQ $16, DI
	// AX = bit mask of differences
	diff16:
	BSFQ AX, BX // index of first byte that differs
	XORQ AX, AX
	MOVB (SI)(BX*1), CX
	CMPB CX, (DI)(BX*1)
	SETHI AX
	LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
	RET

	// 0 through 16 bytes left, alen>=8, blen>=8
	_0through16:
	CMPQ R8, $8
	JBE _0through8
	MOVQ (SI), AX
	MOVQ (DI), CX
	CMPQ AX, CX
	JNE diff8
	_0through8:
	MOVQ -8(SI)(R8*1), AX
	MOVQ -8(DI)(R8*1), CX
	CMPQ AX, CX
	JEQ allsame

	// AX and CX contain parts of a and b that differ.
	diff8:
	BSWAPQ AX // reverse order of bytes
	BSWAPQ CX
	XORQ AX, CX
	BSRQ CX, CX // index of highest bit difference
	SHRQ CX, AX // move a's bit to bottom
	ANDQ $1, AX // mask bit
	LEAQ -1(AX*2), AX // 1/0 => +1/-1
	RET

	// 0-7 bytes in common
	small:
	LEAQ (R8*8), CX // bytes left -> bits left
	NEGQ CX // - bits lift (== 64 - bits left mod 64)
	JEQ allsame

	// load bytes of a into high bytes of AX
	CMPB SI, $0xf8
	JA si_high
	MOVQ (SI), SI
	JMP si_finish
	si_high:
	MOVQ -8(SI)(R8*1), SI
	SHRQ CX, SI
	si_finish:
	SHLQ CX, SI

	// load bytes of b in to high bytes of BX
	CMPB DI, $0xf8
	JA di_high
	MOVQ (DI), DI
	JMP di_finish
	di_high:
	MOVQ -8(DI)(R8*1), DI
	SHRQ CX, DI
	di_finish:
	SHLQ CX, DI

	BSWAPQ SI // reverse order of bytes
	BSWAPQ DI
	XORQ SI, DI // find bit differences
	JEQ allsame
	BSRQ DI, CX // index of highest bit difference
	SHRQ CX, SI // move a's bit to bottom
	ANDQ $1, SI // mask bit
	LEAQ -1(SI*2), AX // 1/0 => +1/-1
	RET

	allsame:
	XORQ AX, AX
	XORQ CX, CX
	CMPQ BX, DX
	SETGT AX // 1 if alen > blen
	SETEQ CX // 1 if alen == blen
	LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
	RET

	// this works for >= 64 bytes of data.
	#ifndef hasAVX2
	big_loop:
	MOVOU (SI), X0
	MOVOU (DI), X1
	PCMPEQB X0, X1
	PMOVMSKB X1, AX
	XORQ $0xffff, AX
	JNE diff16

	MOVOU 16(SI), X0
	MOVOU 16(DI), X1
	PCMPEQB X0, X1
	PMOVMSKB X1, AX
	XORQ $0xffff, AX
	JNE diff32

	MOVOU 32(SI), X0
	MOVOU 32(DI), X1
	PCMPEQB X0, X1
	PMOVMSKB X1, AX
	XORQ $0xffff, AX
	JNE diff48

	MOVOU 48(SI), X0
	MOVOU 48(DI), X1
	PCMPEQB X0, X1
	PMOVMSKB X1, AX
	XORQ $0xffff, AX
	JNE diff64

	ADDQ $64, SI
	ADDQ $64, DI
	SUBQ $64, R8
	CMPQ R8, $64
	JBE loop
	JMP big_loop
	#endif

	// Compare 64-bytes per loop iteration.
	// Loop is unrolled and uses AVX2.
	big_loop_avx2:
	VMOVDQU (SI), Y2
	VMOVDQU (DI), Y3
	VMOVDQU 32(SI), Y4
	VMOVDQU 32(DI), Y5
	VPCMPEQB Y2, Y3, Y0
	VPMOVMSKB Y0, AX
	XORL $0xffffffff, AX
	JNE diff32_avx2
	VPCMPEQB Y4, Y5, Y6
	VPMOVMSKB Y6, AX
	XORL $0xffffffff, AX
	JNE diff64_avx2

	ADDQ $64, SI
	ADDQ $64, DI
	SUBQ $64, R8
	CMPQ R8, $64
	JB big_loop_avx2_exit
	JMP big_loop_avx2

	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
	diff32_avx2:
	VZEROUPPER
	JMP diff16

	// Same as diff32_avx2, but for last 32 bytes.
	diff64_avx2:
	VZEROUPPER
	JMP diff48

	// For <64 bytes remainder jump to normal loop.
	big_loop_avx2_exit:
	VZEROUPPER
	JMP loop