diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4eb64ffef42045c2050fbbd616ab6ef185f1a6e1 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,1184 @@ +# Stockfish, a UCI chess playing engine derived from Glaurung 2.1 +# Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) +# +# Stockfish is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Stockfish is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +### ========================================================================== +### Section 1. General Configuration +### ========================================================================== + +### Establish the operating system name +KERNEL := $(shell uname -s) +ifeq ($(KERNEL),Linux) + OS := $(shell uname -o) +endif + +### Command prefix to run the built executable (e.g. wine, sde, qemu) +### Backward compatible alias: WINE_PATH (deprecated) +ifneq ($(strip $(WINE_PATH)),) +ifeq ($(strip $(RUN_PREFIX)),) +RUN_PREFIX := $(WINE_PATH) +endif +ifeq ($(MAKELEVEL),0) +ifneq ($(strip $(RUN_PREFIX)),$(strip $(WINE_PATH))) +$(warning *** Both RUN_PREFIX and WINE_PATH are set; ignoring WINE_PATH. ***) +else +$(warning *** WINE_PATH is deprecated; use RUN_PREFIX instead. ***) +endif +endif +endif + +### Target Windows OS +ifeq ($(OS),Windows_NT) + ifneq ($(COMP),ndk) + target_windows = yes + endif +else ifeq ($(COMP),mingw) + target_windows = yes + ifeq ($(RUN_PREFIX),) + RUN_PREFIX := $(shell which wine) + endif +endif + +### Executable name +ifeq ($(target_windows),yes) + EXE = stockfish.exe +else + EXE = stockfish +endif + +### Installation dir definitions +PREFIX = /usr/local +BINDIR = $(PREFIX)/bin + +### Built-in benchmark for pgo-builds +PGOBENCH = $(RUN_PREFIX) ./$(EXE) bench + +### Source and object files +SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \ + misc.cpp movegen.cpp movepick.cpp position.cpp \ + search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \ + nnue/nnue_accumulator.cpp nnue/nnue_misc.cpp nnue/network.cpp \ + nnue/features/half_ka_v2_hm.cpp nnue/features/full_threats.cpp \ + engine.cpp score.cpp memory.cpp + +HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h history.h \ + nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/features/full_threats.h \ + nnue/layers/affine_transform.h nnue/layers/affine_transform_sparse_input.h \ + nnue/layers/clipped_relu.h nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h \ + nnue/nnue_architecture.h nnue/nnue_common.h nnue/nnue_feature_transformer.h nnue/simd.h \ + position.h search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \ + tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h memory.h shm.h shm_linux.h + +OBJS = $(notdir $(SRCS:.cpp=.o)) + +VPATH = syzygy:nnue:nnue/features + +### ========================================================================== +### Section 2. High-level Configuration +### ========================================================================== +# +# flag --- Comp switch --- Description +# ---------------------------------------------------------------------------- +# +# debug = yes/no --- -DNDEBUG --- Enable/Disable debug mode +# sanitize = none/ ... (-fsanitize ) +# --- ( undefined ) --- enable undefined behavior checks +# --- ( thread ) --- enable threading error checks +# --- ( address ) --- enable memory access checks +# --- ...etc... --- see compiler documentation for supported sanitizers +# optimize = yes/no --- (-O3/-fast etc.) --- Enable/Disable optimizations +# arch = (name) --- (-arch) --- Target architecture +# bits = 64/32 --- -DIS_64BIT --- 64-/32-bit operating system +# prefetch = yes/no --- -DUSE_PREFETCH --- Use prefetch asm-instruction +# popcnt = yes/no --- -DUSE_POPCNT --- Use popcnt asm-instruction +# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction +# sse = yes/no --- -msse --- Use Intel Streaming SIMD Extensions +# mmx = yes/no --- -mmmx --- Use Intel MMX instructions +# sse2 = yes/no --- -msse2 --- Use Intel Streaming SIMD Extensions 2 +# ssse3 = yes/no --- -mssse3 --- Use Intel Supplemental Streaming SIMD Extensions 3 +# sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1 +# avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 +# avxvnni = yes/no --- -mavxvnni --- Use Intel Vector Neural Network Instructions AVX +# avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 +# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 +# avx512icl = yes/no --- ... multiple ... --- Use All AVX-512 features available on both Intel Ice Lake and AMD Zen 4 +# altivec = yes/no --- -maltivec --- Use PowerPC Altivec SIMD extension +# vsx = yes/no --- -mvsx --- Use POWER VSX SIMD extension +# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture +# dotprod = yes/no --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions +# lsx = yes/no --- -mlsx --- Use Loongson SIMD eXtension +# lasx = yes/no --- -mlasx --- use Loongson Advanced SIMD eXtension +# +# Note that Makefile is space sensitive, so when adding new architectures +# or modifying existing flags, you have to make sure there are no extra spaces +# at the end of the line for flag values. +# +# Example of use for these flags: +# make build ARCH=x86-64-avx512 debug=yes sanitize="address undefined" + + +### 2.1. General and architecture defaults + +ifeq ($(ARCH),) + ARCH = native +endif + +ifeq ($(ARCH), native) + override ARCH := $(shell $(SHELL) ../scripts/get_native_properties.sh | cut -d " " -f 1) +endif + +# explicitly check for the list of supported architectures (as listed with make help), +# the user can override with `make ARCH=x86-64-avx512icl SUPPORTED_ARCH=true` +ifeq ($(ARCH), $(filter $(ARCH), \ + x86-64-avx512icl x86-64-vnni512 x86-64-avx512 x86-64-avxvnni \ + x86-64-bmi2 x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \ + x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-64-altivec ppc-64-vsx ppc-32 e2k \ + armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64 \ + loongarch64 loongarch64-lsx loongarch64-lasx)) + SUPPORTED_ARCH=true +else + SUPPORTED_ARCH=false +endif + +optimize = yes +debug = no +sanitize = none +bits = 64 +prefetch = no +popcnt = no +pext = no +sse = no +mmx = no +sse2 = no +ssse3 = no +sse41 = no +avx2 = no +avxvnni = no +avx512 = no +vnni512 = no +avx512icl = no +altivec = no +vsx = no +neon = no +dotprod = no +arm_version = 0 +lsx = no +lasx = no +STRIP = strip + +ifneq ($(shell which clang-format-20 2> /dev/null),) + CLANG-FORMAT = clang-format-20 +else + CLANG-FORMAT = clang-format +endif + +### 2.2 Architecture specific + +ifeq ($(findstring x86,$(ARCH)),x86) + +# x86-32/64 + +ifeq ($(findstring x86-32,$(ARCH)),x86-32) + arch = i386 + bits = 32 + sse = no + mmx = yes +else + arch = x86_64 + sse = yes + sse2 = yes +endif + +ifeq ($(findstring -sse,$(ARCH)),-sse) + sse = yes +endif + +ifeq ($(findstring -popcnt,$(ARCH)),-popcnt) + popcnt = yes +endif + +ifeq ($(findstring -mmx,$(ARCH)),-mmx) + mmx = yes +endif + +ifeq ($(findstring -sse2,$(ARCH)),-sse2) + sse = yes + sse2 = yes +endif + +ifeq ($(findstring -ssse3,$(ARCH)),-ssse3) + sse = yes + sse2 = yes + ssse3 = yes +endif + +ifeq ($(findstring -sse41,$(ARCH)),-sse41) + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes +endif + +ifeq ($(findstring -modern,$(ARCH)),-modern) + $(warning *** ARCH=$(ARCH) is deprecated, defaulting to ARCH=x86-64-sse41-popcnt. Execute `make help` for a list of available architectures. ***) + $(shell sleep 5) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes +endif + +ifeq ($(findstring -avx2,$(ARCH)),-avx2) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes +endif + +ifeq ($(findstring -avxvnni,$(ARCH)),-avxvnni) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + avxvnni = yes + pext = yes +endif + +ifeq ($(findstring -bmi2,$(ARCH)),-bmi2) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes +endif + +ifeq ($(findstring -avx512,$(ARCH)),-avx512) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes + avx512 = yes +endif + +ifeq ($(findstring -vnni512,$(ARCH)),-vnni512) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes + avx512 = yes + vnni512 = yes +endif + +ifeq ($(findstring -avx512icl,$(ARCH)),-avx512icl) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes + avx512 = yes + vnni512 = yes + avx512icl = yes +endif + +ifeq ($(sse),yes) + prefetch = yes +endif + +# 64-bit pext is not available on x86-32 +ifeq ($(bits),32) + pext = no +endif + +else + +# all other architectures + +ifeq ($(ARCH),general-32) + arch = any + bits = 32 +endif + +ifeq ($(ARCH),general-64) + arch = any +endif + +ifeq ($(ARCH),armv7) + arch = armv7 + prefetch = yes + bits = 32 + arm_version = 7 +endif + +ifeq ($(ARCH),armv7-neon) + arch = armv7 + prefetch = yes + popcnt = yes + neon = yes + bits = 32 + arm_version = 7 +endif + +ifeq ($(ARCH),armv8) + arch = armv8 + prefetch = yes + popcnt = yes + neon = yes + arm_version = 8 +endif + +ifeq ($(ARCH),armv8-dotprod) + arch = armv8 + prefetch = yes + popcnt = yes + neon = yes + dotprod = yes + arm_version = 8 +endif + +ifeq ($(ARCH),apple-silicon) + arch = arm64 + prefetch = yes + popcnt = yes + neon = yes + dotprod = yes + arm_version = 8 +endif + +ifeq ($(ARCH),ppc-32) + arch = ppc + bits = 32 +endif + +ifeq ($(ARCH),ppc-64) + arch = ppc64 + popcnt = yes + prefetch = yes +endif + +ifeq ($(ARCH),ppc-64-altivec) + arch = ppc64 + popcnt = yes + prefetch = yes + altivec = yes +endif + +ifeq ($(ARCH),ppc-64-vsx) + arch = ppc64 + popcnt = yes + prefetch = yes + vsx = yes +endif + +ifeq ($(findstring e2k,$(ARCH)),e2k) + arch = e2k + mmx = yes + bits = 64 + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + popcnt = yes +endif + +ifeq ($(ARCH),riscv64) + arch = riscv64 +endif + +ifeq ($(findstring loongarch64,$(ARCH)),loongarch64) + arch = loongarch64 + prefetch = yes + +ifeq ($(findstring -lasx,$(ARCH)),-lasx) + lsx = yes + lasx = yes +endif + +ifeq ($(findstring -lsx,$(ARCH)),-lsx) + lsx = yes +endif + +endif +endif + + +### ========================================================================== +### Section 3. Low-level Configuration +### ========================================================================== + +### 3.1 Selecting compiler (default = gcc) +ifeq ($(MAKELEVEL),0) + export ENV_CXXFLAGS := $(CXXFLAGS) + export ENV_DEPENDFLAGS := $(DEPENDFLAGS) + export ENV_LDFLAGS := $(LDFLAGS) +endif + +CXXFLAGS = $(ENV_CXXFLAGS) -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) +DEPENDFLAGS = $(ENV_DEPENDFLAGS) -std=c++17 +LDFLAGS = $(ENV_LDFLAGS) $(EXTRALDFLAGS) + +ifeq ($(COMP),) + COMP=gcc +endif + +ifeq ($(COMP),gcc) + comp=gcc + CXX=g++ + CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-declarations + + ifeq ($(arch),$(filter $(arch),armv7 armv8 riscv64)) + ifeq ($(OS),Android) + CXXFLAGS += -m$(bits) + LDFLAGS += -m$(bits) + endif + ifeq ($(ARCH),riscv64) + CXXFLAGS += -latomic + endif + else ifeq ($(arch),loongarch64) + CXXFLAGS += -latomic + else + CXXFLAGS += -m$(bits) + LDFLAGS += -m$(bits) + endif + + ifeq ($(arch),$(filter $(arch),armv7)) + LDFLAGS += -latomic + endif + + ifneq ($(KERNEL),Darwin) + LDFLAGS += -Wl,--no-as-needed + endif +endif + +ifeq ($(target_windows),yes) + LDFLAGS += -static +endif + +ifeq ($(COMP),mingw) + comp=mingw + + ifeq ($(bits),64) + ifeq ($(shell which x86_64-w64-mingw32-c++-posix 2> /dev/null),) + CXX=x86_64-w64-mingw32-c++ + else + CXX=x86_64-w64-mingw32-c++-posix + endif + else + ifeq ($(shell which i686-w64-mingw32-c++-posix 2> /dev/null),) + CXX=i686-w64-mingw32-c++ + else + CXX=i686-w64-mingw32-c++-posix + endif + endif + CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-declarations +endif + +ifeq ($(COMP),icx) + comp=icx + CXX=icpx + CXXFLAGS += --intel -pedantic -Wextra -Wshadow -Wmissing-prototypes \ + -Wconditional-uninitialized -Wabi -Wdeprecated +endif + +ifeq ($(COMP),clang) + comp=clang + CXX=clang++ + ifeq ($(target_windows),yes) + CXX=x86_64-w64-mingw32-clang++ + endif + + CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-prototypes \ + -Wconditional-uninitialized -flax-vector-conversions=none + + ifeq ($(filter $(KERNEL),Darwin OpenBSD FreeBSD),) + ifeq ($(target_windows),) + ifneq ($(RTLIB),compiler-rt) + LDFLAGS += -latomic + endif + endif + endif + + ifeq ($(arch),$(filter $(arch),armv7 armv8 riscv64)) + ifeq ($(OS),Android) + CXXFLAGS += -m$(bits) + LDFLAGS += -m$(bits) + endif + ifeq ($(ARCH),riscv64) + CXXFLAGS += -latomic + endif + else ifeq ($(arch),loongarch64) + CXXFLAGS += -latomic + else + CXXFLAGS += -m$(bits) + LDFLAGS += -m$(bits) + endif +endif + +ifeq ($(KERNEL),Darwin) + CXXFLAGS += -mmacosx-version-min=10.15 + LDFLAGS += -mmacosx-version-min=10.15 + ifneq ($(arch),any) + CXXFLAGS += -arch $(arch) + LDFLAGS += -arch $(arch) + endif + XCRUN = xcrun +endif + +# To cross-compile for Android, use NDK version r27c or later. +ifeq ($(COMP),ndk) + CXXFLAGS += -stdlib=libc++ + comp=clang + ifeq ($(arch),armv7) + CXX=armv7a-linux-androideabi29-clang++ + CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon + ifneq ($(shell which arm-linux-androideabi-strip 2>/dev/null),) + STRIP=arm-linux-androideabi-strip + else + STRIP=llvm-strip + endif + endif + ifeq ($(arch),armv8) + CXX=aarch64-linux-android29-clang++ + ifneq ($(shell which aarch64-linux-android-strip 2>/dev/null),) + STRIP=aarch64-linux-android-strip + else + STRIP=llvm-strip + endif + endif + ifeq ($(arch),x86_64) + CXX=x86_64-linux-android29-clang++ + ifneq ($(shell which x86_64-linux-android-strip 2>/dev/null),) + STRIP=x86_64-linux-android-strip + else + STRIP=llvm-strip + endif + endif + LDFLAGS += -static-libstdc++ +endif + +### Allow overwriting CXX from command line +ifdef COMPCXX + CXX=$(COMPCXX) +endif + +# llvm-profdata must be version compatible with the specified CXX (be it clang, or the gcc alias) +# make -j profile-build CXX=clang++-20 COMP=clang +# Locate the version in the same directory as the compiler used, +# with fallback to a generic one if it can't be located + LLVM_PROFDATA := $(dir $(realpath $(shell which $(CXX) 2> /dev/null)))llvm-profdata +# for icx +ifeq ($(wildcard $(LLVM_PROFDATA)),) + LLVM_PROFDATA := $(dir $(realpath $(shell which $(CXX) 2> /dev/null)))/compiler/llvm-profdata +endif +ifeq ($(wildcard $(LLVM_PROFDATA)),) + LLVM_PROFDATA := llvm-profdata +endif + +ifeq ($(comp),icx) + profile_make = icx-profile-make + profile_use = icx-profile-use +else ifeq ($(comp),clang) + profile_make = clang-profile-make + profile_use = clang-profile-use +else + profile_make = gcc-profile-make + profile_use = gcc-profile-use + ifeq ($(KERNEL),Darwin) + EXTRAPROFILEFLAGS = -fvisibility=hidden + endif +endif + +### Sometimes gcc is really clang +ifeq ($(COMP),gcc) + gccversion := $(shell $(CXX) --version 2>/dev/null) + gccisclang := $(findstring clang,$(gccversion)) + ifneq ($(gccisclang),) + profile_make = clang-profile-make + profile_use = clang-profile-use + else + CXXFLAGS += -Wstack-usage=128000 + endif +endif + +### On mingw use Windows threads, otherwise POSIX +ifneq ($(comp),mingw) + CXXFLAGS += -DUSE_PTHREADS + # On Android Bionic's C library comes with its own pthread implementation bundled in + ifneq ($(OS),Android) + # Haiku has pthreads in its libroot, so only link it in on other platforms + ifneq ($(KERNEL),Haiku) + ifneq ($(COMP),ndk) + LDFLAGS += -lpthread + + add_lrt = yes + ifeq ($(target_windows),yes) + add_lrt = no + endif + + ifeq ($(KERNEL),Darwin) + add_lrt = no + endif + + ifeq ($(add_lrt),yes) + LDFLAGS += -lrt + endif + endif + endif + endif +endif + +### 3.2.1 Debugging +ifeq ($(debug),no) + CXXFLAGS += -DNDEBUG +else + CXXFLAGS += -g + CXXFLAGS += -D_GLIBCXX_ASSERTIONS -D_GLIBCXX_DEBUG +endif + +### 3.2.2 Debugging with undefined behavior sanitizers +ifneq ($(sanitize),none) + CXXFLAGS += -g3 $(addprefix -fsanitize=,$(sanitize)) + LDFLAGS += $(addprefix -fsanitize=,$(sanitize)) +endif + +### 3.3 Optimization +ifeq ($(optimize),yes) + + CXXFLAGS += -O3 -funroll-loops + + ifeq ($(comp),gcc) + ifeq ($(OS), Android) + CXXFLAGS += -fno-gcse -mthumb -march=armv7-a -mfloat-abi=softfp + endif + endif + + ifeq ($(KERNEL),Darwin) + ifeq ($(comp),$(filter $(comp),clang icx)) + CXXFLAGS += -mdynamic-no-pic + endif + + ifeq ($(comp),gcc) + ifneq ($(arch),arm64) + CXXFLAGS += -mdynamic-no-pic + endif + endif + endif + + ifeq ($(comp),clang) + clangmajorversion := $(shell $(CXX) -dumpversion 2>/dev/null | cut -f1 -d.) + ifeq ($(shell expr $(clangmajorversion) \< 16),1) + CXXFLAGS += -fexperimental-new-pass-manager + endif + endif +endif + +### 3.4 Bits +ifeq ($(bits),64) + CXXFLAGS += -DIS_64BIT +endif + +### 3.5 prefetch and popcount +ifeq ($(prefetch),yes) + ifeq ($(sse),yes) + CXXFLAGS += -msse + endif +else + CXXFLAGS += -DNO_PREFETCH +endif + +ifeq ($(popcnt),yes) + ifeq ($(arch),$(filter $(arch),ppc64 ppc64-altivec ppc64-vsx armv7 armv8 arm64)) + CXXFLAGS += -DUSE_POPCNT + else + CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT + endif +endif + +### 3.6 SIMD architectures +ifeq ($(avx2),yes) + CXXFLAGS += -DUSE_AVX2 + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mavx2 -mbmi + endif +endif + +ifeq ($(avxvnni),yes) + CXXFLAGS += -DUSE_VNNI -DUSE_AVXVNNI + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mavxvnni + endif +endif + +ifeq ($(avx512),yes) + CXXFLAGS += -DUSE_AVX512 + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mavx512f -mavx512bw -mavx512dq -mavx512vl + endif +endif + +ifeq ($(vnni512),yes) + CXXFLAGS += -DUSE_VNNI + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl + endif +endif + +ifeq ($(avx512icl),yes) + CXXFLAGS += -DUSE_AVX512 -DUSE_VNNI -DUSE_AVX512ICL + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx512vpopcntdq -mavx512bitalg -mavx512vnni -mvpclmulqdq -mgfni -mvaes + endif +endif + +ifeq ($(sse41),yes) + CXXFLAGS += -DUSE_SSE41 + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -msse4.1 + endif +endif + +ifeq ($(ssse3),yes) + CXXFLAGS += -DUSE_SSSE3 + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mssse3 + endif +endif + +ifeq ($(sse2),yes) + CXXFLAGS += -DUSE_SSE2 + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -msse2 + endif +endif + +ifeq ($(mmx),yes) + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mmmx + endif +endif + +ifeq ($(altivec),yes) + CXXFLAGS += -maltivec + ifeq ($(COMP),gcc) + CXXFLAGS += -mabi=altivec + endif +endif + +ifeq ($(vsx),yes) + CXXFLAGS += -mvsx + ifeq ($(COMP),gcc) + CXXFLAGS += -DNO_WARN_X86_INTRINSICS -DUSE_SSE2 + endif +endif + +ifeq ($(neon),yes) + CXXFLAGS += -DUSE_NEON=$(arm_version) + ifeq ($(KERNEL),Linux) + ifneq ($(COMP),ndk) + ifneq ($(arch),armv8) + CXXFLAGS += -mfpu=neon + endif + endif + endif +endif + +ifeq ($(dotprod),yes) + CXXFLAGS += -march=armv8.2-a+dotprod -DUSE_NEON_DOTPROD +endif + +ifeq ($(lasx),yes) + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mlasx + endif +endif + +ifeq ($(lsx),yes) + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mlsx + endif +endif + +### 3.7 pext +ifeq ($(pext),yes) + CXXFLAGS += -DUSE_PEXT + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mbmi2 + endif +endif + +### 3.8.1 Try to include git commit sha for versioning +GIT_SHA := $(shell git rev-parse HEAD 2>/dev/null | cut -c 1-8) +ifneq ($(GIT_SHA), ) + CXXFLAGS += -DGIT_SHA=$(GIT_SHA) +endif + +### 3.8.2 Try to include git commit date for versioning +GIT_DATE := $(shell git show -s --date=format:'%Y%m%d' --format=%cd HEAD 2>/dev/null) +ifneq ($(GIT_DATE), ) + CXXFLAGS += -DGIT_DATE=$(GIT_DATE) +endif + +### 3.8.3 Try to include architecture +ifneq ($(ARCH), ) + CXXFLAGS += -DARCH=$(ARCH) +endif + +### 3.9 Link Time Optimization +### This is a mix of compile and link time options because the lto link phase +### needs access to the optimization flags. +ifeq ($(optimize),yes) +ifeq ($(debug),no) + ifneq ($(KERNEL),Darwin) + LLD_BIN := $(shell command -v ld.lld 2>/dev/null) + ifeq ($(LLD_BIN),) + LLD_BIN := $(shell command -v lld 2>/dev/null) + endif + ifneq ($(LLD_BIN),) + ifeq ($(comp),clang) + LDFLAGS += -fuse-ld=lld + else ifeq ($(comp),gcc) + ifneq ($(gccisclang),) + LDFLAGS += -fuse-ld=lld + endif + endif + endif + endif + + ifeq ($(comp),$(filter $(comp),clang icx)) + CXXFLAGS += -flto=full + ifeq ($(comp),icx) + CXXFLAGS += -fwhole-program-vtables + endif + LDFLAGS += $(CXXFLAGS) + +# GCC and CLANG use different methods for parallelizing LTO and CLANG pretends to be +# GCC on some systems. + else ifeq ($(comp),gcc) + ifeq ($(gccisclang),) + CXXFLAGS += -flto -flto-partition=one + LDFLAGS += $(CXXFLAGS) -flto=jobserver + else + CXXFLAGS += -flto=full + LDFLAGS += $(CXXFLAGS) + endif + +# To use LTO and static linking on Windows, +# the tool chain requires gcc version 10.1 or later. + else ifeq ($(comp),mingw) + CXXFLAGS += -flto -flto-partition=one + LDFLAGS += $(CXXFLAGS) -save-temps + endif +endif +endif + +### 3.10 Android 5 can only run position independent executables. Note that this +### breaks Android 4.0 and earlier. +ifeq ($(OS), Android) + CXXFLAGS += -fPIE + LDFLAGS += -fPIE -pie +endif + +### 3.11 Inline settings +ifeq ($(optimize), yes) + ifeq ($(comp), clang) + CXXFLAGS += -Xclang -mllvm -Xclang -inline-threshold=500 + endif +endif + +### ========================================================================== +### Section 4. Public Targets +### ========================================================================== + +help: + @echo "" && \ + echo "To compile stockfish, type: " && \ + echo "" && \ + echo "make -j target [ARCH=arch] [COMP=compiler] [COMPCXX=cxx]" && \ + echo "" && \ + echo "Supported targets:" && \ + echo "" && \ + echo "help > Display architecture details" && \ + echo "profile-build > standard build with profile-guided optimization" && \ + echo "build > skip profile-guided optimization" && \ + echo "net > Download the default nnue nets" && \ + echo "strip > Strip executable" && \ + echo "install > Install executable" && \ + echo "clean > Clean up" && \ + echo "" && \ + echo "Supported archs:" && \ + echo "" && \ + echo "native > select the best architecture for the host processor (default)" && \ + echo "x86-64-avx512icl > x86 64-bit with minimum avx512 support of Intel Ice Lake or AMD Zen 4" && \ + echo "x86-64-vnni512 > x86 64-bit with vnni 512bit support" && \ + echo "x86-64-avx512 > x86 64-bit with avx512 support" && \ + echo "x86-64-avxvnni > x86 64-bit with vnni 256bit support" && \ + echo "x86-64-bmi2 > x86 64-bit with bmi2 support" && \ + echo "x86-64-avx2 > x86 64-bit with avx2 support" && \ + echo "x86-64-sse41-popcnt > x86 64-bit with sse41 and popcnt support" && \ + echo "x86-64-modern > deprecated, currently x86-64-sse41-popcnt" && \ + echo "x86-64-ssse3 > x86 64-bit with ssse3 support" && \ + echo "x86-64-sse3-popcnt > x86 64-bit with sse3 compile and popcnt support" && \ + echo "x86-64 > x86 64-bit generic (with sse2 support)" && \ + echo "x86-32-sse41-popcnt > x86 32-bit with sse41 and popcnt support" && \ + echo "x86-32-sse2 > x86 32-bit with sse2 support" && \ + echo "x86-32 > x86 32-bit generic (with mmx compile support)" && \ + echo "ppc-64 > PPC 64-bit" && \ + echo "ppc-64-altivec > PPC 64-bit with altivec support" && \ + echo "ppc-64-vsx > PPC 64-bit with vsx support" && \ + echo "ppc-32 > PPC 32-bit" && \ + echo "armv7 > ARMv7 32-bit" && \ + echo "armv7-neon > ARMv7 32-bit with popcnt and neon" && \ + echo "armv8 > ARMv8 64-bit with popcnt and neon" && \ + echo "armv8-dotprod > ARMv8 64-bit with popcnt, neon and dot product support" && \ + echo "e2k > Elbrus 2000" && \ + echo "apple-silicon > Apple silicon ARM64" && \ + echo "general-64 > unspecified 64-bit" && \ + echo "general-32 > unspecified 32-bit" && \ + echo "riscv64 > RISC-V 64-bit" && \ + echo "loongarch64 > LoongArch 64-bit" && \ + echo "loongarch64-lsx > LoongArch 64-bit with SIMD eXtension" && \ + echo "loongarch64-lasx > LoongArch 64-bit with Advanced SIMD eXtension" && \ + echo "" && \ + echo "Supported compilers:" && \ + echo "" && \ + echo "gcc > GNU compiler (default)" && \ + echo "mingw > GNU compiler with MinGW under Windows" && \ + echo "clang > LLVM Clang compiler" && \ + echo "icx > Intel oneAPI DPC++/C++ Compiler" && \ + echo "ndk > Google NDK to cross-compile for Android" && \ + echo "" && \ + echo "Simple examples. If you don't know what to do, you likely want to run one of: " && \ + echo "" && \ + echo "make -j profile-build ARCH=x86-64-avx2 # typically a fast compile for common systems " && \ + echo "make -j profile-build ARCH=x86-64-sse41-popcnt # A more portable compile for 64-bit systems " && \ + echo "make -j profile-build ARCH=x86-64 # A portable compile for 64-bit systems " && \ + echo "" && \ + echo "Advanced examples, for experienced users: " && \ + echo "" && \ + echo "make -j profile-build ARCH=x86-64-avxvnni" && \ + echo "make -j profile-build ARCH=x86-64-avxvnni COMP=gcc COMPCXX=g++-12.0" && \ + echo "make -j build ARCH=x86-64-ssse3 COMP=clang" && \ + echo "" +ifneq ($(SUPPORTED_ARCH), true) + @echo "Specify a supported architecture with the ARCH option for more details" + @echo "" +endif + + +.PHONY: help analyze build profile-build strip install clean net \ + objclean profileclean config-sanity \ + icx-profile-use icx-profile-make \ + gcc-profile-use gcc-profile-make \ + clang-profile-use clang-profile-make FORCE \ + format analyze + +analyze: net config-sanity objclean + $(MAKE) -k ARCH=$(ARCH) COMP=$(COMP) $(OBJS) + +build: net config-sanity + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) all + +profile-build: net config-sanity objclean profileclean + @echo "" + @echo "Step 1/4. Building instrumented executable ..." + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) + @echo "" + @echo "Step 2/4. Running benchmark for pgo-build ..." + $(PGOBENCH) > PGOBENCH.out 2>&1 + tail -n 4 PGOBENCH.out + @echo "" + @echo "Step 3/4. Building optimized executable ..." + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) + @echo "" + @echo "Step 4/4. Deleting profile data ..." + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean + +strip: + $(STRIP) $(EXE) + +install: + -mkdir -p -m 755 $(BINDIR) + -cp $(EXE) $(BINDIR) + $(STRIP) $(BINDIR)/$(EXE) + +# clean all +clean: objclean profileclean + @rm -f .depend *~ core + +# clean binaries and objects +objclean: + @rm -f stockfish stockfish.exe *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o + +# clean auxiliary profiling files +profileclean: + @rm -rf profdir + @rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s PGOBENCH.out + @rm -f stockfish.profdata *.profraw + @rm -f stockfish.*args* + @rm -f stockfish.*lt* + @rm -f stockfish.res + @rm -f ./-lstdc++.res + +# evaluation network (nnue) +net: + @$(SHELL) ../scripts/net.sh + +format: + $(CLANG-FORMAT) -i $(SRCS) $(HEADERS) -style=file + +### ========================================================================== +### Section 5. Private Targets +### ========================================================================== + +all: $(EXE) .depend + +config-sanity: net + @echo "" + @echo "Config:" && \ + echo "debug: '$(debug)'" && \ + echo "sanitize: '$(sanitize)'" && \ + echo "optimize: '$(optimize)'" && \ + echo "arch: '$(arch)'" && \ + echo "bits: '$(bits)'" && \ + echo "kernel: '$(KERNEL)'" && \ + echo "os: '$(OS)'" && \ + echo "prefetch: '$(prefetch)'" && \ + echo "popcnt: '$(popcnt)'" && \ + echo "pext: '$(pext)'" && \ + echo "sse: '$(sse)'" && \ + echo "mmx: '$(mmx)'" && \ + echo "sse2: '$(sse2)'" && \ + echo "ssse3: '$(ssse3)'" && \ + echo "sse41: '$(sse41)'" && \ + echo "avx2: '$(avx2)'" && \ + echo "avxvnni: '$(avxvnni)'" && \ + echo "avx512: '$(avx512)'" && \ + echo "vnni512: '$(vnni512)'" && \ + echo "avx512icl: '$(avx512icl)'" && \ + echo "altivec: '$(altivec)'" && \ + echo "vsx: '$(vsx)'" && \ + echo "neon: '$(neon)'" && \ + echo "dotprod: '$(dotprod)'" && \ + echo "arm_version: '$(arm_version)'" && \ + echo "lsx: '$(lsx)'" && \ + echo "lasx: '$(lasx)'" && \ + echo "target_windows: '$(target_windows)'" && \ + echo "" && \ + echo "Flags:" && \ + echo "CXX: $(CXX)" && \ + echo "CXXFLAGS: $(CXXFLAGS)" && \ + echo "LDFLAGS: $(LDFLAGS)" && \ + echo "" && \ + echo "Testing config sanity. If this fails, try 'make help' ..." && \ + echo "" && \ + (test "$(debug)" = "yes" || test "$(debug)" = "no") && \ + (test "$(optimize)" = "yes" || test "$(optimize)" = "no") && \ + (test "$(SUPPORTED_ARCH)" = "true") && \ + (test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \ + test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || test "$(arch)" = "e2k" || \ + test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" || \ + test "$(arch)" = "riscv64" || test "$(arch)" = "loongarch64") && \ + (test "$(bits)" = "32" || test "$(bits)" = "64") && \ + (test "$(prefetch)" = "yes" || test "$(prefetch)" = "no") && \ + (test "$(popcnt)" = "yes" || test "$(popcnt)" = "no") && \ + (test "$(pext)" = "yes" || test "$(pext)" = "no") && \ + (test "$(sse)" = "yes" || test "$(sse)" = "no") && \ + (test "$(mmx)" = "yes" || test "$(mmx)" = "no") && \ + (test "$(sse2)" = "yes" || test "$(sse2)" = "no") && \ + (test "$(ssse3)" = "yes" || test "$(ssse3)" = "no") && \ + (test "$(sse41)" = "yes" || test "$(sse41)" = "no") && \ + (test "$(avx2)" = "yes" || test "$(avx2)" = "no") && \ + (test "$(avx512)" = "yes" || test "$(avx512)" = "no") && \ + (test "$(vnni512)" = "yes" || test "$(vnni512)" = "no") && \ + (test "$(avx512icl)" = "yes" || test "$(avx512icl)" = "no") && \ + (test "$(altivec)" = "yes" || test "$(altivec)" = "no") && \ + (test "$(vsx)" = "yes" || test "$(vsx)" = "no") && \ + (test "$(neon)" = "yes" || test "$(neon)" = "no") && \ + (test "$(lsx)" = "yes" || test "$(lsx)" = "no") && \ + (test "$(lasx)" = "yes" || test "$(lasx)" = "no") && \ + (test "$(comp)" = "gcc" || test "$(comp)" = "icx" || test "$(comp)" = "mingw" || \ + test "$(comp)" = "clang" || test "$(comp)" = "armv7a-linux-androideabi16-clang" || \ + test "$(comp)" = "aarch64-linux-android21-clang") + +$(EXE): $(OBJS) + +$(CXX) -o $@ $(OBJS) $(LDFLAGS) + +# Force recompilation to ensure version info is up-to-date +misc.o: FORCE +FORCE: + +clang-profile-make: + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ + EXTRACXXFLAGS='-fprofile-generate ' \ + EXTRALDFLAGS=' -fprofile-generate' \ + all + +clang-profile-use: + $(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ + EXTRACXXFLAGS='-fprofile-use=stockfish.profdata' \ + EXTRALDFLAGS='-fprofile-use ' \ + all + +gcc-profile-make: + @mkdir -p profdir + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ + EXTRACXXFLAGS='-fprofile-generate=profdir' \ + EXTRACXXFLAGS+=$(EXTRAPROFILEFLAGS) \ + EXTRALDFLAGS='-lgcov' \ + all + +gcc-profile-use: + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ + EXTRACXXFLAGS='-fprofile-use=profdir -fno-peel-loops -fno-tracer' \ + EXTRACXXFLAGS+=$(EXTRAPROFILEFLAGS) \ + EXTRALDFLAGS='-lgcov' \ + all + +icx-profile-make: + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ + EXTRACXXFLAGS='-fprofile-instr-generate ' \ + EXTRALDFLAGS=' -fprofile-instr-generate' \ + all + +icx-profile-use: + $(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw + $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ + EXTRACXXFLAGS='-fprofile-instr-use=stockfish.profdata' \ + EXTRALDFLAGS='-fprofile-use ' \ + all + +.depend: $(SRCS) + -@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null + +ifeq (, $(filter $(MAKECMDGOALS), help strip install clean net objclean profileclean format config-sanity)) +-include .depend +endif diff --git a/src/benchmark.cpp b/src/benchmark.cpp new file mode 100644 index 0000000000000000000000000000000000000000..03bf10ae1cb3ffd95c8e087d22e5deb0f4674fdc --- /dev/null +++ b/src/benchmark.cpp @@ -0,0 +1,516 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "benchmark.h" +#include "numa.h" + +#include +#include +#include +#include + +namespace { + +// clang-format off +const std::vector Defaults = { + "setoption name UCI_Chess960 value false", + "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1", + "r3k2r/p1ppqpb1/bn2pnp1/3PN3/1p2P3/2N2Q1p/PPPBBPPP/R3K2R w KQkq - 0 10", + "8/2p5/3p4/KP5r/1R3p1k/8/4P1P1/8 w - - 0 11", + "4rrk1/pp1n3p/3q2pQ/2p1pb2/2PP4/2P3N1/P2B2PP/4RRK1 b - - 7 19", + "rq3rk1/ppp2ppp/1bnpb3/3N2B1/3NP3/7P/PPPQ1PP1/2KR3R w - - 7 14 moves d4e6", + "r1bq1r1k/1pp1n1pp/1p1p4/4p2Q/4Pp2/1BNP4/PPP2PPP/3R1RK1 w - - 2 14 moves g2g4", + "r3r1k1/2p2ppp/p1p1bn2/8/1q2P3/2NPQN2/PPP3PP/R4RK1 b - - 2 15", + "r1bbk1nr/pp3p1p/2n5/1N4p1/2Np1B2/8/PPP2PPP/2KR1B1R w kq - 0 13", + "r1bq1rk1/ppp1nppp/4n3/3p3Q/3P4/1BP1B3/PP1N2PP/R4RK1 w - - 1 16", + "4r1k1/r1q2ppp/ppp2n2/4P3/5Rb1/1N1BQ3/PPP3PP/R5K1 w - - 1 17", + "2rqkb1r/ppp2p2/2npb1p1/1N1Nn2p/2P1PP2/8/PP2B1PP/R1BQK2R b KQ - 0 11", + "r1bq1r1k/b1p1npp1/p2p3p/1p6/3PP3/1B2NN2/PP3PPP/R2Q1RK1 w - - 1 16", + "3r1rk1/p5pp/bpp1pp2/8/q1PP1P2/b3P3/P2NQRPP/1R2B1K1 b - - 6 22", + "r1q2rk1/2p1bppp/2Pp4/p6b/Q1PNp3/4B3/PP1R1PPP/2K4R w - - 2 18", + "4k2r/1pb2ppp/1p2p3/1R1p4/3P4/2r1PN2/P4PPP/1R4K1 b - - 3 22", + "3q2k1/pb3p1p/4pbp1/2r5/PpN2N2/1P2P2P/5PP1/Q2R2K1 b - - 4 26", + "6k1/6p1/6Pp/ppp5/3pn2P/1P3K2/1PP2P2/3N4 b - - 0 1", + "3b4/5kp1/1p1p1p1p/pP1PpP1P/P1P1P3/3KN3/8/8 w - - 0 1", + "2K5/p7/7P/5pR1/8/5k2/r7/8 w - - 0 1 moves g5g6 f3e3 g6g5 e3f3", + "8/6pk/1p6/8/PP3p1p/5P2/4KP1q/3Q4 w - - 0 1", + "7k/3p2pp/4q3/8/4Q3/5Kp1/P6b/8 w - - 0 1", + "8/2p5/8/2kPKp1p/2p4P/2P5/3P4/8 w - - 0 1", + "8/1p3pp1/7p/5P1P/2k3P1/8/2K2P2/8 w - - 0 1", + "8/pp2r1k1/2p1p3/3pP2p/1P1P1P1P/P5KR/8/8 w - - 0 1", + "8/3p4/p1bk3p/Pp6/1Kp1PpPp/2P2P1P/2P5/5B2 b - - 0 1", + "5k2/7R/4P2p/5K2/p1r2P1p/8/8/8 b - - 0 1", + "6k1/6p1/P6p/r1N5/5p2/7P/1b3PP1/4R1K1 w - - 0 1", + "1r3k2/4q3/2Pp3b/3Bp3/2Q2p2/1p1P2P1/1P2KP2/3N4 w - - 0 1", + "6k1/4pp1p/3p2p1/P1pPb3/R7/1r2P1PP/3B1P2/6K1 w - - 0 1", + "8/3p3B/5p2/5P2/p7/PP5b/k7/6K1 w - - 0 1", + "5rk1/q6p/2p3bR/1pPp1rP1/1P1Pp3/P3B1Q1/1K3P2/R7 w - - 93 90", + "4rrk1/1p1nq3/p7/2p1P1pp/3P2bp/3Q1Bn1/PPPB4/1K2R1NR w - - 40 21", + "r3k2r/3nnpbp/q2pp1p1/p7/Pp1PPPP1/4BNN1/1P5P/R2Q1RK1 w kq - 0 16", + "3Qb1k1/1r2ppb1/pN1n2q1/Pp1Pp1Pr/4P2p/4BP2/4B1R1/1R5K b - - 11 40", + "4k3/3q1r2/1N2r1b1/3ppN2/2nPP3/1B1R2n1/2R1Q3/3K4 w - - 5 1", + "1r6/1P4bk/3qr1p1/N6p/3pp2P/6R1/3Q1PP1/1R4K1 w - - 1 42", + + // Positions with high numbers of changed threats + "k7/2n1n3/1nbNbn2/2NbRBn1/1nbRQR2/2NBRBN1/3N1N2/7K w - - 0 1", + "K7/8/8/BNQNQNB1/N5N1/R1Q1q2r/n5n1/bnqnqnbk w - - 0 1", + + // 5-man positions + "8/8/8/8/5kp1/P7/8/1K1N4 w - - 0 1", // Kc2 - mate + "8/8/8/5N2/8/p7/8/2NK3k w - - 0 1", // Na2 - mate + "8/3k4/8/8/8/4B3/4KB2/2B5 w - - 0 1", // draw + + // 6-man positions + "8/8/1P6/5pr1/8/4R3/7k/2K5 w - - 0 1", // Re5 - mate + "8/2p4P/8/kr6/6R1/8/8/1K6 w - - 0 1", // Ka2 - mate + "8/8/3P3k/8/1p6/8/1P6/1K3n2 b - - 0 1", // Nd2 - draw + + // 7-man positions + "8/R7/2q5/8/6k1/8/1P5p/K6R w - - 0 124", // Draw + + // Mate and stalemate positions + "6k1/3b3r/1p1p4/p1n2p2/1PPNpP1q/P3Q1p1/1R1RB1P1/5K2 b - - 0 1", + "r2r1n2/pp2bk2/2p1p2p/3q4/3PN1QP/2P3R1/P4PP1/5RK1 w - - 0 1", + "8/8/8/8/8/6k1/6p1/6K1 w - -", + "7k/7P/6K1/8/3B4/8/8/8 b - -", + + // Chess 960 + "setoption name UCI_Chess960 value true", + "bbqnnrkr/pppppppp/8/8/8/8/PPPPPPPP/BBQNNRKR w HFhf - 0 1 moves g2g3 d7d5 d2d4 c8h3 c1g5 e8d6 g5e7 f7f6", + "nqbnrkrb/pppppppp/8/8/8/8/PPPPPPPP/NQBNRKRB w KQkq - 0 1", + "setoption name UCI_Chess960 value false" +}; +// clang-format on + +// clang-format off +// human-randomly picked 5 games with <60 moves from +// https://tests.stockfishchess.org/tests/view/665c71f9fd45fb0f907c21e0 +// only moves for one side +const std::vector> BenchmarkPositions = { + { + "rnbq1k1r/ppp1bppp/4pn2/8/2B5/2NP1N2/PPP2PPP/R1BQR1K1 b - - 2 8", + "rnbq1k1r/pp2bppp/4pn2/2p5/2B2B2/2NP1N2/PPP2PPP/R2QR1K1 b - - 1 9", + "r1bq1k1r/pp2bppp/2n1pn2/2p5/2B1NB2/3P1N2/PPP2PPP/R2QR1K1 b - - 3 10", + "r1bq1k1r/pp2bppp/2n1p3/2p5/2B1PB2/5N2/PPP2PPP/R2QR1K1 b - - 0 11", + "r1b2k1r/pp2bppp/2n1p3/2p5/2B1PB2/5N2/PPP2PPP/3RR1K1 b - - 0 12", + "r1b1k2r/pp2bppp/2n1p3/2p5/2B1PB2/2P2N2/PP3PPP/3RR1K1 b - - 0 13", + "r1b1k2r/1p2bppp/p1n1p3/2p5/4PB2/2P2N2/PP2BPPP/3RR1K1 b - - 1 14", + "r1b1k2r/4bppp/p1n1p3/1pp5/P3PB2/2P2N2/1P2BPPP/3RR1K1 b - - 0 15", + "r1b1k2r/4bppp/p1n1p3/1P6/2p1PB2/2P2N2/1P2BPPP/3RR1K1 b - - 0 16", + "r1b1k2r/4bppp/2n1p3/1p6/2p1PB2/1PP2N2/4BPPP/3RR1K1 b - - 0 17", + "r3k2r/3bbppp/2n1p3/1p6/2P1PB2/2P2N2/4BPPP/3RR1K1 b - - 0 18", + "r3k2r/3bbppp/2n1p3/8/1pP1P3/2P2N2/3BBPPP/3RR1K1 b - - 1 19", + "1r2k2r/3bbppp/2n1p3/8/1pPNP3/2P5/3BBPPP/3RR1K1 b - - 3 20", + "1r2k2r/3bbppp/2n1p3/8/2PNP3/2B5/4BPPP/3RR1K1 b - - 0 21", + "1r2k2r/3bb1pp/2n1pp2/1N6/2P1P3/2B5/4BPPP/3RR1K1 b - - 1 22", + "1r2k2r/3b2pp/2n1pp2/1N6/1BP1P3/8/4BPPP/3RR1K1 b - - 0 23", + "1r2k2r/3b2pp/4pp2/1N6/1nP1P3/8/3RBPPP/4R1K1 b - - 1 24", + "1r5r/3bk1pp/4pp2/1N6/1nP1PP2/8/3RB1PP/4R1K1 b - - 0 25", + "1r5r/3bk1pp/2n1pp2/1N6/2P1PP2/8/3RBKPP/4R3 b - - 2 26", + "1r5r/3bk1pp/2n2p2/1N2p3/2P1PP2/6P1/3RBK1P/4R3 b - - 0 27", + "1r1r4/3bk1pp/2n2p2/1N2p3/2P1PP2/6P1/3RBK1P/R7 b - - 2 28", + "1r1r4/N3k1pp/2n1bp2/4p3/2P1PP2/6P1/3RBK1P/R7 b - - 4 29", + "1r1r4/3bk1pp/2N2p2/4p3/2P1PP2/6P1/3RBK1P/R7 b - - 0 30", + "1r1R4/4k1pp/2b2p2/4p3/2P1PP2/6P1/4BK1P/R7 b - - 0 31", + "3r4/4k1pp/2b2p2/4P3/2P1P3/6P1/4BK1P/R7 b - - 0 32", + "3r4/R3k1pp/2b5/4p3/2P1P3/6P1/4BK1P/8 b - - 1 33", + "8/3rk1pp/2b5/R3p3/2P1P3/6P1/4BK1P/8 b - - 3 34", + "8/3r2pp/2bk4/R1P1p3/4P3/6P1/4BK1P/8 b - - 0 35", + "8/2kr2pp/2b5/R1P1p3/4P3/4K1P1/4B2P/8 b - - 2 36", + "1k6/3r2pp/2b5/RBP1p3/4P3/4K1P1/7P/8 b - - 4 37", + "8/1k1r2pp/2b5/R1P1p3/4P3/3BK1P1/7P/8 b - - 6 38", + "1k6/3r2pp/2b5/2P1p3/4P3/3BK1P1/7P/R7 b - - 8 39", + "1k6/r5pp/2b5/2P1p3/4P3/3BK1P1/7P/5R2 b - - 10 40", + "1k3R2/6pp/2b5/2P1p3/4P3/r2BK1P1/7P/8 b - - 12 41", + "5R2/2k3pp/2b5/2P1p3/4P3/r2B2P1/3K3P/8 b - - 14 42", + "5R2/2k3pp/2b5/2P1p3/4P3/3BK1P1/r6P/8 b - - 16 43", + "5R2/2k3pp/2b5/2P1p3/4P3/r2B2P1/4K2P/8 b - - 18 44", + "5R2/2k3pp/2b5/2P1p3/4P3/3B1KP1/r6P/8 b - - 20 45", + "8/2k2Rpp/2b5/2P1p3/4P3/r2B1KP1/7P/8 b - - 22 46", + "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/4K2P/8 b - - 24 47", + "3k4/5Rpp/2b5/2P1p3/4P3/3B1KP1/r6P/8 b - - 26 48", + "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/4K2P/8 b - - 28 49", + "3k4/5Rpp/2b5/2P1p3/4P3/3BK1P1/r6P/8 b - - 30 50", + "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/3K3P/8 b - - 32 51", + "3k4/5Rpp/2b5/2P1p3/4P3/2KB2P1/r6P/8 b - - 34 52", + "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/2K4P/8 b - - 36 53", + "3k4/5Rpp/2b5/2P1p3/4P3/1K1B2P1/r6P/8 b - - 38 54", + "3k4/6Rp/2b5/2P1p3/4P3/1K1B2P1/7r/8 b - - 0 55", + "3k4/8/2b3Rp/2P1p3/4P3/1K1B2P1/7r/8 b - - 1 56", + "8/2k3R1/2b4p/2P1p3/4P3/1K1B2P1/7r/8 b - - 3 57", + "3k4/8/2b3Rp/2P1p3/4P3/1K1B2P1/7r/8 b - - 5 58", + "8/2k5/2b3Rp/2P1p3/1K2P3/3B2P1/7r/8 b - - 7 59", + "8/2k5/2b3Rp/2P1p3/4P3/2KB2P1/3r4/8 b - - 9 60", + "8/2k5/2b3Rp/2P1p3/1K2P3/3B2P1/6r1/8 b - - 11 61", + "8/2k5/2b3Rp/2P1p3/4P3/2KB2P1/3r4/8 b - - 13 62", + "8/2k5/2b3Rp/2P1p3/2K1P3/3B2P1/6r1/8 b - - 15 63", + "4b3/2k3R1/7p/2P1p3/2K1P3/3B2P1/6r1/8 b - - 17 64", + }, + { + "r1bqkbnr/npp1pppp/p7/3P4/4pB2/2N5/PPP2PPP/R2QKBNR w KQkq - 1 6", + "r1bqkb1r/npp1pppp/p4n2/3P4/4pB2/2N5/PPP1QPPP/R3KBNR w KQkq - 3 7", + "r2qkb1r/npp1pppp/p4n2/3P1b2/4pB2/2N5/PPP1QPPP/2KR1BNR w kq - 5 8", + "r2qkb1r/1pp1pppp/p4n2/1n1P1b2/4pB2/2N4P/PPP1QPP1/2KR1BNR w kq - 1 9", + "r2qkb1r/1pp1pppp/5n2/1p1P1b2/4pB2/7P/PPP1QPP1/2KR1BNR w kq - 0 10", + "r2qkb1r/1ppbpppp/5n2/1Q1P4/4pB2/7P/PPP2PP1/2KR1BNR w kq - 1 11", + "3qkb1r/1Qpbpppp/5n2/3P4/4pB2/7P/rPP2PP1/2KR1BNR w k - 0 12", + "q3kb1r/1Qpbpppp/5n2/3P4/4pB2/7P/rPP2PP1/1K1R1BNR w k - 2 13", + "r3kb1r/2pbpppp/5n2/3P4/4pB2/7P/1PP2PP1/1K1R1BNR w k - 0 14", + "r3kb1r/2Bb1ppp/4pn2/3P4/4p3/7P/1PP2PP1/1K1R1BNR w k - 0 15", + "r3kb1r/2Bb2pp/4pn2/8/4p3/7P/1PP2PP1/1K1R1BNR w k - 0 16", + "r3k2r/2Bb2pp/4pn2/2b5/4p3/7P/1PP1NPP1/1K1R1B1R w k - 2 17", + "r6r/2Bbk1pp/4pn2/2b5/3Np3/7P/1PP2PP1/1K1R1B1R w - - 4 18", + "r6r/b2bk1pp/4pn2/4B3/3Np3/7P/1PP2PP1/1K1R1B1R w - - 6 19", + "r1r5/b2bk1pp/4pn2/4B3/2BNp3/7P/1PP2PP1/1K1R3R w - - 8 20", + "r7/b2bk1pp/4pn2/2r1B3/2BNp3/1P5P/2P2PP1/1K1R3R w - - 1 21", + "rb6/3bk1pp/4pn2/2r1B3/2BNpP2/1P5P/2P3P1/1K1R3R w - - 1 22", + "1r6/3bk1pp/4pn2/2r5/2BNpP2/1P5P/2P3P1/1K1R3R w - - 0 23", + "1r6/3bk1p1/4pn1p/2r5/2BNpP2/1P5P/2P3P1/2KR3R w - - 0 24", + "8/3bk1p1/1r2pn1p/2r5/2BNpP1P/1P6/2P3P1/2KR3R w - - 1 25", + "8/3bk3/1r2pnpp/2r5/2BNpP1P/1P6/2P3P1/2K1R2R w - - 0 26", + "2b5/4k3/1r2pnpp/2r5/2BNpP1P/1P4P1/2P5/2K1R2R w - - 1 27", + "8/1b2k3/1r2pnpp/2r5/2BNpP1P/1P4P1/2P5/2K1R1R1 w - - 3 28", + "8/1b1nk3/1r2p1pp/2r5/2BNpPPP/1P6/2P5/2K1R1R1 w - - 1 29", + "8/1b2k3/1r2p1pp/2r1nP2/2BNp1PP/1P6/2P5/2K1R1R1 w - - 1 30", + "8/1b2k3/1r2p1p1/2r1nPp1/2BNp2P/1P6/2P5/2K1R1R1 w - - 0 31", + "8/1b2k3/1r2p1n1/2r3p1/2BNp2P/1P6/2P5/2K1R1R1 w - - 0 32", + "8/1b2k3/1r2p1n1/6r1/2BNp2P/1P6/2P5/2K1R3 w - - 0 33", + "8/1b2k3/1r2p3/4n1P1/2BNp3/1P6/2P5/2K1R3 w - - 1 34", + "8/1b2k3/1r2p3/4n1P1/2BN4/1P2p3/2P5/2K4R w - - 0 35", + "8/1b2k3/1r2p2R/6P1/2nN4/1P2p3/2P5/2K5 w - - 0 36", + "8/1b2k3/3rp2R/6P1/2PN4/4p3/2P5/2K5 w - - 1 37", + "8/4k3/3rp2R/6P1/2PN4/2P1p3/6b1/2K5 w - - 1 38", + "8/4k3/r3p2R/2P3P1/3N4/2P1p3/6b1/2K5 w - - 1 39", + "8/3k4/r3p2R/2P2NP1/8/2P1p3/6b1/2K5 w - - 3 40", + "8/3k4/4p2R/2P3P1/8/2P1N3/6b1/r1K5 w - - 1 41", + "8/3k4/4p2R/2P3P1/8/2P1N3/3K2b1/6r1 w - - 3 42", + "8/3k4/4p2R/2P3P1/8/2PKNb2/8/6r1 w - - 5 43", + "8/4k3/4p1R1/2P3P1/8/2PKNb2/8/6r1 w - - 7 44", + "8/4k3/4p1R1/2P3P1/3K4/2P1N3/8/6rb w - - 9 45", + "8/3k4/4p1R1/2P1K1P1/8/2P1N3/8/6rb w - - 11 46", + "8/3k4/4p1R1/2P3P1/5K2/2P1N3/8/4r2b w - - 13 47", + "8/3k4/2b1p2R/2P3P1/5K2/2P1N3/8/4r3 w - - 15 48", + "8/3k4/2b1p3/2P3P1/5K2/2P1N2R/8/6r1 w - - 17 49", + "2k5/7R/2b1p3/2P3P1/5K2/2P1N3/8/6r1 w - - 19 50", + "2k5/7R/4p3/2P3P1/b1P2K2/4N3/8/6r1 w - - 1 51", + "2k5/3bR3/4p3/2P3P1/2P2K2/4N3/8/6r1 w - - 3 52", + "3k4/3b2R1/4p3/2P3P1/2P2K2/4N3/8/6r1 w - - 5 53", + "3kb3/6R1/4p1P1/2P5/2P2K2/4N3/8/6r1 w - - 1 54", + "3kb3/6R1/4p1P1/2P5/2P2KN1/8/8/2r5 w - - 3 55", + "3kb3/6R1/4p1P1/2P1N3/2P2K2/8/8/5r2 w - - 5 56", + "3kb3/6R1/4p1P1/2P1N3/2P5/4K3/8/4r3 w - - 7 57", + }, + { + "rnbq1rk1/ppp1npb1/4p1p1/3P3p/3PP3/2N2N2/PP2BPPP/R1BQ1RK1 b - - 0 8", + "rnbq1rk1/ppp1npb1/6p1/3pP2p/3P4/2N2N2/PP2BPPP/R1BQ1RK1 b - - 0 9", + "rn1q1rk1/ppp1npb1/6p1/3pP2p/3P2b1/2N2N2/PP2BPPP/R1BQR1K1 b - - 2 10", + "r2q1rk1/ppp1npb1/2n3p1/3pP2p/3P2bN/2N5/PP2BPPP/R1BQR1K1 b - - 4 11", + "r4rk1/pppqnpb1/2n3p1/3pP2p/3P2bN/2N4P/PP2BPP1/R1BQR1K1 b - - 0 12", + "r4rk1/pppqnpb1/2n3p1/3pP2p/3P3N/7P/PP2NPP1/R1BQR1K1 b - - 0 13", + "r4rk1/pppq1pb1/2n3p1/3pPN1p/3P4/7P/PP2NPP1/R1BQR1K1 b - - 0 14", + "r4rk1/ppp2pb1/2n3p1/3pPq1p/3P1N2/7P/PP3PP1/R1BQR1K1 b - - 1 15", + "r4rk1/pppq1pb1/2n3p1/3pP2p/P2P1N2/7P/1P3PP1/R1BQR1K1 b - - 0 16", + "r2n1rk1/pppq1pb1/6p1/3pP2p/P2P1N2/R6P/1P3PP1/2BQR1K1 b - - 2 17", + "r4rk1/pppq1pb1/4N1p1/3pP2p/P2P4/R6P/1P3PP1/2BQR1K1 b - - 0 18", + "r4rk1/ppp2pb1/4q1p1/3pP1Bp/P2P4/R6P/1P3PP1/3QR1K1 b - - 1 19", + "r3r1k1/ppp2pb1/4q1p1/3pP1Bp/P2P1P2/R6P/1P4P1/3QR1K1 b - - 0 20", + "r3r1k1/ppp3b1/4qpp1/3pP2p/P2P1P1B/R6P/1P4P1/3QR1K1 b - - 1 21", + "r3r1k1/ppp3b1/4q1p1/3pP2p/P4P1B/R6P/1P4P1/3QR1K1 b - - 0 22", + "r4rk1/ppp3b1/4q1p1/3pP1Bp/P4P2/R6P/1P4P1/3QR1K1 b - - 2 23", + "r4rk1/pp4b1/4q1p1/2ppP1Bp/P4P2/3R3P/1P4P1/3QR1K1 b - - 1 24", + "r4rk1/pp4b1/4q1p1/2p1P1Bp/P2p1PP1/3R3P/1P6/3QR1K1 b - - 0 25", + "r4rk1/pp4b1/4q1p1/2p1P1B1/P2p1PP1/3R4/1P6/3QR1K1 b - - 0 26", + "r5k1/pp3rb1/4q1p1/2p1P1B1/P2p1PP1/6R1/1P6/3QR1K1 b - - 2 27", + "5rk1/pp3rb1/4q1p1/2p1P1B1/P2pRPP1/6R1/1P6/3Q2K1 b - - 4 28", + "5rk1/1p3rb1/p3q1p1/P1p1P1B1/3pRPP1/6R1/1P6/3Q2K1 b - - 0 29", + "4r1k1/1p3rb1/p3q1p1/P1p1P1B1/3pRPP1/1P4R1/8/3Q2K1 b - - 0 30", + "4r1k1/5rb1/pP2q1p1/2p1P1B1/3pRPP1/1P4R1/8/3Q2K1 b - - 0 31", + "4r1k1/5rb1/pq4p1/2p1P1B1/3pRPP1/1P4R1/4Q3/6K1 b - - 1 32", + "4r1k1/1r4b1/pq4p1/2p1P1B1/3pRPP1/1P4R1/2Q5/6K1 b - - 3 33", + "4r1k1/1r4b1/1q4p1/p1p1P1B1/3p1PP1/1P4R1/2Q5/4R1K1 b - - 1 34", + "4r1k1/3r2b1/1q4p1/p1p1P1B1/2Qp1PP1/1P4R1/8/4R1K1 b - - 3 35", + "4r1k1/3r2b1/4q1p1/p1p1P1B1/2Qp1PP1/1P4R1/5K2/4R3 b - - 5 36", + "4r1k1/3r2b1/6p1/p1p1P1B1/2Pp1PP1/6R1/5K2/4R3 b - - 0 37", + "4r1k1/3r2b1/6p1/p1p1P1B1/2P2PP1/3p2R1/5K2/3R4 b - - 1 38", + "5rk1/3r2b1/6p1/p1p1P1B1/2P2PP1/3p2R1/8/3RK3 b - - 3 39", + "5rk1/6b1/6p1/p1p1P1B1/2Pr1PP1/3R4/8/3RK3 b - - 0 40", + "5rk1/3R2b1/6p1/p1p1P1B1/2r2PP1/8/8/3RK3 b - - 1 41", + "5rk1/3R2b1/6p1/p1p1P1B1/4rPP1/8/3K4/3R4 b - - 3 42", + "1r4k1/3R2b1/6p1/p1p1P1B1/4rPP1/2K5/8/3R4 b - - 5 43", + "1r4k1/3R2b1/6p1/p1p1P1B1/2K2PP1/4r3/8/3R4 b - - 7 44", + "1r3bk1/8/3R2p1/p1p1P1B1/2K2PP1/4r3/8/3R4 b - - 9 45", + "1r3bk1/8/6R1/2p1P1B1/p1K2PP1/4r3/8/3R4 b - - 0 46", + "1r3b2/5k2/R7/2p1P1B1/p1K2PP1/4r3/8/3R4 b - - 2 47", + "5b2/1r3k2/R7/2p1P1B1/p1K2PP1/4r3/8/7R b - - 4 48", + "5b2/5k2/R7/2pKP1B1/pr3PP1/4r3/8/7R b - - 6 49", + "5b2/5k2/R1K5/2p1P1B1/p2r1PP1/4r3/8/7R b - - 8 50", + "8/R4kb1/2K5/2p1P1B1/p2r1PP1/4r3/8/7R b - - 10 51", + "8/R5b1/2K3k1/2p1PPB1/p2r2P1/4r3/8/7R b - - 0 52", + "8/6R1/2K5/2p1PPk1/p2r2P1/4r3/8/7R b - - 0 53", + "8/6R1/2K5/2p1PP2/p2r1kP1/4r3/8/5R2 b - - 2 54", + "8/6R1/2K2P2/2p1P3/p2r2P1/4r1k1/8/5R2 b - - 0 55", + "8/5PR1/2K5/2p1P3/p2r2P1/4r3/6k1/5R2 b - - 0 56", + }, + { + "rn1qkb1r/p1pbpppp/5n2/8/2pP4/2N5/1PQ1PPPP/R1B1KBNR w KQkq - 0 7", + "r2qkb1r/p1pbpppp/2n2n2/8/2pP4/2N2N2/1PQ1PPPP/R1B1KB1R w KQkq - 2 8", + "r2qkb1r/p1pbpppp/5n2/8/1npPP3/2N2N2/1PQ2PPP/R1B1KB1R w KQkq - 1 9", + "r2qkb1r/p1pb1ppp/4pn2/8/1npPP3/2N2N2/1P3PPP/R1BQKB1R w KQkq - 0 10", + "r2qk2r/p1pbbppp/4pn2/8/1nBPP3/2N2N2/1P3PPP/R1BQK2R w KQkq - 1 11", + "r2q1rk1/p1pbbppp/4pn2/8/1nBPP3/2N2N2/1P3PPP/R1BQ1RK1 w - - 3 12", + "r2q1rk1/2pbbppp/p3pn2/8/1nBPPB2/2N2N2/1P3PPP/R2Q1RK1 w - - 0 13", + "r2q1rk1/2p1bppp/p3pn2/1b6/1nBPPB2/2N2N2/1P3PPP/R2QR1K1 w - - 2 14", + "r2q1rk1/4bppp/p1p1pn2/1b6/1nBPPB2/1PN2N2/5PPP/R2QR1K1 w - - 0 15", + "r4rk1/3qbppp/p1p1pn2/1b6/1nBPPB2/1PN2N2/3Q1PPP/R3R1K1 w - - 2 16", + "r4rk1/1q2bppp/p1p1pn2/1b6/1nBPPB2/1PN2N1P/3Q1PP1/R3R1K1 w - - 1 17", + "r3r1k1/1q2bppp/p1p1pn2/1b6/1nBPPB2/1PN2N1P/4QPP1/R3R1K1 w - - 3 18", + "r3r1k1/1q1nbppp/p1p1p3/1b6/1nBPPB2/1PN2N1P/4QPP1/3RR1K1 w - - 5 19", + "r3rbk1/1q1n1ppp/p1p1p3/1b6/1nBPPB2/1PN2N1P/3RQPP1/4R1K1 w - - 7 20", + "r3rbk1/1q3ppp/pnp1p3/1b6/1nBPPB2/1PN2N1P/3RQPP1/4R2K w - - 9 21", + "2r1rbk1/1q3ppp/pnp1p3/1b6/1nBPPB2/1PN2N1P/3RQPP1/1R5K w - - 11 22", + "2r1rbk1/1q4pp/pnp1pp2/1b6/1nBPPB2/1PN2N1P/4QPP1/1R1R3K w - - 0 23", + "2r1rbk1/5qpp/pnp1pp2/1b6/1nBPP3/1PN1BN1P/4QPP1/1R1R3K w - - 2 24", + "2r1rbk1/5qp1/pnp1pp1p/1b6/1nBPP3/1PN1BN1P/4QPP1/1R1R2K1 w - - 0 25", + "2r1rbk1/5qp1/pnp1pp1p/1b6/2BPP3/1P2BN1P/n3QPP1/1R1R2K1 w - - 0 26", + "r3rbk1/5qp1/pnp1pp1p/1b6/2BPP3/1P2BN1P/Q4PP1/1R1R2K1 w - - 1 27", + "rr3bk1/5qp1/pnp1pp1p/1b6/2BPP3/1P2BN1P/Q4PP1/R2R2K1 w - - 3 28", + "rr2qbk1/6p1/pnp1pp1p/1b6/2BPP3/1P2BN1P/4QPP1/R2R2K1 w - - 5 29", + "rr2qbk1/6p1/1np1pp1p/pb6/2BPP3/1P1QBN1P/5PP1/R2R2K1 w - - 0 30", + "rr2qbk1/6p1/1n2pp1p/pp6/3PP3/1P1QBN1P/5PP1/R2R2K1 w - - 0 31", + "rr2qbk1/6p1/1n2pp1p/1p1P4/p3P3/1P1QBN1P/5PP1/R2R2K1 w - - 0 32", + "rr2qbk1/3n2p1/3Ppp1p/1p6/p3P3/1P1QBN1P/5PP1/R2R2K1 w - - 1 33", + "rr3bk1/3n2p1/3Ppp1p/1p5q/pP2P3/3QBN1P/5PP1/R2R2K1 w - - 1 34", + "rr3bk1/3n2p1/3Ppp1p/1p5q/1P2P3/p2QBN1P/5PP1/2RR2K1 w - - 0 35", + "1r3bk1/3n2p1/r2Ppp1p/1p5q/1P2P3/pQ2BN1P/5PP1/2RR2K1 w - - 2 36", + "1r2qbk1/2Rn2p1/r2Ppp1p/1p6/1P2P3/pQ2BN1P/5PP1/3R2K1 w - - 4 37", + "1r2qbk1/2Rn2p1/r2Ppp1p/1pB5/1P2P3/1Q3N1P/p4PP1/3R2K1 w - - 0 38", + "1r2q1k1/2Rn2p1/r2bpp1p/1pB5/1P2P3/1Q3N1P/p4PP1/R5K1 w - - 0 39", + "1r2q1k1/2Rn2p1/3rpp1p/1p6/1P2P3/1Q3N1P/p4PP1/R5K1 w - - 0 40", + "2r1q1k1/2Rn2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 1 41", + "1r2q1k1/1R1n2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 3 42", + "2r1q1k1/2Rn2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 5 43", + "1r2q1k1/1R1n2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 7 44", + "1rq3k1/R2n2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 9 45", + "2q3k1/Rr1n2p1/3rpp1p/1p6/1P2P3/5N1P/4QPP1/R5K1 w - - 11 46", + "Rrq3k1/3n2p1/3rpp1p/1p6/1P2P3/5N1P/4QPP1/R5K1 w - - 13 47", + }, + { + "rn1qkb1r/1pp2ppp/p4p2/3p1b2/5P2/1P2PN2/P1PP2PP/RN1QKB1R b KQkq - 1 6", + "r2qkb1r/1pp2ppp/p1n2p2/3p1b2/3P1P2/1P2PN2/P1P3PP/RN1QKB1R b KQkq - 0 7", + "r2qkb1r/1pp2ppp/p4p2/3p1b2/1n1P1P2/1P1BPN2/P1P3PP/RN1QK2R b KQkq - 2 8", + "r2qkb1r/1pp2ppp/p4p2/3p1b2/3P1P2/1P1PPN2/P5PP/RN1QK2R b KQkq - 0 9", + "r2qk2r/1pp2ppp/p2b1p2/3p1b2/3P1P2/1PNPPN2/P5PP/R2QK2R b KQkq - 2 10", + "r2qk2r/1p3ppp/p1pb1p2/3p1b2/3P1P2/1PNPPN2/P5PP/R2Q1RK1 b kq - 1 11", + "r2q1rk1/1p3ppp/p1pb1p2/3p1b2/3P1P2/1PNPPN2/P2Q2PP/R4RK1 b - - 3 12", + "r2qr1k1/1p3ppp/p1pb1p2/3p1b2/3P1P2/1P1PPN2/P2QN1PP/R4RK1 b - - 5 13", + "r3r1k1/1p3ppp/pqpb1p2/3p1b2/3P1P2/1P1PPNN1/P2Q2PP/R4RK1 b - - 7 14", + "r3r1k1/1p3ppp/pqp2p2/3p1b2/1b1P1P2/1P1PPNN1/P1Q3PP/R4RK1 b - - 9 15", + "r3r1k1/1p1b1ppp/pqp2p2/3p4/1b1P1P2/1P1PPNN1/P4QPP/R4RK1 b - - 11 16", + "2r1r1k1/1p1b1ppp/pqp2p2/3p4/1b1PPP2/1P1P1NN1/P4QPP/R4RK1 b - - 0 17", + "2r1r1k1/1p1b1ppp/pq3p2/2pp4/1b1PPP2/PP1P1NN1/5QPP/R4RK1 b - - 0 18", + "2r1r1k1/1p1b1ppp/pq3p2/2Pp4/4PP2/PPbP1NN1/5QPP/R4RK1 b - - 0 19", + "2r1r1k1/1p1b1ppp/p4p2/2Pp4/4PP2/PqbP1NN1/5QPP/RR4K1 b - - 1 20", + "2r1r1k1/1p1b1ppp/p4p2/2Pp4/q3PP2/P1bP1NN1/R4QPP/1R4K1 b - - 3 21", + "2r1r1k1/1p3ppp/p4p2/1bPP4/q4P2/P1bP1NN1/R4QPP/1R4K1 b - - 0 22", + "2r1r1k1/1p3ppp/p4p2/2PP4/q4P2/P1bb1NN1/R4QPP/2R3K1 b - - 1 23", + "2r1r1k1/1p3ppp/p2P1p2/2P5/2q2P2/P1bb1NN1/R4QPP/2R3K1 b - - 0 24", + "2rr2k1/1p3ppp/p2P1p2/2P5/2q2P2/P1bb1NN1/R4QPP/2R4K b - - 2 25", + "2rr2k1/1p3ppp/p2P1p2/2Q5/5P2/P1bb1NN1/R5PP/2R4K b - - 0 26", + "3r2k1/1p3ppp/p2P1p2/2r5/5P2/P1bb1N2/R3N1PP/2R4K b - - 1 27", + "3r2k1/1p3ppp/p2P1p2/2r5/5P2/P1b2N2/4R1PP/2R4K b - - 0 28", + "3r2k1/1p3ppp/p2P1p2/2r5/1b3P2/P4N2/4R1PP/3R3K b - - 2 29", + "3r2k1/1p2Rppp/p2P1p2/b1r5/5P2/P4N2/6PP/3R3K b - - 4 30", + "3r2k1/1R3ppp/p1rP1p2/b7/5P2/P4N2/6PP/3R3K b - - 0 31", + "3r2k1/1R3ppp/p2R1p2/b7/5P2/P4N2/6PP/7K b - - 0 32", + "6k1/1R3ppp/p2r1p2/b7/5P2/P4NP1/7P/7K b - - 0 33", + "6k1/1R3p1p/p2r1pp1/b7/5P1P/P4NP1/8/7K b - - 0 34", + "6k1/3R1p1p/pr3pp1/b7/5P1P/P4NP1/8/7K b - - 2 35", + "6k1/5p2/pr3pp1/b2R3p/5P1P/P4NP1/8/7K b - - 1 36", + "6k1/5p2/pr3pp1/7p/5P1P/P1bR1NP1/8/7K b - - 3 37", + "6k1/5p2/p1r2pp1/7p/5P1P/P1bR1NP1/6K1/8 b - - 5 38", + "6k1/5p2/p1r2pp1/b2R3p/5P1P/P4NP1/6K1/8 b - - 7 39", + "6k1/5p2/p4pp1/b2R3p/5P1P/P4NPK/2r5/8 b - - 9 40", + "6k1/2b2p2/p4pp1/7p/5P1P/P2R1NPK/2r5/8 b - - 11 41", + "6k1/2b2p2/5pp1/p6p/3N1P1P/P2R2PK/2r5/8 b - - 1 42", + "6k1/2b2p2/5pp1/p6p/3N1P1P/P1R3PK/r7/8 b - - 3 43", + "6k1/5p2/1b3pp1/p6p/5P1P/P1R3PK/r1N5/8 b - - 5 44", + "8/5pk1/1bR2pp1/p6p/5P1P/P5PK/r1N5/8 b - - 7 45", + "3b4/5pk1/2R2pp1/p4P1p/7P/P5PK/r1N5/8 b - - 0 46", + "8/4bpk1/2R2pp1/p4P1p/6PP/P6K/r1N5/8 b - - 0 47", + "8/5pk1/2R2pP1/p6p/6PP/b6K/r1N5/8 b - - 0 48", + "8/6k1/2R2pp1/p6P/7P/b6K/r1N5/8 b - - 0 49", + "8/6k1/2R2p2/p6p/7P/b5K1/r1N5/8 b - - 1 50", + "8/8/2R2pk1/p6p/7P/b4K2/r1N5/8 b - - 3 51", + "8/8/2R2pk1/p6p/7P/4NK2/rb6/8 b - - 5 52", + "2R5/8/5pk1/7p/p6P/4NK2/rb6/8 b - - 1 53", + "6R1/8/5pk1/7p/p6P/4NK2/1b6/r7 b - - 3 54", + "R7/5k2/5p2/7p/p6P/4NK2/1b6/r7 b - - 5 55", + "R7/5k2/5p2/7p/7P/p3N3/1b2K3/r7 b - - 1 56", + "8/R4k2/5p2/7p/7P/p3N3/1b2K3/7r b - - 3 57", + "8/8/5pk1/7p/R6P/p3N3/1b2K3/7r b - - 5 58", + "8/8/5pk1/7p/R6P/p7/4K3/2bN3r b - - 7 59", + "8/8/5pk1/7p/R6P/p7/4KN1r/2b5 b - - 9 60", + "8/8/5pk1/7p/R6P/p3K3/1b3N1r/8 b - - 11 61", + "8/8/R4pk1/7p/7P/p1b1K3/5N1r/8 b - - 13 62", + "8/8/5pk1/7p/7P/2b1K3/R4N1r/8 b - - 0 63", + "8/8/5pk1/7p/3K3P/8/R4N1r/4b3 b - - 2 64", + } +}; +// clang-format on + +} // namespace + +namespace Stockfish::Benchmark { + +// Builds a list of UCI commands to be run by bench. There +// are five parameters: TT size in MB, number of search threads that +// should be used, the limit value spent for each position, a file name +// where to look for positions in FEN format, and the type of the limit: +// depth, perft, nodes and movetime (in milliseconds). Examples: +// +// bench : search default positions up to depth 13 +// bench 64 1 15 : search default positions up to depth 15 (TT = 64MB) +// bench 64 1 100000 default nodes : search default positions for 100K nodes each +// bench 64 4 5000 current movetime : search current position with 4 threads for 5 sec +// bench 16 1 5 blah perft : run a perft 5 on positions in file "blah" +std::vector setup_bench(const std::string& currentFen, std::istream& is) { + + std::vector fens, list; + std::string go, token; + + // Assign default values to missing arguments + std::string ttSize = (is >> token) ? token : "16"; + std::string threads = (is >> token) ? token : "1"; + std::string limit = (is >> token) ? token : "13"; + std::string fenFile = (is >> token) ? token : "default"; + std::string limitType = (is >> token) ? token : "depth"; + + go = limitType == "eval" ? "eval" : "go " + limitType + " " + limit; + + if (fenFile == "default") + fens = Defaults; + + else if (fenFile == "current") + fens.push_back(currentFen); + + else + { + std::string fen; + std::ifstream file(fenFile); + + if (!file.is_open()) + { + std::cerr << "Unable to open file " << fenFile << std::endl; + exit(EXIT_FAILURE); + } + + while (getline(file, fen)) + if (!fen.empty()) + fens.push_back(fen); + + file.close(); + } + + list.emplace_back("setoption name Threads value " + threads); + list.emplace_back("setoption name Hash value " + ttSize); + list.emplace_back("ucinewgame"); + + for (const std::string& fen : fens) + if (fen.find("setoption") != std::string::npos) + list.emplace_back(fen); + else + { + list.emplace_back("position fen " + fen); + list.emplace_back(go); + } + + return list; +} + +BenchmarkSetup setup_benchmark(std::istream& is) { + // TT_SIZE_PER_THREAD is chosen such that roughly half of the hash is used all positions + // for the current sequence have been searched. + static constexpr int TT_SIZE_PER_THREAD = 128; + + static constexpr int DEFAULT_DURATION_S = 150; + + BenchmarkSetup setup{}; + + // Assign default values to missing arguments + int desiredTimeS; + + if (!(is >> setup.threads)) + setup.threads = int(get_hardware_concurrency()); + else + setup.originalInvocation += std::to_string(setup.threads); + + if (!(is >> setup.ttSize)) + setup.ttSize = TT_SIZE_PER_THREAD * setup.threads; + else + setup.originalInvocation += " " + std::to_string(setup.ttSize); + + if (!(is >> desiredTimeS)) + desiredTimeS = DEFAULT_DURATION_S; + else + setup.originalInvocation += " " + std::to_string(desiredTimeS); + + setup.filledInvocation += std::to_string(setup.threads) + " " + std::to_string(setup.ttSize) + + " " + std::to_string(desiredTimeS); + + auto getCorrectedTime = [&](int ply) { + // time per move is fit roughly based on LTC games + // seconds = 50/{ply+15} + // ms = 50000/{ply+15} + // with this fit 10th move gets 2000ms + // adjust for desired 10th move time + return 50000.0 / (static_cast(ply) + 15.0); + }; + + float totalTime = 0; + for (const auto& game : BenchmarkPositions) + { + int ply = 1; + for (int i = 0; i < static_cast(game.size()); ++i) + { + const float correctedTime = float(getCorrectedTime(ply)); + totalTime += correctedTime; + ply += 1; + } + } + + float timeScaleFactor = static_cast(desiredTimeS * 1000) / totalTime; + + for (const auto& game : BenchmarkPositions) + { + setup.commands.emplace_back("ucinewgame"); + int ply = 1; + for (const std::string& fen : game) + { + setup.commands.emplace_back("position fen " + fen); + + const int correctedTime = static_cast(getCorrectedTime(ply) * timeScaleFactor); + setup.commands.emplace_back("go movetime " + std::to_string(correctedTime)); + + ply += 1; + } + } + + return setup; +} + +} // namespace Stockfish diff --git a/src/benchmark.h b/src/benchmark.h new file mode 100644 index 0000000000000000000000000000000000000000..a6606e78cad6feb990fc4b84ddc7fe7e31d1dc5f --- /dev/null +++ b/src/benchmark.h @@ -0,0 +1,42 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef BENCHMARK_H_INCLUDED +#define BENCHMARK_H_INCLUDED + +#include +#include +#include + +namespace Stockfish::Benchmark { + +std::vector setup_bench(const std::string&, std::istream&); + +struct BenchmarkSetup { + int ttSize; + int threads; + std::vector commands; + std::string originalInvocation; + std::string filledInvocation; +}; + +BenchmarkSetup setup_benchmark(std::istream&); + +} // namespace Stockfish + +#endif // #ifndef BENCHMARK_H_INCLUDED diff --git a/src/bitboard.cpp b/src/bitboard.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0861222cf0e108380ceb296f0449c37a49ce9d8b --- /dev/null +++ b/src/bitboard.cpp @@ -0,0 +1,189 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "bitboard.h" + +#include +#include +#include + +#include "misc.h" + +namespace Stockfish { + +uint8_t PopCnt16[1 << 16]; +uint8_t SquareDistance[SQUARE_NB][SQUARE_NB]; + +Bitboard LineBB[SQUARE_NB][SQUARE_NB]; +Bitboard BetweenBB[SQUARE_NB][SQUARE_NB]; +Bitboard RayPassBB[SQUARE_NB][SQUARE_NB]; + +alignas(64) Magic Magics[SQUARE_NB][2]; + +namespace { + +Bitboard RookTable[0x19000]; // To store rook attacks +Bitboard BishopTable[0x1480]; // To store bishop attacks + +void init_magics(PieceType pt, Bitboard table[], Magic magics[][2]); +} + +// Returns an ASCII representation of a bitboard suitable +// to be printed to standard output. Useful for debugging. +std::string Bitboards::pretty(Bitboard b) { + + std::string s = "+---+---+---+---+---+---+---+---+\n"; + + for (Rank r = RANK_8;; --r) + { + for (File f = FILE_A; f <= FILE_H; ++f) + s += b & make_square(f, r) ? "| X " : "| "; + + s += "| " + std::to_string(1 + r) + "\n+---+---+---+---+---+---+---+---+\n"; + + if (r == RANK_1) + break; + } + s += " a b c d e f g h\n"; + + return s; +} + + +// Initializes various bitboard tables. It is called at +// startup and relies on global objects to be already zero-initialized. +void Bitboards::init() { + + for (unsigned i = 0; i < (1 << 16); ++i) + PopCnt16[i] = uint8_t(std::bitset<16>(i).count()); + + for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1) + for (Square s2 = SQ_A1; s2 <= SQ_H8; ++s2) + SquareDistance[s1][s2] = std::max(distance(s1, s2), distance(s1, s2)); + + init_magics(ROOK, RookTable, Magics); + init_magics(BISHOP, BishopTable, Magics); + + for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1) + { + for (PieceType pt : {BISHOP, ROOK}) + for (Square s2 = SQ_A1; s2 <= SQ_H8; ++s2) + { + if (PseudoAttacks[pt][s1] & s2) + { + LineBB[s1][s2] = (attacks_bb(pt, s1, 0) & attacks_bb(pt, s2, 0)) | s1 | s2; + BetweenBB[s1][s2] = + (attacks_bb(pt, s1, square_bb(s2)) & attacks_bb(pt, s2, square_bb(s1))); + RayPassBB[s1][s2] = + attacks_bb(pt, s1, 0) & (attacks_bb(pt, s2, square_bb(s1)) | s2); + } + BetweenBB[s1][s2] |= s2; + } + } +} + +namespace { +// Computes all rook and bishop attacks at startup. Magic +// bitboards are used to look up attacks of sliding pieces. As a reference see +// https://www.chessprogramming.org/Magic_Bitboards. In particular, here we use +// the so called "fancy" approach. +void init_magics(PieceType pt, Bitboard table[], Magic magics[][2]) { + +#ifndef USE_PEXT + // Optimal PRNG seeds to pick the correct magics in the shortest time + int seeds[][RANK_NB] = {{8977, 44560, 54343, 38998, 5731, 95205, 104912, 17020}, + {728, 10316, 55013, 32803, 12281, 15100, 16645, 255}}; + + Bitboard occupancy[4096]; + int epoch[4096] = {}, cnt = 0; +#endif + Bitboard reference[4096]; + int size = 0; + + for (Square s = SQ_A1; s <= SQ_H8; ++s) + { + // Board edges are not considered in the relevant occupancies + Bitboard edges = ((Rank1BB | Rank8BB) & ~rank_bb(s)) | ((FileABB | FileHBB) & ~file_bb(s)); + + // Given a square 's', the mask is the bitboard of sliding attacks from + // 's' computed on an empty board. The index must be big enough to contain + // all the attacks for each possible subset of the mask and so is 2 power + // the number of 1s of the mask. Hence we deduce the size of the shift to + // apply to the 64 or 32 bits word to get the index. + Magic& m = magics[s][pt - BISHOP]; + m.mask = Bitboards::sliding_attack(pt, s, 0) & ~edges; +#ifndef USE_PEXT + m.shift = (Is64Bit ? 64 : 32) - popcount(m.mask); +#endif + // Set the offset for the attacks table of the square. We have individual + // table sizes for each square with "Fancy Magic Bitboards". + m.attacks = s == SQ_A1 ? table : magics[s - 1][pt - BISHOP].attacks + size; + size = 0; + + // Use Carry-Rippler trick to enumerate all subsets of masks[s] and + // store the corresponding sliding attack bitboard in reference[]. + Bitboard b = 0; + do + { +#ifndef USE_PEXT + occupancy[size] = b; +#endif + reference[size] = Bitboards::sliding_attack(pt, s, b); + + if (HasPext) + m.attacks[pext(b, m.mask)] = reference[size]; + + size++; + b = (b - m.mask) & m.mask; + } while (b); + +#ifndef USE_PEXT + PRNG rng(seeds[Is64Bit][rank_of(s)]); + + // Find a magic for square 's' picking up an (almost) random number + // until we find the one that passes the verification test. + for (int i = 0; i < size;) + { + for (m.magic = 0; popcount((m.magic * m.mask) >> 56) < 6;) + m.magic = rng.sparse_rand(); + + // A good magic must map every possible occupancy to an index that + // looks up the correct sliding attack in the attacks[s] database. + // Note that we build up the database for square 's' as a side + // effect of verifying the magic. Keep track of the attempt count + // and save it in epoch[], little speed-up trick to avoid resetting + // m.attacks[] after every failed attempt. + for (++cnt, i = 0; i < size; ++i) + { + unsigned idx = m.index(occupancy[i]); + + if (epoch[idx] < cnt) + { + epoch[idx] = cnt; + m.attacks[idx] = reference[i]; + } + else if (m.attacks[idx] != reference[i]) + break; + } + } +#endif + } +} +} + +} // namespace Stockfish diff --git a/src/bitboard.h b/src/bitboard.h new file mode 100644 index 0000000000000000000000000000000000000000..7d36b0a62ed65ee7982e11f378c518b1a8616e2b --- /dev/null +++ b/src/bitboard.h @@ -0,0 +1,458 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef BITBOARD_H_INCLUDED +#define BITBOARD_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" + +namespace Stockfish { + +namespace Bitboards { + +void init(); +std::string pretty(Bitboard b); + +} // namespace Stockfish::Bitboards + +constexpr Bitboard FileABB = 0x0101010101010101ULL; +constexpr Bitboard FileBBB = FileABB << 1; +constexpr Bitboard FileCBB = FileABB << 2; +constexpr Bitboard FileDBB = FileABB << 3; +constexpr Bitboard FileEBB = FileABB << 4; +constexpr Bitboard FileFBB = FileABB << 5; +constexpr Bitboard FileGBB = FileABB << 6; +constexpr Bitboard FileHBB = FileABB << 7; + +constexpr Bitboard Rank1BB = 0xFF; +constexpr Bitboard Rank2BB = Rank1BB << (8 * 1); +constexpr Bitboard Rank3BB = Rank1BB << (8 * 2); +constexpr Bitboard Rank4BB = Rank1BB << (8 * 3); +constexpr Bitboard Rank5BB = Rank1BB << (8 * 4); +constexpr Bitboard Rank6BB = Rank1BB << (8 * 5); +constexpr Bitboard Rank7BB = Rank1BB << (8 * 6); +constexpr Bitboard Rank8BB = Rank1BB << (8 * 7); + +extern uint8_t PopCnt16[1 << 16]; +extern uint8_t SquareDistance[SQUARE_NB][SQUARE_NB]; + +extern Bitboard BetweenBB[SQUARE_NB][SQUARE_NB]; +extern Bitboard LineBB[SQUARE_NB][SQUARE_NB]; +extern Bitboard RayPassBB[SQUARE_NB][SQUARE_NB]; + +// Magic holds all magic bitboards relevant data for a single square +struct Magic { + Bitboard mask; + Bitboard* attacks; +#ifndef USE_PEXT + Bitboard magic; + unsigned shift; +#endif + + // Compute the attack's index using the 'magic bitboards' approach + unsigned index(Bitboard occupied) const { + +#ifdef USE_PEXT + return unsigned(pext(occupied, mask)); +#else + if (Is64Bit) + return unsigned(((occupied & mask) * magic) >> shift); + + unsigned lo = unsigned(occupied) & unsigned(mask); + unsigned hi = unsigned(occupied >> 32) & unsigned(mask >> 32); + return (lo * unsigned(magic) ^ hi * unsigned(magic >> 32)) >> shift; +#endif + } + + Bitboard attacks_bb(Bitboard occupied) const { return attacks[index(occupied)]; } +}; + +extern Magic Magics[SQUARE_NB][2]; + +constexpr Bitboard square_bb(Square s) { + assert(is_ok(s)); + return 1ULL << s; +} + + +// Overloads of bitwise operators between a Bitboard and a Square for testing +// whether a given bit is set in a bitboard, and for setting and clearing bits. + +constexpr Bitboard operator&(Bitboard b, Square s) { return b & square_bb(s); } +constexpr Bitboard operator|(Bitboard b, Square s) { return b | square_bb(s); } +constexpr Bitboard operator^(Bitboard b, Square s) { return b ^ square_bb(s); } +constexpr Bitboard& operator|=(Bitboard& b, Square s) { return b |= square_bb(s); } +constexpr Bitboard& operator^=(Bitboard& b, Square s) { return b ^= square_bb(s); } + +constexpr Bitboard operator&(Square s, Bitboard b) { return b & s; } +constexpr Bitboard operator|(Square s, Bitboard b) { return b | s; } +constexpr Bitboard operator^(Square s, Bitboard b) { return b ^ s; } + +constexpr Bitboard operator|(Square s1, Square s2) { return square_bb(s1) | s2; } + +constexpr bool more_than_one(Bitboard b) { return b & (b - 1); } + + +// rank_bb() and file_bb() return a bitboard representing all the squares on +// the given file or rank. + +constexpr Bitboard rank_bb(Rank r) { return Rank1BB << (8 * r); } + +constexpr Bitboard rank_bb(Square s) { return rank_bb(rank_of(s)); } + +constexpr Bitboard file_bb(File f) { return FileABB << f; } + +constexpr Bitboard file_bb(Square s) { return file_bb(file_of(s)); } + + +// Moves a bitboard one or two steps as specified by the direction D +template +constexpr Bitboard shift(Bitboard b) { + return D == NORTH ? b << 8 + : D == SOUTH ? b >> 8 + : D == NORTH + NORTH ? b << 16 + : D == SOUTH + SOUTH ? b >> 16 + : D == EAST ? (b & ~FileHBB) << 1 + : D == WEST ? (b & ~FileABB) >> 1 + : D == NORTH_EAST ? (b & ~FileHBB) << 9 + : D == NORTH_WEST ? (b & ~FileABB) << 7 + : D == SOUTH_EAST ? (b & ~FileHBB) >> 7 + : D == SOUTH_WEST ? (b & ~FileABB) >> 9 + : 0; +} + + +// Returns the squares attacked by pawns of the given color +// from the squares in the given bitboard. +template +constexpr Bitboard pawn_attacks_bb(Bitboard b) { + return C == WHITE ? shift(b) | shift(b) + : shift(b) | shift(b); +} + + +// Returns a bitboard representing an entire line (from board edge +// to board edge) that intersects the two given squares. If the given squares +// are not on a same file/rank/diagonal, the function returns 0. For instance, +// line_bb(SQ_C4, SQ_F7) will return a bitboard with the A2-G8 diagonal. +inline Bitboard line_bb(Square s1, Square s2) { + + assert(is_ok(s1) && is_ok(s2)); + return LineBB[s1][s2]; +} + + +// Returns a bitboard representing the squares in the semi-open +// segment between the squares s1 and s2 (excluding s1 but including s2). If the +// given squares are not on a same file/rank/diagonal, it returns s2. For instance, +// between_bb(SQ_C4, SQ_F7) will return a bitboard with squares D5, E6 and F7, but +// between_bb(SQ_E6, SQ_F8) will return a bitboard with the square F8. This trick +// allows to generate non-king evasion moves faster: the defending piece must either +// interpose itself to cover the check or capture the checking piece. +inline Bitboard between_bb(Square s1, Square s2) { + + assert(is_ok(s1) && is_ok(s2)); + return BetweenBB[s1][s2]; +} + +// distance() functions return the distance between x and y, defined as the +// number of steps for a king in x to reach y. + +template +inline int distance(Square x, Square y); + +template<> +inline int distance(Square x, Square y) { + return std::abs(file_of(x) - file_of(y)); +} + +template<> +inline int distance(Square x, Square y) { + return std::abs(rank_of(x) - rank_of(y)); +} + +template<> +inline int distance(Square x, Square y) { + return SquareDistance[x][y]; +} + +inline int edge_distance(File f) { return std::min(f, File(FILE_H - f)); } + + +constexpr int constexpr_popcount(Bitboard b) { + b = b - ((b >> 1) & 0x5555555555555555ULL); + b = (b & 0x3333333333333333ULL) + ((b >> 2) & 0x3333333333333333ULL); + b = (b + (b >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return static_cast((b * 0x0101010101010101ULL) >> 56); +} + +// Counts the number of non-zero bits in a bitboard. +inline int popcount(Bitboard b) { + +#ifndef USE_POPCNT + + std::uint16_t indices[4]; + std::memcpy(indices, &b, sizeof(b)); + return PopCnt16[indices[0]] + PopCnt16[indices[1]] + PopCnt16[indices[2]] + + PopCnt16[indices[3]]; + +#elif defined(_MSC_VER) + + return int(_mm_popcnt_u64(b)); + +#else // Assumed gcc or compatible compiler + + return __builtin_popcountll(b); + +#endif +} + +// Returns the least significant bit in a non-zero bitboard. +inline Square lsb(Bitboard b) { + assert(b); + +#if defined(__GNUC__) // GCC, Clang, ICX + + return Square(__builtin_ctzll(b)); + +#elif defined(_MSC_VER) + #ifdef _WIN64 // MSVC, WIN64 + + unsigned long idx; + _BitScanForward64(&idx, b); + return Square(idx); + + #else // MSVC, WIN32 + unsigned long idx; + + if (b & 0xffffffff) + { + _BitScanForward(&idx, int32_t(b)); + return Square(idx); + } + else + { + _BitScanForward(&idx, int32_t(b >> 32)); + return Square(idx + 32); + } + #endif +#else // Compiler is neither GCC nor MSVC compatible + #error "Compiler not supported." +#endif +} + +// Returns the most significant bit in a non-zero bitboard. +inline Square msb(Bitboard b) { + assert(b); + +#if defined(__GNUC__) // GCC, Clang, ICX + + return Square(63 ^ __builtin_clzll(b)); + +#elif defined(_MSC_VER) + #ifdef _WIN64 // MSVC, WIN64 + + unsigned long idx; + _BitScanReverse64(&idx, b); + return Square(idx); + + #else // MSVC, WIN32 + + unsigned long idx; + + if (b >> 32) + { + _BitScanReverse(&idx, int32_t(b >> 32)); + return Square(idx + 32); + } + else + { + _BitScanReverse(&idx, int32_t(b)); + return Square(idx); + } + #endif +#else // Compiler is neither GCC nor MSVC compatible + #error "Compiler not supported." +#endif +} + +// Returns the bitboard of the least significant +// square of a non-zero bitboard. It is equivalent to square_bb(lsb(bb)). +inline Bitboard least_significant_square_bb(Bitboard b) { + assert(b); + return b & -b; +} + +// Finds and clears the least significant bit in a non-zero bitboard. +inline Square pop_lsb(Bitboard& b) { + assert(b); + const Square s = lsb(b); + b &= b - 1; + return s; +} + +namespace Bitboards { +// Returns the bitboard of target square for the given step +// from the given square. If the step is off the board, returns empty bitboard. +constexpr Bitboard safe_destination(Square s, int step) { + constexpr auto abs = [](int v) { return v < 0 ? -v : v; }; + Square to = Square(s + step); + return is_ok(to) && abs(file_of(s) - file_of(to)) <= 2 ? square_bb(to) : Bitboard(0); +} + +constexpr Bitboard sliding_attack(PieceType pt, Square sq, Bitboard occupied) { + Bitboard attacks = 0; + Direction RookDirections[4] = {NORTH, SOUTH, EAST, WEST}; + Direction BishopDirections[4] = {NORTH_EAST, SOUTH_EAST, SOUTH_WEST, NORTH_WEST}; + + for (Direction d : (pt == ROOK ? RookDirections : BishopDirections)) + { + Square s = sq; + while (safe_destination(s, d)) + { + attacks |= (s += d); + if (occupied & s) + { + break; + } + } + } + + return attacks; +} + +constexpr Bitboard knight_attack(Square sq) { + Bitboard b = {}; + for (int step : {-17, -15, -10, -6, 6, 10, 15, 17}) + b |= safe_destination(sq, step); + return b; +} + +constexpr Bitboard king_attack(Square sq) { + Bitboard b = {}; + for (int step : {-9, -8, -7, -1, 1, 7, 8, 9}) + b |= safe_destination(sq, step); + return b; +} + +constexpr Bitboard pseudo_attacks(PieceType pt, Square sq) { + switch (pt) + { + case PieceType::ROOK : + case PieceType::BISHOP : + return sliding_attack(pt, sq, 0); + case PieceType::QUEEN : + return sliding_attack(PieceType::ROOK, sq, 0) | sliding_attack(PieceType::BISHOP, sq, 0); + case PieceType::KNIGHT : + return knight_attack(sq); + case PieceType::KING : + return king_attack(sq); + default : + assert(false); + return 0; + } +} + +} + +inline constexpr auto PseudoAttacks = []() constexpr { + std::array, PIECE_TYPE_NB> attacks{}; + + for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1) + { + attacks[WHITE][s1] = pawn_attacks_bb(square_bb(s1)); + attacks[BLACK][s1] = pawn_attacks_bb(square_bb(s1)); + + attacks[KING][s1] = Bitboards::pseudo_attacks(KING, s1); + attacks[KNIGHT][s1] = Bitboards::pseudo_attacks(KNIGHT, s1); + attacks[QUEEN][s1] = attacks[BISHOP][s1] = Bitboards::pseudo_attacks(BISHOP, s1); + attacks[QUEEN][s1] |= attacks[ROOK][s1] = Bitboards::pseudo_attacks(ROOK, s1); + } + + return attacks; +}(); + + +// Returns the pseudo attacks of the given piece type +// assuming an empty board. +template +inline Bitboard attacks_bb(Square s, Color c = COLOR_NB) { + + assert((Pt != PAWN || c < COLOR_NB) && is_ok(s)); + return Pt == PAWN ? PseudoAttacks[c][s] : PseudoAttacks[Pt][s]; +} + + +// Returns the attacks by the given piece +// assuming the board is occupied according to the passed Bitboard. +// Sliding piece attacks do not continue passed an occupied square. +template +inline Bitboard attacks_bb(Square s, Bitboard occupied) { + + assert(Pt != PAWN && is_ok(s)); + + switch (Pt) + { + case BISHOP : + case ROOK : + return Magics[s][Pt - BISHOP].attacks_bb(occupied); + case QUEEN : + return attacks_bb(s, occupied) | attacks_bb(s, occupied); + default : + return PseudoAttacks[Pt][s]; + } +} + +// Returns the attacks by the given piece +// assuming the board is occupied according to the passed Bitboard. +// Sliding piece attacks do not continue passed an occupied square. +inline Bitboard attacks_bb(PieceType pt, Square s, Bitboard occupied) { + + assert(pt != PAWN && is_ok(s)); + + switch (pt) + { + case BISHOP : + return attacks_bb(s, occupied); + case ROOK : + return attacks_bb(s, occupied); + case QUEEN : + return attacks_bb(s, occupied) | attacks_bb(s, occupied); + default : + return PseudoAttacks[pt][s]; + } +} + +inline Bitboard attacks_bb(Piece pc, Square s, Bitboard occupied) { + return type_of(pc) == PAWN ? PseudoAttacks[color_of(pc)][s] + : attacks_bb(type_of(pc), s, occupied); +} + +} // namespace Stockfish + +#endif // #ifndef BITBOARD_H_INCLUDED diff --git a/src/engine.cpp b/src/engine.cpp new file mode 100644 index 0000000000000000000000000000000000000000..be0fe3c4086e9aeede5707a97503679ef803d947 --- /dev/null +++ b/src/engine.cpp @@ -0,0 +1,411 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "engine.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "evaluate.h" +#include "misc.h" +#include "nnue/network.h" +#include "nnue/nnue_common.h" +#include "nnue/nnue_misc.h" +#include "numa.h" +#include "perft.h" +#include "position.h" +#include "search.h" +#include "shm.h" +#include "syzygy/tbprobe.h" +#include "types.h" +#include "uci.h" +#include "ucioption.h" + +namespace Stockfish { + +namespace NN = Eval::NNUE; + +constexpr auto StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"; +constexpr int MaxHashMB = Is64Bit ? 33554432 : 2048; +int MaxThreads = std::max(1024, 4 * int(get_hardware_concurrency())); + +// The default configuration will attempt to group L3 domains up to 32 threads. +// This size was found to be a good balance between the Elo gain of increased +// history sharing and the speed loss from more cross-cache accesses (see +// PR#6526). The user can always explicitly override this behavior. +constexpr NumaAutoPolicy DefaultNumaPolicy = BundledL3Policy{32}; + +Engine::Engine(std::optional path) : + binaryDirectory(path ? CommandLine::get_binary_directory(*path) : ""), + numaContext(NumaConfig::from_system(DefaultNumaPolicy)), + states(new std::deque(1)), + threads(), + networks(numaContext, get_default_networks()) { + + pos.set(StartFEN, false, &states->back()); + + options.add( // + "Debug Log File", Option("", [](const Option& o) { + start_logger(o); + return std::nullopt; + })); + + options.add( // + "NumaPolicy", Option("auto", [this](const Option& o) { + set_numa_config_from_option(o); + return numa_config_information_as_string() + "\n" + + thread_allocation_information_as_string(); + })); + + options.add( // + "Threads", Option(1, 1, MaxThreads, [this](const Option&) { + resize_threads(); + return thread_allocation_information_as_string(); + })); + + options.add( // + "Hash", Option(16, 1, MaxHashMB, [this](const Option& o) { + set_tt_size(o); + return std::nullopt; + })); + + options.add( // + "Clear Hash", Option([this](const Option&) { + search_clear(); + return std::nullopt; + })); + + options.add( // + "Ponder", Option(false)); + + options.add( // + "MultiPV", Option(1, 1, MAX_MOVES)); + + options.add("Skill Level", Option(20, 0, 20)); + + options.add("Move Overhead", Option(10, 0, 5000)); + + options.add("nodestime", Option(0, 0, 10000)); + + options.add("UCI_Chess960", Option(false)); + + options.add("UCI_LimitStrength", Option(false)); + + options.add("UCI_Elo", + Option(Stockfish::Search::Skill::LowestElo, Stockfish::Search::Skill::LowestElo, + Stockfish::Search::Skill::HighestElo)); + + options.add("UCI_ShowWDL", Option(false)); + + options.add( // + "SyzygyPath", Option("", [](const Option& o) { + Tablebases::init(o); + return std::nullopt; + })); + + options.add("SyzygyProbeDepth", Option(1, 1, 100)); + + options.add("Syzygy50MoveRule", Option(true)); + + options.add("SyzygyProbeLimit", Option(7, 0, 7)); + + options.add( // + "EvalFile", Option(EvalFileDefaultNameBig, [this](const Option& o) { + load_big_network(o); + return std::nullopt; + })); + + options.add( // + "EvalFileSmall", Option(EvalFileDefaultNameSmall, [this](const Option& o) { + load_small_network(o); + return std::nullopt; + })); + + threads.clear(); + threads.ensure_network_replicated(); + resize_threads(); +} + +std::uint64_t Engine::perft(const std::string& fen, Depth depth, bool isChess960) { + verify_networks(); + + return Benchmark::perft(fen, depth, isChess960); +} + +void Engine::go(Search::LimitsType& limits) { + assert(limits.perft == 0); + verify_networks(); + + threads.start_thinking(options, pos, states, limits); +} +void Engine::stop() { threads.stop = true; } + +void Engine::search_clear() { + wait_for_search_finished(); + + tt.clear(threads); + threads.clear(); + + // @TODO wont work with multiple instances + Tablebases::init(options["SyzygyPath"]); // Free mapped files +} + +void Engine::set_on_update_no_moves(std::function&& f) { + updateContext.onUpdateNoMoves = std::move(f); +} + +void Engine::set_on_update_full(std::function&& f) { + updateContext.onUpdateFull = std::move(f); +} + +void Engine::set_on_iter(std::function&& f) { + updateContext.onIter = std::move(f); +} + +void Engine::set_on_bestmove(std::function&& f) { + updateContext.onBestmove = std::move(f); +} + +void Engine::set_on_verify_networks(std::function&& f) { + onVerifyNetworks = std::move(f); +} + +void Engine::wait_for_search_finished() { threads.main_thread()->wait_for_search_finished(); } + +void Engine::set_position(const std::string& fen, const std::vector& moves) { + // Drop the old state and create a new one + states = StateListPtr(new std::deque(1)); + pos.set(fen, options["UCI_Chess960"], &states->back()); + + for (const auto& move : moves) + { + auto m = UCIEngine::to_move(pos, move); + + if (m == Move::none()) + break; + + states->emplace_back(); + pos.do_move(m, states->back()); + } +} + +// modifiers + +void Engine::set_numa_config_from_option(const std::string& o) { + if (o == "auto" || o == "system") + { + numaContext.set_numa_config(NumaConfig::from_system(DefaultNumaPolicy)); + } + else if (o == "hardware") + { + // Don't respect affinity set in the system. + numaContext.set_numa_config(NumaConfig::from_system(DefaultNumaPolicy, false)); + } + else if (o == "none") + { + numaContext.set_numa_config(NumaConfig{}); + } + else + { + numaContext.set_numa_config(NumaConfig::from_string(o)); + } + + // Force reallocation of threads in case affinities need to change. + resize_threads(); + threads.ensure_network_replicated(); +} + +void Engine::resize_threads() { + threads.wait_for_search_finished(); + threads.set(numaContext.get_numa_config(), {options, threads, tt, sharedHists, networks}, + updateContext); + + // Reallocate the hash with the new threadpool size + set_tt_size(options["Hash"]); + threads.ensure_network_replicated(); +} + +void Engine::set_tt_size(size_t mb) { + wait_for_search_finished(); + tt.resize(mb, threads); +} + +void Engine::set_ponderhit(bool b) { threads.main_manager()->ponder = b; } + +// network related + +void Engine::verify_networks() const { + networks->big.verify(options["EvalFile"], onVerifyNetworks); + networks->small.verify(options["EvalFileSmall"], onVerifyNetworks); + + auto statuses = networks.get_status_and_errors(); + for (size_t i = 0; i < statuses.size(); ++i) + { + const auto [status, error] = statuses[i]; + std::string message = "Network replica " + std::to_string(i + 1) + ": "; + if (status == SystemWideSharedConstantAllocationStatus::NoAllocation) + { + message += "No allocation."; + } + else if (status == SystemWideSharedConstantAllocationStatus::LocalMemory) + { + message += "Local memory."; + } + else if (status == SystemWideSharedConstantAllocationStatus::SharedMemory) + { + message += "Shared memory."; + } + else + { + message += "Unknown status."; + } + + if (error.has_value()) + { + message += " " + *error; + } + + onVerifyNetworks(message); + } +} + +std::unique_ptr Engine::get_default_networks() const { + + auto networks_ = + std::make_unique(NN::EvalFile{EvalFileDefaultNameBig, "None", ""}, + NN::EvalFile{EvalFileDefaultNameSmall, "None", ""}); + + networks_->big.load(binaryDirectory, ""); + networks_->small.load(binaryDirectory, ""); + + return networks_; +} + +void Engine::load_big_network(const std::string& file) { + networks.modify_and_replicate( + [this, &file](NN::Networks& networks_) { networks_.big.load(binaryDirectory, file); }); + threads.clear(); + threads.ensure_network_replicated(); +} + +void Engine::load_small_network(const std::string& file) { + networks.modify_and_replicate( + [this, &file](NN::Networks& networks_) { networks_.small.load(binaryDirectory, file); }); + threads.clear(); + threads.ensure_network_replicated(); +} + +void Engine::save_network(const std::pair, std::string> files[2]) { + networks.modify_and_replicate([&files](NN::Networks& networks_) { + networks_.big.save(files[0].first); + networks_.small.save(files[1].first); + }); +} + +// utility functions + +void Engine::trace_eval() const { + StateListPtr trace_states(new std::deque(1)); + Position p; + p.set(pos.fen(), options["UCI_Chess960"], &trace_states->back()); + + verify_networks(); + + sync_cout << "\n" << Eval::trace(p, *networks) << sync_endl; +} + +const OptionsMap& Engine::get_options() const { return options; } +OptionsMap& Engine::get_options() { return options; } + +std::string Engine::fen() const { return pos.fen(); } + +void Engine::flip() { pos.flip(); } + +std::string Engine::visualize() const { + std::stringstream ss; + ss << pos; + return ss.str(); +} + +int Engine::get_hashfull(int maxAge) const { return tt.hashfull(maxAge); } + +std::vector> Engine::get_bound_thread_count_by_numa_node() const { + auto counts = threads.get_bound_thread_count_by_numa_node(); + const NumaConfig& cfg = numaContext.get_numa_config(); + std::vector> ratios; + NumaIndex n = 0; + for (; n < counts.size(); ++n) + ratios.emplace_back(counts[n], cfg.num_cpus_in_numa_node(n)); + if (!counts.empty()) + for (; n < cfg.num_numa_nodes(); ++n) + ratios.emplace_back(0, cfg.num_cpus_in_numa_node(n)); + return ratios; +} + +std::string Engine::get_numa_config_as_string() const { + return numaContext.get_numa_config().to_string(); +} + +std::string Engine::numa_config_information_as_string() const { + auto cfgStr = get_numa_config_as_string(); + return "Available processors: " + cfgStr; +} + +std::string Engine::thread_binding_information_as_string() const { + auto boundThreadsByNode = get_bound_thread_count_by_numa_node(); + std::stringstream ss; + if (boundThreadsByNode.empty()) + return ss.str(); + + bool isFirst = true; + + for (auto&& [current, total] : boundThreadsByNode) + { + if (!isFirst) + ss << ":"; + ss << current << "/" << total; + isFirst = false; + } + + return ss.str(); +} + +std::string Engine::thread_allocation_information_as_string() const { + std::stringstream ss; + + size_t threadsSize = threads.size(); + ss << "Using " << threadsSize << (threadsSize > 1 ? " threads" : " thread"); + + auto boundThreadsByNodeStr = thread_binding_information_as_string(); + if (boundThreadsByNodeStr.empty()) + return ss.str(); + + ss << " with NUMA node thread binding: "; + ss << boundThreadsByNodeStr; + + return ss.str(); +} +} diff --git a/src/engine.h b/src/engine.h new file mode 100644 index 0000000000000000000000000000000000000000..92d6282dcd6fb56dffd59279e1bd1ee4be283b86 --- /dev/null +++ b/src/engine.h @@ -0,0 +1,134 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef ENGINE_H_INCLUDED +#define ENGINE_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "history.h" +#include "nnue/network.h" +#include "numa.h" +#include "position.h" +#include "search.h" +#include "syzygy/tbprobe.h" // for Stockfish::Depth +#include "thread.h" +#include "tt.h" +#include "ucioption.h" + +namespace Stockfish { + +class Engine { + public: + using InfoShort = Search::InfoShort; + using InfoFull = Search::InfoFull; + using InfoIter = Search::InfoIteration; + + Engine(std::optional path = std::nullopt); + + // Cannot be movable due to components holding backreferences to fields + Engine(const Engine&) = delete; + Engine(Engine&&) = delete; + Engine& operator=(const Engine&) = delete; + Engine& operator=(Engine&&) = delete; + + ~Engine() { wait_for_search_finished(); } + + std::uint64_t perft(const std::string& fen, Depth depth, bool isChess960); + + // non blocking call to start searching + void go(Search::LimitsType&); + // non blocking call to stop searching + void stop(); + + // blocking call to wait for search to finish + void wait_for_search_finished(); + // set a new position, moves are in UCI format + void set_position(const std::string& fen, const std::vector& moves); + + // modifiers + + void set_numa_config_from_option(const std::string& o); + void resize_threads(); + void set_tt_size(size_t mb); + void set_ponderhit(bool); + void search_clear(); + + void set_on_update_no_moves(std::function&&); + void set_on_update_full(std::function&&); + void set_on_iter(std::function&&); + void set_on_bestmove(std::function&&); + void set_on_verify_networks(std::function&&); + + // network related + + void verify_networks() const; + std::unique_ptr get_default_networks() const; + void load_big_network(const std::string& file); + void load_small_network(const std::string& file); + void save_network(const std::pair, std::string> files[2]); + + // utility functions + + void trace_eval() const; + + const OptionsMap& get_options() const; + OptionsMap& get_options(); + + int get_hashfull(int maxAge = 0) const; + + std::string fen() const; + void flip(); + std::string visualize() const; + std::vector> get_bound_thread_count_by_numa_node() const; + std::string get_numa_config_as_string() const; + std::string numa_config_information_as_string() const; + std::string thread_allocation_information_as_string() const; + std::string thread_binding_information_as_string() const; + + private: + const std::string binaryDirectory; + + NumaReplicationContext numaContext; + + Position pos; + StateListPtr states; + + OptionsMap options; + ThreadPool threads; + TranspositionTable tt; + LazyNumaReplicatedSystemWide networks; + + Search::SearchManager::UpdateContext updateContext; + std::function onVerifyNetworks; + std::map sharedHists; +}; + +} // namespace Stockfish + + +#endif // #ifndef ENGINE_H_INCLUDED diff --git a/src/evaluate.cpp b/src/evaluate.cpp new file mode 100644 index 0000000000000000000000000000000000000000..745bd3e4d56f114737a726f967f5942727f3fe9b --- /dev/null +++ b/src/evaluate.cpp @@ -0,0 +1,124 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "evaluate.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nnue/network.h" +#include "nnue/nnue_misc.h" +#include "position.h" +#include "types.h" +#include "uci.h" +#include "nnue/nnue_accumulator.h" + +namespace Stockfish { + +// Returns a static, purely materialistic evaluation of the position from +// the point of view of the side to move. It can be divided by PawnValue to get +// an approximation of the material advantage on the board in terms of pawns. +int Eval::simple_eval(const Position& pos) { + Color c = pos.side_to_move(); + return PawnValue * (pos.count(c) - pos.count(~c)) + pos.non_pawn_material(c) + - pos.non_pawn_material(~c); +} + +bool Eval::use_smallnet(const Position& pos) { return std::abs(simple_eval(pos)) > 962; } + +// Evaluate is the evaluator for the outer world. It returns a static evaluation +// of the position from the point of view of the side to move. +Value Eval::evaluate(const Eval::NNUE::Networks& networks, + const Position& pos, + Eval::NNUE::AccumulatorStack& accumulators, + Eval::NNUE::AccumulatorCaches& caches, + int optimism) { + + assert(!pos.checkers()); + + bool smallNet = use_smallnet(pos); + auto [psqt, positional] = smallNet ? networks.small.evaluate(pos, accumulators, caches.small) + : networks.big.evaluate(pos, accumulators, caches.big); + + Value nnue = (125 * psqt + 131 * positional) / 128; + + // Re-evaluate the position when higher eval accuracy is worth the time spent + if (smallNet && (std::abs(nnue) < 277)) + { + std::tie(psqt, positional) = networks.big.evaluate(pos, accumulators, caches.big); + nnue = (125 * psqt + 131 * positional) / 128; + smallNet = false; + } + + // Blend optimism and eval with nnue complexity + int nnueComplexity = std::abs(psqt - positional); + optimism += optimism * nnueComplexity / 476; + nnue -= nnue * nnueComplexity / 18236; + + int material = 534 * pos.count() + pos.non_pawn_material(); + int v = (nnue * (77871 + material) + optimism * (7191 + material)) / 77871; + + // Damp down the evaluation linearly when shuffling + v -= v * pos.rule50_count() / 199; + + // Guarantee evaluation does not hit the tablebase range + v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); + + return v; +} + +// Like evaluate(), but instead of returning a value, it returns +// a string (suitable for outputting to stdout) that contains the detailed +// descriptions and values of each evaluation term. Useful for debugging. +// Trace scores are from white's point of view +std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) { + + if (pos.checkers()) + return "Final evaluation: none (in check)"; + + auto accumulators = std::make_unique(); + auto caches = std::make_unique(networks); + + std::stringstream ss; + ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2); + ss << '\n' << NNUE::trace(pos, networks, *caches) << '\n'; + + ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15); + + auto [psqt, positional] = networks.big.evaluate(pos, *accumulators, caches->big); + Value v = psqt + positional; + v = pos.side_to_move() == WHITE ? v : -v; + ss << "NNUE evaluation " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)\n"; + + v = evaluate(networks, pos, *accumulators, *caches, VALUE_ZERO); + v = pos.side_to_move() == WHITE ? v : -v; + ss << "Final evaluation " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)"; + ss << " [with scaled NNUE, ...]"; + ss << "\n"; + + return ss.str(); +} + +} // namespace Stockfish diff --git a/src/evaluate.h b/src/evaluate.h new file mode 100644 index 0000000000000000000000000000000000000000..4af7093e0ac7737546bbac45fff2bb0bfd7811c8 --- /dev/null +++ b/src/evaluate.h @@ -0,0 +1,58 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef EVALUATE_H_INCLUDED +#define EVALUATE_H_INCLUDED + +#include + +#include "types.h" + +namespace Stockfish { + +class Position; + +namespace Eval { + +// The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue +// for the build process (profile-build and fishtest) to work. Do not change the +// name of the macro or the location where this macro is defined, as it is used +// in the Makefile/Fishtest. +#define EvalFileDefaultNameBig "nn-9a0cc2a62c52.nnue" +#define EvalFileDefaultNameSmall "nn-47fc8b7fff06.nnue" + +namespace NNUE { +struct Networks; +struct AccumulatorCaches; +class AccumulatorStack; +} + +std::string trace(Position& pos, const Eval::NNUE::Networks& networks); + +int simple_eval(const Position& pos); +bool use_smallnet(const Position& pos); +Value evaluate(const NNUE::Networks& networks, + const Position& pos, + Eval::NNUE::AccumulatorStack& accumulators, + Eval::NNUE::AccumulatorCaches& caches, + int optimism); +} // namespace Eval + +} // namespace Stockfish + +#endif // #ifndef EVALUATE_H_INCLUDED diff --git a/src/history.h b/src/history.h new file mode 100644 index 0000000000000000000000000000000000000000..c98a7ee223b108fabfe066a6d5dcba6ab17441c6 --- /dev/null +++ b/src/history.h @@ -0,0 +1,273 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef HISTORY_H_INCLUDED +#define HISTORY_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: keep + +#include "memory.h" +#include "misc.h" +#include "position.h" + +namespace Stockfish { + +constexpr int PAWN_HISTORY_BASE_SIZE = 8192; // has to be a power of 2 +constexpr int UINT_16_HISTORY_SIZE = std::numeric_limits::max() + 1; +constexpr int CORRHIST_BASE_SIZE = UINT_16_HISTORY_SIZE; +constexpr int CORRECTION_HISTORY_LIMIT = 1024; +constexpr int LOW_PLY_HISTORY_SIZE = 5; + +static_assert((PAWN_HISTORY_BASE_SIZE & (PAWN_HISTORY_BASE_SIZE - 1)) == 0, + "PAWN_HISTORY_BASE_SIZE has to be a power of 2"); + +static_assert((CORRHIST_BASE_SIZE & (CORRHIST_BASE_SIZE - 1)) == 0, + "CORRHIST_BASE_SIZE has to be a power of 2"); + +// StatsEntry is the container of various numerical statistics. We use a class +// instead of a naked value to directly call history update operator<<() on +// the entry. The first template parameter T is the base type of the array, +// and the second template parameter D limits the range of updates in [-D, D] +// when we update values with the << operator +template +struct StatsEntry { + static_assert(std::is_arithmetic_v, "Not an arithmetic type"); + + private: + std::conditional_t, T> entry; + + public: + void operator=(const T& v) { + if constexpr (Atomic) + entry.store(v, std::memory_order_relaxed); + else + entry = v; + } + + operator T() const { + if constexpr (Atomic) + return entry.load(std::memory_order_relaxed); + else + return entry; + } + + void operator<<(int bonus) { + // Make sure that bonus is in range [-D, D] + int clampedBonus = std::clamp(bonus, -D, D); + T val = *this; + *this = val + clampedBonus - val * std::abs(clampedBonus) / D; + + assert(std::abs(T(*this)) <= D); + } +}; + +enum StatsType { + NoCaptures, + Captures +}; + +template +using Stats = MultiArray, Sizes...>; + +template +using AtomicStats = MultiArray, Sizes...>; + +// DynStats is a dynamically sized array of Stats, used for thread-shared histories +// which should scale with the total number of threads. The SizeMultiplier gives +// the per-thread allocation count of T. +template +struct DynStats { + explicit DynStats(size_t s) { + size = s * SizeMultiplier; + data = make_unique_large_page(size); + } + // Sets all values in the range to 0 + void clear_range(int value, size_t threadIdx, size_t numaTotal) { + size_t start = uint64_t(threadIdx) * size / numaTotal; + assert(start < size); + size_t end = threadIdx + 1 == numaTotal ? size : uint64_t(threadIdx + 1) * size / numaTotal; + + while (start < end) + data[start++].fill(value); + } + size_t get_size() const { return size; } + T& operator[](size_t index) { + assert(index < size); + return data.get()[index]; + } + const T& operator[](size_t index) const { + assert(index < size); + return data.get()[index]; + } + + private: + size_t size; + LargePagePtr data; +}; + +// ButterflyHistory records how often quiet moves have been successful or unsuccessful +// during the current search, and is used for reduction and move ordering decisions. +// It uses 2 tables (one for each color) indexed by the move's from and to squares, +// see https://www.chessprogramming.org/Butterfly_Boards +using ButterflyHistory = Stats; + +// LowPlyHistory is addressed by ply and move's from and to squares, used +// to improve move ordering near the root +using LowPlyHistory = Stats; + +// CapturePieceToHistory is addressed by a move's [piece][to][captured piece type] +using CapturePieceToHistory = Stats; + +// PieceToHistory is like ButterflyHistory but is addressed by a move's [piece][to] +using PieceToHistory = Stats; + +// ContinuationHistory is the combined history of a given pair of moves, usually +// the current one given a previous one. The nested history table is based on +// PieceToHistory instead of ButterflyBoards. +using ContinuationHistory = MultiArray; + +// PawnHistory is addressed by the pawn structure and a move's [piece][to] +using PawnHistory = + DynStats, PAWN_HISTORY_BASE_SIZE>; + +// Correction histories record differences between the static evaluation of +// positions and their search score. It is used to improve the static evaluation +// used by some search heuristics. +// see https://www.chessprogramming.org/Static_Evaluation_Correction_History +enum CorrHistType { + Pawn, // By color and pawn structure + Minor, // By color and positions of minor pieces (Knight, Bishop) + NonPawn, // By non-pawn material positions and color + PieceTo, // By [piece][to] move + Continuation, // Combined history of move pairs +}; + +template +struct CorrectionBundle { + StatsEntry pawn; + StatsEntry minor; + StatsEntry nonPawnWhite; + StatsEntry nonPawnBlack; + + void operator=(T val) { + pawn = val; + minor = val; + nonPawnWhite = val; + nonPawnBlack = val; + } +}; + +namespace Detail { + +template +struct CorrHistTypedef { + using type = + DynStats, CORRHIST_BASE_SIZE>; +}; + +template<> +struct CorrHistTypedef { + using type = Stats; +}; + +template<> +struct CorrHistTypedef { + using type = MultiArray::type, PIECE_NB, SQUARE_NB>; +}; + +template<> +struct CorrHistTypedef { + using type = DynStats, + CORRHIST_BASE_SIZE>; +}; + +} + +using UnifiedCorrectionHistory = + DynStats, COLOR_NB>, + CORRHIST_BASE_SIZE>; + +template +using CorrectionHistory = typename Detail::CorrHistTypedef::type; + +using TTMoveHistory = StatsEntry; + +// Set of histories shared between groups of threads. To avoid excessive +// cross-node data transfer, histories are shared only between threads +// on a given NUMA node. The passed size must be a power of two to make +// the indexing more efficient. +struct SharedHistories { + SharedHistories(size_t threadCount) : + correctionHistory(threadCount), + pawnHistory(threadCount) { + assert((threadCount & (threadCount - 1)) == 0 && threadCount != 0); + sizeMinus1 = correctionHistory.get_size() - 1; + pawnHistSizeMinus1 = pawnHistory.get_size() - 1; + } + + size_t get_size() const { return sizeMinus1 + 1; } + + auto& pawn_entry(const Position& pos) { + return pawnHistory[pos.pawn_key() & pawnHistSizeMinus1]; + } + const auto& pawn_entry(const Position& pos) const { + return pawnHistory[pos.pawn_key() & pawnHistSizeMinus1]; + } + + auto& pawn_correction_entry(const Position& pos) { + return correctionHistory[pos.pawn_key() & sizeMinus1]; + } + const auto& pawn_correction_entry(const Position& pos) const { + return correctionHistory[pos.pawn_key() & sizeMinus1]; + } + + auto& minor_piece_correction_entry(const Position& pos) { + return correctionHistory[pos.minor_piece_key() & sizeMinus1]; + } + const auto& minor_piece_correction_entry(const Position& pos) const { + return correctionHistory[pos.minor_piece_key() & sizeMinus1]; + } + + template + auto& nonpawn_correction_entry(const Position& pos) { + return correctionHistory[pos.non_pawn_key(c) & sizeMinus1]; + } + template + const auto& nonpawn_correction_entry(const Position& pos) const { + return correctionHistory[pos.non_pawn_key(c) & sizeMinus1]; + } + + UnifiedCorrectionHistory correctionHistory; + PawnHistory pawnHistory; + + + private: + size_t sizeMinus1, pawnHistSizeMinus1; +}; + +} // namespace Stockfish + +#endif // #ifndef HISTORY_H_INCLUDED diff --git a/src/incbin/UNLICENCE b/src/incbin/UNLICENCE new file mode 100644 index 0000000000000000000000000000000000000000..32484ab5e7026f9a1f15c2f8c08b1418802e02a8 --- /dev/null +++ b/src/incbin/UNLICENCE @@ -0,0 +1,26 @@ +The file "incbin.h" is free and unencumbered software released into +the public domain by Dale Weiler, see: + + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/src/incbin/incbin.h b/src/incbin/incbin.h new file mode 100644 index 0000000000000000000000000000000000000000..3f662e15dad1864dc6031c6491a21a6815994f1a --- /dev/null +++ b/src/incbin/incbin.h @@ -0,0 +1,476 @@ +/** + * @file incbin.h + * @author Dale Weiler + * @brief Utility for including binary files + * + * Facilities for including binary files into the current translation unit and + * making use from them externally in other translation units. + */ +#ifndef INCBIN_HDR +#define INCBIN_HDR +#include +#if defined(__AVX512BW__) || \ + defined(__AVX512CD__) || \ + defined(__AVX512DQ__) || \ + defined(__AVX512ER__) || \ + defined(__AVX512PF__) || \ + defined(__AVX512VL__) || \ + defined(__AVX512F__) +# define INCBIN_ALIGNMENT_INDEX 6 +#elif defined(__AVX__) || \ + defined(__AVX2__) +# define INCBIN_ALIGNMENT_INDEX 5 +#elif defined(__SSE__) || \ + defined(__SSE2__) || \ + defined(__SSE3__) || \ + defined(__SSSE3__) || \ + defined(__SSE4_1__) || \ + defined(__SSE4_2__) || \ + defined(__neon__) || \ + defined(__ARM_NEON) || \ + defined(__ALTIVEC__) +# define INCBIN_ALIGNMENT_INDEX 4 +#elif ULONG_MAX != 0xffffffffu +# define INCBIN_ALIGNMENT_INDEX 3 +# else +# define INCBIN_ALIGNMENT_INDEX 2 +#endif + +/* Lookup table of (1 << n) where `n' is `INCBIN_ALIGNMENT_INDEX' */ +#define INCBIN_ALIGN_SHIFT_0 1 +#define INCBIN_ALIGN_SHIFT_1 2 +#define INCBIN_ALIGN_SHIFT_2 4 +#define INCBIN_ALIGN_SHIFT_3 8 +#define INCBIN_ALIGN_SHIFT_4 16 +#define INCBIN_ALIGN_SHIFT_5 32 +#define INCBIN_ALIGN_SHIFT_6 64 + +/* Actual alignment value */ +#define INCBIN_ALIGNMENT \ + INCBIN_CONCATENATE( \ + INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \ + INCBIN_ALIGNMENT_INDEX) + +/* Stringize */ +#define INCBIN_STR(X) \ + #X +#define INCBIN_STRINGIZE(X) \ + INCBIN_STR(X) +/* Concatenate */ +#define INCBIN_CAT(X, Y) \ + X ## Y +#define INCBIN_CONCATENATE(X, Y) \ + INCBIN_CAT(X, Y) +/* Deferred macro expansion */ +#define INCBIN_EVAL(X) \ + X +#define INCBIN_INVOKE(N, ...) \ + INCBIN_EVAL(N(__VA_ARGS__)) +/* Variable argument count for overloading by arity */ +#define INCBIN_VA_ARG_COUNTER(_1, _2, _3, N, ...) N +#define INCBIN_VA_ARGC(...) INCBIN_VA_ARG_COUNTER(__VA_ARGS__, 3, 2, 1, 0) + +/* Green Hills uses a different directive for including binary data */ +#if defined(__ghs__) +# if (__ghs_asm == 2) +# define INCBIN_MACRO ".file" +/* Or consider the ".myrawdata" entry in the ld file */ +# else +# define INCBIN_MACRO "\tINCBIN" +# endif +#else +# define INCBIN_MACRO ".incbin" +#endif + +#ifndef _MSC_VER +# define INCBIN_ALIGN \ + __attribute__((aligned(INCBIN_ALIGNMENT))) +#else +# define INCBIN_ALIGN __declspec(align(INCBIN_ALIGNMENT)) +#endif + +#if defined(__arm__) || /* GNU C and RealView */ \ + defined(__arm) || /* Diab */ \ + defined(_ARM) /* ImageCraft */ +# define INCBIN_ARM +#endif + +#ifdef __GNUC__ +/* Utilize .balign where supported */ +# define INCBIN_ALIGN_HOST ".balign " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n" +# define INCBIN_ALIGN_BYTE ".balign 1\n" +#elif defined(INCBIN_ARM) +/* + * On arm assemblers, the alignment value is calculated as (1 << n) where `n' is + * the shift count. This is the value passed to `.align' + */ +# define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT_INDEX) "\n" +# define INCBIN_ALIGN_BYTE ".align 0\n" +#else +/* We assume other inline assembler's treat `.align' as `.balign' */ +# define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n" +# define INCBIN_ALIGN_BYTE ".align 1\n" +#endif + +/* INCBIN_CONST is used by incbin.c generated files */ +#if defined(__cplusplus) +# define INCBIN_EXTERNAL extern "C" +# define INCBIN_CONST extern const +#else +# define INCBIN_EXTERNAL extern +# define INCBIN_CONST const +#endif + +/** + * @brief Optionally override the linker section into which size and data is + * emitted. + * + * @warning If you use this facility, you might have to deal with + * platform-specific linker output section naming on your own. + */ +#if !defined(INCBIN_OUTPUT_SECTION) +# if defined(__APPLE__) +# define INCBIN_OUTPUT_SECTION ".const_data" +# else +# define INCBIN_OUTPUT_SECTION ".rodata" +# endif +#endif + +/** + * @brief Optionally override the linker section into which data is emitted. + * + * @warning If you use this facility, you might have to deal with + * platform-specific linker output section naming on your own. + */ +#if !defined(INCBIN_OUTPUT_DATA_SECTION) +# define INCBIN_OUTPUT_DATA_SECTION INCBIN_OUTPUT_SECTION +#endif + +/** + * @brief Optionally override the linker section into which size is emitted. + * + * @warning If you use this facility, you might have to deal with + * platform-specific linker output section naming on your own. + * + * @note This is useful for Harvard architectures where program memory cannot + * be directly read from the program without special instructions. With this you + * can chose to put the size variable in RAM rather than ROM. + */ +#if !defined(INCBIN_OUTPUT_SIZE_SECTION) +# define INCBIN_OUTPUT_SIZE_SECTION INCBIN_OUTPUT_SECTION +#endif + +#if defined(__APPLE__) +# include "TargetConditionals.h" +# if defined(TARGET_OS_IPHONE) && !defined(INCBIN_SILENCE_BITCODE_WARNING) +# warning "incbin is incompatible with bitcode. Using the library will break upload to App Store if you have bitcode enabled. Add `#define INCBIN_SILENCE_BITCODE_WARNING` before including this header to silence this warning." +# endif +/* The directives are different for Apple branded compilers */ +# define INCBIN_SECTION INCBIN_OUTPUT_SECTION "\n" +# define INCBIN_GLOBAL(NAME) ".globl " INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n" +# define INCBIN_INT ".long " +# define INCBIN_MANGLE "_" +# define INCBIN_BYTE ".byte " +# define INCBIN_TYPE(...) +#else +# define INCBIN_SECTION ".section " INCBIN_OUTPUT_SECTION "\n" +# define INCBIN_GLOBAL(NAME) ".global " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n" +# if defined(__ghs__) +# define INCBIN_INT ".word " +# else +# define INCBIN_INT ".int " +# endif +# if defined(__USER_LABEL_PREFIX__) +# define INCBIN_MANGLE INCBIN_STRINGIZE(__USER_LABEL_PREFIX__) +# else +# define INCBIN_MANGLE "" +# endif +# if defined(INCBIN_ARM) +/* On arm assemblers, `@' is used as a line comment token */ +# define INCBIN_TYPE(NAME) ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", %object\n" +# elif defined(__MINGW32__) || defined(__MINGW64__) +/* Mingw doesn't support this directive either */ +# define INCBIN_TYPE(NAME) +# else +/* It's safe to use `@' on other architectures */ +# define INCBIN_TYPE(NAME) ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", @object\n" +# endif +# define INCBIN_BYTE ".byte " +#endif + +/* List of style types used for symbol names */ +#define INCBIN_STYLE_CAMEL 0 +#define INCBIN_STYLE_SNAKE 1 + +/** + * @brief Specify the prefix to use for symbol names. + * + * @note By default this is "g". + * + * @code + * #define INCBIN_PREFIX incbin + * #include "incbin.h" + * INCBIN(Foo, "foo.txt"); + * + * // Now you have the following symbols instead: + * // const unsigned char incbinFoo[]; + * // const unsigned char *const incbinFoo; + * // const unsigned int incbinFoo; + * @endcode + */ +#if !defined(INCBIN_PREFIX) +# define INCBIN_PREFIX g +#endif + +/** + * @brief Specify the style used for symbol names. + * + * Possible options are + * - INCBIN_STYLE_CAMEL "CamelCase" + * - INCBIN_STYLE_SNAKE "snake_case" + * + * @note By default this is INCBIN_STYLE_CAMEL + * + * @code + * #define INCBIN_STYLE INCBIN_STYLE_SNAKE + * #include "incbin.h" + * INCBIN(foo, "foo.txt"); + * + * // Now you have the following symbols: + * // const unsigned char foo_data[]; + * // const unsigned char *const foo_end; + * // const unsigned int foo_size; + * @endcode + */ +#if !defined(INCBIN_STYLE) +# define INCBIN_STYLE INCBIN_STYLE_CAMEL +#endif + +/* Style lookup tables */ +#define INCBIN_STYLE_0_DATA Data +#define INCBIN_STYLE_0_END End +#define INCBIN_STYLE_0_SIZE Size +#define INCBIN_STYLE_1_DATA _data +#define INCBIN_STYLE_1_END _end +#define INCBIN_STYLE_1_SIZE _size + +/* Style lookup: returning identifier */ +#define INCBIN_STYLE_IDENT(TYPE) \ + INCBIN_CONCATENATE( \ + INCBIN_STYLE_, \ + INCBIN_CONCATENATE( \ + INCBIN_EVAL(INCBIN_STYLE), \ + INCBIN_CONCATENATE(_, TYPE))) + +/* Style lookup: returning string literal */ +#define INCBIN_STYLE_STRING(TYPE) \ + INCBIN_STRINGIZE( \ + INCBIN_STYLE_IDENT(TYPE)) \ + +/* Generate the global labels by indirectly invoking the macro with our style + * type and concatenating the name against them. */ +#define INCBIN_GLOBAL_LABELS(NAME, TYPE) \ + INCBIN_INVOKE( \ + INCBIN_GLOBAL, \ + INCBIN_CONCATENATE( \ + NAME, \ + INCBIN_INVOKE( \ + INCBIN_STYLE_IDENT, \ + TYPE))) \ + INCBIN_INVOKE( \ + INCBIN_TYPE, \ + INCBIN_CONCATENATE( \ + NAME, \ + INCBIN_INVOKE( \ + INCBIN_STYLE_IDENT, \ + TYPE))) + +/** + * @brief Externally reference binary data included in another translation unit. + * + * Produces three external symbols that reference the binary data included in + * another translation unit. + * + * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with + * "Data", as well as "End" and "Size" after. An example is provided below. + * + * @param TYPE Optional array type. Omitting this picks a default of `unsigned char`. + * @param NAME The name given for the binary data + * + * @code + * INCBIN_EXTERN(Foo); + * + * // Now you have the following symbols: + * // extern const unsigned char Foo[]; + * // extern const unsigned char *const Foo; + * // extern const unsigned int Foo; + * @endcode + * + * You may specify a custom optional data type as well as the first argument. + * @code + * INCBIN_EXTERN(custom_type, Foo); + * + * // Now you have the following symbols: + * // extern const custom_type Foo[]; + * // extern const custom_type *const Foo; + * // extern const unsigned int Foo; + * @endcode + */ +#define INCBIN_EXTERN(...) \ + INCBIN_CONCATENATE(INCBIN_EXTERN_, INCBIN_VA_ARGC(__VA_ARGS__))(__VA_ARGS__) +#define INCBIN_EXTERN_1(NAME, ...) \ + INCBIN_EXTERN_2(unsigned char, NAME) +#define INCBIN_EXTERN_2(TYPE, NAME) \ + INCBIN_EXTERNAL const INCBIN_ALIGN TYPE \ + INCBIN_CONCATENATE( \ + INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \ + INCBIN_STYLE_IDENT(DATA))[]; \ + INCBIN_EXTERNAL const INCBIN_ALIGN TYPE *const \ + INCBIN_CONCATENATE( \ + INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \ + INCBIN_STYLE_IDENT(END)); \ + INCBIN_EXTERNAL const unsigned int \ + INCBIN_CONCATENATE( \ + INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \ + INCBIN_STYLE_IDENT(SIZE)) + +/** + * @brief Externally reference textual data included in another translation unit. + * + * Produces three external symbols that reference the textual data included in + * another translation unit. + * + * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with + * "Data", as well as "End" and "Size" after. An example is provided below. + * + * @param NAME The name given for the textual data + * + * @code + * INCBIN_EXTERN(Foo); + * + * // Now you have the following symbols: + * // extern const char Foo[]; + * // extern const char *const Foo; + * // extern const unsigned int Foo; + * @endcode + */ +#define INCTXT_EXTERN(NAME) \ + INCBIN_EXTERN_2(char, NAME) + +/** + * @brief Include a binary file into the current translation unit. + * + * Includes a binary file into the current translation unit, producing three symbols + * for objects that encode the data and size respectively. + * + * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with + * "Data", as well as "End" and "Size" after. An example is provided below. + * + * @param TYPE Optional array type. Omitting this picks a default of `unsigned char`. + * @param NAME The name to associate with this binary data (as an identifier.) + * @param FILENAME The file to include (as a string literal.) + * + * @code + * INCBIN(Icon, "icon.png"); + * + * // Now you have the following symbols: + * // const unsigned char Icon[]; + * // const unsigned char *const Icon; + * // const unsigned int Icon; + * @endcode + * + * You may specify a custom optional data type as well as the first argument. + * These macros are specialized by arity. + * @code + * INCBIN(custom_type, Icon, "icon.png"); + * + * // Now you have the following symbols: + * // const custom_type Icon[]; + * // const custom_type *const Icon; + * // const unsigned int Icon; + * @endcode + * + * @warning This must be used in global scope + * @warning The identifiers may be different if INCBIN_STYLE is not default + * + * To externally reference the data included by this in another translation unit + * please @see INCBIN_EXTERN. + */ +#ifdef _MSC_VER +# define INCBIN(NAME, FILENAME) \ + INCBIN_EXTERN(NAME) +#else +# define INCBIN(...) \ + INCBIN_CONCATENATE(INCBIN_, INCBIN_VA_ARGC(__VA_ARGS__))(__VA_ARGS__) +# if defined(__GNUC__) +# define INCBIN_1(...) _Pragma("GCC error \"Single argument INCBIN not allowed\"") +# elif defined(__clang__) +# define INCBIN_1(...) _Pragma("clang error \"Single argument INCBIN not allowed\"") +# else +# define INCBIN_1(...) /* Cannot do anything here */ +# endif +# define INCBIN_2(NAME, FILENAME) \ + INCBIN_3(unsigned char, NAME, FILENAME) +# define INCBIN_3(TYPE, NAME, FILENAME) INCBIN_COMMON(TYPE, NAME, FILENAME, /* No terminator for binary data */) +# define INCBIN_COMMON(TYPE, NAME, FILENAME, TERMINATOR) \ + __asm__(INCBIN_SECTION \ + INCBIN_GLOBAL_LABELS(NAME, DATA) \ + INCBIN_ALIGN_HOST \ + INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \ + INCBIN_MACRO " \"" FILENAME "\"\n" \ + TERMINATOR \ + INCBIN_GLOBAL_LABELS(NAME, END) \ + INCBIN_ALIGN_BYTE \ + INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \ + INCBIN_BYTE "1\n" \ + INCBIN_GLOBAL_LABELS(NAME, SIZE) \ + INCBIN_ALIGN_HOST \ + INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \ + INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \ + INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \ + INCBIN_ALIGN_HOST \ + ".text\n" \ + ); \ + INCBIN_EXTERN(TYPE, NAME) +#endif + +/** + * @brief Include a textual file into the current translation unit. + * + * This behaves the same as INCBIN except it produces char compatible arrays + * and implicitly adds a null-terminator byte, thus the size of data included + * by this is one byte larger than that of INCBIN. + * + * Includes a textual file into the current translation unit, producing three + * symbols for objects that encode the data and size respectively. + * + * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with + * "Data", as well as "End" and "Size" after. An example is provided below. + * + * @param NAME The name to associate with this binary data (as an identifier.) + * @param FILENAME The file to include (as a string literal.) + * + * @code + * INCTXT(Readme, "readme.txt"); + * + * // Now you have the following symbols: + * // const char Readme[]; + * // const char *const Readme; + * // const unsigned int Readme; + * @endcode + * + * @warning This must be used in global scope + * @warning The identifiers may be different if INCBIN_STYLE is not default + * + * To externally reference the data included by this in another translation unit + * please @see INCBIN_EXTERN. + */ +#if defined(_MSC_VER) +# define INCTXT(NAME, FILENAME) \ + INCBIN_EXTERN(NAME) +#else +# define INCTXT(NAME, FILENAME) \ + INCBIN_COMMON(char, NAME, FILENAME, INCBIN_BYTE "0\n") +#endif + +#endif \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9a7376efbaf3d7fd420ff215400b08047e603ff3 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,43 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include + +#include "bitboard.h" +#include "misc.h" +#include "position.h" +#include "tune.h" +#include "uci.h" + +using namespace Stockfish; + +int main(int argc, char* argv[]) { + std::cout << engine_info() << std::endl; + + Bitboards::init(); + Position::init(); + + auto uci = std::make_unique(argc, argv); + + Tune::init(uci->engine_options()); + + uci->loop(); + + return 0; +} diff --git a/src/memory.cpp b/src/memory.cpp new file mode 100644 index 0000000000000000000000000000000000000000..94a5993991b63685ca65b09d09e87f07af5cf9c1 --- /dev/null +++ b/src/memory.cpp @@ -0,0 +1,199 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "memory.h" + +#include + +#if __has_include("features.h") + #include +#endif + +#if defined(__linux__) && !defined(__ANDROID__) + #include +#endif + +#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) \ + || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) \ + || defined(__e2k__) + #define POSIXALIGNEDALLOC + #include +#endif + +#ifdef _WIN32 + #if _WIN32_WINNT < 0x0601 + #undef _WIN32_WINNT + #define _WIN32_WINNT 0x0601 // Force to include needed API prototypes + #endif + + #ifndef NOMINMAX + #define NOMINMAX + #endif + + #include // std::hex, std::dec + #include // std::cerr + #include // std::endl + #include + +// The needed Windows API for processor groups could be missed from old Windows +// versions, so instead of calling them directly (forcing the linker to resolve +// the calls at compile time), try to load them at runtime. To do this we need +// first to define the corresponding function pointers. + +#endif + + +namespace Stockfish { + +// Wrappers for systems where the c++17 implementation does not guarantee the +// availability of aligned_alloc(). Memory allocated with std_aligned_alloc() +// must be freed with std_aligned_free(). + +void* std_aligned_alloc(size_t alignment, size_t size) { +#if defined(_ISOC11_SOURCE) + return aligned_alloc(alignment, size); +#elif defined(POSIXALIGNEDALLOC) + void* mem = nullptr; + posix_memalign(&mem, alignment, size); + return mem; +#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) + return _mm_malloc(size, alignment); +#elif defined(_WIN32) + return _aligned_malloc(size, alignment); +#else + return std::aligned_alloc(alignment, size); +#endif +} + +void std_aligned_free(void* ptr) { + +#if defined(POSIXALIGNEDALLOC) + free(ptr); +#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) + _mm_free(ptr); +#elif defined(_WIN32) + _aligned_free(ptr); +#else + free(ptr); +#endif +} + +// aligned_large_pages_alloc() will return suitably aligned memory, +// if possible using large pages. + +#if defined(_WIN32) + +static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize) { + + return windows_try_with_large_page_priviliges( + [&](size_t largePageSize) { + // Round up size to full pages and allocate + allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1); + return VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, + PAGE_READWRITE); + }, + []() { return (void*) nullptr; }); +} + +void* aligned_large_pages_alloc(size_t allocSize) { + + // Try to allocate large pages + void* mem = aligned_large_pages_alloc_windows(allocSize); + + // Fall back to regular, page-aligned, allocation if necessary + if (!mem) + mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + + return mem; +} + +#else + +void* aligned_large_pages_alloc(size_t allocSize) { + + #if defined(__linux__) + constexpr size_t alignment = 2 * 1024 * 1024; // 2MB page size assumed + #else + constexpr size_t alignment = 4096; // small page size assumed + #endif + + // Round up to multiples of alignment + size_t size = ((allocSize + alignment - 1) / alignment) * alignment; + void* mem = std_aligned_alloc(alignment, size); + #if defined(MADV_HUGEPAGE) + madvise(mem, size, MADV_HUGEPAGE); + #endif + return mem; +} + +#endif + +bool has_large_pages() { + +#if defined(_WIN32) + + constexpr size_t page_size = 2 * 1024 * 1024; // 2MB page size assumed + void* mem = aligned_large_pages_alloc_windows(page_size); + if (mem == nullptr) + { + return false; + } + else + { + aligned_large_pages_free(mem); + return true; + } + +#elif defined(__linux__) + + #if defined(MADV_HUGEPAGE) + return true; + #else + return false; + #endif + +#else + + return false; + +#endif +} + + +// aligned_large_pages_free() will free the previously memory allocated +// by aligned_large_pages_alloc(). The effect is a nop if mem == nullptr. + +#if defined(_WIN32) + +void aligned_large_pages_free(void* mem) { + + if (mem && !VirtualFree(mem, 0, MEM_RELEASE)) + { + DWORD err = GetLastError(); + std::cerr << "Failed to free large page memory. Error code: 0x" << std::hex << err + << std::dec << std::endl; + exit(EXIT_FAILURE); + } +} + +#else + +void aligned_large_pages_free(void* mem) { std_aligned_free(mem); } + +#endif +} // namespace Stockfish diff --git a/src/memory.h b/src/memory.h new file mode 100644 index 0000000000000000000000000000000000000000..056b07c6d16593a634e53a6621a343bd43aea9f6 --- /dev/null +++ b/src/memory.h @@ -0,0 +1,333 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef MEMORY_H_INCLUDED +#define MEMORY_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" + +#if defined(_WIN64) + + #if _WIN32_WINNT < 0x0601 + #undef _WIN32_WINNT + #define _WIN32_WINNT 0x0601 // Force to include needed API prototypes + #endif + + #if !defined(NOMINMAX) + #define NOMINMAX + #endif + #include + + // Some Windows headers (RPC/old headers) define short macros such + // as 'small' expanding to 'char', which breaks identifiers in the code. + // Undefine those macros immediately after including . + #ifdef small + #undef small + #endif + + #include + +extern "C" { +using OpenProcessToken_t = bool (*)(HANDLE, DWORD, PHANDLE); +using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID); +using AdjustTokenPrivileges_t = + bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD); +} +#endif + + +namespace Stockfish { + +void* std_aligned_alloc(size_t alignment, size_t size); +void std_aligned_free(void* ptr); + +// Memory aligned by page size, min alignment: 4096 bytes +void* aligned_large_pages_alloc(size_t size); +void aligned_large_pages_free(void* mem); + +bool has_large_pages(); + +// Frees memory which was placed there with placement new. +// Works for both single objects and arrays of unknown bound. +template +void memory_deleter(T* ptr, FREE_FUNC free_func) { + if (!ptr) + return; + + // Explicitly needed to call the destructor + if constexpr (!std::is_trivially_destructible_v) + ptr->~T(); + + free_func(ptr); +} + +// Frees memory which was placed there with placement new. +// Works for both single objects and arrays of unknown bound. +template +void memory_deleter_array(T* ptr, FREE_FUNC free_func) { + if (!ptr) + return; + + + // Move back on the pointer to where the size is allocated + const size_t array_offset = std::max(sizeof(size_t), alignof(T)); + char* raw_memory = reinterpret_cast(ptr) - array_offset; + + if constexpr (!std::is_trivially_destructible_v) + { + const size_t size = *reinterpret_cast(raw_memory); + + // Explicitly call the destructor for each element in reverse order + for (size_t i = size; i-- > 0;) + ptr[i].~T(); + } + + free_func(raw_memory); +} + +// Allocates memory for a single object and places it there with placement new +template +inline std::enable_if_t, T*> memory_allocator(ALLOC_FUNC alloc_func, + Args&&... args) { + void* raw_memory = alloc_func(sizeof(T)); + ASSERT_ALIGNED(raw_memory, alignof(T)); + return new (raw_memory) T(std::forward(args)...); +} + +// Allocates memory for an array of unknown bound and places it there with placement new +template +inline std::enable_if_t, std::remove_extent_t*> +memory_allocator(ALLOC_FUNC alloc_func, size_t num) { + using ElementType = std::remove_extent_t; + + const size_t array_offset = std::max(sizeof(size_t), alignof(ElementType)); + + // Save the array size in the memory location + char* raw_memory = + reinterpret_cast(alloc_func(array_offset + num * sizeof(ElementType))); + ASSERT_ALIGNED(raw_memory, alignof(T)); + + new (raw_memory) size_t(num); + + for (size_t i = 0; i < num; ++i) + new (raw_memory + array_offset + i * sizeof(ElementType)) ElementType(); + + // Need to return the pointer at the start of the array so that + // the indexing in unique_ptr works. + return reinterpret_cast(raw_memory + array_offset); +} + +// +// +// aligned large page unique ptr +// +// + +template +struct LargePageDeleter { + void operator()(T* ptr) const { return memory_deleter(ptr, aligned_large_pages_free); } +}; + +template +struct LargePageArrayDeleter { + void operator()(T* ptr) const { return memory_deleter_array(ptr, aligned_large_pages_free); } +}; + +template +using LargePagePtr = + std::conditional_t, + std::unique_ptr>>, + std::unique_ptr>>; + +// make_unique_large_page for single objects +template +std::enable_if_t, LargePagePtr> make_unique_large_page(Args&&... args) { + static_assert(alignof(T) <= 4096, + "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); + + T* obj = memory_allocator(aligned_large_pages_alloc, std::forward(args)...); + + return LargePagePtr(obj); +} + +// make_unique_large_page for arrays of unknown bound +template +std::enable_if_t, LargePagePtr> make_unique_large_page(size_t num) { + using ElementType = std::remove_extent_t; + + static_assert(alignof(ElementType) <= 4096, + "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); + + ElementType* memory = memory_allocator(aligned_large_pages_alloc, num); + + return LargePagePtr(memory); +} + +// +// +// aligned unique ptr +// +// + +template +struct AlignedDeleter { + void operator()(T* ptr) const { return memory_deleter(ptr, std_aligned_free); } +}; + +template +struct AlignedArrayDeleter { + void operator()(T* ptr) const { return memory_deleter_array(ptr, std_aligned_free); } +}; + +template +using AlignedPtr = + std::conditional_t, + std::unique_ptr>>, + std::unique_ptr>>; + +// make_unique_aligned for single objects +template +std::enable_if_t, AlignedPtr> make_unique_aligned(Args&&... args) { + const auto func = [](size_t size) { return std_aligned_alloc(alignof(T), size); }; + T* obj = memory_allocator(func, std::forward(args)...); + + return AlignedPtr(obj); +} + +// make_unique_aligned for arrays of unknown bound +template +std::enable_if_t, AlignedPtr> make_unique_aligned(size_t num) { + using ElementType = std::remove_extent_t; + + const auto func = [](size_t size) { return std_aligned_alloc(alignof(ElementType), size); }; + ElementType* memory = memory_allocator(func, num); + + return AlignedPtr(memory); +} + + +// Get the first aligned element of an array. +// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes, +// where N is the number of elements in the array. +template +T* align_ptr_up(T* ptr) { + static_assert(alignof(T) < Alignment); + + const uintptr_t ptrint = reinterpret_cast(reinterpret_cast(ptr)); + return reinterpret_cast( + reinterpret_cast((ptrint + (Alignment - 1)) / Alignment * Alignment)); +} + +#if defined(_WIN32) + +template +auto windows_try_with_large_page_priviliges([[maybe_unused]] FuncYesT&& fyes, FuncNoT&& fno) { + + #if !defined(_WIN64) + return fno(); + #else + + HANDLE hProcessToken{}; + LUID luid{}; + + const size_t largePageSize = GetLargePageMinimum(); + if (!largePageSize) + return fno(); + + // Dynamically link OpenProcessToken, LookupPrivilegeValue and AdjustTokenPrivileges + + HMODULE hAdvapi32 = GetModuleHandle(TEXT("advapi32.dll")); + + if (!hAdvapi32) + hAdvapi32 = LoadLibrary(TEXT("advapi32.dll")); + + auto OpenProcessToken_f = + OpenProcessToken_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken")); + if (!OpenProcessToken_f) + return fno(); + auto LookupPrivilegeValueA_f = + LookupPrivilegeValueA_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA")); + if (!LookupPrivilegeValueA_f) + return fno(); + auto AdjustTokenPrivileges_f = + AdjustTokenPrivileges_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges")); + if (!AdjustTokenPrivileges_f) + return fno(); + + // We need SeLockMemoryPrivilege, so try to enable it for the process + + if (!OpenProcessToken_f( // OpenProcessToken() + GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken)) + return fno(); + + if (!LookupPrivilegeValueA_f(nullptr, "SeLockMemoryPrivilege", &luid)) + return fno(); + + TOKEN_PRIVILEGES tp{}; + TOKEN_PRIVILEGES prevTp{}; + DWORD prevTpLen = 0; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Luid = luid; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() + // succeeds, we still need to query GetLastError() to ensure that the privileges + // were actually obtained. + + if (!AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, + &prevTpLen) + || GetLastError() != ERROR_SUCCESS) + return fno(); + + auto&& ret = fyes(largePageSize); + + // Privilege no longer needed, restore previous state + AdjustTokenPrivileges_f(hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr); + + CloseHandle(hProcessToken); + + return std::forward(ret); + + #endif +} + +#endif + +template +T load_as(const ByteT* buffer) { + static_assert(std::is_trivially_copyable::value, "Type must be trivially copyable"); + static_assert(sizeof(ByteT) == 1); + + T value; + std::memcpy(&value, buffer, sizeof(T)); + + return value; +} + +} // namespace Stockfish + +#endif // #ifndef MEMORY_H_INCLUDED diff --git a/src/misc.cpp b/src/misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2cb3ce5d492d981dc3da5cab9ff1a2fc18cf08d2 --- /dev/null +++ b/src/misc.cpp @@ -0,0 +1,549 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "misc.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" + +namespace Stockfish { + +namespace { + +// Version number or dev. +constexpr std::string_view version = "dev"; + +// Our fancy logging facility. The trick here is to replace cin.rdbuf() and +// cout.rdbuf() with two Tie objects that tie cin and cout to a file stream. We +// can toggle the logging of std::cout and std::cin at runtime whilst preserving +// usual I/O functionality, all without changing a single line of code! +// Idea from http://groups.google.com/group/comp.lang.c++/msg/1d941c0f26ea0d81 + +struct Tie: public std::streambuf { // MSVC requires split streambuf for cin and cout + + Tie(std::streambuf* b, std::streambuf* l) : + buf(b), + logBuf(l) {} + + int sync() override { return logBuf->pubsync(), buf->pubsync(); } + int overflow(int c) override { return log(buf->sputc(char(c)), "<< "); } + int underflow() override { return buf->sgetc(); } + int uflow() override { return log(buf->sbumpc(), ">> "); } + + std::streambuf *buf, *logBuf; + + int log(int c, const char* prefix) { + + static int last = '\n'; // Single log file + + if (last == '\n') + logBuf->sputn(prefix, 3); + + return last = logBuf->sputc(char(c)); + } +}; + +class Logger { + + Logger() : + in(std::cin.rdbuf(), file.rdbuf()), + out(std::cout.rdbuf(), file.rdbuf()) {} + ~Logger() { start(""); } + + std::ofstream file; + Tie in, out; + + public: + static void start(const std::string& fname) { + + static Logger l; + + if (l.file.is_open()) + { + std::cout.rdbuf(l.out.buf); + std::cin.rdbuf(l.in.buf); + l.file.close(); + } + + if (!fname.empty()) + { + l.file.open(fname, std::ifstream::out); + + if (!l.file.is_open()) + { + std::cerr << "Unable to open debug log file " << fname << std::endl; + exit(EXIT_FAILURE); + } + + std::cin.rdbuf(&l.in); + std::cout.rdbuf(&l.out); + } + } +}; + +} // namespace + + +// Returns the full name of the current Stockfish version. +// +// For local dev compiles we try to append the commit SHA and +// commit date from git. If that fails only the local compilation +// date is set and "nogit" is specified: +// Stockfish dev-YYYYMMDD-SHA +// or +// Stockfish dev-YYYYMMDD-nogit +// +// For releases (non-dev builds) we only include the version number: +// Stockfish version +std::string engine_version_info() { + std::stringstream ss; + ss << "Stockfish " << version << std::setfill('0'); + + if constexpr (version == "dev") + { + ss << "-"; +#ifdef GIT_DATE + ss << stringify(GIT_DATE); +#else + constexpr std::string_view months("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec"); + + std::string month, day, year; + std::stringstream date(__DATE__); // From compiler, format is "Sep 21 2008" + + date >> month >> day >> year; + ss << year << std::setw(2) << std::setfill('0') << (1 + months.find(month) / 4) + << std::setw(2) << std::setfill('0') << day; +#endif + + ss << "-"; + +#ifdef GIT_SHA + ss << stringify(GIT_SHA); +#else + ss << "nogit"; +#endif + } + + return ss.str(); +} + +std::string engine_info(bool to_uci) { + return engine_version_info() + (to_uci ? "\nid author " : " by ") + + "the Stockfish developers (see AUTHORS file)"; +} + + +// Returns a string trying to describe the compiler we use +std::string compiler_info() { + +#define make_version_string(major, minor, patch) \ + stringify(major) "." stringify(minor) "." stringify(patch) + + // Predefined macros hell: + // + // __GNUC__ Compiler is GCC, Clang or ICX + // __clang__ Compiler is Clang or ICX + // __INTEL_LLVM_COMPILER Compiler is ICX + // _MSC_VER Compiler is MSVC + // _WIN32 Building on Windows (any) + // _WIN64 Building on Windows 64 bit + + std::string compiler = "\nCompiled by : "; + +#if defined(__INTEL_LLVM_COMPILER) + compiler += "ICX "; + compiler += stringify(__INTEL_LLVM_COMPILER); +#elif defined(__clang__) + compiler += "clang++ "; + compiler += make_version_string(__clang_major__, __clang_minor__, __clang_patchlevel__); +#elif _MSC_VER + compiler += "MSVC "; + compiler += "(version "; + compiler += stringify(_MSC_FULL_VER) "." stringify(_MSC_BUILD); + compiler += ")"; +#elif defined(__e2k__) && defined(__LCC__) + #define dot_ver2(n) \ + compiler += char('.'); \ + compiler += char('0' + (n) / 10); \ + compiler += char('0' + (n) % 10); + + compiler += "MCST LCC "; + compiler += "(version "; + compiler += std::to_string(__LCC__ / 100); + dot_ver2(__LCC__ % 100) dot_ver2(__LCC_MINOR__) compiler += ")"; +#elif __GNUC__ + compiler += "g++ (GNUC) "; + compiler += make_version_string(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); +#else + compiler += "Unknown compiler "; + compiler += "(unknown version)"; +#endif + +#if defined(__APPLE__) + compiler += " on Apple"; +#elif defined(__CYGWIN__) + compiler += " on Cygwin"; +#elif defined(__MINGW64__) + compiler += " on MinGW64"; +#elif defined(__MINGW32__) + compiler += " on MinGW32"; +#elif defined(__ANDROID__) + compiler += " on Android"; +#elif defined(__linux__) + compiler += " on Linux"; +#elif defined(_WIN64) + compiler += " on Microsoft Windows 64-bit"; +#elif defined(_WIN32) + compiler += " on Microsoft Windows 32-bit"; +#else + compiler += " on unknown system"; +#endif + + compiler += "\nCompilation architecture : "; +#if defined(ARCH) + compiler += stringify(ARCH); +#else + compiler += "(undefined architecture)"; +#endif + + compiler += "\nCompilation settings : "; + compiler += (Is64Bit ? "64bit" : "32bit"); +#if defined(USE_AVX512ICL) + compiler += " AVX512ICL"; +#endif +#if defined(USE_VNNI) + compiler += " VNNI"; +#endif +#if defined(USE_AVX512) + compiler += " AVX512"; +#endif + compiler += (HasPext ? " BMI2" : ""); +#if defined(USE_AVX2) + compiler += " AVX2"; +#endif +#if defined(USE_SSE41) + compiler += " SSE41"; +#endif +#if defined(USE_SSSE3) + compiler += " SSSE3"; +#endif +#if defined(USE_SSE2) + compiler += " SSE2"; +#endif +#if defined(USE_NEON_DOTPROD) + compiler += " NEON_DOTPROD"; +#elif defined(USE_NEON) + compiler += " NEON"; +#endif + compiler += (HasPopCnt ? " POPCNT" : ""); + +#if !defined(NDEBUG) + compiler += " DEBUG"; +#endif + + compiler += "\nCompiler __VERSION__ macro : "; +#ifdef __VERSION__ + compiler += __VERSION__; +#else + compiler += "(undefined macro)"; +#endif + + compiler += "\n"; + + return compiler; +} + + +// Debug functions used mainly to collect run-time statistics +constexpr int MaxDebugSlots = 32; + +namespace { + +template +struct DebugInfo { + std::array, N> data = {0}; + + [[nodiscard]] constexpr std::atomic& operator[](size_t index) { + assert(index < N); + return data[index]; + } + + constexpr DebugInfo& operator=(const DebugInfo& other) { + for (size_t i = 0; i < N; i++) + data[i].store(other.data[i].load()); + return *this; + } +}; + +struct DebugExtremes: public DebugInfo<3> { + DebugExtremes() { + data[1] = std::numeric_limits::min(); + data[2] = std::numeric_limits::max(); + } +}; + +std::array, MaxDebugSlots> hit; +std::array, MaxDebugSlots> mean; +std::array, MaxDebugSlots> stdev; +std::array, MaxDebugSlots> correl; +std::array extremes; + +} // namespace + +void dbg_hit_on(bool cond, int slot) { + + ++hit.at(slot)[0]; + if (cond) + ++hit.at(slot)[1]; +} + +void dbg_mean_of(int64_t value, int slot) { + + ++mean.at(slot)[0]; + mean.at(slot)[1] += value; +} + +void dbg_stdev_of(int64_t value, int slot) { + + ++stdev.at(slot)[0]; + stdev.at(slot)[1] += value; + stdev.at(slot)[2] += value * value; +} + +void dbg_extremes_of(int64_t value, int slot) { + ++extremes.at(slot)[0]; + + int64_t current_max = extremes.at(slot)[1].load(); + while (current_max < value && !extremes.at(slot)[1].compare_exchange_weak(current_max, value)) + {} + + int64_t current_min = extremes.at(slot)[2].load(); + while (current_min > value && !extremes.at(slot)[2].compare_exchange_weak(current_min, value)) + {} +} + +void dbg_correl_of(int64_t value1, int64_t value2, int slot) { + + ++correl.at(slot)[0]; + correl.at(slot)[1] += value1; + correl.at(slot)[2] += value1 * value1; + correl.at(slot)[3] += value2; + correl.at(slot)[4] += value2 * value2; + correl.at(slot)[5] += value1 * value2; +} + +void dbg_print() { + + int64_t n; + auto E = [&n](int64_t x) { return double(x) / n; }; + auto sqr = [](double x) { return x * x; }; + + for (int i = 0; i < MaxDebugSlots; ++i) + if ((n = hit[i][0])) + std::cerr << "Hit #" << i << ": Total " << n << " Hits " << hit[i][1] + << " Hit Rate (%) " << 100.0 * E(hit[i][1]) << std::endl; + + for (int i = 0; i < MaxDebugSlots; ++i) + if ((n = mean[i][0])) + { + std::cerr << "Mean #" << i << ": Total " << n << " Mean " << E(mean[i][1]) << std::endl; + } + + for (int i = 0; i < MaxDebugSlots; ++i) + if ((n = stdev[i][0])) + { + double r = sqrt(E(stdev[i][2]) - sqr(E(stdev[i][1]))); + std::cerr << "Stdev #" << i << ": Total " << n << " Stdev " << r << std::endl; + } + + for (int i = 0; i < MaxDebugSlots; ++i) + if ((n = extremes[i][0])) + { + std::cerr << "Extremity #" << i << ": Total " << n << " Min " << extremes[i][2] + << " Max " << extremes[i][1] << std::endl; + } + + for (int i = 0; i < MaxDebugSlots; ++i) + if ((n = correl[i][0])) + { + double r = (E(correl[i][5]) - E(correl[i][1]) * E(correl[i][3])) + / (sqrt(E(correl[i][2]) - sqr(E(correl[i][1]))) + * sqrt(E(correl[i][4]) - sqr(E(correl[i][3])))); + std::cerr << "Correl. #" << i << ": Total " << n << " Coefficient " << r << std::endl; + } +} + +void dbg_clear() { + hit.fill({}); + mean.fill({}); + stdev.fill({}); + correl.fill({}); + extremes.fill({}); +} + +// Used to serialize access to std::cout +// to avoid multiple threads writing at the same time. +std::ostream& operator<<(std::ostream& os, SyncCout sc) { + + static std::mutex m; + + if (sc == IO_LOCK) + m.lock(); + + if (sc == IO_UNLOCK) + m.unlock(); + + return os; +} + +void sync_cout_start() { std::cout << IO_LOCK; } +void sync_cout_end() { std::cout << IO_UNLOCK; } + +// Hash function based on public domain MurmurHash64A, by Austin Appleby. +uint64_t hash_bytes(const char* data, size_t size) { + const uint64_t m = 0xc6a4a7935bd1e995ull; + const int r = 47; + + uint64_t h = size * m; + + const char* end = data + (size & ~(size_t) 7); + + for (const char* p = data; p != end; p += 8) + { + uint64_t k; + std::memcpy(&k, p, sizeof(k)); + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + if (size & 7) + { + uint64_t k = 0; + for (int i = (size & 7) - 1; i >= 0; i--) + k = (k << 8) | (uint64_t) end[i]; + + h ^= k; + h *= m; + } + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +// Trampoline helper to avoid moving Logger to misc.h +void start_logger(const std::string& fname) { Logger::start(fname); } + + +#ifdef _WIN32 + #include + #define GETCWD _getcwd +#else + #include + #define GETCWD getcwd +#endif + +size_t str_to_size_t(const std::string& s) { + unsigned long long value = std::stoull(s); + if (value > std::numeric_limits::max()) + std::exit(EXIT_FAILURE); + return static_cast(value); +} + +std::optional read_file_to_string(const std::string& path) { + std::ifstream f(path, std::ios_base::binary); + if (!f) + return std::nullopt; + return std::string(std::istreambuf_iterator(f), std::istreambuf_iterator()); +} + +void remove_whitespace(std::string& s) { + s.erase(std::remove_if(s.begin(), s.end(), [](char c) { return std::isspace(c); }), s.end()); +} + +bool is_whitespace(std::string_view s) { + return std::all_of(s.begin(), s.end(), [](char c) { return std::isspace(c); }); +} + +std::string CommandLine::get_binary_directory(std::string argv0) { + std::string pathSeparator; + +#ifdef _WIN32 + pathSeparator = "\\"; + #ifdef _MSC_VER + // Under windows argv[0] may not have the extension. Also _get_pgmptr() had + // issues in some Windows 10 versions, so check returned values carefully. + char* pgmptr = nullptr; + if (!_get_pgmptr(&pgmptr) && pgmptr != nullptr && *pgmptr) + argv0 = pgmptr; + #endif +#else + pathSeparator = "/"; +#endif + + // Extract the working directory + auto workingDirectory = CommandLine::get_working_directory(); + + // Extract the binary directory path from argv0 + auto binaryDirectory = argv0; + size_t pos = binaryDirectory.find_last_of("\\/"); + if (pos == std::string::npos) + binaryDirectory = "." + pathSeparator; + else + binaryDirectory.resize(pos + 1); + + // Pattern replacement: "./" at the start of path is replaced by the working directory + if (binaryDirectory.find("." + pathSeparator) == 0) + binaryDirectory.replace(0, 1, workingDirectory); + + return binaryDirectory; +} + +std::string CommandLine::get_working_directory() { + std::string workingDirectory = ""; + char buff[40000]; + char* cwd = GETCWD(buff, 40000); + if (cwd) + workingDirectory = cwd; + + return workingDirectory; +} + + +} // namespace Stockfish diff --git a/src/misc.h b/src/misc.h new file mode 100644 index 0000000000000000000000000000000000000000..d1c368fdd2459b429deb6d9548e93c3cdbd17eeb --- /dev/null +++ b/src/misc.h @@ -0,0 +1,538 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef MISC_H_INCLUDED +#define MISC_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: keep +// IWYU pragma: no_include <__exception/terminate.h> +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(NO_PREFETCH) && (defined(_MSC_VER) || defined(__INTEL_COMPILER)) + #include +#endif + +#define stringify2(x) #x +#define stringify(x) stringify2(x) + +namespace Stockfish { + +std::string engine_version_info(); +std::string engine_info(bool to_uci = false); +std::string compiler_info(); + +// Prefetch hint enums for explicit call-site control. +enum class PrefetchRw { + READ, + WRITE +}; + +// NOTE: PrefetchLoc controls locality / cache level, not whether a prefetch +// is issued. In particular, PrefetchLoc::NONE maps to a non-temporal / +// lowest-locality prefetch (Intel: _MM_HINT_NTA, GCC/Clang: locality = 0) +// and therefore still performs a prefetch. To completely disable +// prefetching, define NO_PREFETCH so that prefetch() becomes a no-op. +enum class PrefetchLoc { + NONE, // Non-temporal / no cache locality (still issues a prefetch) + LOW, // Low locality (e.g. T2 / L2) + MODERATE, // Moderate locality (e.g. T1 / L1) + HIGH // High locality (e.g. T0 / closest cache) +}; + +// Preloads the given address into cache. This is a non-blocking +// function that doesn't stall the CPU waiting for data to be loaded from memory, +// which can be quite slow. +#ifdef NO_PREFETCH +template +void prefetch(const void*) {} +#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) + +constexpr int get_intel_hint(PrefetchRw rw, PrefetchLoc loc) { + if (rw == PrefetchRw::WRITE) + { + #ifdef _MM_HINT_ET0 + return _MM_HINT_ET0; + #else + // Fallback when write-prefetch hint is not available: use T0 + return _MM_HINT_T0; + #endif + } + switch (loc) + { + case PrefetchLoc::NONE : + return _MM_HINT_NTA; + case PrefetchLoc::LOW : + return _MM_HINT_T2; + case PrefetchLoc::MODERATE : + return _MM_HINT_T1; + case PrefetchLoc::HIGH : + return _MM_HINT_T0; + default : + return _MM_HINT_T0; + } +} + +template +void prefetch(const void* addr) { + _mm_prefetch(static_cast(addr), get_intel_hint(RW, LOC)); +} +#else +template +void prefetch(const void* addr) { + __builtin_prefetch(addr, static_cast(RW), static_cast(LOC)); +} +#endif + +void start_logger(const std::string& fname); + +size_t str_to_size_t(const std::string& s); + +#if defined(__linux__) + +struct PipeDeleter { + void operator()(FILE* file) const { + if (file != nullptr) + { + pclose(file); + } + } +}; + +#endif + +// Reads the file as bytes. +// Returns std::nullopt if the file does not exist. +std::optional read_file_to_string(const std::string& path); + +void dbg_hit_on(bool cond, int slot = 0); +void dbg_mean_of(int64_t value, int slot = 0); +void dbg_stdev_of(int64_t value, int slot = 0); +void dbg_extremes_of(int64_t value, int slot = 0); +void dbg_correl_of(int64_t value1, int64_t value2, int slot = 0); +void dbg_print(); +void dbg_clear(); + +using TimePoint = std::chrono::milliseconds::rep; // A value in milliseconds +static_assert(sizeof(TimePoint) == sizeof(int64_t), "TimePoint should be 64 bits"); +inline TimePoint now() { + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +} + +inline std::vector split(std::string_view s, std::string_view delimiter) { + std::vector res; + + if (s.empty()) + return res; + + size_t begin = 0; + for (;;) + { + const size_t end = s.find(delimiter, begin); + if (end == std::string::npos) + break; + + res.emplace_back(s.substr(begin, end - begin)); + begin = end + delimiter.size(); + } + + res.emplace_back(s.substr(begin)); + + return res; +} + +void remove_whitespace(std::string& s); +bool is_whitespace(std::string_view s); + +enum SyncCout { + IO_LOCK, + IO_UNLOCK +}; +std::ostream& operator<<(std::ostream&, SyncCout); + +#define sync_cout std::cout << IO_LOCK +#define sync_endl std::endl << IO_UNLOCK + +void sync_cout_start(); +void sync_cout_end(); + +// True if and only if the binary is compiled on a little-endian machine +static inline const std::uint16_t Le = 1; +static inline const bool IsLittleEndian = *reinterpret_cast(&Le) == 1; + + +template +class ValueList { + + public: + std::size_t size() const { return size_; } + int ssize() const { return int(size_); } + void push_back(const T& value) { + assert(size_ < MaxSize); + values_[size_++] = value; + } + const T* begin() const { return values_; } + const T* end() const { return values_ + size_; } + const T& operator[](int index) const { return values_[index]; } + + T* make_space(size_t count) { + T* result = &values_[size_]; + size_ += count; + assert(size_ <= MaxSize); + return result; + } + + private: + T values_[MaxSize]; + std::size_t size_ = 0; +}; + + +template +class MultiArray; + +namespace Detail { + +template +struct MultiArrayHelper { + using ChildType = MultiArray; +}; + +template +struct MultiArrayHelper { + using ChildType = T; +}; + +template +constexpr bool is_strictly_assignable_v = + std::is_assignable_v && (std::is_same_v || !std::is_convertible_v); + +} + +// MultiArray is a generic N-dimensional array. +// The template parameters (Size and Sizes) encode the dimensions of the array. +template +class MultiArray { + using ChildType = typename Detail::MultiArrayHelper::ChildType; + using ArrayType = std::array; + ArrayType data_; + + public: + using value_type = typename ArrayType::value_type; + using size_type = typename ArrayType::size_type; + using difference_type = typename ArrayType::difference_type; + using reference = typename ArrayType::reference; + using const_reference = typename ArrayType::const_reference; + using pointer = typename ArrayType::pointer; + using const_pointer = typename ArrayType::const_pointer; + using iterator = typename ArrayType::iterator; + using const_iterator = typename ArrayType::const_iterator; + using reverse_iterator = typename ArrayType::reverse_iterator; + using const_reverse_iterator = typename ArrayType::const_reverse_iterator; + + constexpr auto& at(size_type index) noexcept { return data_.at(index); } + constexpr const auto& at(size_type index) const noexcept { return data_.at(index); } + + constexpr auto& operator[](size_type index) noexcept { return data_[index]; } + constexpr const auto& operator[](size_type index) const noexcept { return data_[index]; } + + constexpr auto& front() noexcept { return data_.front(); } + constexpr const auto& front() const noexcept { return data_.front(); } + constexpr auto& back() noexcept { return data_.back(); } + constexpr const auto& back() const noexcept { return data_.back(); } + + auto* data() { return data_.data(); } + const auto* data() const { return data_.data(); } + + constexpr auto begin() noexcept { return data_.begin(); } + constexpr auto end() noexcept { return data_.end(); } + constexpr auto begin() const noexcept { return data_.begin(); } + constexpr auto end() const noexcept { return data_.end(); } + constexpr auto cbegin() const noexcept { return data_.cbegin(); } + constexpr auto cend() const noexcept { return data_.cend(); } + + constexpr auto rbegin() noexcept { return data_.rbegin(); } + constexpr auto rend() noexcept { return data_.rend(); } + constexpr auto rbegin() const noexcept { return data_.rbegin(); } + constexpr auto rend() const noexcept { return data_.rend(); } + constexpr auto crbegin() const noexcept { return data_.crbegin(); } + constexpr auto crend() const noexcept { return data_.crend(); } + + constexpr bool empty() const noexcept { return data_.empty(); } + constexpr size_type size() const noexcept { return data_.size(); } + constexpr size_type max_size() const noexcept { return data_.max_size(); } + + template + void fill(const U& v) { + static_assert(Detail::is_strictly_assignable_v, + "Cannot assign fill value to entry type"); + for (auto& ele : data_) + { + if constexpr (sizeof...(Sizes) == 0) + ele = v; + else + ele.fill(v); + } + } + + constexpr void swap(MultiArray& other) noexcept { data_.swap(other.data_); } +}; + + +// xorshift64star Pseudo-Random Number Generator +// This class is based on original code written and dedicated +// to the public domain by Sebastiano Vigna (2014). +// It has the following characteristics: +// +// - Outputs 64-bit numbers +// - Passes Dieharder and SmallCrush test batteries +// - Does not require warm-up, no zeroland to escape +// - Internal state is a single 64-bit integer +// - Period is 2^64 - 1 +// - Speed: 1.60 ns/call (Core i7 @3.40GHz) +// +// For further analysis see +// + +class PRNG { + + uint64_t s; + + uint64_t rand64() { + + s ^= s >> 12, s ^= s << 25, s ^= s >> 27; + return s * 2685821657736338717LL; + } + + public: + PRNG(uint64_t seed) : + s(seed) { + assert(seed); + } + + template + T rand() { + return T(rand64()); + } + + // Special generator used to fast init magic numbers. + // Output values only have 1/8th of their bits set on average. + template + T sparse_rand() { + return T(rand64() & rand64() & rand64()); + } +}; + +inline uint64_t mul_hi64(uint64_t a, uint64_t b) { +#if defined(__GNUC__) && defined(IS_64BIT) + __extension__ using uint128 = unsigned __int128; + return (uint128(a) * uint128(b)) >> 64; +#else + uint64_t aL = uint32_t(a), aH = a >> 32; + uint64_t bL = uint32_t(b), bH = b >> 32; + uint64_t c1 = (aL * bL) >> 32; + uint64_t c2 = aH * bL + c1; + uint64_t c3 = aL * bH + uint32_t(c2); + return aH * bH + (c2 >> 32) + (c3 >> 32); +#endif +} + +uint64_t hash_bytes(const char*, size_t); + +template +inline std::size_t get_raw_data_hash(const T& value) { + // We must have no padding bytes because we're reinterpreting as char + static_assert(std::has_unique_object_representations()); + + return static_cast( + hash_bytes(reinterpret_cast(&value), sizeof(value))); +} + +template +inline void hash_combine(std::size_t& seed, const T& v) { + std::size_t x; + // For primitive types we avoid using the default hasher, which may be + // nondeterministic across program invocations + if constexpr (std::is_integral()) + x = v; + else + x = std::hash{}(v); + seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + +inline std::uint64_t hash_string(const std::string& sv) { return hash_bytes(sv.data(), sv.size()); } + +template +class FixedString { + public: + FixedString() : + length_(0) { + data_[0] = '\0'; + } + + FixedString(const char* str) { + size_t len = std::strlen(str); + if (len > Capacity) + std::terminate(); + std::memcpy(data_, str, len); + length_ = len; + data_[length_] = '\0'; + } + + FixedString(const std::string& str) { + if (str.size() > Capacity) + std::terminate(); + std::memcpy(data_, str.data(), str.size()); + length_ = str.size(); + data_[length_] = '\0'; + } + + std::size_t size() const { return length_; } + std::size_t capacity() const { return Capacity; } + + const char* c_str() const { return data_; } + const char* data() const { return data_; } + + char& operator[](std::size_t i) { return data_[i]; } + + const char& operator[](std::size_t i) const { return data_[i]; } + + FixedString& operator+=(const char* str) { + size_t len = std::strlen(str); + if (length_ + len > Capacity) + std::terminate(); + std::memcpy(data_ + length_, str, len); + length_ += len; + data_[length_] = '\0'; + return *this; + } + + FixedString& operator+=(const FixedString& other) { return (*this += other.c_str()); } + + operator std::string() const { return std::string(data_, length_); } + + operator std::string_view() const { return std::string_view(data_, length_); } + + template + bool operator==(const T& other) const noexcept { + return (std::string_view) (*this) == other; + } + + template + bool operator!=(const T& other) const noexcept { + return (std::string_view) (*this) != other; + } + + void clear() { + length_ = 0; + data_[0] = '\0'; + } + + private: + char data_[Capacity + 1]; // +1 for null terminator + std::size_t length_; +}; + +struct CommandLine { + public: + CommandLine(int _argc, char** _argv) : + argc(_argc), + argv(_argv) {} + + static std::string get_binary_directory(std::string argv0); + static std::string get_working_directory(); + + int argc; + char** argv; +}; + +namespace Utility { + +template +void move_to_front(std::vector& vec, Predicate pred) { + auto it = std::find_if(vec.begin(), vec.end(), pred); + + if (it != vec.end()) + { + std::rotate(vec.begin(), it, it + 1); + } +} +} + +#if defined(__GNUC__) + #define sf_always_inline __attribute__((always_inline)) +#elif defined(_MSC_VER) + #define sf_always_inline __forceinline +#else + // do nothing for other compilers + #define sf_always_inline +#endif + +#if defined(__clang__) + #define sf_assume(cond) __builtin_assume(cond) +#elif defined(__GNUC__) + #if __GNUC__ >= 13 + #define sf_assume(cond) __attribute__((assume(cond))) + #else + #define sf_assume(cond) \ + do \ + { \ + if (!(cond)) \ + __builtin_unreachable(); \ + } while (0) + #endif +#elif defined(_MSC_VER) + #define sf_assume(cond) __assume(cond) +#else + // do nothing for other compilers + #define sf_assume(cond) +#endif + +#ifdef __GNUC__ + #define sf_unreachable() __builtin_unreachable() +#elif defined(_MSC_VER) + #define sf_unreachable() __assume(0) +#else + #define sf_unreachable() +#endif + +} // namespace Stockfish + +template +struct std::hash> { + std::size_t operator()(const Stockfish::FixedString& fstr) const noexcept { + return Stockfish::hash_bytes(fstr.data(), fstr.size()); + } +}; + +#endif // #ifndef MISC_H_INCLUDED diff --git a/src/movegen.cpp b/src/movegen.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e63707a1ff4baa70fc0a64d3feb2b09505f0e14c --- /dev/null +++ b/src/movegen.cpp @@ -0,0 +1,312 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "movegen.h" + +#include +#include + +#include "bitboard.h" +#include "position.h" + +#if defined(USE_AVX512ICL) + #include + #include + #include +#endif + +namespace Stockfish { + +namespace { + +#if defined(USE_AVX512ICL) + +inline Move* write_moves(Move* moveList, uint32_t mask, __m512i vector) { + // Avoid _mm512_mask_compressstoreu_epi16() as it's 256 uOps on Zen4 + _mm512_storeu_si512(reinterpret_cast<__m512i*>(moveList), + _mm512_maskz_compress_epi16(mask, vector)); + return moveList + popcount(mask); +} + +template +inline Move* splat_pawn_moves(Move* moveList, Bitboard to_bb) { + alignas(64) static constexpr auto SPLAT_TABLE = [] { + std::array table{}; + for (int i = 0; i < 64; i++) + { + Square from{uint8_t(std::clamp(i - offset, 0, 63))}; + table[i] = {Move(from, Square{uint8_t(i)})}; + } + return table; + }(); + + auto table = reinterpret_cast(SPLAT_TABLE.data()); + + moveList = + write_moves(moveList, static_cast(to_bb >> 0), _mm512_load_si512(table + 0)); + moveList = + write_moves(moveList, static_cast(to_bb >> 32), _mm512_load_si512(table + 1)); + + return moveList; +} + +inline Move* splat_moves(Move* moveList, Square from, Bitboard to_bb) { + alignas(64) static constexpr auto SPLAT_TABLE = [] { + std::array table{}; + for (uint8_t i = 0; i < 64; i++) + table[i] = {Move(SQUARE_ZERO, Square{i})}; + return table; + }(); + + __m512i fromVec = _mm512_set1_epi16(Move(from, SQUARE_ZERO).raw()); + + auto table = reinterpret_cast(SPLAT_TABLE.data()); + + moveList = write_moves(moveList, static_cast(to_bb >> 0), + _mm512_or_si512(_mm512_load_si512(table + 0), fromVec)); + moveList = write_moves(moveList, static_cast(to_bb >> 32), + _mm512_or_si512(_mm512_load_si512(table + 1), fromVec)); + + return moveList; +} + +#else + +template +inline Move* splat_pawn_moves(Move* moveList, Bitboard to_bb) { + while (to_bb) + { + Square to = pop_lsb(to_bb); + *moveList++ = Move(to - offset, to); + } + return moveList; +} + +inline Move* splat_moves(Move* moveList, Square from, Bitboard to_bb) { + while (to_bb) + *moveList++ = Move(from, pop_lsb(to_bb)); + return moveList; +} + +#endif + +template +Move* make_promotions(Move* moveList, [[maybe_unused]] Square to) { + + constexpr bool all = Type == EVASIONS || Type == NON_EVASIONS; + + if constexpr (Type == CAPTURES || all) + *moveList++ = Move::make(to - D, to, QUEEN); + + if constexpr ((Type == CAPTURES && Enemy) || (Type == QUIETS && !Enemy) || all) + { + *moveList++ = Move::make(to - D, to, ROOK); + *moveList++ = Move::make(to - D, to, BISHOP); + *moveList++ = Move::make(to - D, to, KNIGHT); + } + + return moveList; +} + + +template +Move* generate_pawn_moves(const Position& pos, Move* moveList, Bitboard target) { + + constexpr Color Them = ~Us; + constexpr Bitboard TRank7BB = (Us == WHITE ? Rank7BB : Rank2BB); + constexpr Bitboard TRank3BB = (Us == WHITE ? Rank3BB : Rank6BB); + constexpr Direction Up = pawn_push(Us); + constexpr Direction UpRight = (Us == WHITE ? NORTH_EAST : SOUTH_WEST); + constexpr Direction UpLeft = (Us == WHITE ? NORTH_WEST : SOUTH_EAST); + + const Bitboard emptySquares = ~pos.pieces(); + const Bitboard enemies = Type == EVASIONS ? pos.checkers() : pos.pieces(Them); + + Bitboard pawnsOn7 = pos.pieces(Us, PAWN) & TRank7BB; + Bitboard pawnsNotOn7 = pos.pieces(Us, PAWN) & ~TRank7BB; + + // Single and double pawn pushes, no promotions + if constexpr (Type != CAPTURES) + { + Bitboard b1 = shift(pawnsNotOn7) & emptySquares; + Bitboard b2 = shift(b1 & TRank3BB) & emptySquares; + + if constexpr (Type == EVASIONS) // Consider only blocking squares + { + b1 &= target; + b2 &= target; + } + + moveList = splat_pawn_moves(moveList, b1); + moveList = splat_pawn_moves(moveList, b2); + } + + // Promotions and underpromotions + if (pawnsOn7) + { + Bitboard b1 = shift(pawnsOn7) & enemies; + Bitboard b2 = shift(pawnsOn7) & enemies; + Bitboard b3 = shift(pawnsOn7) & emptySquares; + + if constexpr (Type == EVASIONS) + b3 &= target; + + while (b1) + moveList = make_promotions(moveList, pop_lsb(b1)); + + while (b2) + moveList = make_promotions(moveList, pop_lsb(b2)); + + while (b3) + moveList = make_promotions(moveList, pop_lsb(b3)); + } + + // Standard and en passant captures + if constexpr (Type == CAPTURES || Type == EVASIONS || Type == NON_EVASIONS) + { + Bitboard b1 = shift(pawnsNotOn7) & enemies; + Bitboard b2 = shift(pawnsNotOn7) & enemies; + + moveList = splat_pawn_moves(moveList, b1); + moveList = splat_pawn_moves(moveList, b2); + + if (pos.ep_square() != SQ_NONE) + { + assert(rank_of(pos.ep_square()) == relative_rank(Us, RANK_6)); + + // An en passant capture cannot resolve a discovered check + if (Type == EVASIONS && (target & (pos.ep_square() + Up))) + return moveList; + + b1 = pawnsNotOn7 & attacks_bb(pos.ep_square(), Them); + + assert(b1); + + while (b1) + *moveList++ = Move::make(pop_lsb(b1), pos.ep_square()); + } + } + + return moveList; +} + + +template +Move* generate_moves(const Position& pos, Move* moveList, Bitboard target) { + + static_assert(Pt != KING && Pt != PAWN, "Unsupported piece type in generate_moves()"); + + Bitboard bb = pos.pieces(Us, Pt); + + while (bb) + { + Square from = pop_lsb(bb); + Bitboard b = attacks_bb(from, pos.pieces()) & target; + + moveList = splat_moves(moveList, from, b); + } + + return moveList; +} + + +template +Move* generate_all(const Position& pos, Move* moveList) { + + static_assert(Type != LEGAL, "Unsupported type in generate_all()"); + + const Square ksq = pos.square(Us); + Bitboard target; + + // Skip generating non-king moves when in double check + if (Type != EVASIONS || !more_than_one(pos.checkers())) + { + target = Type == EVASIONS ? between_bb(ksq, lsb(pos.checkers())) + : Type == NON_EVASIONS ? ~pos.pieces(Us) + : Type == CAPTURES ? pos.pieces(~Us) + : ~pos.pieces(); // QUIETS + + moveList = generate_pawn_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); + } + + Bitboard b = attacks_bb(ksq) & (Type == EVASIONS ? ~pos.pieces(Us) : target); + + moveList = splat_moves(moveList, ksq, b); + + if ((Type == QUIETS || Type == NON_EVASIONS) && pos.can_castle(Us & ANY_CASTLING)) + for (CastlingRights cr : {Us & KING_SIDE, Us & QUEEN_SIDE}) + if (!pos.castling_impeded(cr) && pos.can_castle(cr)) + *moveList++ = Move::make(ksq, pos.castling_rook_square(cr)); + + return moveList; +} + +} // namespace + + +// Generates all pseudo-legal captures plus queen promotions +// Generates all pseudo-legal non-captures and underpromotions +// Generates all pseudo-legal check evasions +// Generates all pseudo-legal captures and non-captures +// +// Returns a pointer to the end of the move list. +template +Move* generate(const Position& pos, Move* moveList) { + + static_assert(Type != LEGAL, "Unsupported type in generate()"); + assert((Type == EVASIONS) == bool(pos.checkers())); + + Color us = pos.side_to_move(); + + return us == WHITE ? generate_all(pos, moveList) + : generate_all(pos, moveList); +} + +// Explicit template instantiations +template Move* generate(const Position&, Move*); +template Move* generate(const Position&, Move*); +template Move* generate(const Position&, Move*); +template Move* generate(const Position&, Move*); + +// generate generates all the legal moves in the given position + +template<> +Move* generate(const Position& pos, Move* moveList) { + + Color us = pos.side_to_move(); + Bitboard pinned = pos.blockers_for_king(us) & pos.pieces(us); + Square ksq = pos.square(us); + Move* cur = moveList; + + moveList = + pos.checkers() ? generate(pos, moveList) : generate(pos, moveList); + while (cur != moveList) + if (((pinned & cur->from_sq()) || cur->from_sq() == ksq || cur->type_of() == EN_PASSANT) + && !pos.legal(*cur)) + *cur = *(--moveList); + else + ++cur; + + return moveList; +} + +} // namespace Stockfish diff --git a/src/movegen.h b/src/movegen.h new file mode 100644 index 0000000000000000000000000000000000000000..7f209f92a7eee591a65f75223f1ad8f261f7781a --- /dev/null +++ b/src/movegen.h @@ -0,0 +1,73 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef MOVEGEN_H_INCLUDED +#define MOVEGEN_H_INCLUDED + +#include // IWYU pragma: keep +#include + +#include "types.h" + +namespace Stockfish { + +class Position; + +enum GenType { + CAPTURES, + QUIETS, + EVASIONS, + NON_EVASIONS, + LEGAL +}; + +struct ExtMove: public Move { + int value; + + void operator=(Move m) { data = m.raw(); } + + // Inhibit unwanted implicit conversions to Move + // with an ambiguity that yields to a compile error. + operator float() const = delete; +}; + +inline bool operator<(const ExtMove& f, const ExtMove& s) { return f.value < s.value; } + +template +Move* generate(const Position& pos, Move* moveList); + +// The MoveList struct wraps the generate() function and returns a convenient +// list of moves. Using MoveList is sometimes preferable to directly calling +// the lower level generate() function. +template +struct MoveList { + + explicit MoveList(const Position& pos) : + last(generate(pos, moveList)) {} + const Move* begin() const { return moveList; } + const Move* end() const { return last; } + size_t size() const { return last - moveList; } + bool contains(Move move) const { return std::find(begin(), end(), move) != end(); } + + private: + Move moveList[MAX_MOVES], *last; +}; + +} // namespace Stockfish + +#endif // #ifndef MOVEGEN_H_INCLUDED diff --git a/src/movepick.cpp b/src/movepick.cpp new file mode 100644 index 0000000000000000000000000000000000000000..23b7facbb96a70fc77a84b9db9875ab1edab6a3a --- /dev/null +++ b/src/movepick.cpp @@ -0,0 +1,313 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "movepick.h" + +#include +#include +#include + +#include "bitboard.h" +#include "misc.h" +#include "position.h" + +namespace Stockfish { + +namespace { + +enum Stages { + // generate main search moves + MAIN_TT, + CAPTURE_INIT, + GOOD_CAPTURE, + QUIET_INIT, + GOOD_QUIET, + BAD_CAPTURE, + BAD_QUIET, + + // generate evasion moves + EVASION_TT, + EVASION_INIT, + EVASION, + + // generate probcut moves + PROBCUT_TT, + PROBCUT_INIT, + PROBCUT, + + // generate qsearch moves + QSEARCH_TT, + QCAPTURE_INIT, + QCAPTURE +}; + + +// Sort moves in descending order up to and including a given limit. +// The order of moves smaller than the limit is left unspecified. +void partial_insertion_sort(ExtMove* begin, ExtMove* end, int limit) { + + for (ExtMove *sortedEnd = begin, *p = begin + 1; p < end; ++p) + if (p->value >= limit) + { + ExtMove tmp = *p, *q; + *p = *++sortedEnd; + for (q = sortedEnd; q != begin && *(q - 1) < tmp; --q) + *q = *(q - 1); + *q = tmp; + } +} + +} // namespace + + +// Constructors of the MovePicker class. As arguments, we pass information +// to decide which class of moves to emit, to help sorting the (presumably) +// good moves first, and how important move ordering is at the current node. + +// MovePicker constructor for the main search and for the quiescence search +MovePicker::MovePicker(const Position& p, + Move ttm, + Depth d, + const ButterflyHistory* mh, + const LowPlyHistory* lph, + const CapturePieceToHistory* cph, + const PieceToHistory** ch, + const SharedHistories* sh, + int pl) : + pos(p), + mainHistory(mh), + lowPlyHistory(lph), + captureHistory(cph), + continuationHistory(ch), + sharedHistory(sh), + ttMove(ttm), + depth(d), + ply(pl) { + + if (pos.checkers()) + stage = EVASION_TT + !(ttm && pos.pseudo_legal(ttm)); + + else + stage = (depth > 0 ? MAIN_TT : QSEARCH_TT) + !(ttm && pos.pseudo_legal(ttm)); +} + +// MovePicker constructor for ProbCut: we generate captures with Static Exchange +// Evaluation (SEE) greater than or equal to the given threshold. +MovePicker::MovePicker(const Position& p, Move ttm, int th, const CapturePieceToHistory* cph) : + pos(p), + captureHistory(cph), + ttMove(ttm), + threshold(th) { + assert(!pos.checkers()); + + stage = PROBCUT_TT + !(ttm && pos.capture_stage(ttm) && pos.pseudo_legal(ttm)); +} + +// Assigns a numerical value to each move in a list, used for sorting. +// Captures are ordered by Most Valuable Victim (MVV), preferring captures +// with a good history. Quiets moves are ordered using the history tables. +template +ExtMove* MovePicker::score(MoveList& ml) { + + static_assert(Type == CAPTURES || Type == QUIETS || Type == EVASIONS, "Wrong type"); + + Color us = pos.side_to_move(); + + [[maybe_unused]] Bitboard threatByLesser[KING + 1]; + if constexpr (Type == QUIETS) + { + threatByLesser[PAWN] = 0; + threatByLesser[KNIGHT] = threatByLesser[BISHOP] = pos.attacks_by(~us); + threatByLesser[ROOK] = + pos.attacks_by(~us) | pos.attacks_by(~us) | threatByLesser[KNIGHT]; + threatByLesser[QUEEN] = pos.attacks_by(~us) | threatByLesser[ROOK]; + threatByLesser[KING] = 0; + } + + ExtMove* it = cur; + for (auto move : ml) + { + ExtMove& m = *it++; + m = move; + + const Square from = m.from_sq(); + const Square to = m.to_sq(); + const Piece pc = pos.moved_piece(m); + const PieceType pt = type_of(pc); + const Piece capturedPiece = pos.piece_on(to); + + if constexpr (Type == CAPTURES) + m.value = (*captureHistory)[pc][to][type_of(capturedPiece)] + + 7 * int(PieceValue[capturedPiece]); + + else if constexpr (Type == QUIETS) + { + // histories + m.value = 2 * (*mainHistory)[us][m.raw()]; + m.value += 2 * sharedHistory->pawn_entry(pos)[pc][to]; + m.value += (*continuationHistory[0])[pc][to]; + m.value += (*continuationHistory[1])[pc][to]; + m.value += (*continuationHistory[2])[pc][to]; + m.value += (*continuationHistory[3])[pc][to]; + m.value += (*continuationHistory[5])[pc][to]; + + // bonus for checks + m.value += (bool(pos.check_squares(pt) & to) && pos.see_ge(m, -75)) * 16384; + + // penalty for moving to a square threatened by a lesser piece + // or bonus for escaping an attack by a lesser piece. + int v = 20 * (bool(threatByLesser[pt] & from) - bool(threatByLesser[pt] & to)); + m.value += PieceValue[pt] * v; + + + if (ply < LOW_PLY_HISTORY_SIZE) + m.value += 8 * (*lowPlyHistory)[ply][m.raw()] / (1 + ply); + } + + else // Type == EVASIONS + { + if (pos.capture_stage(m)) + m.value = PieceValue[capturedPiece] + (1 << 28); + else + m.value = (*mainHistory)[us][m.raw()] + (*continuationHistory[0])[pc][to]; + } + } + return it; +} + +// Returns the next move satisfying a predicate function. +// This never returns the TT move, as it was emitted before. +template +Move MovePicker::select(Pred filter) { + + for (; cur < endCur; ++cur) + if (*cur != ttMove && filter()) + return *cur++; + + return Move::none(); +} + +// This is the most important method of the MovePicker class. We emit one +// new pseudo-legal move on every call until there are no more moves left, +// picking the move with the highest score from a list of generated moves. +Move MovePicker::next_move() { + + constexpr int goodQuietThreshold = -14000; +top: + switch (stage) + { + + case MAIN_TT : + case EVASION_TT : + case QSEARCH_TT : + case PROBCUT_TT : + ++stage; + return ttMove; + + case CAPTURE_INIT : + case PROBCUT_INIT : + case QCAPTURE_INIT : { + MoveList ml(pos); + + cur = endBadCaptures = moves; + endCur = endCaptures = score(ml); + + partial_insertion_sort(cur, endCur, std::numeric_limits::min()); + ++stage; + goto top; + } + + case GOOD_CAPTURE : + if (select([&]() { + if (pos.see_ge(*cur, -cur->value / 18)) + return true; + std::swap(*endBadCaptures++, *cur); + return false; + })) + return *(cur - 1); + + ++stage; + [[fallthrough]]; + + case QUIET_INIT : + if (!skipQuiets) + { + MoveList ml(pos); + + endCur = endGenerated = score(ml); + + partial_insertion_sort(cur, endCur, -3560 * depth); + } + + ++stage; + [[fallthrough]]; + + case GOOD_QUIET : + if (!skipQuiets && select([&]() { return cur->value > goodQuietThreshold; })) + return *(cur - 1); + + // Prepare the pointers to loop over the bad captures + cur = moves; + endCur = endBadCaptures; + + ++stage; + [[fallthrough]]; + + case BAD_CAPTURE : + if (select([]() { return true; })) + return *(cur - 1); + + // Prepare the pointers to loop over quiets again + cur = endCaptures; + endCur = endGenerated; + + ++stage; + [[fallthrough]]; + + case BAD_QUIET : + if (!skipQuiets) + return select([&]() { return cur->value <= goodQuietThreshold; }); + + return Move::none(); + + case EVASION_INIT : { + MoveList ml(pos); + + cur = moves; + endCur = endGenerated = score(ml); + + partial_insertion_sort(cur, endCur, std::numeric_limits::min()); + ++stage; + [[fallthrough]]; + } + + case EVASION : + case QCAPTURE : + return select([]() { return true; }); + + case PROBCUT : + return select([&]() { return pos.see_ge(*cur, threshold); }); + } + + assert(false); + return Move::none(); // Silence warning +} + +void MovePicker::skip_quiet_moves() { skipQuiets = true; } + +} // namespace Stockfish diff --git a/src/movepick.h b/src/movepick.h new file mode 100644 index 0000000000000000000000000000000000000000..08bd9a539154ffd93ed2021aeec8e72fcff8881d --- /dev/null +++ b/src/movepick.h @@ -0,0 +1,80 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef MOVEPICK_H_INCLUDED +#define MOVEPICK_H_INCLUDED + +#include "history.h" +#include "movegen.h" +#include "types.h" + +namespace Stockfish { + +class Position; + +// The MovePicker class is used to pick one pseudo-legal move at a time from the +// current position. The most important method is next_move(), which emits one +// new pseudo-legal move on every call, until there are no moves left, when +// Move::none() is returned. In order to improve the efficiency of the alpha-beta +// algorithm, MovePicker attempts to return the moves which are most likely to get +// a cut-off first. +class MovePicker { + + public: + MovePicker(const MovePicker&) = delete; + MovePicker& operator=(const MovePicker&) = delete; + MovePicker(const Position&, + Move, + Depth, + const ButterflyHistory*, + const LowPlyHistory*, + const CapturePieceToHistory*, + const PieceToHistory**, + const SharedHistories*, + int); + MovePicker(const Position&, Move, int, const CapturePieceToHistory*); + Move next_move(); + void skip_quiet_moves(); + + private: + template + Move select(Pred); + template + ExtMove* score(MoveList&); + ExtMove* begin() { return cur; } + ExtMove* end() { return endCur; } + + const Position& pos; + const ButterflyHistory* mainHistory; + const LowPlyHistory* lowPlyHistory; + const CapturePieceToHistory* captureHistory; + const PieceToHistory** continuationHistory; + const SharedHistories* sharedHistory; + Move ttMove; + ExtMove * cur, *endCur, *endBadCaptures, *endCaptures, *endGenerated; + int stage; + int threshold; + Depth depth; + int ply; + bool skipQuiets = false; + ExtMove moves[MAX_MOVES]; +}; + +} // namespace Stockfish + +#endif // #ifndef MOVEPICK_H_INCLUDED diff --git a/src/nnue/features/full_threats.cpp b/src/nnue/features/full_threats.cpp new file mode 100644 index 0000000000000000000000000000000000000000..03ad158e6c149dbd6ba3adf33a21ed18a2a3cf7f --- /dev/null +++ b/src/nnue/features/full_threats.cpp @@ -0,0 +1,343 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +//Definition of input features FullThreats of NNUE evaluation function + +#include "full_threats.h" + +#include +#include +#include +#include +#include + +#include "../../bitboard.h" +#include "../../misc.h" +#include "../../position.h" +#include "../../types.h" +#include "../nnue_common.h" + +namespace Stockfish::Eval::NNUE::Features { + +struct HelperOffsets { + int cumulativePieceOffset, cumulativeOffset; +}; + +constexpr std::array AllPieces = { + W_PAWN, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING, + B_PAWN, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING, +}; + +template +constexpr auto make_piece_indices_type() { + static_assert(PT != PieceType::PAWN); + + std::array, SQUARE_NB> out{}; + + for (Square from = SQ_A1; from <= SQ_H8; ++from) + { + Bitboard attacks = PseudoAttacks[PT][from]; + + for (Square to = SQ_A1; to <= SQ_H8; ++to) + { + out[from][to] = constexpr_popcount(((1ULL << to) - 1) & attacks); + } + } + + return out; +} + +template +constexpr auto make_piece_indices_piece() { + static_assert(type_of(P) == PieceType::PAWN); + + std::array, SQUARE_NB> out{}; + + constexpr Color C = color_of(P); + + for (Square from = SQ_A1; from <= SQ_H8; ++from) + { + Bitboard attacks = PseudoAttacks[C][from]; + + for (Square to = SQ_A1; to <= SQ_H8; ++to) + { + out[from][to] = constexpr_popcount(((1ULL << to) - 1) & attacks); + } + } + + return out; +} + +constexpr auto index_lut2_array() { + constexpr auto KNIGHT_ATTACKS = make_piece_indices_type(); + constexpr auto BISHOP_ATTACKS = make_piece_indices_type(); + constexpr auto ROOK_ATTACKS = make_piece_indices_type(); + constexpr auto QUEEN_ATTACKS = make_piece_indices_type(); + constexpr auto KING_ATTACKS = make_piece_indices_type(); + + std::array, SQUARE_NB>, PIECE_NB> indices{}; + + indices[W_PAWN] = make_piece_indices_piece(); + indices[B_PAWN] = make_piece_indices_piece(); + + indices[W_KNIGHT] = KNIGHT_ATTACKS; + indices[B_KNIGHT] = KNIGHT_ATTACKS; + + indices[W_BISHOP] = BISHOP_ATTACKS; + indices[B_BISHOP] = BISHOP_ATTACKS; + + indices[W_ROOK] = ROOK_ATTACKS; + indices[B_ROOK] = ROOK_ATTACKS; + + indices[W_QUEEN] = QUEEN_ATTACKS; + indices[B_QUEEN] = QUEEN_ATTACKS; + + indices[W_KING] = KING_ATTACKS; + indices[B_KING] = KING_ATTACKS; + + return indices; +} + +constexpr auto init_threat_offsets() { + std::array indices{}; + std::array, PIECE_NB> offsets{}; + + int cumulativeOffset = 0; + for (Piece piece : AllPieces) + { + int pieceIdx = piece; + int cumulativePieceOffset = 0; + + for (Square from = SQ_A1; from <= SQ_H8; ++from) + { + offsets[pieceIdx][from] = cumulativePieceOffset; + + if (type_of(piece) != PAWN) + { + Bitboard attacks = PseudoAttacks[type_of(piece)][from]; + cumulativePieceOffset += constexpr_popcount(attacks); + } + + else if (from >= SQ_A2 && from <= SQ_H7) + { + Bitboard attacks = (pieceIdx < 8) ? pawn_attacks_bb(square_bb(from)) + : pawn_attacks_bb(square_bb(from)); + cumulativePieceOffset += constexpr_popcount(attacks); + } + } + + indices[pieceIdx] = {cumulativePieceOffset, cumulativeOffset}; + + cumulativeOffset += numValidTargets[pieceIdx] * cumulativePieceOffset; + } + + return std::pair{indices, offsets}; +} + +constexpr auto helper_offsets = init_threat_offsets().first; +// Lookup array for indexing threats +constexpr auto offsets = init_threat_offsets().second; + +constexpr auto init_index_luts() { + std::array, PIECE_NB>, PIECE_NB> indices{}; + + for (Piece attacker : AllPieces) + { + for (Piece attacked : AllPieces) + { + bool enemy = (attacker ^ attacked) == 8; + PieceType attackerType = type_of(attacker); + PieceType attackedType = type_of(attacked); + + int map = FullThreats::map[attackerType - 1][attackedType - 1]; + bool semi_excluded = attackerType == attackedType && (enemy || attackerType != PAWN); + IndexType feature = helper_offsets[attacker].cumulativeOffset + + (color_of(attacked) * (numValidTargets[attacker] / 2) + map) + * helper_offsets[attacker].cumulativePieceOffset; + + bool excluded = map < 0; + indices[attacker][attacked][0] = excluded ? FullThreats::Dimensions : feature; + indices[attacker][attacked][1] = + excluded || semi_excluded ? FullThreats::Dimensions : feature; + } + } + + return indices; +} + +// The final index is calculated from summing data found in these two LUTs, as well +// as offsets[attacker][from] + +// [attacker][attacked][from < to] +constexpr auto index_lut1 = init_index_luts(); +// [attacker][from][to] +constexpr auto index_lut2 = index_lut2_array(); + +// Index of a feature for a given king position and another piece on some square +inline sf_always_inline IndexType FullThreats::make_index( + Color perspective, Piece attacker, Square from, Square to, Piece attacked, Square ksq) { + const std::int8_t orientation = OrientTBL[ksq] ^ (56 * perspective); + unsigned from_oriented = uint8_t(from) ^ orientation; + unsigned to_oriented = uint8_t(to) ^ orientation; + + std::int8_t swap = 8 * perspective; + unsigned attacker_oriented = attacker ^ swap; + unsigned attacked_oriented = attacked ^ swap; + + return index_lut1[attacker_oriented][attacked_oriented][from_oriented < to_oriented] + + offsets[attacker_oriented][from_oriented] + + index_lut2[attacker_oriented][from_oriented][to_oriented]; +} + +// Get a list of indices for active features in ascending order + +void FullThreats::append_active_indices(Color perspective, const Position& pos, IndexList& active) { + Square ksq = pos.square(perspective); + Bitboard occupied = pos.pieces(); + + for (Color color : {WHITE, BLACK}) + { + for (PieceType pt = PAWN; pt < KING; ++pt) + { + Color c = Color(perspective ^ color); + Piece attacker = make_piece(c, pt); + Bitboard bb = pos.pieces(c, pt); + + if (pt == PAWN) + { + auto right = (c == WHITE) ? NORTH_EAST : SOUTH_WEST; + auto left = (c == WHITE) ? NORTH_WEST : SOUTH_EAST; + auto attacks_left = + ((c == WHITE) ? shift(bb) : shift(bb)) & occupied; + auto attacks_right = + ((c == WHITE) ? shift(bb) : shift(bb)) & occupied; + + while (attacks_left) + { + Square to = pop_lsb(attacks_left); + Square from = to - right; + Piece attacked = pos.piece_on(to); + IndexType index = make_index(perspective, attacker, from, to, attacked, ksq); + + if (index < Dimensions) + active.push_back(index); + } + + while (attacks_right) + { + Square to = pop_lsb(attacks_right); + Square from = to - left; + Piece attacked = pos.piece_on(to); + IndexType index = make_index(perspective, attacker, from, to, attacked, ksq); + + if (index < Dimensions) + active.push_back(index); + } + } + else + { + while (bb) + { + Square from = pop_lsb(bb); + Bitboard attacks = (attacks_bb(pt, from, occupied)) & occupied; + + while (attacks) + { + Square to = pop_lsb(attacks); + Piece attacked = pos.piece_on(to); + IndexType index = + make_index(perspective, attacker, from, to, attacked, ksq); + + if (index < Dimensions) + active.push_back(index); + } + } + } + } + } +} + +// Get a list of indices for recently changed features + +void FullThreats::append_changed_indices(Color perspective, + Square ksq, + const DiffType& diff, + IndexList& removed, + IndexList& added, + FusedUpdateData* fusedData, + bool first, + const ThreatWeightType* prefetchBase, + IndexType prefetchStride) { + + for (const auto& dirty : diff.list) + { + auto attacker = dirty.pc(); + auto attacked = dirty.threatened_pc(); + auto from = dirty.pc_sq(); + auto to = dirty.threatened_sq(); + auto add = dirty.add(); + + if (fusedData) + { + if (from == fusedData->dp2removed) + { + if (add) + { + if (first) + { + fusedData->dp2removedOriginBoard |= to; + continue; + } + } + else if (fusedData->dp2removedOriginBoard & to) + continue; + } + + if (to != SQ_NONE && to == fusedData->dp2removed) + { + if (add) + { + if (first) + { + fusedData->dp2removedTargetBoard |= from; + continue; + } + } + else if (fusedData->dp2removedTargetBoard & from) + continue; + } + } + + auto& insert = add ? added : removed; + const IndexType index = make_index(perspective, attacker, from, to, attacked, ksq); + + if (index < Dimensions) + { + if (prefetchBase) + prefetch( + prefetchBase + static_cast(index) * prefetchStride); + insert.push_back(index); + } + } +} + +bool FullThreats::requires_refresh(const DiffType& diff, Color perspective) { + return perspective == diff.us && (int8_t(diff.ksq) & 0b100) != (int8_t(diff.prevKsq) & 0b100); +} + +} // namespace Stockfish::Eval::NNUE::Features diff --git a/src/nnue/features/full_threats.h b/src/nnue/features/full_threats.h new file mode 100644 index 0000000000000000000000000000000000000000..76f5b74c6f8c5cebf0969497bd096ab96b2ee2e3 --- /dev/null +++ b/src/nnue/features/full_threats.h @@ -0,0 +1,106 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +//Definition of input features Simplified_Threats of NNUE evaluation function + +#ifndef NNUE_FEATURES_FULL_THREATS_INCLUDED +#define NNUE_FEATURES_FULL_THREATS_INCLUDED + +#include + +#include "../../misc.h" +#include "../../types.h" +#include "../nnue_common.h" + +namespace Stockfish { +class Position; +} + +namespace Stockfish::Eval::NNUE::Features { + +static constexpr int numValidTargets[PIECE_NB] = {0, 6, 10, 8, 8, 10, 0, 0, + 0, 6, 10, 8, 8, 10, 0, 0}; + +class FullThreats { + public: + // Feature name + static constexpr const char* Name = "Full_Threats(Friend)"; + + // Hash value embedded in the evaluation file + static constexpr std::uint32_t HashValue = 0x8f234cb8u; + + // Number of feature dimensions + static constexpr IndexType Dimensions = 60144; + + // clang-format off + // Orient a square according to perspective (rotates by 180 for black) + static constexpr std::int8_t OrientTBL[SQUARE_NB] = { + SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1, + SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1, + SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1, + SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1, + SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1, + SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1, + SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1, + SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1, + }; + + static constexpr int map[PIECE_TYPE_NB-2][PIECE_TYPE_NB-2] = { + { 0, 1, -1, 2, -1, -1}, + { 0, 1, 2, 3, 4, -1}, + { 0, 1, 2, 3, -1, -1}, + { 0, 1, 2, 3, -1, -1}, + { 0, 1, 2, 3, 4, -1}, + {-1, -1, -1, -1, -1, -1} + }; + // clang-format on + + struct FusedUpdateData { + Bitboard dp2removedOriginBoard = 0; + Bitboard dp2removedTargetBoard = 0; + + Square dp2removed; + }; + + // Maximum number of simultaneously active features. + static constexpr IndexType MaxActiveDimensions = 128; + using IndexList = ValueList; + using DiffType = DirtyThreats; + + static IndexType + make_index(Color perspective, Piece attkr, Square from, Square to, Piece attkd, Square ksq); + + // Get a list of indices for active features + static void append_active_indices(Color perspective, const Position& pos, IndexList& active); + + // Get a list of indices for recently changed features + static void append_changed_indices(Color perspective, + Square ksq, + const DiffType& diff, + IndexList& removed, + IndexList& added, + FusedUpdateData* fd = nullptr, + bool first = false, + const ThreatWeightType* prefetchBase = nullptr, + IndexType prefetchStride = 0); + + // Returns whether the change stored in this DirtyPiece means + // that a full accumulator refresh is required. + static bool requires_refresh(const DiffType& diff, Color perspective); +}; + +} // namespace Stockfish::Eval::NNUE::Features + +#endif // #ifndef NNUE_FEATURES_FULL_THREATS_INCLUDED diff --git a/src/nnue/features/half_ka_v2_hm.cpp b/src/nnue/features/half_ka_v2_hm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a82e89de486812d870d6fafa84f802b68ee0b715 --- /dev/null +++ b/src/nnue/features/half_ka_v2_hm.cpp @@ -0,0 +1,69 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +//Definition of input features HalfKAv2_hm of NNUE evaluation function + +#include "half_ka_v2_hm.h" + +#include "../../bitboard.h" +#include "../../position.h" +#include "../../types.h" +#include "../nnue_common.h" + +namespace Stockfish::Eval::NNUE::Features { + +// Index of a feature for a given king position and another piece on some square + +IndexType HalfKAv2_hm::make_index(Color perspective, Square s, Piece pc, Square ksq) { + const IndexType flip = 56 * perspective; + return (IndexType(s) ^ OrientTBL[ksq] ^ flip) + PieceSquareIndex[perspective][pc] + + KingBuckets[int(ksq) ^ flip]; +} + +// Get a list of indices for active features + +void HalfKAv2_hm::append_active_indices(Color perspective, const Position& pos, IndexList& active) { + Square ksq = pos.square(perspective); + Bitboard bb = pos.pieces(); + while (bb) + { + Square s = pop_lsb(bb); + active.push_back(make_index(perspective, s, pos.piece_on(s), ksq)); + } +} + +// Get a list of indices for recently changed features + +void HalfKAv2_hm::append_changed_indices( + Color perspective, Square ksq, const DiffType& diff, IndexList& removed, IndexList& added) { + removed.push_back(make_index(perspective, diff.from, diff.pc, ksq)); + if (diff.to != SQ_NONE) + added.push_back(make_index(perspective, diff.to, diff.pc, ksq)); + + if (diff.remove_sq != SQ_NONE) + removed.push_back(make_index(perspective, diff.remove_sq, diff.remove_pc, ksq)); + + if (diff.add_sq != SQ_NONE) + added.push_back(make_index(perspective, diff.add_sq, diff.add_pc, ksq)); +} + +bool HalfKAv2_hm::requires_refresh(const DiffType& diff, Color perspective) { + return diff.pc == make_piece(perspective, KING); +} + +} // namespace Stockfish::Eval::NNUE::Features diff --git a/src/nnue/features/half_ka_v2_hm.h b/src/nnue/features/half_ka_v2_hm.h new file mode 100644 index 0000000000000000000000000000000000000000..49b0a87a4d01c705028ad0e991b12a3d24c2954d --- /dev/null +++ b/src/nnue/features/half_ka_v2_hm.h @@ -0,0 +1,128 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +//Definition of input features HalfKP of NNUE evaluation function + +#ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED +#define NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED + +#include + +#include "../../misc.h" +#include "../../types.h" +#include "../nnue_common.h" + +namespace Stockfish { +class Position; +} + +namespace Stockfish::Eval::NNUE::Features { + +// Feature HalfKAv2_hm: Combination of the position of own king and the +// position of pieces. Position mirrored such that king is always on e..h files. +class HalfKAv2_hm { + + // Unique number for each piece type on each square + enum { + PS_NONE = 0, + PS_W_PAWN = 0, + PS_B_PAWN = 1 * SQUARE_NB, + PS_W_KNIGHT = 2 * SQUARE_NB, + PS_B_KNIGHT = 3 * SQUARE_NB, + PS_W_BISHOP = 4 * SQUARE_NB, + PS_B_BISHOP = 5 * SQUARE_NB, + PS_W_ROOK = 6 * SQUARE_NB, + PS_B_ROOK = 7 * SQUARE_NB, + PS_W_QUEEN = 8 * SQUARE_NB, + PS_B_QUEEN = 9 * SQUARE_NB, + PS_KING = 10 * SQUARE_NB, + PS_NB = 11 * SQUARE_NB + }; + + static constexpr IndexType PieceSquareIndex[COLOR_NB][PIECE_NB] = { + // Convention: W - us, B - them + // Viewed from other side, W and B are reversed + {PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_KING, PS_NONE, + PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_KING, PS_NONE}, + {PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_KING, PS_NONE, + PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_KING, PS_NONE}}; + + public: + // Feature name + static constexpr const char* Name = "HalfKAv2_hm(Friend)"; + + // Hash value embedded in the evaluation file + static constexpr std::uint32_t HashValue = 0x7f234cb8u; + + // Number of feature dimensions + static constexpr IndexType Dimensions = + static_cast(SQUARE_NB) * static_cast(PS_NB) / 2; + +#define B(v) (v * PS_NB) + // clang-format off + static constexpr IndexType KingBuckets[SQUARE_NB] = { + B(28), B(29), B(30), B(31), B(31), B(30), B(29), B(28), + B(24), B(25), B(26), B(27), B(27), B(26), B(25), B(24), + B(20), B(21), B(22), B(23), B(23), B(22), B(21), B(20), + B(16), B(17), B(18), B(19), B(19), B(18), B(17), B(16), + B(12), B(13), B(14), B(15), B(15), B(14), B(13), B(12), + B( 8), B( 9), B(10), B(11), B(11), B(10), B( 9), B( 8), + B( 4), B( 5), B( 6), B( 7), B( 7), B( 6), B( 5), B( 4), + B( 0), B( 1), B( 2), B( 3), B( 3), B( 2), B( 1), B( 0), + }; + // clang-format on +#undef B + // clang-format off + // Orient a square according to perspective (rotates by 180 for black) + static constexpr IndexType OrientTBL[SQUARE_NB] = { + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1 , + }; + // clang-format on + + // Maximum number of simultaneously active features. + static constexpr IndexType MaxActiveDimensions = 32; + using IndexList = ValueList; + using DiffType = DirtyPiece; + + // Index of a feature for a given king position and another piece on some square + + static IndexType make_index(Color perspective, Square s, Piece pc, Square ksq); + + // Get a list of indices for active features + + static void append_active_indices(Color perspective, const Position& pos, IndexList& active); + + // Get a list of indices for recently changed features + static void append_changed_indices( + Color perspective, Square ksq, const DiffType& diff, IndexList& removed, IndexList& added); + + // Returns whether the change stored in this DirtyPiece means + // that a full accumulator refresh is required. + static bool requires_refresh(const DiffType& diff, Color perspective); +}; + +} // namespace Stockfish::Eval::NNUE::Features + +#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..6cd44e19c82b562515bdcd3a1543d4d698bbec66 --- /dev/null +++ b/src/nnue/layers/affine_transform.h @@ -0,0 +1,312 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +// Definition of layer AffineTransform of NNUE evaluation function + +#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED +#define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED + +#include +#include + +#include "../../memory.h" +#include "../nnue_common.h" +#include "../simd.h" + +/* + This file contains the definition for a fully connected layer (aka affine transform). + + - expected use-case is for when PaddedInputDimensions == 32 and InputDimensions <= 32. + - that's why AVX512 is hard to implement + - expected use-case is small layers + - inputs are processed in chunks of 4, weights are respectively transposed + - accumulation happens directly to int32s +*/ + +namespace Stockfish::Eval::NNUE::Layers { + +#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD) + #define ENABLE_SEQ_OPT +#endif + +// Fallback implementation for older/other architectures. +// Requires the input to be padded to at least 16 values. +#ifndef ENABLE_SEQ_OPT + +template +static void affine_transform_non_ssse3(std::int32_t* output, + const std::int8_t* weights, + const std::int32_t* biases, + const std::uint8_t* input) { + #if defined(USE_SSE2) || defined(USE_NEON) + #if defined(USE_SSE2) + // At least a multiple of 16, with SSE2. + constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; + const __m128i Zeros = _mm_setzero_si128(); + const auto inputVector = reinterpret_cast(input); + + #elif defined(USE_NEON) + constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; + const auto inputVector = reinterpret_cast(input); + #endif + + for (IndexType i = 0; i < OutputDimensions; ++i) + { + const IndexType offset = i * PaddedInputDimensions; + + #if defined(USE_SSE2) + __m128i sumLo = _mm_cvtsi32_si128(biases[i]); + __m128i sumHi = Zeros; + const auto row = reinterpret_cast(&weights[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m128i row_j = _mm_load_si128(&row[j]); + __m128i input_j = _mm_load_si128(&inputVector[j]); + __m128i extendedRowLo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8); + __m128i extendedRowHi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8); + __m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros); + __m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros); + __m128i productLo = _mm_madd_epi16(extendedRowLo, extendedInputLo); + __m128i productHi = _mm_madd_epi16(extendedRowHi, extendedInputHi); + sumLo = _mm_add_epi32(sumLo, productLo); + sumHi = _mm_add_epi32(sumHi, productHi); + } + __m128i sum = _mm_add_epi32(sumLo, sumHi); + __m128i sumHigh_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sumHigh_64); + __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sum_second_32); + output[i] = _mm_cvtsi128_si32(sum); + + #elif defined(USE_NEON) + + int32x4_t sum = {biases[i]}; + const auto row = reinterpret_cast(&weights[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]); + product = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]); + sum = vpadalq_s16(sum, product); + } + output[i] = SIMD::neon_m128_reduce_add_epi32(sum); + + #endif + } + #else + std::memcpy(output, biases, sizeof(std::int32_t) * OutputDimensions); + + // Traverse weights in transpose order to take advantage of input sparsity + for (IndexType i = 0; i < InputDimensions; ++i) + if (input[i]) + { + const std::int8_t* w = &weights[i]; + const int in = input[i]; + for (IndexType j = 0; j < OutputDimensions; ++j) + output[j] += w[j * PaddedInputDimensions] * in; + } + #endif +} + +#endif // !ENABLE_SEQ_OPT + +template +class AffineTransform { + public: + // Input/output type + using InputType = std::uint8_t; + using OutputType = std::int32_t; + + // Number of input/output dimensions + static constexpr IndexType InputDimensions = InDims; + static constexpr IndexType OutputDimensions = OutDims; + + static constexpr IndexType PaddedInputDimensions = + ceil_to_multiple(InputDimensions, MaxSimdWidth); + static constexpr IndexType PaddedOutputDimensions = + ceil_to_multiple(OutputDimensions, MaxSimdWidth); + + using OutputBuffer = OutputType[PaddedOutputDimensions]; + + // Hash value embedded in the evaluation file + static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) { + std::uint32_t hashValue = 0xCC03DAE4u; + hashValue += OutputDimensions; + hashValue ^= prevHash >> 1; + hashValue ^= prevHash << 31; + return hashValue; + } + + static constexpr IndexType get_weight_index_scrambled(IndexType i) { + return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 + + i / PaddedInputDimensions * 4 + i % 4; + } + + static constexpr IndexType get_weight_index(IndexType i) { +#ifdef ENABLE_SEQ_OPT + return get_weight_index_scrambled(i); +#else + return i; +#endif + } + + // Read network parameters + bool read_parameters(std::istream& stream) { + read_little_endian(stream, biases, OutputDimensions); + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + weights[get_weight_index(i)] = read_little_endian(stream); + + return !stream.fail(); + } + + // Write network parameters + bool write_parameters(std::ostream& stream) const { + write_little_endian(stream, biases, OutputDimensions); + + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + write_little_endian(stream, weights[get_weight_index(i)]); + + return !stream.fail(); + } + + std::size_t get_content_hash() const { + std::size_t h = 0; + hash_combine(h, get_raw_data_hash(biases)); + hash_combine(h, get_raw_data_hash(weights)); + hash_combine(h, get_hash_value(0)); + return h; + } + + // Forward propagation + void propagate(const InputType* input, OutputType* output) const { + +#ifdef ENABLE_SEQ_OPT + + if constexpr (OutputDimensions > 1) + { + #if defined(USE_AVX512) + using vec_t = __m512i; + #define vec_set_32 _mm512_set1_epi32 + #define vec_add_dpbusd_32 SIMD::m512_add_dpbusd_epi32 + #elif defined(USE_AVX2) + using vec_t = __m256i; + #define vec_set_32 _mm256_set1_epi32 + #define vec_add_dpbusd_32 SIMD::m256_add_dpbusd_epi32 + #elif defined(USE_SSSE3) + using vec_t = __m128i; + #define vec_set_32 _mm_set1_epi32 + #define vec_add_dpbusd_32 SIMD::m128_add_dpbusd_epi32 + #elif defined(USE_NEON_DOTPROD) + using vec_t = int32x4_t; + #define vec_set_32 vdupq_n_s32 + #define vec_add_dpbusd_32(acc, a, b) \ + SIMD::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \ + vreinterpretq_s8_s32(b)) + #endif + + static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType); + + static_assert(OutputDimensions % OutputSimdWidth == 0); + + constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4; + constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; + + const vec_t* biasvec = reinterpret_cast(biases); + vec_t acc[NumRegs]; + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = biasvec[k]; + + for (IndexType i = 0; i < NumChunks; ++i) + { + const vec_t in0 = + vec_set_32(load_as(input + i * sizeof(std::int32_t))); + const auto col0 = + reinterpret_cast(&weights[i * OutputDimensions * 4]); + + for (IndexType k = 0; k < NumRegs; ++k) + vec_add_dpbusd_32(acc[k], in0, col0[k]); + } + + vec_t* outptr = reinterpret_cast(output); + for (IndexType k = 0; k < NumRegs; ++k) + outptr[k] = acc[k]; + + #undef vec_set_32 + #undef vec_add_dpbusd_32 + } + else if constexpr (OutputDimensions == 1) + { + // We cannot use AVX512 for the last layer because there are only 32 inputs + // and the buffer is not padded to 64 elements. + #if defined(USE_AVX2) + using vec_t = __m256i; + #define vec_setzero() _mm256_setzero_si256() + #define vec_add_dpbusd_32 SIMD::m256_add_dpbusd_epi32 + #define vec_hadd SIMD::m256_hadd + #elif defined(USE_SSSE3) + using vec_t = __m128i; + #define vec_setzero() _mm_setzero_si128() + #define vec_add_dpbusd_32 SIMD::m128_add_dpbusd_epi32 + #define vec_hadd SIMD::m128_hadd + #elif defined(USE_NEON_DOTPROD) + using vec_t = int32x4_t; + #define vec_setzero() vdupq_n_s32(0) + #define vec_add_dpbusd_32(acc, a, b) \ + SIMD::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \ + vreinterpretq_s8_s32(b)) + #define vec_hadd SIMD::neon_m128_hadd + #endif + + const auto inputVector = reinterpret_cast(input); + + static constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(InputType); + + static_assert(PaddedInputDimensions % InputSimdWidth == 0); + + constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth; + vec_t sum0 = vec_setzero(); + const auto row0 = reinterpret_cast(&weights[0]); + + for (int j = 0; j < int(NumChunks); ++j) + { + const vec_t in = inputVector[j]; + vec_add_dpbusd_32(sum0, in, row0[j]); + } + output[0] = vec_hadd(sum0, biases[0]); + + #undef vec_setzero + #undef vec_add_dpbusd_32 + #undef vec_hadd + } +#else + // Use old implementation for the other architectures. + affine_transform_non_ssse3( + output, weights, biases, input); +#endif + } + + private: + using BiasType = OutputType; + using WeightType = std::int8_t; + + alignas(CacheLineSize) BiasType biases[OutputDimensions]; + alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions]; +}; + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED diff --git a/src/nnue/layers/affine_transform_sparse_input.h b/src/nnue/layers/affine_transform_sparse_input.h new file mode 100644 index 0000000000000000000000000000000000000000..059a773974b560ee5712143fe3aa09987eae0fbd --- /dev/null +++ b/src/nnue/layers/affine_transform_sparse_input.h @@ -0,0 +1,379 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +// Definition of layer AffineTransformSparseInput of NNUE evaluation function + +#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED +#define NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED + +#include +#include +#include +#include + +#include "../../bitboard.h" +#include "../../memory.h" +#include "../simd.h" +#include "../nnue_common.h" + +/* + This file contains the definition for a fully connected layer (aka affine transform) with block sparse input. +*/ + +namespace Stockfish::Eval::NNUE::Layers { + +#if (USE_SSSE3 | (USE_NEON >= 8)) +static constexpr int lsb_index64[64] = { + 0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61, 54, 58, 35, 52, 50, 42, + 21, 44, 38, 32, 29, 23, 17, 11, 4, 62, 46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, + 31, 22, 10, 45, 25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63}; + +constexpr int constexpr_lsb(uint64_t bb) { + assert(bb != 0); + constexpr uint64_t debruijn64 = 0x03F79D71B4CB0A89ULL; + return lsb_index64[((bb ^ (bb - 1)) * debruijn64) >> 58]; +} + +alignas(CacheLineSize) static constexpr struct OffsetIndices { + + std::uint16_t offset_indices[256][8]; + + constexpr OffsetIndices() : + offset_indices() { + for (int i = 0; i < 256; ++i) + { + std::uint64_t j = i, k = 0; + while (j) + { + offset_indices[i][k++] = constexpr_lsb(j); + j &= j - 1; + } + while (k < 8) + offset_indices[i][k++] = 0; + } + } + +} Lookup; + + #if defined(__GNUC__) || defined(__clang__) + #define RESTRICT __restrict__ + #elif defined(_MSC_VER) + #define RESTRICT __restrict + #else + #define RESTRICT + #endif + +// Find indices of nonzero 32-bit values in a packed byte buffer. +// The input pointer addresses a sequence of 32-bit blocks stored in a +// std::uint8_t array. +template +void find_nnz(const std::uint8_t* RESTRICT input, + std::uint16_t* RESTRICT out, + IndexType& count_out) { + + #if defined(USE_AVX512ICL) + + constexpr IndexType SimdWidthIn = 64; // 512 bits + constexpr IndexType SimdWidthOut = 32; // 512 bits / 16 bits + constexpr IndexType NumChunks = InputDimensions / SimdWidthOut; + const __m512i increment = _mm512_set1_epi16(SimdWidthOut); + __m512i base = _mm512_set_epi16( // Same permute order as _mm512_packus_epi32() + 31, 30, 29, 28, 15, 14, 13, 12, 27, 26, 25, 24, 11, 10, 9, 8, 23, 22, 21, 20, 7, 6, 5, 4, 19, + 18, 17, 16, 3, 2, 1, 0); + + IndexType count = 0; + for (IndexType i = 0; i < NumChunks; ++i) + { + const __m512i inputV0 = _mm512_load_si512(input + i * 2 * SimdWidthIn); + const __m512i inputV1 = _mm512_load_si512(input + i * 2 * SimdWidthIn + SimdWidthIn); + + // Get a bitmask and gather non zero indices + const __m512i inputV01 = _mm512_packus_epi32(inputV0, inputV1); + const __mmask32 nnzMask = _mm512_test_epi16_mask(inputV01, inputV01); + + // Avoid _mm512_mask_compressstoreu_epi16() as it's 256 uOps on Zen4 + __m512i nnz = _mm512_maskz_compress_epi16(nnzMask, base); + _mm512_storeu_si512(out + count, nnz); + + count += popcount(nnzMask); + base = _mm512_add_epi16(base, increment); + } + count_out = count; + + #elif defined(USE_AVX512) + + constexpr IndexType SimdWidth = 16; // 512 bits / 32 bits + constexpr IndexType NumChunks = InputDimensions / SimdWidth; + const __m512i increment = _mm512_set1_epi32(SimdWidth); + __m512i base = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + + IndexType count = 0; + for (IndexType i = 0; i < NumChunks; ++i) + { + const __m512i inputV = _mm512_load_si512(input + i * SimdWidth * sizeof(std::uint32_t)); + + // Get a bitmask and gather non zero indices + const __mmask16 nnzMask = _mm512_test_epi32_mask(inputV, inputV); + const __m512i nnzV = _mm512_maskz_compress_epi32(nnzMask, base); + _mm512_mask_cvtepi32_storeu_epi16(out + count, 0xFFFF, nnzV); + count += popcount(nnzMask); + base = _mm512_add_epi32(base, increment); + } + count_out = count; + + #else + + using namespace SIMD; + + constexpr IndexType InputSimdWidth = sizeof(vec_uint_t) / sizeof(std::int32_t); + // Outputs are processed 8 elements at a time, even if the SIMD width is narrower + constexpr IndexType ChunkSize = 8; + constexpr IndexType NumChunks = InputDimensions / ChunkSize; + constexpr IndexType InputsPerChunk = ChunkSize / InputSimdWidth; + + static_assert(InputsPerChunk > 0 && "SIMD width too wide"); + + const auto inputVector = reinterpret_cast(input); + IndexType count = 0; + vec128_t base = vec128_zero; + const vec128_t increment = vec128_set_16(8); + for (IndexType i = 0; i < NumChunks; ++i) + { + // bitmask of nonzero values in this chunk + unsigned nnz = 0; + for (IndexType j = 0; j < InputsPerChunk; ++j) + { + const vec_uint_t inputChunk = inputVector[i * InputsPerChunk + j]; + nnz |= unsigned(vec_nnz(inputChunk)) << (j * InputSimdWidth); + } + const vec128_t offsets = + vec128_load(reinterpret_cast(&Lookup.offset_indices[nnz])); + vec128_storeu(reinterpret_cast(out + count), vec128_add(base, offsets)); + count += popcount(nnz); + base = vec128_add(base, increment); + } + count_out = count; + #endif +} + +#endif + +// Sparse input implementation +template +class AffineTransformSparseInput { + public: + // Input/output type + using InputType = std::uint8_t; + using OutputType = std::int32_t; + + // Number of input/output dimensions + static constexpr IndexType InputDimensions = InDims; + static constexpr IndexType OutputDimensions = OutDims; + + static_assert(OutputDimensions % 16 == 0, + "Only implemented for OutputDimensions divisible by 16."); + + static constexpr IndexType PaddedInputDimensions = + ceil_to_multiple(InputDimensions, MaxSimdWidth); + static constexpr IndexType PaddedOutputDimensions = + ceil_to_multiple(OutputDimensions, MaxSimdWidth); + +#if (USE_SSSE3 | (USE_NEON >= 8)) + static constexpr IndexType ChunkSize = 4; +#else + static constexpr IndexType ChunkSize = 1; +#endif + + using OutputBuffer = OutputType[PaddedOutputDimensions]; + + // Hash value embedded in the evaluation file + static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) { + std::uint32_t hashValue = 0xCC03DAE4u; + hashValue += OutputDimensions; + hashValue ^= prevHash >> 1; + hashValue ^= prevHash << 31; + return hashValue; + } + + static constexpr IndexType get_weight_index_scrambled(IndexType i) { + return (i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize + + i / PaddedInputDimensions * ChunkSize + i % ChunkSize; + } + + static constexpr IndexType get_weight_index(IndexType i) { +#if (USE_SSSE3 | (USE_NEON >= 8)) + return get_weight_index_scrambled(i); +#else + return i; +#endif + } + + // Read network parameters + bool read_parameters(std::istream& stream) { + read_little_endian(stream, biases, OutputDimensions); + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + weights[get_weight_index(i)] = read_little_endian(stream); + + return !stream.fail(); + } + + // Write network parameters + bool write_parameters(std::ostream& stream) const { + write_little_endian(stream, biases, OutputDimensions); + + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + write_little_endian(stream, weights[get_weight_index(i)]); + + return !stream.fail(); + } + + std::size_t get_content_hash() const { + std::size_t h = 0; + hash_combine(h, get_raw_data_hash(biases)); + hash_combine(h, get_raw_data_hash(weights)); + hash_combine(h, get_hash_value(0)); + return h; + } + + // Forward propagation + void propagate(const InputType* input, OutputType* output) const { + +#if (USE_SSSE3 | (USE_NEON >= 8)) + #if defined(USE_AVX512) + using invec_t = __m512i; + using outvec_t = __m512i; + #define vec_add_32 _mm512_add_epi32 + #define vec_set_32 _mm512_set1_epi32 + #define vec_add_dpbusd_32 SIMD::m512_add_dpbusd_epi32 + #elif defined(USE_AVX2) + using invec_t = __m256i; + using outvec_t = __m256i; + #define vec_add_32 _mm256_add_epi32 + #define vec_set_32 _mm256_set1_epi32 + #define vec_add_dpbusd_32 SIMD::m256_add_dpbusd_epi32 + #elif defined(USE_SSSE3) + using invec_t = __m128i; + using outvec_t = __m128i; + #define vec_set_32 _mm_set1_epi32 + #define vec_add_dpbusd_32 SIMD::m128_add_dpbusd_epi32 + #elif defined(USE_NEON_DOTPROD) + using invec_t = int8x16_t; + using outvec_t = int32x4_t; + #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a)) + #define vec_add_dpbusd_32 SIMD::dotprod_m128_add_dpbusd_epi32 + #elif defined(USE_NEON) + using invec_t = int8x16_t; + using outvec_t = int32x4_t; + #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a)) + #define vec_add_dpbusd_32 SIMD::neon_m128_add_dpbusd_epi32 + #endif + constexpr IndexType OutputSimdWidth = sizeof(outvec_t) / sizeof(OutputType); + constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / ChunkSize; + constexpr IndexType NumAccums = OutputDimensions / OutputSimdWidth; + // If we're using high-latency dot product instructions, split the accumulators + // to create 3 separate dependency chains and merge at the end + constexpr IndexType NumRegs = + #if defined(USE_VNNI) + 3 * NumAccums; + #else + NumAccums; + #endif + std::uint16_t nnz[NumChunks]; + IndexType count; + + // Find indices of nonzero 32-bit blocks + find_nnz(input, nnz, count); + + const outvec_t* biasvec = reinterpret_cast(biases); + outvec_t acc[NumRegs]; + for (IndexType k = 0; k < NumAccums; ++k) + acc[k] = biasvec[k]; + + const auto* start = nnz; + const auto* end = nnz + count; + + // convince GCC to not do weird pointer arithmetic in the following loop + const std::int8_t* weights_cp = weights; + #if defined(USE_VNNI) + for (IndexType k = NumAccums; k < NumRegs; ++k) + acc[k] = vec_zero(); + + while (start < end - 2) + { + const std::ptrdiff_t i0 = *start++; + const std::ptrdiff_t i1 = *start++; + const std::ptrdiff_t i2 = *start++; + const invec_t in0 = + vec_set_32(load_as(input + i0 * sizeof(std::int32_t))); + const invec_t in1 = + vec_set_32(load_as(input + i1 * sizeof(std::int32_t))); + const invec_t in2 = + vec_set_32(load_as(input + i2 * sizeof(std::int32_t))); + const auto col0 = + reinterpret_cast(&weights_cp[i0 * OutputDimensions * ChunkSize]); + const auto col1 = + reinterpret_cast(&weights_cp[i1 * OutputDimensions * ChunkSize]); + const auto col2 = + reinterpret_cast(&weights_cp[i2 * OutputDimensions * ChunkSize]); + for (IndexType k = 0; k < NumAccums; ++k) + { + vec_add_dpbusd_32(acc[k], in0, col0[k]); + vec_add_dpbusd_32(acc[k + NumAccums], in1, col1[k]); + vec_add_dpbusd_32(acc[k + 2 * NumAccums], in2, col2[k]); + } + } + for (IndexType k = 0; k < NumAccums; ++k) + acc[k] = vec_add_32(vec_add_32(acc[k], acc[k + NumAccums]), acc[k + 2 * NumAccums]); + #endif + while (start < end) + { + const std::ptrdiff_t i = *start++; + const invec_t in = vec_set_32(load_as(input + i * sizeof(std::int32_t))); + const auto col = + reinterpret_cast(&weights_cp[i * OutputDimensions * ChunkSize]); + for (IndexType k = 0; k < NumAccums; ++k) + vec_add_dpbusd_32(acc[k], in, col[k]); + } + + outvec_t* outptr = reinterpret_cast(output); + for (IndexType k = 0; k < NumAccums; ++k) + outptr[k] = acc[k]; + + #undef vec_set_32 + #undef vec_add_dpbusd_32 + #ifdef vec_add_32 + #undef vec_add_32 + #endif +#else + // Use dense implementation for the other architectures. + affine_transform_non_ssse3( + output, weights, biases, input); +#endif + } + + private: + using BiasType = OutputType; + using WeightType = std::int8_t; + + alignas(CacheLineSize) BiasType biases[OutputDimensions]; + alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions]; +}; + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h new file mode 100644 index 0000000000000000000000000000000000000000..9ce85d3fa79c2be45a41f1eb47e7b737222a5cc5 --- /dev/null +++ b/src/nnue/layers/clipped_relu.h @@ -0,0 +1,170 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +// Definition of layer ClippedReLU of NNUE evaluation function + +#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED +#define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED + +#include +#include +#include + +#include "../nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +// Clipped ReLU +template +class ClippedReLU { + public: + // Input/output type + using InputType = std::int32_t; + using OutputType = std::uint8_t; + + // Number of input/output dimensions + static constexpr IndexType InputDimensions = InDims; + static constexpr IndexType OutputDimensions = InputDimensions; + static constexpr IndexType PaddedOutputDimensions = + ceil_to_multiple(OutputDimensions, 32); + + using OutputBuffer = OutputType[PaddedOutputDimensions]; + + // Hash value embedded in the evaluation file + static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) { + std::uint32_t hashValue = 0x538D24C7u; + hashValue += prevHash; + return hashValue; + } + + // Read network parameters + bool read_parameters(std::istream&) { return true; } + + // Write network parameters + bool write_parameters(std::ostream&) const { return true; } + + std::size_t get_content_hash() const { + std::size_t h = 0; + hash_combine(h, get_hash_value(0)); + return h; + } + + // Forward propagation + void propagate(const InputType* input, OutputType* output) const { + +#if defined(USE_AVX2) + if constexpr (InputDimensions % SimdWidth == 0) + { + constexpr IndexType NumChunks = InputDimensions / SimdWidth; + const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m256i*>(output); + for (IndexType i = 0; i < NumChunks; ++i) + { + const __m256i words0 = + _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 0]), + _mm256_load_si256(&in[i * 4 + 1])), + WeightScaleBits); + const __m256i words1 = + _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 2]), + _mm256_load_si256(&in[i * 4 + 3])), + WeightScaleBits); + _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32( + _mm256_packs_epi16(words0, words1), Offsets)); + } + } + else + { + constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2); + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m128i*>(output); + for (IndexType i = 0; i < NumChunks; ++i) + { + const __m128i words0 = _mm_srli_epi16( + _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])), + WeightScaleBits); + const __m128i words1 = _mm_srli_epi16( + _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), + WeightScaleBits); + _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1)); + } + } + constexpr IndexType Start = InputDimensions % SimdWidth == 0 + ? InputDimensions / SimdWidth * SimdWidth + : InputDimensions / (SimdWidth / 2) * (SimdWidth / 2); + +#elif defined(USE_SSE2) + constexpr IndexType NumChunks = InputDimensions / SimdWidth; + + #ifndef USE_SSE41 + const __m128i k0x80s = _mm_set1_epi8(-128); + #endif + + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m128i*>(output); + for (IndexType i = 0; i < NumChunks; ++i) + { + #if defined(USE_SSE41) + const __m128i words0 = _mm_srli_epi16( + _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])), + WeightScaleBits); + const __m128i words1 = _mm_srli_epi16( + _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), + WeightScaleBits); + _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1)); + #else + const __m128i words0 = _mm_srai_epi16( + _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])), + WeightScaleBits); + const __m128i words1 = _mm_srai_epi16( + _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), + WeightScaleBits); + const __m128i packedbytes = _mm_packs_epi16(words0, words1); + _mm_store_si128(&out[i], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)); + #endif + } + constexpr IndexType Start = NumChunks * SimdWidth; + +#elif defined(USE_NEON) + constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2); + const SIMD::vec_i8x8_t Zero = {0}; + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast(output); + for (IndexType i = 0; i < NumChunks; ++i) + { + int16x8_t shifted; + const auto pack = reinterpret_cast(&shifted); + pack[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits); + pack[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits); + out[i] = vmax_s8(vqmovn_s16(shifted), Zero); + } + constexpr IndexType Start = NumChunks * (SimdWidth / 2); +#else + constexpr IndexType Start = 0; +#endif + + for (IndexType i = Start; i < InputDimensions; ++i) + { + output[i] = static_cast(std::clamp(input[i] >> WeightScaleBits, 0, 127)); + } + } +}; + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED diff --git a/src/nnue/layers/sqr_clipped_relu.h b/src/nnue/layers/sqr_clipped_relu.h new file mode 100644 index 0000000000000000000000000000000000000000..53412d014a42fc4d45c79f4f84e93f3efcbfb965 --- /dev/null +++ b/src/nnue/layers/sqr_clipped_relu.h @@ -0,0 +1,109 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +// Definition of layer ClippedReLU of NNUE evaluation function + +#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED +#define NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED + +#include +#include +#include + +#include "../nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +// Clipped ReLU +template +class SqrClippedReLU { + public: + // Input/output type + using InputType = std::int32_t; + using OutputType = std::uint8_t; + + // Number of input/output dimensions + static constexpr IndexType InputDimensions = InDims; + static constexpr IndexType OutputDimensions = InputDimensions; + static constexpr IndexType PaddedOutputDimensions = + ceil_to_multiple(OutputDimensions, 32); + + using OutputBuffer = OutputType[PaddedOutputDimensions]; + + // Hash value embedded in the evaluation file + static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) { + std::uint32_t hashValue = 0x538D24C7u; + hashValue += prevHash; + return hashValue; + } + + // Read network parameters + bool read_parameters(std::istream&) { return true; } + + // Write network parameters + bool write_parameters(std::ostream&) const { return true; } + + std::size_t get_content_hash() const { + std::size_t h = 0; + hash_combine(h, get_hash_value(0)); + return h; + } + + // Forward propagation + void propagate(const InputType* input, OutputType* output) const { + +#if defined(USE_SSE2) + constexpr IndexType NumChunks = InputDimensions / 16; + + static_assert(WeightScaleBits == 6); + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m128i*>(output); + for (IndexType i = 0; i < NumChunks; ++i) + { + __m128i words0 = + _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])); + __m128i words1 = + _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])); + + // We shift by WeightScaleBits * 2 = 12 and divide by 128 + // which is an additional shift-right of 7, meaning 19 in total. + // MulHi strips the lower 16 bits so we need to shift out 3 more to match. + words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3); + words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3); + + _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1)); + } + constexpr IndexType Start = NumChunks * 16; + +#else + constexpr IndexType Start = 0; +#endif + + for (IndexType i = Start; i < InputDimensions; ++i) + { + output[i] = static_cast( + // Really should be /127 but we need to make it fast so we right-shift + // by an extra 7 bits instead. Needs to be accounted for in the trainer. + std::min(127ll, ((long long) (input[i]) * input[i]) >> (2 * WeightScaleBits + 7))); + } + } +}; + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp new file mode 100644 index 0000000000000000000000000000000000000000..88fd6e977d8a78273b19faf92269f5726f69492f --- /dev/null +++ b/src/nnue/network.cpp @@ -0,0 +1,415 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "network.h" + +#include +#include +#include +#include +#include +#include + +#define INCBIN_SILENCE_BITCODE_WARNING +#include "../incbin/incbin.h" + +#include "../evaluate.h" +#include "../misc.h" +#include "../position.h" +#include "../types.h" +#include "nnue_architecture.h" +#include "nnue_common.h" +#include "nnue_misc.h" + +// Macro to embed the default efficiently updatable neural network (NNUE) file +// data in the engine binary (using incbin.h, by Dale Weiler). +// This macro invocation will declare the following three variables +// const unsigned char gEmbeddedNNUEData[]; // a pointer to the embedded data +// const unsigned char *const gEmbeddedNNUEEnd; // a marker to the end +// const unsigned int gEmbeddedNNUESize; // the size of the embedded file +// Note that this does not work in Microsoft Visual Studio. +#if !defined(_MSC_VER) && !defined(NNUE_EMBEDDING_OFF) +INCBIN(EmbeddedNNUEBig, EvalFileDefaultNameBig); +INCBIN(EmbeddedNNUESmall, EvalFileDefaultNameSmall); +#else +const unsigned char gEmbeddedNNUEBigData[1] = {0x0}; +const unsigned char* const gEmbeddedNNUEBigEnd = &gEmbeddedNNUEBigData[1]; +const unsigned int gEmbeddedNNUEBigSize = 1; +const unsigned char gEmbeddedNNUESmallData[1] = {0x0}; +const unsigned char* const gEmbeddedNNUESmallEnd = &gEmbeddedNNUESmallData[1]; +const unsigned int gEmbeddedNNUESmallSize = 1; +#endif + +namespace { + +struct EmbeddedNNUE { + EmbeddedNNUE(const unsigned char* embeddedData, + const unsigned char* embeddedEnd, + const unsigned int embeddedSize) : + data(embeddedData), + end(embeddedEnd), + size(embeddedSize) {} + const unsigned char* data; + const unsigned char* end; + const unsigned int size; +}; + +using namespace Stockfish::Eval::NNUE; + +EmbeddedNNUE get_embedded(EmbeddedNNUEType type) { + if (type == EmbeddedNNUEType::BIG) + return EmbeddedNNUE(gEmbeddedNNUEBigData, gEmbeddedNNUEBigEnd, gEmbeddedNNUEBigSize); + else + return EmbeddedNNUE(gEmbeddedNNUESmallData, gEmbeddedNNUESmallEnd, gEmbeddedNNUESmallSize); +} + +} + + +namespace Stockfish::Eval::NNUE { + + +namespace Detail { + +// Read evaluation function parameters +template +bool read_parameters(std::istream& stream, T& reference) { + std::uint32_t header; + header = read_little_endian(stream); + if (!stream) + return false; + return reference.read_parameters(stream); +} + +// Write evaluation function parameters +template +bool write_parameters(std::ostream& stream, const T& reference) { + + write_little_endian(stream, T::get_hash_value()); + return reference.write_parameters(stream); +} + +} // namespace Detail + +template +void Network::load(const std::string& rootDirectory, std::string evalfilePath) { +#if defined(DEFAULT_NNUE_DIRECTORY) + std::vector dirs = {"", "", rootDirectory, + stringify(DEFAULT_NNUE_DIRECTORY)}; +#else + std::vector dirs = {"", "", rootDirectory}; +#endif + + if (evalfilePath.empty()) + evalfilePath = evalFile.defaultName; + + for (const auto& directory : dirs) + { + if (std::string(evalFile.current) != evalfilePath) + { + if (directory != "") + { + load_user_net(directory, evalfilePath); + } + + if (directory == "" && evalfilePath == std::string(evalFile.defaultName)) + { + load_internal(); + } + } + } +} + + +template +bool Network::save(const std::optional& filename) const { + std::string actualFilename; + std::string msg; + + if (filename.has_value()) + actualFilename = filename.value(); + else + { + if (std::string(evalFile.current) != std::string(evalFile.defaultName)) + { + msg = "Failed to export a net. " + "A non-embedded net can only be saved if the filename is specified"; + + sync_cout << msg << sync_endl; + return false; + } + + actualFilename = evalFile.defaultName; + } + + std::ofstream stream(actualFilename, std::ios_base::binary); + bool saved = save(stream, evalFile.current, evalFile.netDescription); + + msg = saved ? "Network saved successfully to " + actualFilename : "Failed to export a net"; + + sync_cout << msg << sync_endl; + return saved; +} + + +template +NetworkOutput +Network::evaluate(const Position& pos, + AccumulatorStack& accumulatorStack, + AccumulatorCaches::Cache& cache) const { + + constexpr uint64_t alignment = CacheLineSize; + + alignas(alignment) + TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize]; + + ASSERT_ALIGNED(transformedFeatures, alignment); + + const int bucket = (pos.count() - 1) / 4; + const auto psqt = + featureTransformer.transform(pos, accumulatorStack, cache, transformedFeatures, bucket); + const auto positional = network[bucket].propagate(transformedFeatures); + return {static_cast(psqt / OutputScale), static_cast(positional / OutputScale)}; +} + + +template +void Network::verify(std::string evalfilePath, + const std::function& f) const { + if (evalfilePath.empty()) + evalfilePath = evalFile.defaultName; + + if (std::string(evalFile.current) != evalfilePath) + { + if (f) + { + std::string msg1 = + "Network evaluation parameters compatible with the engine must be available."; + std::string msg2 = "The network file " + evalfilePath + " was not loaded successfully."; + std::string msg3 = "The UCI option EvalFile might need to specify the full path, " + "including the directory name, to the network file."; + std::string msg4 = "The default net can be downloaded from: " + "https://tests.stockfishchess.org/api/nn/" + + std::string(evalFile.defaultName); + std::string msg5 = "The engine will be terminated now."; + + std::string msg = "ERROR: " + msg1 + '\n' + "ERROR: " + msg2 + '\n' + "ERROR: " + msg3 + + '\n' + "ERROR: " + msg4 + '\n' + "ERROR: " + msg5 + '\n'; + + f(msg); + } + + exit(EXIT_FAILURE); + } + + if (f) + { + size_t size = sizeof(featureTransformer) + sizeof(Arch) * LayerStacks; + f("NNUE evaluation using " + evalfilePath + " (" + std::to_string(size / (1024 * 1024)) + + "MiB, (" + std::to_string(featureTransformer.TotalInputDimensions) + ", " + + std::to_string(network[0].TransformedFeatureDimensions) + ", " + + std::to_string(network[0].FC_0_OUTPUTS) + ", " + std::to_string(network[0].FC_1_OUTPUTS) + + ", 1))"); + } +} + + +template +NnueEvalTrace +Network::trace_evaluate(const Position& pos, + AccumulatorStack& accumulatorStack, + AccumulatorCaches::Cache& cache) const { + + constexpr uint64_t alignment = CacheLineSize; + + alignas(alignment) + TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize]; + + ASSERT_ALIGNED(transformedFeatures, alignment); + + NnueEvalTrace t{}; + t.correctBucket = (pos.count() - 1) / 4; + for (IndexType bucket = 0; bucket < LayerStacks; ++bucket) + { + const auto materialist = + featureTransformer.transform(pos, accumulatorStack, cache, transformedFeatures, bucket); + const auto positional = network[bucket].propagate(transformedFeatures); + + t.psqt[bucket] = static_cast(materialist / OutputScale); + t.positional[bucket] = static_cast(positional / OutputScale); + } + + return t; +} + + +template +void Network::load_user_net(const std::string& dir, + const std::string& evalfilePath) { + std::ifstream stream(dir + evalfilePath, std::ios::binary); + auto description = load(stream); + + if (description.has_value()) + { + evalFile.current = evalfilePath; + evalFile.netDescription = description.value(); + } +} + + +template +void Network::load_internal() { + // C++ way to prepare a buffer for a memory stream + class MemoryBuffer: public std::basic_streambuf { + public: + MemoryBuffer(char* p, size_t n) { + setg(p, p, p + n); + setp(p, p + n); + } + }; + + const auto embedded = get_embedded(embeddedType); + + MemoryBuffer buffer(const_cast(reinterpret_cast(embedded.data)), + size_t(embedded.size)); + + std::istream stream(&buffer); + auto description = load(stream); + + if (description.has_value()) + { + evalFile.current = evalFile.defaultName; + evalFile.netDescription = description.value(); + } +} + + +template +void Network::initialize() { + initialized = true; +} + + +template +bool Network::save(std::ostream& stream, + const std::string& name, + const std::string& netDescription) const { + if (name.empty() || name == "None") + return false; + + return write_parameters(stream, netDescription); +} + + +template +std::optional Network::load(std::istream& stream) { + initialize(); + std::string description; + + return read_parameters(stream, description) ? std::make_optional(description) : std::nullopt; +} + + +template +std::size_t Network::get_content_hash() const { + if (!initialized) + return 0; + + std::size_t h = 0; + hash_combine(h, featureTransformer); + for (auto&& layerstack : network) + hash_combine(h, layerstack); + hash_combine(h, evalFile); + hash_combine(h, static_cast(embeddedType)); + return h; +} + +// Read network header +template +bool Network::read_header(std::istream& stream, + std::uint32_t* hashValue, + std::string* desc) const { + std::uint32_t magic; + + magic = read_little_endian(stream); + *hashValue = read_little_endian(stream); + std::uint32_t size = read_little_endian(stream); + if (!stream || magic != Version) + return false; + desc->resize(size); + stream.read(&(*desc)[0], size); + return !stream.fail(); +} + + +// Write network header +template +bool Network::write_header(std::ostream& stream, + std::uint32_t hashValue, + const std::string& desc) const { + write_little_endian(stream, Version); + write_little_endian(stream, hashValue); + write_little_endian(stream, std::uint32_t(desc.size())); + stream.write(&desc[0], desc.size()); + return !stream.fail(); +} + + +template +bool Network::read_parameters(std::istream& stream, + std::string& netDescription) { + std::uint32_t hashValue; + if (!read_header(stream, &hashValue, &netDescription)) + return false; + if (false && hashValue != Network::hash) + return false; + if (!Detail::read_parameters(stream, featureTransformer)) + return false; + for (std::size_t i = 0; i < LayerStacks; ++i) + { + if (!Detail::read_parameters(stream, network[i])) + return false; + } + return stream && stream.peek() == std::ios::traits_type::eof(); +} + + +template +bool Network::write_parameters(std::ostream& stream, + const std::string& netDescription) const { + if (!write_header(stream, Network::hash, netDescription)) + return false; + if (!Detail::write_parameters(stream, featureTransformer)) + return false; + for (std::size_t i = 0; i < LayerStacks; ++i) + { + if (!Detail::write_parameters(stream, network[i])) + return false; + } + return bool(stream); +} + +// Explicit template instantiations + +template class Network, + FeatureTransformer>; + +template class Network, + FeatureTransformer>; + +} // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/network.h b/src/nnue/network.h new file mode 100644 index 0000000000000000000000000000000000000000..cb433718d43f1f9c25b4ffb6dc9005dcd465b32d --- /dev/null +++ b/src/nnue/network.h @@ -0,0 +1,161 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef NETWORK_H_INCLUDED +#define NETWORK_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../misc.h" +#include "../types.h" +#include "nnue_accumulator.h" +#include "nnue_architecture.h" +#include "nnue_common.h" +#include "nnue_feature_transformer.h" +#include "nnue_misc.h" + +namespace Stockfish { +class Position; +} + +namespace Stockfish::Eval::NNUE { + +enum class EmbeddedNNUEType { + BIG, + SMALL, +}; + +using NetworkOutput = std::tuple; + +// The network must be a trivial type, i.e. the memory must be in-line. +// This is required to allow sharing the network via shared memory, as +// there is no way to run destructors. +template +class Network { + static constexpr IndexType FTDimensions = Arch::TransformedFeatureDimensions; + + public: + Network(EvalFile file, EmbeddedNNUEType type) : + evalFile(file), + embeddedType(type) {} + + Network(const Network& other) = default; + Network(Network&& other) = default; + + Network& operator=(const Network& other) = default; + Network& operator=(Network&& other) = default; + + void load(const std::string& rootDirectory, std::string evalfilePath); + bool save(const std::optional& filename) const; + + std::size_t get_content_hash() const; + + NetworkOutput evaluate(const Position& pos, + AccumulatorStack& accumulatorStack, + AccumulatorCaches::Cache& cache) const; + + + void verify(std::string evalfilePath, const std::function&) const; + NnueEvalTrace trace_evaluate(const Position& pos, + AccumulatorStack& accumulatorStack, + AccumulatorCaches::Cache& cache) const; + + private: + void load_user_net(const std::string&, const std::string&); + void load_internal(); + + void initialize(); + + bool save(std::ostream&, const std::string&, const std::string&) const; + std::optional load(std::istream&); + + bool read_header(std::istream&, std::uint32_t*, std::string*) const; + bool write_header(std::ostream&, std::uint32_t, const std::string&) const; + + bool read_parameters(std::istream&, std::string&); + bool write_parameters(std::ostream&, const std::string&) const; + + // Input feature converter + Transformer featureTransformer; + + // Evaluation function + Arch network[LayerStacks]; + + EvalFile evalFile; + EmbeddedNNUEType embeddedType; + + bool initialized = false; + + // Hash value of evaluation function structure + static constexpr std::uint32_t hash = Transformer::get_hash_value() ^ Arch::get_hash_value(); + + template + friend struct AccumulatorCaches::Cache; +}; + +// Definitions of the network types +using SmallFeatureTransformer = FeatureTransformer; +using SmallNetworkArchitecture = + NetworkArchitecture; + +using BigFeatureTransformer = FeatureTransformer; +using BigNetworkArchitecture = NetworkArchitecture; + +using NetworkBig = Network; +using NetworkSmall = Network; + + +struct Networks { + Networks(EvalFile bigFile, EvalFile smallFile) : + big(bigFile, EmbeddedNNUEType::BIG), + small(smallFile, EmbeddedNNUEType::SMALL) {} + + NetworkBig big; + NetworkSmall small; +}; + + +} // namespace Stockfish + +template +struct std::hash> { + std::size_t operator()( + const Stockfish::Eval::NNUE::Network& network) const noexcept { + return network.get_content_hash(); + } +}; + +template<> +struct std::hash { + std::size_t operator()(const Stockfish::Eval::NNUE::Networks& networks) const noexcept { + std::size_t h = 0; + Stockfish::hash_combine(h, networks.big); + Stockfish::hash_combine(h, networks.small); + return h; + } +}; + +#endif diff --git a/src/nnue/nnue_accumulator.cpp b/src/nnue/nnue_accumulator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3f588e37d2bbdacf91b8a04068c019520bed24e5 --- /dev/null +++ b/src/nnue/nnue_accumulator.cpp @@ -0,0 +1,952 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "nnue_accumulator.h" + +#include +#include +#include +#include + +#include "../bitboard.h" +#include "../misc.h" +#include "../position.h" +#include "../types.h" +#include "features/half_ka_v2_hm.h" +#include "nnue_architecture.h" +#include "nnue_common.h" +#include "nnue_feature_transformer.h" // IWYU pragma: keep +#include "simd.h" + +namespace Stockfish::Eval::NNUE { + +using namespace SIMD; + +namespace { + +template +void double_inc_update(Color perspective, + const FeatureTransformer& featureTransformer, + const Square ksq, + AccumulatorState& middle_state, + AccumulatorState& target_state, + const AccumulatorState& computed); + +template +void double_inc_update(Color perspective, + const FeatureTransformer& featureTransformer, + const Square ksq, + AccumulatorState& middle_state, + AccumulatorState& target_state, + const AccumulatorState& computed, + const DirtyPiece& dp2); + +template +void update_accumulator_incremental( + Color perspective, + const FeatureTransformer& featureTransformer, + const Square ksq, + AccumulatorState& target_state, + const AccumulatorState& computed); + +template +void update_accumulator_refresh_cache(Color perspective, + const FeatureTransformer& featureTransformer, + const Position& pos, + AccumulatorState& accumulatorState, + AccumulatorCaches::Cache& cache); + +template +void update_threats_accumulator_full(Color perspective, + const FeatureTransformer& featureTransformer, + const Position& pos, + AccumulatorState& accumulatorState); +} + +template +const AccumulatorState& AccumulatorStack::latest() const noexcept { + return accumulators()[size - 1]; +} + +// Explicit template instantiations +template const AccumulatorState& AccumulatorStack::latest() const noexcept; +template const AccumulatorState& AccumulatorStack::latest() const noexcept; + +template +AccumulatorState& AccumulatorStack::mut_latest() noexcept { + return mut_accumulators()[size - 1]; +} + +template +const std::array, AccumulatorStack::MaxSize>& +AccumulatorStack::accumulators() const noexcept { + static_assert(std::is_same_v || std::is_same_v, + "Invalid Feature Set Type"); + + if constexpr (std::is_same_v) + return psq_accumulators; + + if constexpr (std::is_same_v) + return threat_accumulators; +} + +template +std::array, AccumulatorStack::MaxSize>& +AccumulatorStack::mut_accumulators() noexcept { + static_assert(std::is_same_v || std::is_same_v, + "Invalid Feature Set Type"); + + if constexpr (std::is_same_v) + return psq_accumulators; + + if constexpr (std::is_same_v) + return threat_accumulators; +} + +void AccumulatorStack::reset() noexcept { + psq_accumulators[0].reset({}); + threat_accumulators[0].reset({}); + size = 1; +} + +std::pair AccumulatorStack::push() noexcept { + assert(size < MaxSize); + auto& dp = psq_accumulators[size].reset(); + auto& dts = threat_accumulators[size].reset(); + new (&dts) DirtyThreats; + size++; + return {dp, dts}; +} + +void AccumulatorStack::pop() noexcept { + assert(size > 1); + size--; +} + +template +void AccumulatorStack::evaluate(const Position& pos, + const FeatureTransformer& featureTransformer, + AccumulatorCaches::Cache& cache) noexcept { + constexpr bool UseThreats = (Dimensions == TransformedFeatureDimensionsBig); + + evaluate_side(WHITE, pos, featureTransformer, cache); + + if (UseThreats) + evaluate_side(WHITE, pos, featureTransformer, cache); + + evaluate_side(BLACK, pos, featureTransformer, cache); + + if (UseThreats) + evaluate_side(BLACK, pos, featureTransformer, cache); +} + +template +void AccumulatorStack::evaluate_side(Color perspective, + const Position& pos, + const FeatureTransformer& featureTransformer, + AccumulatorCaches::Cache& cache) noexcept { + + const auto last_usable_accum = + find_last_usable_accumulator(perspective); + + if ((accumulators()[last_usable_accum].template acc()) + .computed[perspective]) + forward_update_incremental(perspective, pos, featureTransformer, + last_usable_accum); + + else + { + if constexpr (std::is_same_v) + update_accumulator_refresh_cache(perspective, featureTransformer, pos, + mut_latest(), cache); + else + update_threats_accumulator_full(perspective, featureTransformer, pos, + mut_latest()); + + backward_update_incremental(perspective, pos, featureTransformer, + last_usable_accum); + } +} + +// Find the earliest usable accumulator, this can either be a computed accumulator or the accumulator +// state just before a change that requires full refresh. +template +std::size_t AccumulatorStack::find_last_usable_accumulator(Color perspective) const noexcept { + + for (std::size_t curr_idx = size - 1; curr_idx > 0; curr_idx--) + { + if ((accumulators()[curr_idx].template acc()).computed[perspective]) + return curr_idx; + + if (FeatureSet::requires_refresh(accumulators()[curr_idx].diff, perspective)) + return curr_idx; + } + + return 0; +} + +template +void AccumulatorStack::forward_update_incremental( + Color perspective, + const Position& pos, + const FeatureTransformer& featureTransformer, + const std::size_t begin) noexcept { + + assert(begin < accumulators().size()); + assert((accumulators()[begin].template acc()).computed[perspective]); + + const Square ksq = pos.square(perspective); + + for (std::size_t next = begin + 1; next < size; next++) + { + if (next + 1 < size) + { + DirtyPiece& dp1 = mut_accumulators()[next].diff; + DirtyPiece& dp2 = mut_accumulators()[next + 1].diff; + + auto& accumulators = mut_accumulators(); + + if constexpr (std::is_same_v) + { + if (dp2.remove_sq != SQ_NONE + && (accumulators[next].diff.threateningSqs & square_bb(dp2.remove_sq))) + { + double_inc_update(perspective, featureTransformer, ksq, accumulators[next], + accumulators[next + 1], accumulators[next - 1], dp2); + next++; + continue; + } + } + + if constexpr (std::is_same_v) + { + if (dp1.to != SQ_NONE && dp1.to == dp2.remove_sq) + { + const Square captureSq = dp1.to; + dp1.to = dp2.remove_sq = SQ_NONE; + double_inc_update(perspective, featureTransformer, ksq, accumulators[next], + accumulators[next + 1], accumulators[next - 1]); + dp1.to = dp2.remove_sq = captureSq; + next++; + continue; + } + } + } + + update_accumulator_incremental(perspective, featureTransformer, ksq, + mut_accumulators()[next], + accumulators()[next - 1]); + } + + assert((latest().acc()).computed[perspective]); +} + +template +void AccumulatorStack::backward_update_incremental( + Color perspective, + + const Position& pos, + const FeatureTransformer& featureTransformer, + const std::size_t end) noexcept { + + assert(end < accumulators().size()); + assert(end < size); + assert((latest().template acc()).computed[perspective]); + + const Square ksq = pos.square(perspective); + + for (std::int64_t next = std::int64_t(size) - 2; next >= std::int64_t(end); next--) + update_accumulator_incremental(perspective, featureTransformer, ksq, + mut_accumulators()[next], + accumulators()[next + 1]); + + assert((accumulators()[end].template acc()).computed[perspective]); +} + +// Explicit template instantiations +template void AccumulatorStack::evaluate( + const Position& pos, + const FeatureTransformer& featureTransformer, + AccumulatorCaches::Cache& cache) noexcept; +template void AccumulatorStack::evaluate( + const Position& pos, + const FeatureTransformer& featureTransformer, + AccumulatorCaches::Cache& cache) noexcept; + + +namespace { + +template, bool> = true> +void fused_row_reduce(const ElementType* in, ElementType* out, const Ts* const... rows) { + constexpr IndexType size = Width * sizeof(ElementType) / sizeof(typename VectorWrapper::type); + + auto* vecIn = reinterpret_cast(in); + auto* vecOut = reinterpret_cast(out); + + for (IndexType i = 0; i < size; ++i) + vecOut[i] = fused( + vecIn[i], reinterpret_cast(rows)[i]...); +} + +template +struct AccumulatorUpdateContext { + Color perspective; + const FeatureTransformer& featureTransformer; + const AccumulatorState& from; + AccumulatorState& to; + + AccumulatorUpdateContext(Color persp, + const FeatureTransformer& ft, + const AccumulatorState& accF, + AccumulatorState& accT) noexcept : + perspective{persp}, + featureTransformer{ft}, + from{accF}, + to{accT} {} + + template, bool> = true> + void apply(const Ts... indices) { + auto to_weight_vector = [&](const IndexType index) { + return &featureTransformer.weights[index * Dimensions]; + }; + + auto to_psqt_weight_vector = [&](const IndexType index) { + return &featureTransformer.psqtWeights[index * PSQTBuckets]; + }; + + fused_row_reduce( + (from.template acc()).accumulation[perspective].data(), + (to.template acc()).accumulation[perspective].data(), + to_weight_vector(indices)...); + + fused_row_reduce( + (from.template acc()).psqtAccumulation[perspective].data(), + (to.template acc()).psqtAccumulation[perspective].data(), + to_psqt_weight_vector(indices)...); + } + + void apply(const typename FeatureSet::IndexList& added, + const typename FeatureSet::IndexList& removed) { + const auto& fromAcc = from.template acc().accumulation[perspective]; + auto& toAcc = to.template acc().accumulation[perspective]; + + const auto& fromPsqtAcc = from.template acc().psqtAccumulation[perspective]; + auto& toPsqtAcc = to.template acc().psqtAccumulation[perspective]; + +#ifdef VECTOR + using Tiling = SIMDTiling; + vec_t acc[Tiling::NumRegs]; + psqt_vec_t psqt[Tiling::NumPsqtRegs]; + + const auto* threatWeights = &featureTransformer.threatWeights[0]; + + for (IndexType j = 0; j < Dimensions / Tiling::TileHeight; ++j) + { + auto* fromTile = reinterpret_cast(&fromAcc[j * Tiling::TileHeight]); + auto* toTile = reinterpret_cast(&toAcc[j * Tiling::TileHeight]); + + for (IndexType k = 0; k < Tiling::NumRegs; ++k) + acc[k] = fromTile[k]; + + for (int i = 0; i < removed.ssize(); ++i) + { + size_t index = removed[i]; + const size_t offset = Dimensions * index; + auto* column = reinterpret_cast(&threatWeights[offset]); + + #ifdef USE_NEON + for (IndexType k = 0; k < Tiling::NumRegs; k += 2) + { + acc[k] = vec_sub_16(acc[k], vmovl_s8(vget_low_s8(column[k / 2]))); + acc[k + 1] = vec_sub_16(acc[k + 1], vmovl_high_s8(column[k / 2])); + } + #else + for (IndexType k = 0; k < Tiling::NumRegs; ++k) + acc[k] = vec_sub_16(acc[k], vec_convert_8_16(column[k])); + #endif + } + + for (int i = 0; i < added.ssize(); ++i) + { + size_t index = added[i]; + const size_t offset = Dimensions * index; + auto* column = reinterpret_cast(&threatWeights[offset]); + + #ifdef USE_NEON + for (IndexType k = 0; k < Tiling::NumRegs; k += 2) + { + acc[k] = vec_add_16(acc[k], vmovl_s8(vget_low_s8(column[k / 2]))); + acc[k + 1] = vec_add_16(acc[k + 1], vmovl_high_s8(column[k / 2])); + } + #else + for (IndexType k = 0; k < Tiling::NumRegs; ++k) + acc[k] = vec_add_16(acc[k], vec_convert_8_16(column[k])); + #endif + } + + for (IndexType k = 0; k < Tiling::NumRegs; k++) + vec_store(&toTile[k], acc[k]); + + threatWeights += Tiling::TileHeight; + } + + for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j) + { + auto* fromTilePsqt = + reinterpret_cast(&fromPsqtAcc[j * Tiling::PsqtTileHeight]); + auto* toTilePsqt = + reinterpret_cast(&toPsqtAcc[j * Tiling::PsqtTileHeight]); + + for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) + psqt[k] = fromTilePsqt[k]; + + for (int i = 0; i < removed.ssize(); ++i) + { + size_t index = removed[i]; + const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight; + auto* columnPsqt = reinterpret_cast( + &featureTransformer.threatPsqtWeights[offset]); + + for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k) + psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); + } + + for (int i = 0; i < added.ssize(); ++i) + { + size_t index = added[i]; + const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight; + auto* columnPsqt = reinterpret_cast( + &featureTransformer.threatPsqtWeights[offset]); + + for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k) + psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); + } + + for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) + vec_store_psqt(&toTilePsqt[k], psqt[k]); + } + +#else + + toAcc = fromAcc; + toPsqtAcc = fromPsqtAcc; + + for (const auto index : removed) + { + const IndexType offset = Dimensions * index; + + for (IndexType j = 0; j < Dimensions; ++j) + toAcc[j] -= featureTransformer.threatWeights[offset + j]; + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + toPsqtAcc[k] -= featureTransformer.threatPsqtWeights[index * PSQTBuckets + k]; + } + + for (const auto index : added) + { + const IndexType offset = Dimensions * index; + + for (IndexType j = 0; j < Dimensions; ++j) + toAcc[j] += featureTransformer.threatWeights[offset + j]; + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + toPsqtAcc[k] += featureTransformer.threatPsqtWeights[index * PSQTBuckets + k]; + } + +#endif + } +}; + +template +auto make_accumulator_update_context(Color perspective, + const FeatureTransformer& featureTransformer, + const AccumulatorState& accumulatorFrom, + AccumulatorState& accumulatorTo) noexcept { + return AccumulatorUpdateContext{perspective, featureTransformer, + accumulatorFrom, accumulatorTo}; +} + +template +void double_inc_update(Color perspective, + const FeatureTransformer& featureTransformer, + const Square ksq, + AccumulatorState& middle_state, + AccumulatorState& target_state, + const AccumulatorState& computed) { + + assert(computed.acc().computed[perspective]); + assert(!middle_state.acc().computed[perspective]); + assert(!target_state.acc().computed[perspective]); + + PSQFeatureSet::IndexList removed, added; + PSQFeatureSet::append_changed_indices(perspective, ksq, middle_state.diff, removed, added); + // you can't capture a piece that was just involved in castling since the rook ends up + // in a square that the king passed + assert(added.size() < 2); + PSQFeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added); + + [[maybe_unused]] const int addedSize = added.ssize(); + [[maybe_unused]] const int removedSize = removed.ssize(); + + assert(addedSize == 1); + assert(removedSize == 2 || removedSize == 3); + + // Workaround compiler warning for uninitialized variables, replicated on + // profile builds on windows with gcc 14.2.0. + // Also helps with optimizations on some compilers. + + sf_assume(addedSize == 1); + sf_assume(removedSize == 2 || removedSize == 3); + + auto updateContext = + make_accumulator_update_context(perspective, featureTransformer, computed, target_state); + + if (removedSize == 2) + { + updateContext.template apply(added[0], removed[0], removed[1]); + } + else + { + updateContext.template apply(added[0], removed[0], removed[1], + removed[2]); + } + + target_state.acc().computed[perspective] = true; +} + +template +void double_inc_update(Color perspective, + const FeatureTransformer& featureTransformer, + const Square ksq, + AccumulatorState& middle_state, + AccumulatorState& target_state, + const AccumulatorState& computed, + const DirtyPiece& dp2) { + + assert(computed.acc().computed[perspective]); + assert(!middle_state.acc().computed[perspective]); + assert(!target_state.acc().computed[perspective]); + + ThreatFeatureSet::FusedUpdateData fusedData; + + fusedData.dp2removed = dp2.remove_sq; + + ThreatFeatureSet::IndexList removed, added; + const auto* pfBase = &featureTransformer.threatWeights[0]; + auto pfStride = static_cast(TransformedFeatureDimensions); + ThreatFeatureSet::append_changed_indices(perspective, ksq, middle_state.diff, removed, added, + &fusedData, true, pfBase, pfStride); + ThreatFeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added, + &fusedData, false, pfBase, pfStride); + + auto updateContext = + make_accumulator_update_context(perspective, featureTransformer, computed, target_state); + + updateContext.apply(added, removed); + + target_state.acc().computed[perspective] = true; +} + +template +void update_accumulator_incremental( + Color perspective, + const FeatureTransformer& featureTransformer, + const Square ksq, + AccumulatorState& target_state, + const AccumulatorState& computed) { + + assert((computed.template acc()).computed[perspective]); + assert(!(target_state.template acc()).computed[perspective]); + + // The size must be enough to contain the largest possible update. + // That might depend on the feature set and generally relies on the + // feature set's update cost calculation to be correct and never allow + // updates with more added/removed features than MaxActiveDimensions. + // In this case, the maximum size of both feature addition and removal + // is 2, since we are incrementally updating one move at a time. + typename FeatureSet::IndexList removed, added; + if constexpr (std::is_same_v) + { + const auto* pfBase = &featureTransformer.threatWeights[0]; + auto pfStride = static_cast(TransformedFeatureDimensions); + if constexpr (Forward) + FeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added, + nullptr, false, pfBase, pfStride); + else + FeatureSet::append_changed_indices(perspective, ksq, computed.diff, added, removed, + nullptr, false, pfBase, pfStride); + } + else + { + if constexpr (Forward) + FeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added); + else + FeatureSet::append_changed_indices(perspective, ksq, computed.diff, added, removed); + } + + auto updateContext = + make_accumulator_update_context(perspective, featureTransformer, computed, target_state); + + if constexpr (std::is_same_v) + updateContext.apply(added, removed); + else + { + [[maybe_unused]] const int addedSize = added.ssize(); + [[maybe_unused]] const int removedSize = removed.ssize(); + + assert(addedSize == 1 || addedSize == 2); + assert(removedSize == 1 || removedSize == 2); + assert((Forward && addedSize <= removedSize) || (!Forward && addedSize >= removedSize)); + + // Workaround compiler warning for uninitialized variables, replicated + // on profile builds on windows with gcc 14.2.0. + // Also helps with optimizations on some compilers. + + sf_assume(addedSize == 1 || addedSize == 2); + sf_assume(removedSize == 1 || removedSize == 2); + + if (!(removedSize == 1 || removedSize == 2) || !(addedSize == 1 || addedSize == 2)) + sf_unreachable(); + + if ((Forward && removedSize == 1) || (!Forward && addedSize == 1)) + { + assert(addedSize == 1 && removedSize == 1); + updateContext.template apply(added[0], removed[0]); + } + else if (Forward && addedSize == 1) + { + assert(removedSize == 2); + updateContext.template apply(added[0], removed[0], removed[1]); + } + else if (!Forward && removedSize == 1) + { + assert(addedSize == 2); + updateContext.template apply(added[0], added[1], removed[0]); + } + else + { + assert(addedSize == 2 && removedSize == 2); + updateContext.template apply(added[0], added[1], removed[0], + removed[1]); + } + } + + (target_state.template acc()).computed[perspective] = true; +} + +Bitboard get_changed_pieces(const std::array& oldPieces, + const std::array& newPieces) { +#if defined(USE_AVX512) || defined(USE_AVX2) + static_assert(sizeof(Piece) == 1); + Bitboard sameBB = 0; + + for (int i = 0; i < 64; i += 32) + { + const __m256i old_v = _mm256_loadu_si256(reinterpret_cast(&oldPieces[i])); + const __m256i new_v = _mm256_loadu_si256(reinterpret_cast(&newPieces[i])); + const __m256i cmpEqual = _mm256_cmpeq_epi8(old_v, new_v); + const std::uint32_t equalMask = _mm256_movemask_epi8(cmpEqual); + sameBB |= static_cast(equalMask) << i; + } + return ~sameBB; +#elif defined(USE_NEON) + uint8x16x4_t old_v = vld4q_u8(reinterpret_cast(oldPieces.data())); + uint8x16x4_t new_v = vld4q_u8(reinterpret_cast(newPieces.data())); + auto cmp = [=](const int i) { return vceqq_u8(old_v.val[i], new_v.val[i]); }; + + uint8x16_t cmp0_1 = vsriq_n_u8(cmp(1), cmp(0), 1); + uint8x16_t cmp2_3 = vsriq_n_u8(cmp(3), cmp(2), 1); + uint8x16_t merged = vsriq_n_u8(cmp2_3, cmp0_1, 2); + merged = vsriq_n_u8(merged, merged, 4); + uint8x8_t sameBB = vshrn_n_u16(vreinterpretq_u16_u8(merged), 4); + + return ~vget_lane_u64(vreinterpret_u64_u8(sameBB), 0); +#else + Bitboard changed = 0; + + for (Square sq = SQUARE_ZERO; sq < SQUARE_NB; ++sq) + changed |= static_cast(oldPieces[sq] != newPieces[sq]) << sq; + + return changed; +#endif +} + +template +void update_accumulator_refresh_cache(Color perspective, + const FeatureTransformer& featureTransformer, + const Position& pos, + AccumulatorState& accumulatorState, + AccumulatorCaches::Cache& cache) { + + using Tiling [[maybe_unused]] = SIMDTiling; + + const Square ksq = pos.square(perspective); + auto& entry = cache[ksq][perspective]; + PSQFeatureSet::IndexList removed, added; + + const Bitboard changedBB = get_changed_pieces(entry.pieces, pos.piece_array()); + Bitboard removedBB = changedBB & entry.pieceBB; + Bitboard addedBB = changedBB & pos.pieces(); + + while (removedBB) + { + Square sq = pop_lsb(removedBB); + removed.push_back(PSQFeatureSet::make_index(perspective, sq, entry.pieces[sq], ksq)); + } + while (addedBB) + { + Square sq = pop_lsb(addedBB); + added.push_back(PSQFeatureSet::make_index(perspective, sq, pos.piece_on(sq), ksq)); + } + + entry.pieceBB = pos.pieces(); + entry.pieces = pos.piece_array(); + + auto& accumulator = accumulatorState.acc(); + accumulator.computed[perspective] = true; + +#ifdef VECTOR + vec_t acc[Tiling::NumRegs]; + psqt_vec_t psqt[Tiling::NumPsqtRegs]; + + const auto* weights = &featureTransformer.weights[0]; + + for (IndexType j = 0; j < Dimensions / Tiling::TileHeight; ++j) + { + auto* accTile = + reinterpret_cast(&accumulator.accumulation[perspective][j * Tiling::TileHeight]); + auto* entryTile = reinterpret_cast(&entry.accumulation[j * Tiling::TileHeight]); + + for (IndexType k = 0; k < Tiling::NumRegs; ++k) + acc[k] = entryTile[k]; + + int i = 0; + for (; i < std::min(removed.ssize(), added.ssize()); ++i) + { + size_t indexR = removed[i]; + const size_t offsetR = Dimensions * indexR; + auto* columnR = reinterpret_cast(&weights[offsetR]); + size_t indexA = added[i]; + const size_t offsetA = Dimensions * indexA; + auto* columnA = reinterpret_cast(&weights[offsetA]); + + for (IndexType k = 0; k < Tiling::NumRegs; ++k) + acc[k] = fused(acc[k], columnA[k], columnR[k]); + } + for (; i < removed.ssize(); ++i) + { + size_t index = removed[i]; + const size_t offset = Dimensions * index; + auto* column = reinterpret_cast(&weights[offset]); + + for (IndexType k = 0; k < Tiling::NumRegs; ++k) + acc[k] = vec_sub_16(acc[k], column[k]); + } + for (; i < added.ssize(); ++i) + { + size_t index = added[i]; + const size_t offset = Dimensions * index; + auto* column = reinterpret_cast(&weights[offset]); + + for (IndexType k = 0; k < Tiling::NumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + + for (IndexType k = 0; k < Tiling::NumRegs; k++) + vec_store(&entryTile[k], acc[k]); + for (IndexType k = 0; k < Tiling::NumRegs; k++) + vec_store(&accTile[k], acc[k]); + + weights += Tiling::TileHeight; + } + + for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j) + { + auto* accTilePsqt = reinterpret_cast( + &accumulator.psqtAccumulation[perspective][j * Tiling::PsqtTileHeight]); + auto* entryTilePsqt = + reinterpret_cast(&entry.psqtAccumulation[j * Tiling::PsqtTileHeight]); + + for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) + psqt[k] = entryTilePsqt[k]; + + for (int i = 0; i < removed.ssize(); ++i) + { + size_t index = removed[i]; + const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight; + auto* columnPsqt = + reinterpret_cast(&featureTransformer.psqtWeights[offset]); + + for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k) + psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); + } + for (int i = 0; i < added.ssize(); ++i) + { + size_t index = added[i]; + const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight; + auto* columnPsqt = + reinterpret_cast(&featureTransformer.psqtWeights[offset]); + + for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k) + psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); + } + + for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) + vec_store_psqt(&entryTilePsqt[k], psqt[k]); + for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) + vec_store_psqt(&accTilePsqt[k], psqt[k]); + } + +#else + + for (const auto index : removed) + { + const IndexType offset = Dimensions * index; + for (IndexType j = 0; j < Dimensions; ++j) + entry.accumulation[j] -= featureTransformer.weights[offset + j]; + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + entry.psqtAccumulation[k] -= featureTransformer.psqtWeights[index * PSQTBuckets + k]; + } + for (const auto index : added) + { + const IndexType offset = Dimensions * index; + for (IndexType j = 0; j < Dimensions; ++j) + entry.accumulation[j] += featureTransformer.weights[offset + j]; + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + entry.psqtAccumulation[k] += featureTransformer.psqtWeights[index * PSQTBuckets + k]; + } + + // The accumulator of the refresh entry has been updated. + // Now copy its content to the actual accumulator we were refreshing. + accumulator.accumulation[perspective] = entry.accumulation; + accumulator.psqtAccumulation[perspective] = entry.psqtAccumulation; +#endif +} + +template +void update_threats_accumulator_full(Color perspective, + const FeatureTransformer& featureTransformer, + const Position& pos, + AccumulatorState& accumulatorState) { + using Tiling [[maybe_unused]] = SIMDTiling; + + ThreatFeatureSet::IndexList active; + ThreatFeatureSet::append_active_indices(perspective, pos, active); + + auto& accumulator = accumulatorState.acc(); + accumulator.computed[perspective] = true; + +#ifdef VECTOR + vec_t acc[Tiling::NumRegs]; + psqt_vec_t psqt[Tiling::NumPsqtRegs]; + + const auto* threatWeights = &featureTransformer.threatWeights[0]; + + for (IndexType j = 0; j < Dimensions / Tiling::TileHeight; ++j) + { + auto* accTile = + reinterpret_cast(&accumulator.accumulation[perspective][j * Tiling::TileHeight]); + + for (IndexType k = 0; k < Tiling::NumRegs; ++k) + acc[k] = vec_zero(); + + int i = 0; + + for (; i < active.ssize(); ++i) + { + size_t index = active[i]; + const size_t offset = Dimensions * index; + auto* column = reinterpret_cast(&threatWeights[offset]); + + #ifdef USE_NEON + for (IndexType k = 0; k < Tiling::NumRegs; k += 2) + { + acc[k] = vec_add_16(acc[k], vmovl_s8(vget_low_s8(column[k / 2]))); + acc[k + 1] = vec_add_16(acc[k + 1], vmovl_high_s8(column[k / 2])); + } + #else + for (IndexType k = 0; k < Tiling::NumRegs; ++k) + acc[k] = vec_add_16(acc[k], vec_convert_8_16(column[k])); + #endif + } + + for (IndexType k = 0; k < Tiling::NumRegs; k++) + vec_store(&accTile[k], acc[k]); + + threatWeights += Tiling::TileHeight; + } + + for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j) + { + auto* accTilePsqt = reinterpret_cast( + &accumulator.psqtAccumulation[perspective][j * Tiling::PsqtTileHeight]); + + for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) + psqt[k] = vec_zero_psqt(); + + for (int i = 0; i < active.ssize(); ++i) + { + size_t index = active[i]; + const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight; + auto* columnPsqt = + reinterpret_cast(&featureTransformer.threatPsqtWeights[offset]); + + for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k) + psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); + } + + for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) + vec_store_psqt(&accTilePsqt[k], psqt[k]); + } + +#else + + for (IndexType j = 0; j < Dimensions; ++j) + accumulator.accumulation[perspective][j] = 0; + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + accumulator.psqtAccumulation[perspective][k] = 0; + + for (const auto index : active) + { + const IndexType offset = Dimensions * index; + + for (IndexType j = 0; j < Dimensions; ++j) + accumulator.accumulation[perspective][j] += + featureTransformer.threatWeights[offset + j]; + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + accumulator.psqtAccumulation[perspective][k] += + featureTransformer.threatPsqtWeights[index * PSQTBuckets + k]; + } + +#endif +} + +} + +} diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h new file mode 100644 index 0000000000000000000000000000000000000000..438074f430a8673e9d04bfe001b902e9c8072d25 --- /dev/null +++ b/src/nnue/nnue_accumulator.h @@ -0,0 +1,206 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +// Class for difference calculation of NNUE evaluation function + +#ifndef NNUE_ACCUMULATOR_H_INCLUDED +#define NNUE_ACCUMULATOR_H_INCLUDED + +#include +#include +#include +#include +#include + +#include "../types.h" +#include "nnue_architecture.h" +#include "nnue_common.h" + +namespace Stockfish { +class Position; +} + +namespace Stockfish::Eval::NNUE { + +template +struct alignas(CacheLineSize) Accumulator; + +template +class FeatureTransformer; + +// Class that holds the result of affine transformation of input features +template +struct alignas(CacheLineSize) Accumulator { + std::array, COLOR_NB> accumulation; + std::array, COLOR_NB> psqtAccumulation; + std::array computed = {}; +}; + + +// AccumulatorCaches struct provides per-thread accumulator caches, where each +// cache contains multiple entries for each of the possible king squares. +// When the accumulator needs to be refreshed, the cached entry is used to more +// efficiently update the accumulator, instead of rebuilding it from scratch. +// This idea, was first described by Luecx (author of Koivisto) and +// is commonly referred to as "Finny Tables". +struct AccumulatorCaches { + + template + AccumulatorCaches(const Networks& networks) { + clear(networks); + } + + template + struct alignas(CacheLineSize) Cache { + + struct alignas(CacheLineSize) Entry { + std::array accumulation; + std::array psqtAccumulation; + std::array pieces; + Bitboard pieceBB; + + // To initialize a refresh entry, we set all its bitboards empty, + // so we put the biases in the accumulation, without any weights on top + void clear(const std::array& biases) { + accumulation = biases; + std::memset(reinterpret_cast(this) + offsetof(Entry, psqtAccumulation), + 0, sizeof(Entry) - offsetof(Entry, psqtAccumulation)); + } + }; + + template + void clear(const Network& network) { + for (auto& entries1D : entries) + for (auto& entry : entries1D) + entry.clear(network.featureTransformer.biases); + } + + std::array& operator[](Square sq) { return entries[sq]; } + + std::array, SQUARE_NB> entries; + }; + + template + void clear(const Networks& networks) { + big.clear(networks.big); + small.clear(networks.small); + } + + Cache big; + Cache small; +}; + + +template +struct AccumulatorState { + Accumulator accumulatorBig; + Accumulator accumulatorSmall; + typename FeatureSet::DiffType diff; + + template + auto& acc() noexcept { + static_assert(Size == TransformedFeatureDimensionsBig + || Size == TransformedFeatureDimensionsSmall, + "Invalid size for accumulator"); + + if constexpr (Size == TransformedFeatureDimensionsBig) + return accumulatorBig; + else if constexpr (Size == TransformedFeatureDimensionsSmall) + return accumulatorSmall; + } + + template + const auto& acc() const noexcept { + static_assert(Size == TransformedFeatureDimensionsBig + || Size == TransformedFeatureDimensionsSmall, + "Invalid size for accumulator"); + + if constexpr (Size == TransformedFeatureDimensionsBig) + return accumulatorBig; + else if constexpr (Size == TransformedFeatureDimensionsSmall) + return accumulatorSmall; + } + + void reset(const typename FeatureSet::DiffType& dp) noexcept { + diff = dp; + accumulatorBig.computed.fill(false); + accumulatorSmall.computed.fill(false); + } + + typename FeatureSet::DiffType& reset() noexcept { + accumulatorBig.computed.fill(false); + accumulatorSmall.computed.fill(false); + return diff; + } +}; + +class AccumulatorStack { + public: + static constexpr std::size_t MaxSize = MAX_PLY + 1; + + template + [[nodiscard]] const AccumulatorState& latest() const noexcept; + + void reset() noexcept; + std::pair push() noexcept; + void pop() noexcept; + + template + void evaluate(const Position& pos, + const FeatureTransformer& featureTransformer, + AccumulatorCaches::Cache& cache) noexcept; + + private: + template + [[nodiscard]] AccumulatorState& mut_latest() noexcept; + + template + [[nodiscard]] const std::array, MaxSize>& accumulators() const noexcept; + + template + [[nodiscard]] std::array, MaxSize>& mut_accumulators() noexcept; + + template + void evaluate_side(Color perspective, + const Position& pos, + const FeatureTransformer& featureTransformer, + AccumulatorCaches::Cache& cache) noexcept; + + template + [[nodiscard]] std::size_t find_last_usable_accumulator(Color perspective) const noexcept; + + template + void forward_update_incremental(Color perspective, + const Position& pos, + const FeatureTransformer& featureTransformer, + const std::size_t begin) noexcept; + + template + void backward_update_incremental(Color perspective, + const Position& pos, + const FeatureTransformer& featureTransformer, + const std::size_t end) noexcept; + + std::array, MaxSize> psq_accumulators; + std::array, MaxSize> threat_accumulators; + std::size_t size = 1; +}; + +} // namespace Stockfish::Eval::NNUE + +#endif // NNUE_ACCUMULATOR_H_INCLUDED diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h new file mode 100644 index 0000000000000000000000000000000000000000..94985be980380dcc42c4438c832a74554bf365db --- /dev/null +++ b/src/nnue/nnue_architecture.h @@ -0,0 +1,165 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +// Input features and network structure used in NNUE evaluation function + +#ifndef NNUE_ARCHITECTURE_H_INCLUDED +#define NNUE_ARCHITECTURE_H_INCLUDED + +#include +#include +#include + +#include "features/half_ka_v2_hm.h" +#include "features/full_threats.h" +#include "layers/affine_transform.h" +#include "layers/affine_transform_sparse_input.h" +#include "layers/clipped_relu.h" +#include "layers/sqr_clipped_relu.h" +#include "nnue_common.h" + +namespace Stockfish::Eval::NNUE { + +// Input features used in evaluation function +using ThreatFeatureSet = Features::FullThreats; +using PSQFeatureSet = Features::HalfKAv2_hm; + +// Number of input feature dimensions after conversion +constexpr IndexType TransformedFeatureDimensionsBig = 256; +constexpr int L2Big = 31; +constexpr int L3Big = 32; + +constexpr IndexType TransformedFeatureDimensionsSmall = 128; +constexpr int L2Small = 15; +constexpr int L3Small = 32; + +constexpr IndexType PSQTBuckets = 8; +constexpr IndexType LayerStacks = 8; + +// If vector instructions are enabled, we update and refresh the +// accumulator tile by tile such that each tile fits in the CPU's +// vector registers. +static_assert(PSQTBuckets % 8 == 0, + "Per feature PSQT values cannot be processed at granularity lower than 8 at a time."); + +template +struct NetworkArchitecture { + static constexpr IndexType TransformedFeatureDimensions = L1; + static constexpr int FC_0_OUTPUTS = L2; + static constexpr int FC_1_OUTPUTS = L3; + + Layers::AffineTransformSparseInput fc_0; + Layers::SqrClippedReLU ac_sqr_0; + Layers::ClippedReLU ac_0; + Layers::AffineTransform fc_1; + Layers::ClippedReLU ac_1; + Layers::AffineTransform fc_2; + + // Hash value embedded in the evaluation file + static constexpr std::uint32_t get_hash_value() { + // input slice hash + std::uint32_t hashValue = 0xEC42E90Du; + hashValue ^= TransformedFeatureDimensions * 2; + + hashValue = decltype(fc_0)::get_hash_value(hashValue); + hashValue = decltype(ac_0)::get_hash_value(hashValue); + hashValue = decltype(fc_1)::get_hash_value(hashValue); + hashValue = decltype(ac_1)::get_hash_value(hashValue); + hashValue = decltype(fc_2)::get_hash_value(hashValue); + + return hashValue; + } + + // Read network parameters + bool read_parameters(std::istream& stream) { + return fc_0.read_parameters(stream) && ac_0.read_parameters(stream) + && fc_1.read_parameters(stream) && ac_1.read_parameters(stream) + && fc_2.read_parameters(stream); + } + + // Write network parameters + bool write_parameters(std::ostream& stream) const { + return fc_0.write_parameters(stream) && ac_0.write_parameters(stream) + && fc_1.write_parameters(stream) && ac_1.write_parameters(stream) + && fc_2.write_parameters(stream); + } + + std::int32_t propagate(const TransformedFeatureType* transformedFeatures) const { + struct alignas(CacheLineSize) Buffer { + alignas(CacheLineSize) typename decltype(fc_0)::OutputBuffer fc_0_out; + alignas(CacheLineSize) typename decltype(ac_sqr_0)::OutputType + ac_sqr_0_out[ceil_to_multiple(FC_0_OUTPUTS * 2, 32)]; + alignas(CacheLineSize) typename decltype(ac_0)::OutputBuffer ac_0_out; + alignas(CacheLineSize) typename decltype(fc_1)::OutputBuffer fc_1_out; + alignas(CacheLineSize) typename decltype(ac_1)::OutputBuffer ac_1_out; + alignas(CacheLineSize) typename decltype(fc_2)::OutputBuffer fc_2_out; + + Buffer() { std::memset(this, 0, sizeof(*this)); } + }; + +#if defined(__clang__) && (__APPLE__) + // workaround for a bug reported with xcode 12 + static thread_local auto tlsBuffer = std::make_unique(); + // Access TLS only once, cache result. + Buffer& buffer = *tlsBuffer; +#else + alignas(CacheLineSize) static thread_local Buffer buffer; +#endif + + fc_0.propagate(transformedFeatures, buffer.fc_0_out); + ac_sqr_0.propagate(buffer.fc_0_out, buffer.ac_sqr_0_out); + ac_0.propagate(buffer.fc_0_out, buffer.ac_0_out); + std::memcpy(buffer.ac_sqr_0_out + FC_0_OUTPUTS, buffer.ac_0_out, + FC_0_OUTPUTS * sizeof(typename decltype(ac_0)::OutputType)); + fc_1.propagate(buffer.ac_sqr_0_out, buffer.fc_1_out); + ac_1.propagate(buffer.fc_1_out, buffer.ac_1_out); + fc_2.propagate(buffer.ac_1_out, buffer.fc_2_out); + + // buffer.fc_0_out[FC_0_OUTPUTS] is such that 1.0 is equal to 127*(1< +struct std::hash> { + std::size_t + operator()(const Stockfish::Eval::NNUE::NetworkArchitecture& arch) const noexcept { + return arch.get_content_hash(); + } +}; + +#endif // #ifndef NNUE_ARCHITECTURE_H_INCLUDED diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h new file mode 100644 index 0000000000000000000000000000000000000000..82cefde038bb56536ecccd9caba636015eb1cc1b --- /dev/null +++ b/src/nnue/nnue_common.h @@ -0,0 +1,298 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +// Constants used in NNUE evaluation function + +#ifndef NNUE_COMMON_H_INCLUDED +#define NNUE_COMMON_H_INCLUDED + +#include +#include +#include +#include +#include +#include + +#include "../misc.h" + +#if defined(USE_AVX2) + #include + +#elif defined(USE_SSE41) + #include + +#elif defined(USE_SSSE3) + #include + +#elif defined(USE_SSE2) + #include + +#elif defined(USE_NEON) + #include +#endif + +namespace Stockfish::Eval::NNUE { + +using BiasType = std::int16_t; +using ThreatWeightType = std::int8_t; +using WeightType = std::int16_t; +using PSQTWeightType = std::int32_t; +using IndexType = std::uint32_t; + +// Version of the evaluation file +constexpr std::uint32_t Version = 0x7AF32F20u; + +// Constant used in evaluation value calculation +constexpr int OutputScale = 16; +constexpr int WeightScaleBits = 6; + +// Size of cache line (in bytes) +constexpr std::size_t CacheLineSize = 64; + +constexpr const char Leb128MagicString[] = "COMPRESSED_LEB128"; +constexpr const std::size_t Leb128MagicStringSize = sizeof(Leb128MagicString) - 1; + +// SIMD width (in bytes) +#if defined(USE_AVX2) +constexpr std::size_t SimdWidth = 32; + +#elif defined(USE_SSE2) +constexpr std::size_t SimdWidth = 16; + +#elif defined(USE_NEON) +constexpr std::size_t SimdWidth = 16; +#endif + +constexpr std::size_t MaxSimdWidth = 32; + +// Type of input feature after conversion +using TransformedFeatureType = std::uint8_t; + +// Round n up to be a multiple of base +template +constexpr IntType ceil_to_multiple(IntType n, IntType base) { + return (n + base - 1) / base * base; +} + + +// Utility to read an integer (signed or unsigned, any size) +// from a stream in little-endian order. We swap the byte order after the read if +// necessary to return a result with the byte ordering of the compiling machine. +template +inline IntType read_little_endian(std::istream& stream) { + IntType result; + + if (IsLittleEndian) + stream.read(reinterpret_cast(&result), sizeof(IntType)); + else + { + std::uint8_t u[sizeof(IntType)]; + std::make_unsigned_t v = 0; + + stream.read(reinterpret_cast(u), sizeof(IntType)); + for (std::size_t i = 0; i < sizeof(IntType); ++i) + v = (v << 8) | u[sizeof(IntType) - i - 1]; + + std::memcpy(&result, &v, sizeof(IntType)); + } + + return result; +} + + +// Utility to write an integer (signed or unsigned, any size) +// to a stream in little-endian order. We swap the byte order before the write if +// necessary to always write in little-endian order, independently of the byte +// ordering of the compiling machine. +template +inline void write_little_endian(std::ostream& stream, IntType value) { + + if (IsLittleEndian) + stream.write(reinterpret_cast(&value), sizeof(IntType)); + else + { + std::uint8_t u[sizeof(IntType)]; + std::make_unsigned_t v = value; + + std::size_t i = 0; + // if constexpr to silence the warning about shift by 8 + if constexpr (sizeof(IntType) > 1) + { + for (; i + 1 < sizeof(IntType); ++i) + { + u[i] = std::uint8_t(v); + v >>= 8; + } + } + u[i] = std::uint8_t(v); + + stream.write(reinterpret_cast(u), sizeof(IntType)); + } +} + + +// Read integers in bulk from a little-endian stream. +// This reads N integers from stream s and puts them in array out. +template +inline void read_little_endian(std::istream& stream, IntType* out, std::size_t count) { + if (IsLittleEndian) + stream.read(reinterpret_cast(out), sizeof(IntType) * count); + else + for (std::size_t i = 0; i < count; ++i) + out[i] = read_little_endian(stream); +} + + +// Write integers in bulk to a little-endian stream. +// This takes N integers from array values and writes them on stream s. +template +inline void write_little_endian(std::ostream& stream, const IntType* values, std::size_t count) { + if (IsLittleEndian) + stream.write(reinterpret_cast(values), sizeof(IntType) * count); + else + for (std::size_t i = 0; i < count; ++i) + write_little_endian(stream, values[i]); +} + +// Read N signed integers from the stream s, putting them in the array out. +// The stream is assumed to be compressed using the signed LEB128 format. +// See https://en.wikipedia.org/wiki/LEB128 for a description of the compression scheme. +template +inline void read_leb_128_detail(std::istream& stream, + std::array& out, + std::uint32_t& bytes_left, + BufType& buf, + std::uint32_t& buf_pos) { + + static_assert(std::is_signed_v, "Not implemented for unsigned types"); + static_assert(sizeof(IntType) <= 4, "Not implemented for types larger than 32 bit"); + + IntType result = 0; + size_t shift = 0, i = 0; + while (i < Count) + { + if (buf_pos == buf.size()) + { + stream.read(reinterpret_cast(buf.data()), + std::min(std::size_t(bytes_left), buf.size())); + buf_pos = 0; + } + + std::uint8_t byte = buf[buf_pos++]; + --bytes_left; + result |= (byte & 0x7f) << (shift % 32); + shift += 7; + + if ((byte & 0x80) == 0) + { + out[i++] = (shift >= 32 || (byte & 0x40) == 0) ? result : result | ~((1 << shift) - 1); + result = 0; + shift = 0; + } + } +} + +template +inline void read_leb_128(std::istream& stream, Arrays&... outs) { + // Check the presence of our LEB128 magic string + char leb128MagicString[Leb128MagicStringSize]; + stream.read(leb128MagicString, Leb128MagicStringSize); + if (stream.fail() || strncmp(Leb128MagicString, leb128MagicString, Leb128MagicStringSize) != 0) + { + stream.setstate(std::ios::failbit); + return; + } + + auto bytes_left = read_little_endian(stream); + std::array buf; + std::uint32_t buf_pos = std::uint32_t(buf.size()); + + (read_leb_128_detail(stream, outs, bytes_left, buf, buf_pos), ...); + + if (bytes_left != 0) + stream.setstate(std::ios::failbit); +} + + +// Write signed integers to a stream with LEB128 compression. +// This takes N integers from array values, compresses them with +// the LEB128 algorithm and writes the result on the stream s. +// See https://en.wikipedia.org/wiki/LEB128 for a description of the compression scheme. +template +inline void write_leb_128(std::ostream& stream, const std::array& values) { + + // Write our LEB128 magic string + stream.write(Leb128MagicString, Leb128MagicStringSize); + + static_assert(std::is_signed_v, "Not implemented for unsigned types"); + + std::uint32_t byte_count = 0; + for (std::size_t i = 0; i < Count; ++i) + { + IntType value = values[i]; + std::uint8_t byte; + do + { + byte = value & 0x7f; + value >>= 7; + ++byte_count; + } while ((byte & 0x40) == 0 ? value != 0 : value != -1); + } + + write_little_endian(stream, byte_count); + + const std::uint32_t BUF_SIZE = 4096; + std::uint8_t buf[BUF_SIZE]; + std::uint32_t buf_pos = 0; + + auto flush = [&]() { + if (buf_pos > 0) + { + stream.write(reinterpret_cast(buf), buf_pos); + buf_pos = 0; + } + }; + + auto write = [&](std::uint8_t b) { + buf[buf_pos++] = b; + if (buf_pos == BUF_SIZE) + flush(); + }; + + for (std::size_t i = 0; i < Count; ++i) + { + IntType value = values[i]; + while (true) + { + std::uint8_t byte = value & 0x7f; + value >>= 7; + if ((byte & 0x40) == 0 ? value == 0 : value == -1) + { + write(byte); + break; + } + write(byte | 0x80); + } + } + + flush(); +} + +} // namespace Stockfish::Eval::NNUE + +#endif // #ifndef NNUE_COMMON_H_INCLUDED diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h new file mode 100644 index 0000000000000000000000000000000000000000..9ea09b0c16335a3892349d8d3f3a35db784dc9eb --- /dev/null +++ b/src/nnue/nnue_feature_transformer.h @@ -0,0 +1,456 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +// A class that converts the input features of the NNUE evaluation function + +#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED +#define NNUE_FEATURE_TRANSFORMER_H_INCLUDED + +#include +#include +#include +#include +#include + +#include "../position.h" +#include "../types.h" +#include "nnue_accumulator.h" +#include "nnue_architecture.h" +#include "nnue_common.h" +#include "simd.h" + +namespace Stockfish::Eval::NNUE { + +// Returns the inverse of a permutation +template +constexpr std::array +invert_permutation(const std::array& order) { + std::array inverse{}; + for (std::size_t i = 0; i < order.size(); i++) + inverse[order[i]] = i; + return inverse; +} + +// Divide a byte region of size TotalSize to chunks of size +// BlockSize, and permute the blocks by a given order +template +void permute(std::array& data, const std::array& order) { + constexpr std::size_t TotalSize = N * sizeof(T); + + static_assert(TotalSize % (BlockSize * OrderSize) == 0, + "ChunkSize * OrderSize must perfectly divide TotalSize"); + + constexpr std::size_t ProcessChunkSize = BlockSize * OrderSize; + + std::array buffer{}; + + std::byte* const bytes = reinterpret_cast(data.data()); + + for (std::size_t i = 0; i < TotalSize; i += ProcessChunkSize) + { + std::byte* const values = &bytes[i]; + + for (std::size_t j = 0; j < OrderSize; j++) + { + auto* const buffer_chunk = &buffer[j * BlockSize]; + auto* const value_chunk = &values[order[j] * BlockSize]; + + std::copy(value_chunk, value_chunk + BlockSize, buffer_chunk); + } + + std::copy(std::begin(buffer), std::end(buffer), values); + } +} + +// Input feature converter +template +class FeatureTransformer { + static constexpr bool UseThreats = + (TransformedFeatureDimensions == TransformedFeatureDimensionsBig); + // Number of output dimensions for one side + static constexpr IndexType HalfDimensions = TransformedFeatureDimensions; + + public: + // Output type + using OutputType = TransformedFeatureType; + + // Number of input/output dimensions + static constexpr IndexType InputDimensions = PSQFeatureSet::Dimensions; + static constexpr IndexType ThreatInputDimensions = ThreatFeatureSet::Dimensions; + static constexpr IndexType TotalInputDimensions = + InputDimensions + (UseThreats ? ThreatInputDimensions : 0); + static constexpr IndexType OutputDimensions = HalfDimensions; + + // Size of forward propagation buffer + static constexpr std::size_t BufferSize = OutputDimensions * sizeof(OutputType); + + // Store the order by which 128-bit blocks of a 1024-bit data must + // be permuted so that calling packus on adjacent vectors of 16-bit + // integers loaded from the data results in the pre-permutation order + static constexpr auto PackusEpi16Order = []() -> std::array { +#if defined(USE_AVX512) + // _mm512_packus_epi16 after permutation: + // | 0 | 2 | 4 | 6 | // Vector 0 + // | 1 | 3 | 5 | 7 | // Vector 1 + // | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | // Packed Result + return {0, 2, 4, 6, 1, 3, 5, 7}; +#elif defined(USE_AVX2) + // _mm256_packus_epi16 after permutation: + // | 0 | 2 | | 4 | 6 | // Vector 0, 2 + // | 1 | 3 | | 5 | 7 | // Vector 1, 3 + // | 0 | 1 | 2 | 3 | | 4 | 5 | 6 | 7 | // Packed Result + return {0, 2, 1, 3, 4, 6, 5, 7}; +#else + return {0, 1, 2, 3, 4, 5, 6, 7}; +#endif + }(); + + static constexpr auto InversePackusEpi16Order = invert_permutation(PackusEpi16Order); + + static constexpr std::uint32_t combine_hash(std::initializer_list hashes) { + std::uint32_t hash = 0; + for (const auto component_hash : hashes) + { + hash = (hash << 1) | (hash >> 31); + hash ^= component_hash; + } + return hash; + } + + // Hash value embedded in the evaluation file + static constexpr std::uint32_t get_hash_value() { + return (UseThreats ? combine_hash({ThreatFeatureSet::HashValue, PSQFeatureSet::HashValue}) + : PSQFeatureSet::HashValue) + ^ (OutputDimensions * 2); + } + + void permute_weights() { + permute<16>(biases, PackusEpi16Order); + permute<16>(weights, PackusEpi16Order); + + if constexpr (UseThreats) + permute<8>(threatWeights, PackusEpi16Order); + } + + void unpermute_weights() { + permute<16>(biases, InversePackusEpi16Order); + permute<16>(weights, InversePackusEpi16Order); + + if constexpr (UseThreats) + permute<8>(threatWeights, InversePackusEpi16Order); + } + + // Read network parameters + bool read_parameters(std::istream& stream) { + const std::streampos beginPos = stream.tellg(); + + if constexpr (UseThreats) + { + // Primary path: Full_Threats + HalfKAv2_hm^ export layout + read_leb_128(stream, biases); + read_little_endian(stream, threatWeights.data(), + ThreatInputDimensions * HalfDimensions); + read_leb_128(stream, weights); + read_leb_128(stream, threatPsqtWeights); + read_leb_128(stream, psqtWeights); + + if (stream.fail()) + { + // Fallback path: HalfKAv2_hm^ only export layout (no threat tensors) + stream.clear(); + stream.seekg(beginPos); + std::fill(threatWeights.begin(), threatWeights.end(), 0); + std::fill(threatPsqtWeights.begin(), threatPsqtWeights.end(), 0); + + read_leb_128(stream, biases); + read_leb_128(stream, weights); + read_leb_128(stream, psqtWeights); + } + } + else + { + read_leb_128(stream, biases); + read_leb_128(stream, weights); + read_leb_128(stream, psqtWeights); + } + + if (stream.fail()) + return false; + + permute_weights(); + return true; + } + + // Write network parameters + bool write_parameters(std::ostream& stream) const { + std::unique_ptr copy = std::make_unique(*this); + + copy->unpermute_weights(); + + write_leb_128(stream, copy->biases); + + if constexpr (UseThreats) + { + write_little_endian(stream, copy->threatWeights.data(), + ThreatInputDimensions * HalfDimensions); + write_leb_128(stream, copy->weights); + + auto combinedPsqtWeights = + std::make_unique>(); + + std::copy(std::begin(copy->threatPsqtWeights), + std::begin(copy->threatPsqtWeights) + ThreatInputDimensions * PSQTBuckets, + combinedPsqtWeights->begin()); + + std::copy(std::begin(copy->psqtWeights), + std::begin(copy->psqtWeights) + InputDimensions * PSQTBuckets, + combinedPsqtWeights->begin() + ThreatInputDimensions * PSQTBuckets); + + write_leb_128(stream, *combinedPsqtWeights); + } + else + { + write_leb_128(stream, copy->weights); + write_leb_128(stream, copy->psqtWeights); + } + + return !stream.fail(); + } + + std::size_t get_content_hash() const { + std::size_t h = 0; + + hash_combine(h, get_raw_data_hash(biases)); + hash_combine(h, get_raw_data_hash(weights)); + hash_combine(h, get_raw_data_hash(psqtWeights)); + + if constexpr (UseThreats) + { + hash_combine(h, get_raw_data_hash(threatWeights)); + hash_combine(h, get_raw_data_hash(threatPsqtWeights)); + } + + hash_combine(h, get_hash_value()); + + return h; + } + + // Convert input features + std::int32_t transform(const Position& pos, + AccumulatorStack& accumulatorStack, + AccumulatorCaches::Cache& cache, + OutputType* output, + int bucket) const { + + using namespace SIMD; + accumulatorStack.evaluate(pos, *this, cache); + const auto& accumulatorState = accumulatorStack.latest(); + const auto& threatAccumulatorState = accumulatorStack.latest(); + + const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; + const auto& psqtAccumulation = (accumulatorState.acc()).psqtAccumulation; + auto psqt = + (psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket]); + + if constexpr (UseThreats) + { + const auto& threatPsqtAccumulation = + (threatAccumulatorState.acc()).psqtAccumulation; + psqt = (psqt + threatPsqtAccumulation[perspectives[0]][bucket] + - threatPsqtAccumulation[perspectives[1]][bucket]) + / 2; + } + else + psqt /= 2; + + const auto& accumulation = (accumulatorState.acc()).accumulation; + const auto& threatAccumulation = + (threatAccumulatorState.acc()).accumulation; + + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = (HalfDimensions / 2) * p; + +#if defined(VECTOR) + + constexpr IndexType OutputChunkSize = MaxChunkSize; + static_assert((HalfDimensions / 2) % OutputChunkSize == 0); + constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; + + const vec_t Zero = vec_zero(); + const vec_t One = vec_set_16(255); + + const vec_t* in0 = reinterpret_cast(&(accumulation[perspectives[p]][0])); + const vec_t* in1 = + reinterpret_cast(&(accumulation[perspectives[p]][HalfDimensions / 2])); + vec_t* out = reinterpret_cast(output + offset); + + // Per the NNUE architecture, here we want to multiply pairs of + // clipped elements and divide the product by 128. To do this, + // we can naively perform min/max operation to clip each of the + // four int16 vectors, mullo pairs together, then pack them into + // one int8 vector. However, there exists a faster way. + + // The idea here is to use the implicit clipping from packus to + // save us two vec_max_16 instructions. This clipping works due + // to the fact that any int16 integer below zero will be zeroed + // on packus. + + // Consider the case where the second element is negative. + // If we do standard clipping, that element will be zero, which + // means our pairwise product is zero. If we perform packus and + // remove the lower-side clip for the second element, then our + // product before packus will be negative, and is zeroed on pack. + // The two operation produce equivalent results, but the second + // one (using packus) saves one max operation per pair. + + // But here we run into a problem: mullo does not preserve the + // sign of the multiplication. We can get around this by doing + // mulhi, which keeps the sign. But that requires an additional + // tweak. + + // mulhi cuts off the last 16 bits of the resulting product, + // which is the same as performing a rightward shift of 16 bits. + // We can use this to our advantage. Recall that we want to + // divide the final product by 128, which is equivalent to a + // 7-bit right shift. Intuitively, if we shift the clipped + // value left by 9, and perform mulhi, which shifts the product + // right by 16 bits, then we will net a right shift of 7 bits. + // However, this won't work as intended. Since we clip the + // values to have a maximum value of 127, shifting it by 9 bits + // might occupy the signed bit, resulting in some positive + // values being interpreted as negative after the shift. + + // There is a way, however, to get around this limitation. When + // loading the network, scale accumulator weights and biases by + // 2. To get the same pairwise multiplication result as before, + // we need to divide the product by 128 * 2 * 2 = 512, which + // amounts to a right shift of 9 bits. So now we only have to + // shift left by 7 bits, perform mulhi (shifts right by 16 bits) + // and net a 9 bit right shift. Since we scaled everything by + // two, the values are clipped at 127 * 2 = 254, which occupies + // 8 bits. Shifting it by 7 bits left will no longer occupy the + // signed bit, so we are safe. + + // Note that on NEON processors, we shift left by 6 instead + // because the instruction "vqdmulhq_s16" also doubles the + // return value after the multiplication, adding an extra shift + // to the left by 1, so we compensate by shifting less before + // the multiplication. + + constexpr int shift = + #if defined(USE_SSE2) + 7; + #else + 6; + #endif + if constexpr (UseThreats) + { + const vec_t* tin0 = + reinterpret_cast(&(threatAccumulation[perspectives[p]][0])); + const vec_t* tin1 = reinterpret_cast( + &(threatAccumulation[perspectives[p]][HalfDimensions / 2])); + for (IndexType j = 0; j < NumOutputChunks; ++j) + { + const vec_t acc0a = vec_add_16(in0[j * 2 + 0], tin0[j * 2 + 0]); + const vec_t acc0b = vec_add_16(in0[j * 2 + 1], tin0[j * 2 + 1]); + const vec_t acc1a = vec_add_16(in1[j * 2 + 0], tin1[j * 2 + 0]); + const vec_t acc1b = vec_add_16(in1[j * 2 + 1], tin1[j * 2 + 1]); + + const vec_t sum0a = + vec_slli_16(vec_max_16(vec_min_16(acc0a, One), Zero), shift); + const vec_t sum0b = + vec_slli_16(vec_max_16(vec_min_16(acc0b, One), Zero), shift); + const vec_t sum1a = vec_min_16(acc1a, One); + const vec_t sum1b = vec_min_16(acc1b, One); + + const vec_t pa = vec_mulhi_16(sum0a, sum1a); + const vec_t pb = vec_mulhi_16(sum0b, sum1b); + + out[j] = vec_packus_16(pa, pb); + } + } + else + { + for (IndexType j = 0; j < NumOutputChunks; ++j) + { + const vec_t sum0a = + vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero), shift); + const vec_t sum0b = + vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero), shift); + const vec_t sum1a = vec_min_16(in1[j * 2 + 0], One); + const vec_t sum1b = vec_min_16(in1[j * 2 + 1], One); + + const vec_t pa = vec_mulhi_16(sum0a, sum1a); + const vec_t pb = vec_mulhi_16(sum0b, sum1b); + + out[j] = vec_packus_16(pa, pb); + } + } + +#else + + for (IndexType j = 0; j < HalfDimensions / 2; ++j) + { + BiasType sum0 = accumulation[static_cast(perspectives[p])][j + 0]; + BiasType sum1 = + accumulation[static_cast(perspectives[p])][j + HalfDimensions / 2]; + + if constexpr (UseThreats) + { + sum0 += threatAccumulation[static_cast(perspectives[p])][j + 0]; + sum1 += + threatAccumulation[static_cast(perspectives[p])][j + HalfDimensions / 2]; + } + + sum0 = std::clamp(sum0, 0, 255); + sum1 = std::clamp(sum1, 0, 255); + + output[offset + j] = static_cast(unsigned(sum0 * sum1) / 512); + } + +#endif + } + + return psqt; + } // end of function transform() + + alignas(CacheLineSize) std::array biases; + alignas(CacheLineSize) std::array weights; + alignas(CacheLineSize) + std::array threatWeights; + alignas(CacheLineSize) std::array psqtWeights; + alignas(CacheLineSize) + std::array threatPsqtWeights; +}; + +} // namespace Stockfish::Eval::NNUE + + +template +struct std::hash> { + std::size_t + operator()(const Stockfish::Eval::NNUE::FeatureTransformer& ft) + const noexcept { + return ft.get_content_hash(); + } +}; + +#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..66a6764a33de778ddc839bb259aa4212d6399f0e --- /dev/null +++ b/src/nnue/nnue_misc.cpp @@ -0,0 +1,193 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +// Code for calculating NNUE evaluation function + +#include "nnue_misc.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../position.h" +#include "../types.h" +#include "../uci.h" +#include "network.h" +#include "nnue_accumulator.h" + +namespace Stockfish::Eval::NNUE { + + +constexpr std::string_view PieceToChar(" PNBRQK pnbrqk"); + + +namespace { +// Converts a Value into (centi)pawns and writes it in a buffer. +// The buffer must have capacity for at least 5 chars. +void format_cp_compact(Value v, char* buffer, const Position& pos) { + + buffer[0] = (v < 0 ? '-' : v > 0 ? '+' : ' '); + + int cp = std::abs(UCIEngine::to_cp(v, pos)); + if (cp >= 10000) + { + buffer[1] = '0' + cp / 10000; + cp %= 10000; + buffer[2] = '0' + cp / 1000; + cp %= 1000; + buffer[3] = '0' + cp / 100; + buffer[4] = ' '; + } + else if (cp >= 1000) + { + buffer[1] = '0' + cp / 1000; + cp %= 1000; + buffer[2] = '0' + cp / 100; + cp %= 100; + buffer[3] = '.'; + buffer[4] = '0' + cp / 10; + } + else + { + buffer[1] = '0' + cp / 100; + cp %= 100; + buffer[2] = '.'; + buffer[3] = '0' + cp / 10; + cp %= 10; + buffer[4] = '0' + cp / 1; + } +} + + +// Converts a Value into pawns, always keeping two decimals +void format_cp_aligned_dot(Value v, std::stringstream& stream, const Position& pos) { + + const double pawns = std::abs(0.01 * UCIEngine::to_cp(v, pos)); + + stream << (v < 0 ? '-' + : v > 0 ? '+' + : ' ') + << std::setiosflags(std::ios::fixed) << std::setw(6) << std::setprecision(2) << pawns; +} +} + + +// Returns a string with the value of each piece on a board, +// and a table for (PSQT, Layers) values bucket by bucket. +std::string +trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::AccumulatorCaches& caches) { + + std::stringstream ss; + + char board[3 * 8 + 1][8 * 8 + 2]; + std::memset(board, ' ', sizeof(board)); + for (int row = 0; row < 3 * 8 + 1; ++row) + board[row][8 * 8 + 1] = '\0'; + + // A lambda to output one box of the board + auto writeSquare = [&board, &pos](File file, Rank rank, Piece pc, Value value) { + const int x = int(file) * 8; + const int y = (7 - int(rank)) * 3; + for (int i = 1; i < 8; ++i) + board[y][x + i] = board[y + 3][x + i] = '-'; + for (int i = 1; i < 3; ++i) + board[y + i][x] = board[y + i][x + 8] = '|'; + board[y][x] = board[y][x + 8] = board[y + 3][x + 8] = board[y + 3][x] = '+'; + if (pc != NO_PIECE) + board[y + 1][x + 4] = PieceToChar[pc]; + if (is_valid(value)) + format_cp_compact(value, &board[y + 2][x + 2], pos); + }; + + auto accumulators = std::make_unique(); + + // We estimate the value of each piece by doing a differential evaluation from + // the current base eval, simulating the removal of the piece from its square. + auto [psqt, positional] = networks.big.evaluate(pos, *accumulators, caches.big); + Value base = psqt + positional; + base = pos.side_to_move() == WHITE ? base : -base; + + for (File f = FILE_A; f <= FILE_H; ++f) + for (Rank r = RANK_1; r <= RANK_8; ++r) + { + Square sq = make_square(f, r); + Piece pc = pos.piece_on(sq); + Value v = VALUE_NONE; + + if (pc != NO_PIECE && type_of(pc) != KING) + { + pos.remove_piece(sq); + + accumulators->reset(); + std::tie(psqt, positional) = networks.big.evaluate(pos, *accumulators, caches.big); + Value eval = psqt + positional; + eval = pos.side_to_move() == WHITE ? eval : -eval; + v = base - eval; + + pos.put_piece(pc, sq); + } + + writeSquare(f, r, pc, v); + } + + ss << " NNUE derived piece values:\n"; + for (int row = 0; row < 3 * 8 + 1; ++row) + ss << board[row] << '\n'; + ss << '\n'; + + accumulators->reset(); + auto t = networks.big.trace_evaluate(pos, *accumulators, caches.big); + + ss << " NNUE network contributions " + << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl + << "+------------+------------+------------+------------+\n" + << "| Bucket | Material | Positional | Total |\n" + << "| | (PSQT) | (Layers) | |\n" + << "+------------+------------+------------+------------+\n"; + + for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket) + { + ss << "| " << bucket << " " // + << " | "; + format_cp_aligned_dot(t.psqt[bucket], ss, pos); + ss << " " // + << " | "; + format_cp_aligned_dot(t.positional[bucket], ss, pos); + ss << " " // + << " | "; + format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], ss, pos); + ss << " " // + << " |"; + if (bucket == t.correctBucket) + ss << " <-- this bucket is used"; + ss << '\n'; + } + + ss << "+------------+------------+------------+------------+\n"; + + return ss.str(); +} + + +} // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/nnue_misc.h b/src/nnue/nnue_misc.h new file mode 100644 index 0000000000000000000000000000000000000000..ecece5589c27f83800053bbbfca118c233d4f4b5 --- /dev/null +++ b/src/nnue/nnue_misc.h @@ -0,0 +1,74 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef NNUE_MISC_H_INCLUDED +#define NNUE_MISC_H_INCLUDED + +#include +#include +#include + +#include "../misc.h" +#include "../types.h" +#include "nnue_architecture.h" + +namespace Stockfish { + +class Position; + +namespace Eval::NNUE { + +// EvalFile uses fixed string types because it's part of the network structure which must be trivial. +struct EvalFile { + // Default net name, will use one of the EvalFileDefaultName* macros defined + // in evaluate.h + FixedString<256> defaultName; + // Selected net name, either via uci option or default + FixedString<256> current; + // Net description extracted from the net file + FixedString<256> netDescription; +}; + +struct NnueEvalTrace { + static_assert(LayerStacks == PSQTBuckets); + + Value psqt[LayerStacks]; + Value positional[LayerStacks]; + std::size_t correctBucket; +}; + +struct Networks; +struct AccumulatorCaches; + +std::string trace(Position& pos, const Networks& networks, AccumulatorCaches& caches); + +} // namespace Stockfish::Eval::NNUE +} // namespace Stockfish + +template<> +struct std::hash { + std::size_t operator()(const Stockfish::Eval::NNUE::EvalFile& evalFile) const noexcept { + std::size_t h = 0; + Stockfish::hash_combine(h, evalFile.defaultName); + Stockfish::hash_combine(h, evalFile.current); + Stockfish::hash_combine(h, evalFile.netDescription); + return h; + } +}; + +#endif // #ifndef NNUE_MISC_H_INCLUDED diff --git a/src/nnue/simd.h b/src/nnue/simd.h new file mode 100644 index 0000000000000000000000000000000000000000..601792c1cd0d5476900d3491d3d1c243a4e2fd04 --- /dev/null +++ b/src/nnue/simd.h @@ -0,0 +1,440 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef NNUE_SIMD_H_INCLUDED +#define NNUE_SIMD_H_INCLUDED + +#if defined(USE_AVX2) + #include + +#elif defined(USE_SSE41) + #include + +#elif defined(USE_SSSE3) + #include + +#elif defined(USE_SSE2) + #include + +#elif defined(USE_NEON) + #include +#endif + +#include "../types.h" +#include "nnue_common.h" + +namespace Stockfish::Eval::NNUE::SIMD { + +// If vector instructions are enabled, we update and refresh the +// accumulator tile by tile such that each tile fits in the CPU's +// vector registers. +#define VECTOR + +#ifdef USE_AVX512 +using vec_t = __m512i; +using vec_i8_t = __m256i; +using vec128_t = __m128i; +using psqt_vec_t = __m256i; +using vec_uint_t = __m512i; + #define vec_load(a) _mm512_load_si512(a) + #define vec_store(a, b) _mm512_store_si512(a, b) + #define vec_convert_8_16(a) _mm512_cvtepi8_epi16(a) + #define vec_add_16(a, b) _mm512_add_epi16(a, b) + #define vec_sub_16(a, b) _mm512_sub_epi16(a, b) + #define vec_mulhi_16(a, b) _mm512_mulhi_epi16(a, b) + #define vec_zero() _mm512_setzero_epi32() + #define vec_set_16(a) _mm512_set1_epi16(a) + #define vec_max_16(a, b) _mm512_max_epi16(a, b) + #define vec_min_16(a, b) _mm512_min_epi16(a, b) + #define vec_slli_16(a, b) _mm512_slli_epi16(a, b) + // Inverse permuted at load time + #define vec_packus_16(a, b) _mm512_packus_epi16(a, b) + #define vec_load_psqt(a) _mm256_load_si256(a) + #define vec_store_psqt(a, b) _mm256_store_si256(a, b) + #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b) + #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b) + #define vec_zero_psqt() _mm256_setzero_si256() + + #ifdef USE_SSSE3 + #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512()) + #endif + + #define vec128_zero _mm_setzero_si128() + #define vec128_set_16(a) _mm_set1_epi16(a) + #define vec128_load(a) _mm_load_si128(a) + #define vec128_storeu(a, b) _mm_storeu_si128(a, b) + #define vec128_add(a, b) _mm_add_epi16(a, b) + #define NumRegistersSIMD 16 + #define MaxChunkSize 64 + +#elif USE_AVX2 +using vec_t = __m256i; +using vec_i8_t = __m128i; +using vec128_t = __m128i; +using psqt_vec_t = __m256i; +using vec_uint_t = __m256i; + #define vec_load(a) _mm256_load_si256(a) + #define vec_store(a, b) _mm256_store_si256(a, b) + #define vec_convert_8_16(a) _mm256_cvtepi8_epi16(a) + #define vec_add_16(a, b) _mm256_add_epi16(a, b) + #define vec_sub_16(a, b) _mm256_sub_epi16(a, b) + #define vec_mulhi_16(a, b) _mm256_mulhi_epi16(a, b) + #define vec_zero() _mm256_setzero_si256() + #define vec_set_16(a) _mm256_set1_epi16(a) + #define vec_max_16(a, b) _mm256_max_epi16(a, b) + #define vec_min_16(a, b) _mm256_min_epi16(a, b) + #define vec_slli_16(a, b) _mm256_slli_epi16(a, b) + // Inverse permuted at load time + #define vec_packus_16(a, b) _mm256_packus_epi16(a, b) + #define vec_load_psqt(a) _mm256_load_si256(a) + #define vec_store_psqt(a, b) _mm256_store_si256(a, b) + #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b) + #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b) + #define vec_zero_psqt() _mm256_setzero_si256() + + #ifdef USE_SSSE3 + #if defined(USE_VNNI) && !defined(USE_AVXVNNI) + #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256()) + #else + #define vec_nnz(a) \ + _mm256_movemask_ps( \ + _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256()))) + #endif + #endif + + #define vec128_zero _mm_setzero_si128() + #define vec128_set_16(a) _mm_set1_epi16(a) + #define vec128_load(a) _mm_load_si128(a) + #define vec128_storeu(a, b) _mm_storeu_si128(a, b) + #define vec128_add(a, b) _mm_add_epi16(a, b) + + #define NumRegistersSIMD 12 + #define MaxChunkSize 32 + +#elif USE_SSE2 +using vec_t = __m128i; +using vec_i8_t = std::uint64_t; // for the correct size -- will be loaded into an xmm reg +using vec128_t = __m128i; +using psqt_vec_t = __m128i; +using vec_uint_t = __m128i; + #define vec_load(a) (*(a)) + #define vec_store(a, b) *(a) = (b) + #define vec_add_16(a, b) _mm_add_epi16(a, b) + #define vec_sub_16(a, b) _mm_sub_epi16(a, b) + #define vec_mulhi_16(a, b) _mm_mulhi_epi16(a, b) + #define vec_zero() _mm_setzero_si128() + #define vec_set_16(a) _mm_set1_epi16(a) + #define vec_max_16(a, b) _mm_max_epi16(a, b) + #define vec_min_16(a, b) _mm_min_epi16(a, b) + #define vec_slli_16(a, b) _mm_slli_epi16(a, b) + #define vec_packus_16(a, b) _mm_packus_epi16(a, b) + #define vec_load_psqt(a) (*(a)) + #define vec_store_psqt(a, b) *(a) = (b) + #define vec_add_psqt_32(a, b) _mm_add_epi32(a, b) + #define vec_sub_psqt_32(a, b) _mm_sub_epi32(a, b) + #define vec_zero_psqt() _mm_setzero_si128() + + #ifdef USE_SSSE3 + #define vec_nnz(a) \ + _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128()))) + #endif + + #ifdef __i386__ +inline __m128i _mm_cvtsi64_si128(int64_t val) { + return _mm_loadl_epi64(reinterpret_cast(&val)); +} + #endif + + #ifdef USE_SSE41 + #define vec_convert_8_16(a) _mm_cvtepi8_epi16(_mm_cvtsi64_si128(static_cast(a))) + #else +// Credit: Yoshie2000 +inline __m128i vec_convert_8_16(uint64_t x) { + __m128i v8 = _mm_cvtsi64_si128(static_cast(x)); + __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(), v8); + return _mm_unpacklo_epi8(v8, sign); +} + #endif + + #define vec128_zero _mm_setzero_si128() + #define vec128_set_16(a) _mm_set1_epi16(a) + #define vec128_load(a) _mm_load_si128(a) + #define vec128_storeu(a, b) _mm_storeu_si128(a, b) + #define vec128_add(a, b) _mm_add_epi16(a, b) + + #define NumRegistersSIMD (Is64Bit ? 12 : 6) + #define MaxChunkSize 16 + +#elif USE_NEON +using vec_i8x8_t __attribute__((may_alias)) = int8x8_t; +using vec_i16x8_t __attribute__((may_alias)) = int16x8_t; +using vec_i8x16_t __attribute__((may_alias)) = int8x16_t; +using vec_u16x8_t __attribute__((may_alias)) = uint16x8_t; +using vec_i32x4_t __attribute__((may_alias)) = int32x4_t; + +using vec_t __attribute__((may_alias)) = int16x8_t; +using vec_i8_t __attribute__((may_alias)) = int8x16_t; +using psqt_vec_t __attribute__((may_alias)) = int32x4_t; +using vec128_t __attribute__((may_alias)) = uint16x8_t; +using vec_uint_t __attribute__((may_alias)) = uint32x4_t; + #define vec_load(a) (*(a)) + #define vec_store(a, b) *(a) = (b) + #define vec_add_16(a, b) vaddq_s16(a, b) + #define vec_sub_16(a, b) vsubq_s16(a, b) + #define vec_mulhi_16(a, b) vqdmulhq_s16(a, b) + #define vec_zero() vec_t{0} + #define vec_set_16(a) vdupq_n_s16(a) + #define vec_max_16(a, b) vmaxq_s16(a, b) + #define vec_min_16(a, b) vminq_s16(a, b) + #define vec_slli_16(a, b) vshlq_s16(a, vec_set_16(b)) + #define vec_packus_16(a, b) reinterpret_cast(vcombine_u8(vqmovun_s16(a), vqmovun_s16(b))) + #define vec_load_psqt(a) (*(a)) + #define vec_store_psqt(a, b) *(a) = (b) + #define vec_add_psqt_32(a, b) vaddq_s32(a, b) + #define vec_sub_psqt_32(a, b) vsubq_s32(a, b) + #define vec_zero_psqt() psqt_vec_t{0} + +static constexpr std::uint32_t Mask[4] = {1, 2, 4, 8}; + #define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask))) + #define vec128_zero vdupq_n_u16(0) + #define vec128_set_16(a) vdupq_n_u16(a) + #define vec128_load(a) vld1q_u16(reinterpret_cast(a)) + #define vec128_storeu(a, b) vst1q_u16(reinterpret_cast(a), b) + #define vec128_add(a, b) vaddq_u16(a, b) + + #define NumRegistersSIMD 16 + #define MaxChunkSize 16 + + #ifndef __aarch64__ +// Single instruction doesn't exist on 32-bit ARM +inline int16x8_t vmovl_high_s8(int8x16_t val) { return vmovl_s8(vget_high_s8(val)); } + #endif + +#else + #undef VECTOR + +#endif + +struct Vec16Wrapper { +#ifdef VECTOR + using type = vec_t; + static type add(const type& lhs, const type& rhs) { return vec_add_16(lhs, rhs); } + static type sub(const type& lhs, const type& rhs) { return vec_sub_16(lhs, rhs); } +#else + using type = BiasType; + static type add(const type& lhs, const type& rhs) { return lhs + rhs; } + static type sub(const type& lhs, const type& rhs) { return lhs - rhs; } +#endif +}; + +struct Vec32Wrapper { +#ifdef VECTOR + using type = psqt_vec_t; + static type add(const type& lhs, const type& rhs) { return vec_add_psqt_32(lhs, rhs); } + static type sub(const type& lhs, const type& rhs) { return vec_sub_psqt_32(lhs, rhs); } +#else + using type = PSQTWeightType; + static type add(const type& lhs, const type& rhs) { return lhs + rhs; } + static type sub(const type& lhs, const type& rhs) { return lhs - rhs; } +#endif +}; + +enum UpdateOperation { + Add, + Sub +}; + +template = true> +typename VecWrapper::type fused(const typename VecWrapper::type& in) { + return in; +} + +template, bool> = true, + std::enable_if_t = true> +typename VecWrapper::type +fused(const typename VecWrapper::type& in, const T& operand, const Ts&... operands) { + switch (update_op) + { + case Add : + return fused(VecWrapper::add(in, operand), operands...); + case Sub : + return fused(VecWrapper::sub(in, operand), operands...); + default : + static_assert(update_op == Add || update_op == Sub, + "Only Add and Sub are currently supported."); + return typename VecWrapper::type(); + } +} + +#if defined(USE_AVX512) + +[[maybe_unused]] static int m512_hadd(__m512i sum, int bias) { + return _mm512_reduce_add_epi32(sum) + bias; +} + +[[maybe_unused]] static void m512_add_dpbusd_epi32(__m512i& acc, __m512i a, __m512i b) { + + #if defined(USE_VNNI) + acc = _mm512_dpbusd_epi32(acc, a, b); + #else + __m512i product0 = _mm512_maddubs_epi16(a, b); + product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1)); + acc = _mm512_add_epi32(acc, product0); + #endif +} + +#endif + +#if defined(USE_AVX2) + +[[maybe_unused]] static int m256_hadd(__m256i sum, int bias) { + __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); + return _mm_cvtsi128_si32(sum128) + bias; +} + +[[maybe_unused]] static void m256_add_dpbusd_epi32(__m256i& acc, __m256i a, __m256i b) { + + #if defined(USE_VNNI) + acc = _mm256_dpbusd_epi32(acc, a, b); + #else + __m256i product0 = _mm256_maddubs_epi16(a, b); + product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1)); + acc = _mm256_add_epi32(acc, product0); + #endif +} + +#endif + +#if defined(USE_SSSE3) + +[[maybe_unused]] static int m128_hadd(__m128i sum, int bias) { + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB + return _mm_cvtsi128_si32(sum) + bias; +} + +[[maybe_unused]] static void m128_add_dpbusd_epi32(__m128i& acc, __m128i a, __m128i b) { + + __m128i product0 = _mm_maddubs_epi16(a, b); + product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1)); + acc = _mm_add_epi32(acc, product0); +} + +#endif + +#if defined(USE_NEON_DOTPROD) + +[[maybe_unused]] static void +dotprod_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) { + + acc = vdotq_s32(acc, a, b); +} +#endif + +#if defined(USE_NEON) + +[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) { + #if USE_NEON >= 8 + return vaddvq_s32(s); + #else + return s[0] + s[1] + s[2] + s[3]; + #endif +} + +[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) { + return neon_m128_reduce_add_epi32(sum) + bias; +} + +#endif + +#if USE_NEON >= 8 +[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) { + + int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b)); + int16x8_t product1 = vmull_high_s8(a, b); + int16x8_t sum = vpaddq_s16(product0, product1); + acc = vpadalq_s16(acc, sum); +} +#endif + + +// Compute optimal SIMD register count for feature transformer accumulation. +template +class SIMDTiling { +#ifdef VECTOR + // We use __m* types as template arguments, which causes GCC to emit warnings + // about losing some attribute information. This is irrelevant to us as we + // only take their size, so the following pragma are harmless. + #if defined(__GNUC__) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wignored-attributes" + #endif + + template + static constexpr int BestRegisterCount() { + constexpr std::size_t RegisterSize = sizeof(SIMDRegisterType); + constexpr std::size_t LaneSize = sizeof(LaneType); + + static_assert(RegisterSize >= LaneSize); + static_assert(MaxRegisters <= NumRegistersSIMD); + static_assert(MaxRegisters > 0); + static_assert(NumRegistersSIMD > 0); + static_assert(RegisterSize % LaneSize == 0); + static_assert((NumLanes * LaneSize) % RegisterSize == 0); + + const int ideal = (NumLanes * LaneSize) / RegisterSize; + if (ideal <= MaxRegisters) + return ideal; + + // Look for the largest divisor of the ideal register count that is smaller than MaxRegisters + for (int divisor = MaxRegisters; divisor > 1; --divisor) + if (ideal % divisor == 0) + return divisor; + + return 1; + } + + #if defined(__GNUC__) + #pragma GCC diagnostic pop + #endif + + public: + static constexpr int NumRegs = + BestRegisterCount(); + static constexpr int NumPsqtRegs = + BestRegisterCount(); + + static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2; + static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4; + + static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions"); + static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets"); +#endif +}; +} + +#endif diff --git a/src/numa.h b/src/numa.h new file mode 100644 index 0000000000000000000000000000000000000000..afd868dd085ce26f15c198fc47dbc323abf019a8 --- /dev/null +++ b/src/numa.h @@ -0,0 +1,1718 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef NUMA_H_INCLUDED +#define NUMA_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "shm.h" + +// We support linux very well, but we explicitly do NOT support Android, +// because there is no affected systems, not worth maintaining. +#if defined(__linux__) && !defined(__ANDROID__) + #if !defined(_GNU_SOURCE) + #define _GNU_SOURCE + #endif + #include +#elif defined(_WIN64) + + #if _WIN32_WINNT < 0x0601 + #undef _WIN32_WINNT + #define _WIN32_WINNT 0x0601 // Force to include needed API prototypes + #endif + +// On Windows each processor group can have up to 64 processors. +// https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups +static constexpr size_t WIN_PROCESSOR_GROUP_SIZE = 64; + + #if !defined(NOMINMAX) + #define NOMINMAX + #endif + #include + #if defined small + #undef small + #endif + +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadselectedcpusetmasks +using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT); + +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getthreadselectedcpusetmasks +using GetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT, PUSHORT); + +#endif + +#include "misc.h" + +namespace Stockfish { + +using CpuIndex = size_t; +using NumaIndex = size_t; + +inline CpuIndex get_hardware_concurrency() { + CpuIndex concurrency = std::thread::hardware_concurrency(); + + // Get all processors across all processor groups on windows, since + // hardware_concurrency() only returns the number of processors in + // the first group, because only these are available to std::thread. +#ifdef _WIN64 + concurrency = std::max(concurrency, GetActiveProcessorCount(ALL_PROCESSOR_GROUPS)); +#endif + + return concurrency; +} + +inline const CpuIndex SYSTEM_THREADS_NB = std::max(1, get_hardware_concurrency()); + +#if defined(_WIN64) + +struct WindowsAffinity { + std::optional> oldApi; + std::optional> newApi; + + // We also provide diagnostic for when the affinity is set to nullopt + // whether it was due to being indeterminate. If affinity is indeterminate + // it is best to assume it is not set at all, so consistent with the meaning + // of the nullopt affinity. + bool isNewDeterminate = true; + bool isOldDeterminate = true; + + std::optional> get_combined() const { + if (!oldApi.has_value()) + return newApi; + if (!newApi.has_value()) + return oldApi; + + std::set intersect; + std::set_intersection(oldApi->begin(), oldApi->end(), newApi->begin(), newApi->end(), + std::inserter(intersect, intersect.begin())); + return intersect; + } + + // Since Windows 11 and Windows Server 2022 thread affinities can span + // processor groups and can be set as such by a new WinAPI function. However, + // we may need to force using the old API if we detect that the process has + // affinity set by the old API already and we want to override that. Due to the + // limitations of the old API we cannot detect its use reliably. There will be + // cases where we detect not use but it has actually been used and vice versa. + + bool likely_used_old_api() const { return oldApi.has_value() || !isOldDeterminate; } +}; + +inline std::pair> get_process_group_affinity() { + + // GetProcessGroupAffinity requires the GroupArray argument to be + // aligned to 4 bytes instead of just 2. + static constexpr size_t GroupArrayMinimumAlignment = 4; + static_assert(GroupArrayMinimumAlignment >= alignof(USHORT)); + + // The function should succeed the second time, but it may fail if the group + // affinity has changed between GetProcessGroupAffinity calls. In such case + // we consider this a hard error, as we Cannot work with unstable affinities + // anyway. + static constexpr int MAX_TRIES = 2; + USHORT GroupCount = 1; + for (int i = 0; i < MAX_TRIES; ++i) + { + auto GroupArray = std::make_unique( + GroupCount + (GroupArrayMinimumAlignment / alignof(USHORT) - 1)); + + USHORT* GroupArrayAligned = align_ptr_up(GroupArray.get()); + + const BOOL status = + GetProcessGroupAffinity(GetCurrentProcess(), &GroupCount, GroupArrayAligned); + + if (status == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER) + { + break; + } + + if (status != 0) + { + return std::make_pair(status, + std::vector(GroupArrayAligned, GroupArrayAligned + GroupCount)); + } + } + + return std::make_pair(0, std::vector()); +} + +// On Windows there are two ways to set affinity, and therefore 2 ways to get it. +// These are not consistent, so we have to check both. In some cases it is actually +// not possible to determine affinity. For example when two different threads have +// affinity on different processor groups, set using SetThreadAffinityMask, we cannot +// retrieve the actual affinities. +// From documentation on GetProcessAffinityMask: +// > If the calling process contains threads in multiple groups, +// > the function returns zero for both affinity masks. +// In such cases we just give up and assume we have affinity for all processors. +// nullopt means no affinity is set, that is, all processors are allowed +inline WindowsAffinity get_process_affinity() { + HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); + auto GetThreadSelectedCpuSetMasks_f = GetThreadSelectedCpuSetMasks_t( + (void (*)()) GetProcAddress(k32, "GetThreadSelectedCpuSetMasks")); + + BOOL status = 0; + + WindowsAffinity affinity; + + if (GetThreadSelectedCpuSetMasks_f != nullptr) + { + USHORT RequiredMaskCount; + status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), nullptr, 0, &RequiredMaskCount); + + // We expect ERROR_INSUFFICIENT_BUFFER from GetThreadSelectedCpuSetMasks, + // but other failure is an actual error. + if (status == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER) + { + affinity.isNewDeterminate = false; + } + else if (RequiredMaskCount > 0) + { + // If RequiredMaskCount then these affinities were never set, but it's + // not consistent so GetProcessAffinityMask may still return some affinity. + auto groupAffinities = std::make_unique(RequiredMaskCount); + + status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(), + RequiredMaskCount, &RequiredMaskCount); + + if (status == 0) + { + affinity.isNewDeterminate = false; + } + else + { + std::set cpus; + + for (USHORT i = 0; i < RequiredMaskCount; ++i) + { + const size_t procGroupIndex = groupAffinities[i].Group; + + for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j) + { + if (groupAffinities[i].Mask & (KAFFINITY(1) << j)) + cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j); + } + } + + affinity.newApi = std::move(cpus); + } + } + } + + // NOTE: There is no way to determine full affinity using the old API if + // individual threads set affinity on different processor groups. + + DWORD_PTR proc, sys; + status = GetProcessAffinityMask(GetCurrentProcess(), &proc, &sys); + + // If proc == 0 then we cannot determine affinity because it spans processor groups. + // On Windows 11 and Server 2022 it will instead + // > If, however, hHandle specifies a handle to the current process, the function + // > always uses the calling thread's primary group (which by default is the same + // > as the process' primary group) in order to set the + // > lpProcessAffinityMask and lpSystemAffinityMask. + // So it will never be indeterminate here. We can only make assumptions later. + if (status == 0 || proc == 0) + { + affinity.isOldDeterminate = false; + return affinity; + } + + // If SetProcessAffinityMask was never called the affinity must span + // all processor groups, but if it was called it must only span one. + + std::vector groupAffinity; // We need to capture this later and capturing + // from structured bindings requires c++20. + + std::tie(status, groupAffinity) = get_process_group_affinity(); + if (status == 0) + { + affinity.isOldDeterminate = false; + return affinity; + } + + if (groupAffinity.size() == 1) + { + // We detect the case when affinity is set to all processors and correctly + // leave affinity.oldApi as nullopt. + if (GetActiveProcessorGroupCount() != 1 || proc != sys) + { + std::set cpus; + + const size_t procGroupIndex = groupAffinity[0]; + + const uint64_t mask = static_cast(proc); + for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j) + { + if (mask & (KAFFINITY(1) << j)) + cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j); + } + + affinity.oldApi = std::move(cpus); + } + } + else + { + // If we got here it means that either SetProcessAffinityMask was never set + // or we're on Windows 11/Server 2022. + + // Since Windows 11 and Windows Server 2022 the behaviour of + // GetProcessAffinityMask changed: + // > If, however, hHandle specifies a handle to the current process, + // > the function always uses the calling thread's primary group + // > (which by default is the same as the process' primary group) + // > in order to set the lpProcessAffinityMask and lpSystemAffinityMask. + // In which case we can actually retrieve the full affinity. + + if (GetThreadSelectedCpuSetMasks_f != nullptr) + { + std::thread th([&]() { + std::set cpus; + bool isAffinityFull = true; + + for (auto procGroupIndex : groupAffinity) + { + const int numActiveProcessors = + GetActiveProcessorCount(static_cast(procGroupIndex)); + + // We have to schedule to two different processors + // and & the affinities we get. Otherwise our processor + // choice could influence the resulting affinity. + // We assume the processor IDs within the group are + // filled sequentially from 0. + uint64_t procCombined = std::numeric_limits::max(); + uint64_t sysCombined = std::numeric_limits::max(); + + for (int i = 0; i < std::min(numActiveProcessors, 2); ++i) + { + GROUP_AFFINITY GroupAffinity; + std::memset(&GroupAffinity, 0, sizeof(GROUP_AFFINITY)); + GroupAffinity.Group = static_cast(procGroupIndex); + + GroupAffinity.Mask = static_cast(1) << i; + + status = + SetThreadGroupAffinity(GetCurrentThread(), &GroupAffinity, nullptr); + if (status == 0) + { + affinity.isOldDeterminate = false; + return; + } + + SwitchToThread(); + + DWORD_PTR proc2, sys2; + status = GetProcessAffinityMask(GetCurrentProcess(), &proc2, &sys2); + if (status == 0) + { + affinity.isOldDeterminate = false; + return; + } + + procCombined &= static_cast(proc2); + sysCombined &= static_cast(sys2); + } + + if (procCombined != sysCombined) + isAffinityFull = false; + + for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j) + { + if (procCombined & (KAFFINITY(1) << j)) + cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j); + } + } + + // We have to detect the case where the affinity was not set, + // or is set to all processors so that we correctly produce as + // std::nullopt result. + if (!isAffinityFull) + { + affinity.oldApi = std::move(cpus); + } + }); + + th.join(); + } + } + + return affinity; +} + +// Type machinery used to emulate Cache->GroupCount + +template +struct HasGroupCount: std::false_type {}; + +template +struct HasGroupCount().Cache.GroupCount)>>: std::true_type { +}; + +template::value, bool> = true> +std::set readCacheMembers(const T* info, Pred&& is_cpu_allowed) { + std::set cpus; + // On Windows 10 this will read a 0 because GroupCount doesn't exist + int groupCount = std::max(info->Cache.GroupCount, WORD(1)); + for (WORD procGroup = 0; procGroup < groupCount; ++procGroup) + { + for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number) + { + WORD groupNumber = info->Cache.GroupMasks[procGroup].Group; + const CpuIndex c = static_cast(groupNumber) * WIN_PROCESSOR_GROUP_SIZE + + static_cast(number); + if (!(info->Cache.GroupMasks[procGroup].Mask & (1ULL << number)) || !is_cpu_allowed(c)) + continue; + cpus.insert(c); + } + } + return cpus; +} + +template::value, bool> = true> +std::set readCacheMembers(const T* info, Pred&& is_cpu_allowed) { + std::set cpus; + for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number) + { + WORD groupNumber = info->Cache.GroupMask.Group; + const CpuIndex c = static_cast(groupNumber) * WIN_PROCESSOR_GROUP_SIZE + + static_cast(number); + if (!(info->Cache.GroupMask.Mask & (1ULL << number)) || !is_cpu_allowed(c)) + continue; + cpus.insert(c); + } + return cpus; +} + +#endif + +#if defined(__linux__) && !defined(__ANDROID__) + +inline std::set get_process_affinity() { + + std::set cpus; + + // For unsupported systems, or in case of a soft error, we may assume + // all processors are available for use. + [[maybe_unused]] auto set_to_all_cpus = [&]() { + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + cpus.insert(c); + }; + + // cpu_set_t by default holds 1024 entries. This may not be enough soon, + // but there is no easy way to determine how many threads there actually + // is. In this case we just choose a reasonable upper bound. + static constexpr CpuIndex MaxNumCpus = 1024 * 64; + + cpu_set_t* mask = CPU_ALLOC(MaxNumCpus); + if (mask == nullptr) + std::exit(EXIT_FAILURE); + + const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus); + + CPU_ZERO_S(masksize, mask); + + const int status = sched_getaffinity(0, masksize, mask); + + if (status != 0) + { + CPU_FREE(mask); + std::exit(EXIT_FAILURE); + } + + for (CpuIndex c = 0; c < MaxNumCpus; ++c) + if (CPU_ISSET_S(c, masksize, mask)) + cpus.insert(c); + + CPU_FREE(mask); + + return cpus; +} + +#endif + +#if defined(__linux__) && !defined(__ANDROID__) + +inline static const auto STARTUP_PROCESSOR_AFFINITY = get_process_affinity(); + +#elif defined(_WIN64) + +inline static const auto STARTUP_PROCESSOR_AFFINITY = get_process_affinity(); +inline static const auto STARTUP_USE_OLD_AFFINITY_API = + STARTUP_PROCESSOR_AFFINITY.likely_used_old_api(); + +#endif + +// We want to abstract the purpose of storing the numa node index somewhat. +// Whoever is using this does not need to know the specifics of the replication +// machinery to be able to access NUMA replicated memory. +class NumaReplicatedAccessToken { + public: + NumaReplicatedAccessToken() : + n(0) {} + + explicit NumaReplicatedAccessToken(NumaIndex idx) : + n(idx) {} + + NumaIndex get_numa_index() const { return n; } + + private: + NumaIndex n; +}; + +struct L3Domain { + NumaIndex systemNumaIndex{}; + std::set cpus{}; +}; + +// Use system NUMA nodes +struct SystemNumaPolicy {}; +// Use system-reported L3 domains +struct L3DomainsPolicy {}; +// Group system-reported L3 domains until they reach bundleSize +struct BundledL3Policy { + size_t bundleSize; +}; + +using NumaAutoPolicy = std::variant; + +// Designed as immutable, because there is no good reason to alter an already +// existing config in a way that doesn't require recreating it completely, and +// it would be complex and expensive to maintain class invariants. +// The CPU (processor) numbers always correspond to the actual numbering used +// by the system. The NUMA node numbers MAY NOT correspond to the system's +// numbering of the NUMA nodes. In particular, by default, if the processor has +// non-uniform cache access within a NUMA node (i.e., a non-unified L3 cache structure), +// then L3 domains within a system NUMA node will be used to subdivide it +// into multiple logical NUMA nodes in the config. Additionally, empty nodes may +// be removed, or the user may create custom nodes. +// +// As a special case, when performing system-wide replication of read-only data +// (i.e., LazyNumaReplicatedSystemWide), the system NUMA node is used, rather than +// custom or L3-aware nodes. See that class's get_discriminator() function. +// +// It is guaranteed that NUMA nodes are NOT empty: every node exposed by NumaConfig +// has at least one processor assigned. +// +// We use startup affinities so as not to modify its own behaviour in time. +// +// Since Stockfish doesn't support exceptions all places where an exception +// should be thrown are replaced by std::exit. +class NumaConfig { + public: + NumaConfig() : + highestCpuIndex(0), + customAffinity(false) { + const auto numCpus = SYSTEM_THREADS_NB; + add_cpu_range_to_node(NumaIndex{0}, CpuIndex{0}, numCpus - 1); + } + + // This function gets a NumaConfig based on the system's provided information. + // The available policies are documented above. + static NumaConfig from_system([[maybe_unused]] const NumaAutoPolicy& policy, + bool respectProcessAffinity = true) { + NumaConfig cfg = empty(); + +#if !((defined(__linux__) && !defined(__ANDROID__)) || defined(_WIN64)) + // Fallback for unsupported systems. + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + cfg.add_cpu_to_node(NumaIndex{0}, c); +#else + + #if defined(_WIN64) + + std::optional> allowedCpus; + + if (respectProcessAffinity) + allowedCpus = STARTUP_PROCESSOR_AFFINITY.get_combined(); + + // The affinity cannot be determined in all cases on Windows, + // but we at least guarantee that the number of allowed processors + // is >= number of processors in the affinity mask. In case the user + // is not satisfied they must set the processor numbers explicitly. + auto is_cpu_allowed = [&allowedCpus](CpuIndex c) { + return !allowedCpus.has_value() || allowedCpus->count(c) == 1; + }; + + #elif defined(__linux__) && !defined(__ANDROID__) + + std::set allowedCpus; + + if (respectProcessAffinity) + allowedCpus = STARTUP_PROCESSOR_AFFINITY; + + auto is_cpu_allowed = [respectProcessAffinity, &allowedCpus](CpuIndex c) { + return !respectProcessAffinity || allowedCpus.count(c) == 1; + }; + + #endif + + bool l3Success = false; + if (!std::holds_alternative(policy)) + { + size_t l3BundleSize = 0; + if (const auto* v = std::get_if(&policy)) + { + l3BundleSize = v->bundleSize; + } + if (auto l3Cfg = + try_get_l3_aware_config(respectProcessAffinity, l3BundleSize, is_cpu_allowed)) + { + cfg = std::move(*l3Cfg); + l3Success = true; + } + } + if (!l3Success) + cfg = from_system_numa(respectProcessAffinity, is_cpu_allowed); + + #if defined(_WIN64) + // Split the NUMA nodes to be contained within a group if necessary. + // This is needed between Windows 10 Build 20348 and Windows 11, because + // the new NUMA allocation behaviour was introduced while there was + // still no way to set thread affinity spanning multiple processor groups. + // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support + // We also do this is if need to force old API for some reason. + // + // 2024-08-26: It appears that we need to actually always force this behaviour. + // While Windows allows this to work now, such assignments have bad interaction + // with the scheduler - in particular it still prefers scheduling on the thread's + // "primary" node, even if it means scheduling SMT processors first. + // See https://github.com/official-stockfish/Stockfish/issues/5551 + // See https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups + // + // Each process is assigned a primary group at creation, and by default all + // of its threads' primary group is the same. Each thread's ideal processor + // is in the thread's primary group, so threads will preferentially be + // scheduled to processors on their primary group, but they are able to + // be scheduled to processors on any other group. + // + // used to be guarded by if (STARTUP_USE_OLD_AFFINITY_API) + { + NumaConfig splitCfg = empty(); + + NumaIndex splitNodeIndex = 0; + for (const auto& cpus : cfg.nodes) + { + if (cpus.empty()) + continue; + + size_t lastProcGroupIndex = *(cpus.begin()) / WIN_PROCESSOR_GROUP_SIZE; + for (CpuIndex c : cpus) + { + const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE; + if (procGroupIndex != lastProcGroupIndex) + { + splitNodeIndex += 1; + lastProcGroupIndex = procGroupIndex; + } + splitCfg.add_cpu_to_node(splitNodeIndex, c); + } + splitNodeIndex += 1; + } + + cfg = std::move(splitCfg); + } + #endif + +#endif + + // We have to ensure no empty NUMA nodes persist. + cfg.remove_empty_numa_nodes(); + + // If the user explicitly opts out from respecting the current process affinity + // then it may be inconsistent with the current affinity (obviously), so we + // consider it custom. + if (!respectProcessAffinity) + cfg.customAffinity = true; + + return cfg; + } + + // ':'-separated numa nodes + // ','-separated cpu indices + // supports "first-last" range syntax for cpu indices + // For example "0-15,128-143:16-31,144-159:32-47,160-175:48-63,176-191" + static NumaConfig from_string(const std::string& s) { + NumaConfig cfg = empty(); + + NumaIndex n = 0; + for (auto&& nodeStr : split(s, ":")) + { + auto indices = indices_from_shortened_string(std::string(nodeStr)); + if (!indices.empty()) + { + for (auto idx : indices) + { + if (!cfg.add_cpu_to_node(n, CpuIndex(idx))) + std::exit(EXIT_FAILURE); + } + + n += 1; + } + } + + cfg.customAffinity = true; + + return cfg; + } + + NumaConfig(const NumaConfig&) = delete; + NumaConfig(NumaConfig&&) = default; + NumaConfig& operator=(const NumaConfig&) = delete; + NumaConfig& operator=(NumaConfig&&) = default; + + bool is_cpu_assigned(CpuIndex n) const { return nodeByCpu.count(n) == 1; } + + NumaIndex num_numa_nodes() const { return nodes.size(); } + + CpuIndex num_cpus_in_numa_node(NumaIndex n) const { + assert(n < nodes.size()); + return nodes[n].size(); + } + + CpuIndex num_cpus() const { return nodeByCpu.size(); } + + bool requires_memory_replication() const { return customAffinity || nodes.size() > 1; } + + std::string to_string() const { + std::string str; + + bool isFirstNode = true; + for (auto&& cpus : nodes) + { + if (!isFirstNode) + str += ":"; + + bool isFirstSet = true; + auto rangeStart = cpus.begin(); + for (auto it = cpus.begin(); it != cpus.end(); ++it) + { + auto next = std::next(it); + if (next == cpus.end() || *next != *it + 1) + { + // cpus[i] is at the end of the range (may be of size 1) + if (!isFirstSet) + str += ","; + + const CpuIndex last = *it; + + if (it != rangeStart) + { + const CpuIndex first = *rangeStart; + + str += std::to_string(first); + str += "-"; + str += std::to_string(last); + } + else + str += std::to_string(last); + + rangeStart = next; + isFirstSet = false; + } + } + + isFirstNode = false; + } + + return str; + } + + bool suggests_binding_threads(CpuIndex numThreads) const { + // If we can reasonably determine that the threads cannot be contained + // by the OS within the first NUMA node then we advise distributing + // and binding threads. When the threads are not bound we can only use + // NUMA memory replicated objects from the first node, so when the OS + // has to schedule on other nodes we lose performance. We also suggest + // binding if there's enough threads to distribute among nodes with minimal + // disparity. We try to ignore small nodes, in particular the empty ones. + + // If the affinity set by the user does not match the affinity given by + // the OS then binding is necessary to ensure the threads are running on + // correct processors. + if (customAffinity) + return true; + + // We obviously cannot distribute a single thread, so a single thread + // should never be bound. + if (numThreads <= 1) + return false; + + size_t largestNodeSize = 0; + for (auto&& cpus : nodes) + if (cpus.size() > largestNodeSize) + largestNodeSize = cpus.size(); + + auto is_node_small = [largestNodeSize](const std::set& node) { + static constexpr double SmallNodeThreshold = 0.6; + return static_cast(node.size()) / static_cast(largestNodeSize) + <= SmallNodeThreshold; + }; + + size_t numNotSmallNodes = 0; + for (auto&& cpus : nodes) + if (!is_node_small(cpus)) + numNotSmallNodes += 1; + + return (numThreads > largestNodeSize / 2 || numThreads >= numNotSmallNodes * 4) + && nodes.size() > 1; + } + + std::vector distribute_threads_among_numa_nodes(CpuIndex numThreads) const { + std::vector ns; + + if (nodes.size() == 1) + { + // Special case for when there's no NUMA nodes. This doesn't buy us + // much, but let's keep the default path simple. + ns.resize(numThreads, NumaIndex{0}); + } + else + { + std::vector occupation(nodes.size(), 0); + for (CpuIndex c = 0; c < numThreads; ++c) + { + NumaIndex bestNode{0}; + float bestNodeFill = std::numeric_limits::max(); + for (NumaIndex n = 0; n < nodes.size(); ++n) + { + float fill = + static_cast(occupation[n] + 1) / static_cast(nodes[n].size()); + // NOTE: Do we want to perhaps fill the first available node + // up to 50% first before considering other nodes? + // Probably not, because it would interfere with running + // multiple instances. We basically shouldn't favor any + // particular node. + if (fill < bestNodeFill) + { + bestNode = n; + bestNodeFill = fill; + } + } + ns.emplace_back(bestNode); + occupation[bestNode] += 1; + } + } + + return ns; + } + + NumaReplicatedAccessToken bind_current_thread_to_numa_node(NumaIndex n) const { + if (n >= nodes.size() || nodes[n].size() == 0) + std::exit(EXIT_FAILURE); + +#if defined(__linux__) && !defined(__ANDROID__) + + cpu_set_t* mask = CPU_ALLOC(highestCpuIndex + 1); + if (mask == nullptr) + std::exit(EXIT_FAILURE); + + const size_t masksize = CPU_ALLOC_SIZE(highestCpuIndex + 1); + + CPU_ZERO_S(masksize, mask); + + for (CpuIndex c : nodes[n]) + CPU_SET_S(c, masksize, mask); + + const int status = sched_setaffinity(0, masksize, mask); + + CPU_FREE(mask); + + if (status != 0) + std::exit(EXIT_FAILURE); + + // We yield this thread just to be sure it gets rescheduled. + // This is defensive, allowed because this code is not performance critical. + sched_yield(); + +#elif defined(_WIN64) + + // Requires Windows 11. No good way to set thread affinity spanning + // processor groups before that. + HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); + auto SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t( + (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks")); + + // We ALWAYS set affinity with the new API if available, because + // there's no downsides, and we forcibly keep it consistent with + // the old API should we need to use it. I.e. we always keep this + // as a superset of what we set with SetThreadGroupAffinity. + if (SetThreadSelectedCpuSetMasks_f != nullptr) + { + // Only available on Windows 11 and Windows Server 2022 onwards + const USHORT numProcGroups = USHORT( + ((highestCpuIndex + 1) + WIN_PROCESSOR_GROUP_SIZE - 1) / WIN_PROCESSOR_GROUP_SIZE); + auto groupAffinities = std::make_unique(numProcGroups); + std::memset(groupAffinities.get(), 0, sizeof(GROUP_AFFINITY) * numProcGroups); + for (WORD i = 0; i < numProcGroups; ++i) + groupAffinities[i].Group = i; + + for (CpuIndex c : nodes[n]) + { + const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE; + const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE; + groupAffinities[procGroupIndex].Mask |= KAFFINITY(1) << idxWithinProcGroup; + } + + HANDLE hThread = GetCurrentThread(); + + const BOOL status = + SetThreadSelectedCpuSetMasks_f(hThread, groupAffinities.get(), numProcGroups); + if (status == 0) + std::exit(EXIT_FAILURE); + + // We yield this thread just to be sure it gets rescheduled. + // This is defensive, allowed because this code is not performance critical. + SwitchToThread(); + } + + // Sometimes we need to force the old API, but do not use it unless necessary. + if (SetThreadSelectedCpuSetMasks_f == nullptr || STARTUP_USE_OLD_AFFINITY_API) + { + // On earlier windows version (since windows 7) we cannot run a single thread + // on multiple processor groups, so we need to restrict the group. + // We assume the group of the first processor listed for this node. + // Processors from outside this group will not be assigned for this thread. + // Normally this won't be an issue because windows used to assign NUMA nodes + // such that they cannot span processor groups. However, since Windows 10 + // Build 20348 the behaviour changed, so there's a small window of versions + // between this and Windows 11 that might exhibit problems with not all + // processors being utilized. + // + // We handle this in NumaConfig::from_system by manually splitting the + // nodes when we detect that there is no function to set affinity spanning + // processor nodes. This is required because otherwise our thread distribution + // code may produce suboptimal results. + // + // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support + GROUP_AFFINITY affinity; + std::memset(&affinity, 0, sizeof(GROUP_AFFINITY)); + // We use an ordered set to be sure to get the smallest cpu number here. + const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE; + affinity.Group = static_cast(forcedProcGroupIndex); + for (CpuIndex c : nodes[n]) + { + const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE; + const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE; + // We skip processors that are not in the same processor group. + // If everything was set up correctly this will never be an issue, + // but we have to account for bad NUMA node specification. + if (procGroupIndex != forcedProcGroupIndex) + continue; + + affinity.Mask |= KAFFINITY(1) << idxWithinProcGroup; + } + + HANDLE hThread = GetCurrentThread(); + + const BOOL status = SetThreadGroupAffinity(hThread, &affinity, nullptr); + if (status == 0) + std::exit(EXIT_FAILURE); + + // We yield this thread just to be sure it gets rescheduled. This is + // defensive, allowed because this code is not performance critical. + SwitchToThread(); + } + +#endif + + return NumaReplicatedAccessToken(n); + } + + template + void execute_on_numa_node(NumaIndex n, FuncT&& f) const { + std::thread th([this, &f, n]() { + bind_current_thread_to_numa_node(n); + std::forward(f)(); + }); + + th.join(); + } + + std::vector> nodes; + std::map nodeByCpu; + + private: + CpuIndex highestCpuIndex; + + bool customAffinity; + + static NumaConfig empty() { return NumaConfig(EmptyNodeTag{}); } + + struct EmptyNodeTag {}; + + NumaConfig(EmptyNodeTag) : + highestCpuIndex(0), + customAffinity(false) {} + + void remove_empty_numa_nodes() { + std::vector> newNodes; + for (auto&& cpus : nodes) + if (!cpus.empty()) + newNodes.emplace_back(std::move(cpus)); + nodes = std::move(newNodes); + } + + // Returns true if successful + // Returns false if failed, i.e. when the cpu is already present + // strong guarantee, the structure remains unmodified + bool add_cpu_to_node(NumaIndex n, CpuIndex c) { + if (is_cpu_assigned(c)) + return false; + + while (nodes.size() <= n) + nodes.emplace_back(); + + nodes[n].insert(c); + nodeByCpu[c] = n; + + if (c > highestCpuIndex) + highestCpuIndex = c; + + return true; + } + + // Returns true if successful + // Returns false if failed, i.e. when any of the cpus is already present + // strong guarantee, the structure remains unmodified + bool add_cpu_range_to_node(NumaIndex n, CpuIndex cfirst, CpuIndex clast) { + for (CpuIndex c = cfirst; c <= clast; ++c) + if (is_cpu_assigned(c)) + return false; + + while (nodes.size() <= n) + nodes.emplace_back(); + + for (CpuIndex c = cfirst; c <= clast; ++c) + { + nodes[n].insert(c); + nodeByCpu[c] = n; + } + + if (clast > highestCpuIndex) + highestCpuIndex = clast; + + return true; + } + + static std::vector indices_from_shortened_string(const std::string& s) { + std::vector indices; + + if (s.empty()) + return indices; + + for (const auto& ss : split(s, ",")) + { + if (ss.empty()) + continue; + + auto parts = split(ss, "-"); + if (parts.size() == 1) + { + const CpuIndex c = CpuIndex{str_to_size_t(std::string(parts[0]))}; + indices.emplace_back(c); + } + else if (parts.size() == 2) + { + const CpuIndex cfirst = CpuIndex{str_to_size_t(std::string(parts[0]))}; + const CpuIndex clast = CpuIndex{str_to_size_t(std::string(parts[1]))}; + for (size_t c = cfirst; c <= clast; ++c) + { + indices.emplace_back(c); + } + } + } + + return indices; + } + + // This function queries the system for the mapping of processors to NUMA nodes. + // On Linux we read from standardized kernel sysfs, with a fallback to single NUMA + // node. On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see + // comment for Windows implementation of get_process_affinity. + template + static NumaConfig from_system_numa([[maybe_unused]] bool respectProcessAffinity, + [[maybe_unused]] Pred&& is_cpu_allowed) { + NumaConfig cfg = empty(); + +#if defined(__linux__) && !defined(__ANDROID__) + + // On Linux things are straightforward, since there's no processor groups and + // any thread can be scheduled on all processors. + // We try to gather this information from the sysfs first + // https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-devices-node + + bool useFallback = false; + auto fallback = [&]() { + useFallback = true; + cfg = empty(); + }; + + // /sys/devices/system/node/online contains information about active NUMA nodes + auto nodeIdsStr = read_file_to_string("/sys/devices/system/node/online"); + if (!nodeIdsStr.has_value() || nodeIdsStr->empty()) + { + fallback(); + } + else + { + remove_whitespace(*nodeIdsStr); + for (size_t n : indices_from_shortened_string(*nodeIdsStr)) + { + // /sys/devices/system/node/node.../cpulist + std::string path = + std::string("/sys/devices/system/node/node") + std::to_string(n) + "/cpulist"; + auto cpuIdsStr = read_file_to_string(path); + // Now, we only bail if the file does not exist. Some nodes may be + // empty, that's fine. An empty node still has a file that appears + // to have some whitespace, so we need to handle that. + if (!cpuIdsStr.has_value()) + { + fallback(); + break; + } + else + { + remove_whitespace(*cpuIdsStr); + for (size_t c : indices_from_shortened_string(*cpuIdsStr)) + { + if (is_cpu_allowed(c)) + cfg.add_cpu_to_node(n, c); + } + } + } + } + + if (useFallback) + { + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + if (is_cpu_allowed(c)) + cfg.add_cpu_to_node(NumaIndex{0}, c); + } + +#elif defined(_WIN64) + + WORD numProcGroups = GetActiveProcessorGroupCount(); + for (WORD procGroup = 0; procGroup < numProcGroups; ++procGroup) + { + for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number) + { + PROCESSOR_NUMBER procnum; + procnum.Group = procGroup; + procnum.Number = number; + procnum.Reserved = 0; + USHORT nodeNumber; + + const BOOL status = GetNumaProcessorNodeEx(&procnum, &nodeNumber); + const CpuIndex c = static_cast(procGroup) * WIN_PROCESSOR_GROUP_SIZE + + static_cast(number); + if (status != 0 && nodeNumber != std::numeric_limits::max() + && is_cpu_allowed(c)) + { + cfg.add_cpu_to_node(nodeNumber, c); + } + } + } + +#else + + abort(); // should not reach here + +#endif + + return cfg; + } + + template + static std::optional try_get_l3_aware_config( + bool respectProcessAffinity, size_t bundleSize, [[maybe_unused]] Pred&& is_cpu_allowed) { + // Get the normal system configuration so we know to which NUMA node + // each L3 domain belongs. + NumaConfig systemConfig = + NumaConfig::from_system(SystemNumaPolicy{}, respectProcessAffinity); + std::vector l3Domains; + +#if defined(__linux__) && !defined(__ANDROID__) + + std::set seenCpus; + auto nextUnseenCpu = [&seenCpus]() { + for (CpuIndex i = 0;; ++i) + if (!seenCpus.count(i)) + return i; + }; + + while (true) + { + CpuIndex next = nextUnseenCpu(); + auto siblingsStr = + read_file_to_string("/sys/devices/system/cpu/cpu" + std::to_string(next) + + "/cache/index3/shared_cpu_list"); + + if (!siblingsStr.has_value() || siblingsStr->empty()) + { + break; // we have read all available CPUs + } + + L3Domain domain; + for (size_t c : indices_from_shortened_string(*siblingsStr)) + { + if (is_cpu_allowed(c)) + { + domain.systemNumaIndex = systemConfig.nodeByCpu.at(c); + domain.cpus.insert(c); + } + seenCpus.insert(c); + } + if (!domain.cpus.empty()) + { + l3Domains.emplace_back(std::move(domain)); + } + } + +#elif defined(_WIN64) + + DWORD bufSize = 0; + GetLogicalProcessorInformationEx(RelationCache, nullptr, &bufSize); + if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) + return std::nullopt; + + std::vector buffer(bufSize); + auto info = reinterpret_cast(buffer.data()); + if (!GetLogicalProcessorInformationEx(RelationCache, info, &bufSize)) + return std::nullopt; + + while (reinterpret_cast(info) < buffer.data() + bufSize) + { + info = std::launder(info); + if (info->Relationship == RelationCache && info->Cache.Level == 3) + { + L3Domain domain{}; + domain.cpus = readCacheMembers(info, is_cpu_allowed); + if (!domain.cpus.empty()) + { + domain.systemNumaIndex = systemConfig.nodeByCpu.at(*domain.cpus.begin()); + l3Domains.push_back(std::move(domain)); + } + } + // Variable length data structure, advance to next + info = reinterpret_cast( + reinterpret_cast(info) + info->Size); + } +#endif + + if (!l3Domains.empty()) + return {NumaConfig::from_l3_info(std::move(l3Domains), bundleSize)}; + + return std::nullopt; + } + + + static NumaConfig from_l3_info(std::vector&& domains, size_t bundleSize) { + assert(!domains.empty()); + + std::map> list; + for (auto& d : domains) + list[d.systemNumaIndex].emplace_back(std::move(d)); + + NumaConfig cfg = empty(); + NumaIndex n = 0; + for (auto& [_, ds] : list) + { + bool changed; + // Scan through pairs and merge them. With roughly equal L3 sizes, should give + // a decent distribution. + do + { + changed = false; + for (size_t j = 0; j + 1 < ds.size(); ++j) + { + if (ds[j].cpus.size() + ds[j + 1].cpus.size() <= bundleSize) + { + changed = true; + ds[j].cpus.merge(ds[j + 1].cpus); + ds.erase(ds.begin() + j + 1); + } + } + // ds.size() has decreased if changed is true, so this loop will terminate + } while (changed); + for (const L3Domain& d : ds) + { + const NumaIndex dn = n++; + for (CpuIndex cpu : d.cpus) + { + cfg.add_cpu_to_node(dn, cpu); + } + } + } + return cfg; + } +}; + +class NumaReplicationContext; + +// Instances of this class are tracked by the NumaReplicationContext instance. +// NumaReplicationContext informs all tracked instances when NUMA configuration changes. +class NumaReplicatedBase { + public: + NumaReplicatedBase(NumaReplicationContext& ctx); + + NumaReplicatedBase(const NumaReplicatedBase&) = delete; + NumaReplicatedBase(NumaReplicatedBase&& other) noexcept; + + NumaReplicatedBase& operator=(const NumaReplicatedBase&) = delete; + NumaReplicatedBase& operator=(NumaReplicatedBase&& other) noexcept; + + virtual void on_numa_config_changed() = 0; + virtual ~NumaReplicatedBase(); + + const NumaConfig& get_numa_config() const; + + private: + NumaReplicationContext* context; +}; + +// We force boxing with a unique_ptr. If this becomes an issue due to added +// indirection we may need to add an option for a custom boxing type. When the +// NUMA config changes the value stored at the index 0 is replicated to other nodes. +template +class NumaReplicated: public NumaReplicatedBase { + public: + using ReplicatorFuncType = std::function; + + NumaReplicated(NumaReplicationContext& ctx) : + NumaReplicatedBase(ctx) { + replicate_from(T{}); + } + + NumaReplicated(NumaReplicationContext& ctx, T&& source) : + NumaReplicatedBase(ctx) { + replicate_from(std::move(source)); + } + + NumaReplicated(const NumaReplicated&) = delete; + NumaReplicated(NumaReplicated&& other) noexcept : + NumaReplicatedBase(std::move(other)), + instances(std::exchange(other.instances, {})) {} + + NumaReplicated& operator=(const NumaReplicated&) = delete; + NumaReplicated& operator=(NumaReplicated&& other) noexcept { + NumaReplicatedBase::operator=(*this, std::move(other)); + instances = std::exchange(other.instances, {}); + + return *this; + } + + NumaReplicated& operator=(T&& source) { + replicate_from(std::move(source)); + + return *this; + } + + ~NumaReplicated() override = default; + + const T& operator[](NumaReplicatedAccessToken token) const { + assert(token.get_numa_index() < instances.size()); + return *(instances[token.get_numa_index()]); + } + + const T& operator*() const { return *(instances[0]); } + + const T* operator->() const { return instances[0].get(); } + + template + void modify_and_replicate(FuncT&& f) { + auto source = std::move(instances[0]); + std::forward(f)(*source); + replicate_from(std::move(*source)); + } + + void on_numa_config_changed() override { + // Use the first one as the source. It doesn't matter which one we use, + // because they all must be identical, but the first one is guaranteed to exist. + auto source = std::move(instances[0]); + replicate_from(std::move(*source)); + } + + private: + std::vector> instances; + + void replicate_from(T&& source) { + instances.clear(); + + const NumaConfig& cfg = get_numa_config(); + if (cfg.requires_memory_replication()) + { + for (NumaIndex n = 0; n < cfg.num_numa_nodes(); ++n) + { + cfg.execute_on_numa_node( + n, [this, &source]() { instances.emplace_back(std::make_unique(source)); }); + } + } + else + { + assert(cfg.num_numa_nodes() == 1); + // We take advantage of the fact that replication is not required + // and reuse the source value, avoiding one copy operation. + instances.emplace_back(std::make_unique(std::move(source))); + } + } +}; + +// We force boxing with a unique_ptr. If this becomes an issue due to added +// indirection we may need to add an option for a custom boxing type. +template +class LazyNumaReplicated: public NumaReplicatedBase { + public: + using ReplicatorFuncType = std::function; + + LazyNumaReplicated(NumaReplicationContext& ctx) : + NumaReplicatedBase(ctx) { + prepare_replicate_from(T{}); + } + + LazyNumaReplicated(NumaReplicationContext& ctx, T&& source) : + NumaReplicatedBase(ctx) { + prepare_replicate_from(std::move(source)); + } + + LazyNumaReplicated(const LazyNumaReplicated&) = delete; + LazyNumaReplicated(LazyNumaReplicated&& other) noexcept : + NumaReplicatedBase(std::move(other)), + instances(std::exchange(other.instances, {})) {} + + LazyNumaReplicated& operator=(const LazyNumaReplicated&) = delete; + LazyNumaReplicated& operator=(LazyNumaReplicated&& other) noexcept { + NumaReplicatedBase::operator=(*this, std::move(other)); + instances = std::exchange(other.instances, {}); + + return *this; + } + + LazyNumaReplicated& operator=(T&& source) { + prepare_replicate_from(std::move(source)); + + return *this; + } + + ~LazyNumaReplicated() override = default; + + const T& operator[](NumaReplicatedAccessToken token) const { + assert(token.get_numa_index() < instances.size()); + ensure_present(token.get_numa_index()); + return *(instances[token.get_numa_index()]); + } + + const T& operator*() const { return *(instances[0]); } + + const T* operator->() const { return instances[0].get(); } + + template + void modify_and_replicate(FuncT&& f) { + auto source = std::move(instances[0]); + std::forward(f)(*source); + prepare_replicate_from(std::move(*source)); + } + + void on_numa_config_changed() override { + // Use the first one as the source. It doesn't matter which one we use, + // because they all must be identical, but the first one is guaranteed to exist. + auto source = std::move(instances[0]); + prepare_replicate_from(std::move(*source)); + } + + private: + mutable std::vector> instances; + mutable std::mutex mutex; + + void ensure_present(NumaIndex idx) const { + assert(idx < instances.size()); + + if (instances[idx] != nullptr) + return; + + assert(idx != 0); + + std::unique_lock lock(mutex); + // Check again for races. + if (instances[idx] != nullptr) + return; + + const NumaConfig& cfg = get_numa_config(); + cfg.execute_on_numa_node( + idx, [this, idx]() { instances[idx] = std::make_unique(*instances[0]); }); + } + + void prepare_replicate_from(T&& source) { + instances.clear(); + + const NumaConfig& cfg = get_numa_config(); + if (cfg.requires_memory_replication()) + { + assert(cfg.num_numa_nodes() > 0); + + // We just need to make sure the first instance is there. + // Note that we cannot move here as we need to reallocate the data + // on the correct NUMA node. + cfg.execute_on_numa_node( + 0, [this, &source]() { instances.emplace_back(std::make_unique(source)); }); + + // Prepare others for lazy init. + instances.resize(cfg.num_numa_nodes()); + } + else + { + assert(cfg.num_numa_nodes() == 1); + // We take advantage of the fact that replication is not required + // and reuse the source value, avoiding one copy operation. + instances.emplace_back(std::make_unique(std::move(source))); + } + } +}; + +// Utilizes shared memory. +template +class LazyNumaReplicatedSystemWide: public NumaReplicatedBase { + public: + using ReplicatorFuncType = std::function; + + LazyNumaReplicatedSystemWide(NumaReplicationContext& ctx) : + NumaReplicatedBase(ctx) { + prepare_replicate_from(std::make_unique()); + } + + LazyNumaReplicatedSystemWide(NumaReplicationContext& ctx, std::unique_ptr&& source) : + NumaReplicatedBase(ctx) { + prepare_replicate_from(std::move(source)); + } + + LazyNumaReplicatedSystemWide(const LazyNumaReplicatedSystemWide&) = delete; + LazyNumaReplicatedSystemWide(LazyNumaReplicatedSystemWide&& other) noexcept : + NumaReplicatedBase(std::move(other)), + instances(std::exchange(other.instances, {})) {} + + LazyNumaReplicatedSystemWide& operator=(const LazyNumaReplicatedSystemWide&) = delete; + LazyNumaReplicatedSystemWide& operator=(LazyNumaReplicatedSystemWide&& other) noexcept { + NumaReplicatedBase::operator=(*this, std::move(other)); + instances = std::exchange(other.instances, {}); + + return *this; + } + + LazyNumaReplicatedSystemWide& operator=(std::unique_ptr&& source) { + prepare_replicate_from(std::move(source)); + + return *this; + } + + ~LazyNumaReplicatedSystemWide() override = default; + + const T& operator[](NumaReplicatedAccessToken token) const { + assert(token.get_numa_index() < instances.size()); + ensure_present(token.get_numa_index()); + return *(instances[token.get_numa_index()]); + } + + const T& operator*() const { return *(instances[0]); } + + const T* operator->() const { return &*instances[0]; } + + std::vector>> + get_status_and_errors() const { + std::vector>> + status; + status.reserve(instances.size()); + + for (const auto& instance : instances) + { + status.emplace_back(instance.get_status(), instance.get_error_message()); + } + + return status; + } + + template + void modify_and_replicate(FuncT&& f) { + auto source = std::make_unique(*instances[0]); + std::forward(f)(*source); + prepare_replicate_from(std::move(source)); + } + + void on_numa_config_changed() override { + // Use the first one as the source. It doesn't matter which one we use, + // because they all must be identical, but the first one is guaranteed to exist. + auto source = std::make_unique(*instances[0]); + prepare_replicate_from(std::move(source)); + } + + private: + mutable std::vector> instances; + mutable std::mutex mutex; + + std::size_t get_discriminator(NumaIndex idx) const { + const NumaConfig& cfg = get_numa_config(); + const NumaConfig& cfg_sys = NumaConfig::from_system(SystemNumaPolicy{}, false); + // as a discriminator, locate the hardware/system numadomain this cpuindex belongs to + CpuIndex cpu = *cfg.nodes[idx].begin(); // get a CpuIndex from NumaIndex + NumaIndex sys_idx = cfg_sys.is_cpu_assigned(cpu) ? cfg_sys.nodeByCpu.at(cpu) : 0; + std::string s = cfg_sys.to_string() + "$" + std::to_string(sys_idx); + return static_cast(hash_string(s)); + } + + void ensure_present(NumaIndex idx) const { + assert(idx < instances.size()); + + if (instances[idx] != nullptr) + return; + + assert(idx != 0); + + std::unique_lock lock(mutex); + // Check again for races. + if (instances[idx] != nullptr) + return; + + const NumaConfig& cfg = get_numa_config(); + cfg.execute_on_numa_node(idx, [this, idx]() { + instances[idx] = SystemWideSharedConstant(*instances[0], get_discriminator(idx)); + }); + } + + void prepare_replicate_from(std::unique_ptr&& source) { + instances.clear(); + + const NumaConfig& cfg = get_numa_config(); + // We just need to make sure the first instance is there. + // Note that we cannot move here as we need to reallocate the data + // on the correct NUMA node. + // Even in the case of a single NUMA node we have to copy since it's shared memory. + if (cfg.requires_memory_replication()) + { + assert(cfg.num_numa_nodes() > 0); + + cfg.execute_on_numa_node(0, [this, &source]() { + instances.emplace_back(SystemWideSharedConstant(*source, get_discriminator(0))); + }); + + // Prepare others for lazy init. + instances.resize(cfg.num_numa_nodes()); + } + else + { + assert(cfg.num_numa_nodes() == 1); + instances.emplace_back(SystemWideSharedConstant(*source, get_discriminator(0))); + } + } +}; + +class NumaReplicationContext { + public: + NumaReplicationContext(NumaConfig&& cfg) : + config(std::move(cfg)) {} + + NumaReplicationContext(const NumaReplicationContext&) = delete; + NumaReplicationContext(NumaReplicationContext&&) = delete; + + NumaReplicationContext& operator=(const NumaReplicationContext&) = delete; + NumaReplicationContext& operator=(NumaReplicationContext&&) = delete; + + ~NumaReplicationContext() { + // The context must outlive replicated objects + if (!trackedReplicatedObjects.empty()) + std::exit(EXIT_FAILURE); + } + + void attach(NumaReplicatedBase* obj) { + assert(trackedReplicatedObjects.count(obj) == 0); + trackedReplicatedObjects.insert(obj); + } + + void detach(NumaReplicatedBase* obj) { + assert(trackedReplicatedObjects.count(obj) == 1); + trackedReplicatedObjects.erase(obj); + } + + // oldObj may be invalid at this point + void move_attached([[maybe_unused]] NumaReplicatedBase* oldObj, NumaReplicatedBase* newObj) { + assert(trackedReplicatedObjects.count(oldObj) == 1); + assert(trackedReplicatedObjects.count(newObj) == 0); + trackedReplicatedObjects.erase(oldObj); + trackedReplicatedObjects.insert(newObj); + } + + void set_numa_config(NumaConfig&& cfg) { + config = std::move(cfg); + for (auto&& obj : trackedReplicatedObjects) + obj->on_numa_config_changed(); + } + + const NumaConfig& get_numa_config() const { return config; } + + private: + NumaConfig config; + + // std::set uses std::less by default, which is required for pointer comparison + std::set trackedReplicatedObjects; +}; + +inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicationContext& ctx) : + context(&ctx) { + context->attach(this); +} + +inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicatedBase&& other) noexcept : + context(std::exchange(other.context, nullptr)) { + context->move_attached(&other, this); +} + +inline NumaReplicatedBase& NumaReplicatedBase::operator=(NumaReplicatedBase&& other) noexcept { + context = std::exchange(other.context, nullptr); + + context->move_attached(&other, this); + + return *this; +} + +inline NumaReplicatedBase::~NumaReplicatedBase() { + if (context != nullptr) + context->detach(this); +} + +inline const NumaConfig& NumaReplicatedBase::get_numa_config() const { + return context->get_numa_config(); +} + +} // namespace Stockfish + + +#endif // #ifndef NUMA_H_INCLUDED diff --git a/src/perft.h b/src/perft.h new file mode 100644 index 0000000000000000000000000000000000000000..24d125cbf4739681549069e0afa3447bb468b77d --- /dev/null +++ b/src/perft.h @@ -0,0 +1,67 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef PERFT_H_INCLUDED +#define PERFT_H_INCLUDED + +#include + +#include "movegen.h" +#include "position.h" +#include "types.h" +#include "uci.h" + +namespace Stockfish::Benchmark { + +// Utility to verify move generation. All the leaf nodes up +// to the given depth are generated and counted, and the sum is returned. +template +uint64_t perft(Position& pos, Depth depth) { + + StateInfo st; + + uint64_t cnt, nodes = 0; + const bool leaf = (depth == 2); + + for (const auto& m : MoveList(pos)) + { + if (Root && depth <= 1) + cnt = 1, nodes++; + else + { + pos.do_move(m, st); + cnt = leaf ? MoveList(pos).size() : perft(pos, depth - 1); + nodes += cnt; + pos.undo_move(m); + } + if (Root) + sync_cout << UCIEngine::move(m, pos.is_chess960()) << ": " << cnt << sync_endl; + } + return nodes; +} + +inline uint64_t perft(const std::string& fen, Depth depth, bool isChess960) { + StateInfo st; + Position p; + p.set(fen, isChess960, &st); + + return perft(p, depth); +} +} + +#endif // PERFT_H_INCLUDED diff --git a/src/position.cpp b/src/position.cpp new file mode 100644 index 0000000000000000000000000000000000000000..daadf59ecb57079dea4bf3c61ea064b9b4564ac9 --- /dev/null +++ b/src/position.cpp @@ -0,0 +1,1566 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "position.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bitboard.h" +#include "history.h" +#include "misc.h" +#include "movegen.h" +#include "syzygy/tbprobe.h" +#include "tt.h" +#include "uci.h" + +using std::string; + +namespace Stockfish { + +namespace Zobrist { + +Key psq[PIECE_NB][SQUARE_NB]; +Key enpassant[FILE_NB]; +Key castling[CASTLING_RIGHT_NB]; +Key side, noPawns; + +} + +namespace { + +constexpr std::string_view PieceToChar(" PNBRQK pnbrqk"); + +static constexpr Piece Pieces[] = {W_PAWN, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING, + B_PAWN, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING}; +} // namespace + + +// Returns an ASCII representation of the position +std::ostream& operator<<(std::ostream& os, const Position& pos) { + + os << "\n +---+---+---+---+---+---+---+---+\n"; + + for (Rank r = RANK_8;; --r) + { + for (File f = FILE_A; f <= FILE_H; ++f) + os << " | " << PieceToChar[pos.piece_on(make_square(f, r))]; + + os << " | " << (1 + r) << "\n +---+---+---+---+---+---+---+---+\n"; + + if (r == RANK_1) + break; + } + + os << " a b c d e f g h\n" + << "\nFen: " << pos.fen() << "\nKey: " << std::hex << std::uppercase << std::setfill('0') + << std::setw(16) << pos.key() << std::setfill(' ') << std::dec << "\nCheckers: "; + + for (Bitboard b = pos.checkers(); b;) + os << UCIEngine::square(pop_lsb(b)) << " "; + + if (Tablebases::MaxCardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING)) + { + StateInfo st; + + Position p; + p.set(pos.fen(), pos.is_chess960(), &st); + Tablebases::ProbeState s1, s2; + Tablebases::WDLScore wdl = Tablebases::probe_wdl(p, &s1); + int dtz = Tablebases::probe_dtz(p, &s2); + os << "\nTablebases WDL: " << std::setw(4) << wdl << " (" << s1 << ")" + << "\nTablebases DTZ: " << std::setw(4) << dtz << " (" << s2 << ")"; + } + + return os; +} + + +// Implements Marcel van Kervinck's cuckoo algorithm to detect repetition of positions +// for 3-fold repetition draws. The algorithm uses two hash tables with Zobrist hashes +// to allow fast detection of recurring positions. For details see: +// http://web.archive.org/web/20201107002606/https://marcelk.net/2013-04-06/paper/upcoming-rep-v2.pdf + +// First and second hash functions for indexing the cuckoo tables +inline int H1(Key h) { return h & 0x1fff; } +inline int H2(Key h) { return (h >> 16) & 0x1fff; } + +// Cuckoo tables with Zobrist hashes of valid reversible moves, and the moves themselves +std::array cuckoo; +std::array cuckooMove; + +// Initializes at startup the various arrays used to compute hash keys +void Position::init() { + + PRNG rng(1070372); + + for (Piece pc : Pieces) + for (Square s = SQ_A1; s <= SQ_H8; ++s) + Zobrist::psq[pc][s] = rng.rand(); + // pawns on these squares will promote + std::fill_n(Zobrist::psq[W_PAWN] + SQ_A8, 8, 0); + std::fill_n(Zobrist::psq[B_PAWN], 8, 0); + + for (File f = FILE_A; f <= FILE_H; ++f) + Zobrist::enpassant[f] = rng.rand(); + + for (int cr = NO_CASTLING; cr <= ANY_CASTLING; ++cr) + Zobrist::castling[cr] = rng.rand(); + + Zobrist::side = rng.rand(); + Zobrist::noPawns = rng.rand(); + + // Prepare the cuckoo tables + cuckoo.fill(0); + cuckooMove.fill(Move::none()); + [[maybe_unused]] int count = 0; + for (Piece pc : Pieces) + for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1) + for (Square s2 = Square(s1 + 1); s2 <= SQ_H8; ++s2) + if ((type_of(pc) != PAWN) && (attacks_bb(type_of(pc), s1, 0) & s2)) + { + Move move = Move(s1, s2); + Key key = Zobrist::psq[pc][s1] ^ Zobrist::psq[pc][s2] ^ Zobrist::side; + int i = H1(key); + while (true) + { + std::swap(cuckoo[i], key); + std::swap(cuckooMove[i], move); + if (move == Move::none()) // Arrived at empty slot? + break; + i = (i == H1(key)) ? H2(key) : H1(key); // Push victim to alternative slot + } + count++; + } + assert(count == 3668); +} + + +// Initializes the position object with the given FEN string. +// This function is not very robust - make sure that input FENs are correct, +// this is assumed to be the responsibility of the GUI. +Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si) { + /* + A FEN string defines a particular position using only the ASCII character set. + + A FEN string contains six fields separated by a space. The fields are: + + 1) Piece placement (from white's perspective). Each rank is described, starting + with rank 8 and ending with rank 1. Within each rank, the contents of each + square are described from file A through file H. Following the Standard + Algebraic Notation (SAN), each piece is identified by a single letter taken + from the standard English names. White pieces are designated using upper-case + letters ("PNBRQK") whilst Black uses lowercase ("pnbrqk"). Blank squares are + noted using digits 1 through 8 (the number of blank squares), and "/" + separates ranks. + + 2) Active color. "w" means white moves next, "b" means black. + + 3) Castling availability. If neither side can castle, this is "-". Otherwise, + this has one or more letters: "K" (White can castle kingside), "Q" (White + can castle queenside), "k" (Black can castle kingside), and/or "q" (Black + can castle queenside). + + 4) En passant target square (in algebraic notation). If there's no en passant + target square, this is "-". If a pawn has just made a 2-square move, this + is the position "behind" the pawn. Following X-FEN standard, this is recorded + only if there is a pawn in position to make an en passant capture, and if + there really is a pawn that might have advanced two squares. + + 5) Halfmove clock. This is the number of halfmoves since the last pawn advance + or capture. This is used to determine if a draw can be claimed under the + fifty-move rule. + + 6) Fullmove number. The number of the full move. It starts at 1, and is + incremented after Black's move. +*/ + + unsigned char col, row, token; + size_t idx; + Square sq = SQ_A8; + std::istringstream ss(fenStr); + + std::memset(reinterpret_cast(this), 0, sizeof(Position)); + std::memset(si, 0, sizeof(StateInfo)); + st = si; + + ss >> std::noskipws; + + // 1. Piece placement + while ((ss >> token) && !isspace(token)) + { + if (isdigit(token)) + sq += (token - '0') * EAST; // Advance the given number of files + + else if (token == '/') + sq += 2 * SOUTH; + + else if ((idx = PieceToChar.find(token)) != string::npos) + { + put_piece(Piece(idx), sq); + ++sq; + } + } + + // 2. Active color + ss >> token; + sideToMove = (token == 'w' ? WHITE : BLACK); + ss >> token; + + // 3. Castling availability. Compatible with 3 standards: Normal FEN standard, + // Shredder-FEN that uses the letters of the columns on which the rooks began + // the game instead of KQkq and also X-FEN standard that, in case of Chess960, + // if an inner rook is associated with the castling right, the castling tag is + // replaced by the file letter of the involved rook, as for the Shredder-FEN. + while ((ss >> token) && !isspace(token)) + { + Square rsq; + Color c = islower(token) ? BLACK : WHITE; + Piece rook = make_piece(c, ROOK); + + token = char(toupper(token)); + + if (token == 'K') + for (rsq = relative_square(c, SQ_H1); piece_on(rsq) != rook; --rsq) + {} + + else if (token == 'Q') + for (rsq = relative_square(c, SQ_A1); piece_on(rsq) != rook; ++rsq) + {} + + else if (token >= 'A' && token <= 'H') + rsq = make_square(File(token - 'A'), relative_rank(c, RANK_1)); + + else + continue; + + set_castling_right(c, rsq); + } + + // 4. En passant square. + // Ignore if square is invalid or not on side to move relative rank 6. + bool enpassant = false, legalEP = false; + + if (((ss >> col) && (col >= 'a' && col <= 'h')) + && ((ss >> row) && (row == (sideToMove == WHITE ? '6' : '3')))) + { + st->epSquare = make_square(File(col - 'a'), Rank(row - '1')); + + Bitboard pawns = attacks_bb(st->epSquare, ~sideToMove) & pieces(sideToMove, PAWN); + Bitboard target = (pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove))); + Bitboard occ = pieces() ^ target ^ st->epSquare; + + // En passant square will be considered only if + // a) side to move have a pawn threatening epSquare + // b) there is an enemy pawn in front of epSquare + // c) there is no piece on epSquare or behind epSquare + enpassant = + pawns && target && !(pieces() & (st->epSquare | (st->epSquare + pawn_push(sideToMove)))); + + // If no pawn can execute the en passant capture without leaving the king in check, don't record the epSquare + while (pawns) + legalEP |= !(attackers_to(square(sideToMove), occ ^ pop_lsb(pawns)) + & pieces(~sideToMove) & ~target); + } + + if (!enpassant || !legalEP) + st->epSquare = SQ_NONE; + + // 5-6. Halfmove clock and fullmove number + ss >> std::skipws >> st->rule50 >> gamePly; + + // Convert from fullmove starting from 1 to gamePly starting from 0, + // handle also common incorrect FEN with fullmove = 0. + gamePly = std::max(2 * (gamePly - 1), 0) + (sideToMove == BLACK); + + chess960 = isChess960; + set_state(); + + assert(pos_is_ok()); + + return *this; +} + + +// Helper function used to set castling +// rights given the corresponding color and the rook starting square. +void Position::set_castling_right(Color c, Square rfrom) { + + Square kfrom = square(c); + CastlingRights cr = c & (kfrom < rfrom ? KING_SIDE : QUEEN_SIDE); + + st->castlingRights |= cr; + castlingRightsMask[kfrom] |= cr; + castlingRightsMask[rfrom] |= cr; + castlingRookSquare[cr] = rfrom; + + Square kto = relative_square(c, cr & KING_SIDE ? SQ_G1 : SQ_C1); + Square rto = relative_square(c, cr & KING_SIDE ? SQ_F1 : SQ_D1); + + castlingPath[cr] = (between_bb(rfrom, rto) | between_bb(kfrom, kto)) & ~(kfrom | rfrom); +} + + +// Sets king attacks to detect if a move gives check +void Position::set_check_info() const { + + update_slider_blockers(WHITE); + update_slider_blockers(BLACK); + + Square ksq = square(~sideToMove); + + st->checkSquares[PAWN] = attacks_bb(ksq, ~sideToMove); + st->checkSquares[KNIGHT] = attacks_bb(ksq); + st->checkSquares[BISHOP] = attacks_bb(ksq, pieces()); + st->checkSquares[ROOK] = attacks_bb(ksq, pieces()); + st->checkSquares[QUEEN] = st->checkSquares[BISHOP] | st->checkSquares[ROOK]; + st->checkSquares[KING] = 0; +} + + +// Computes the hash keys of the position, and other +// data that once computed is updated incrementally as moves are made. +// The function is only used when a new position is set up +void Position::set_state() const { + + st->key = 0; + st->minorPieceKey = 0; + st->nonPawnKey[WHITE] = st->nonPawnKey[BLACK] = 0; + st->pawnKey = Zobrist::noPawns; + st->nonPawnMaterial[WHITE] = st->nonPawnMaterial[BLACK] = VALUE_ZERO; + st->checkersBB = attackers_to(square(sideToMove)) & pieces(~sideToMove); + + set_check_info(); + + for (Bitboard b = pieces(); b;) + { + Square s = pop_lsb(b); + Piece pc = piece_on(s); + st->key ^= Zobrist::psq[pc][s]; + + if (type_of(pc) == PAWN) + st->pawnKey ^= Zobrist::psq[pc][s]; + + else + { + st->nonPawnKey[color_of(pc)] ^= Zobrist::psq[pc][s]; + + if (type_of(pc) != KING) + { + st->nonPawnMaterial[color_of(pc)] += PieceValue[pc]; + + if (type_of(pc) <= BISHOP) + st->minorPieceKey ^= Zobrist::psq[pc][s]; + } + } + } + + if (st->epSquare != SQ_NONE) + st->key ^= Zobrist::enpassant[file_of(st->epSquare)]; + + if (sideToMove == BLACK) + st->key ^= Zobrist::side; + + st->key ^= Zobrist::castling[st->castlingRights]; + st->materialKey = compute_material_key(); +} + +Key Position::compute_material_key() const { + Key k = 0; + for (Piece pc : Pieces) + for (int cnt = 0; cnt < pieceCount[pc]; ++cnt) + k ^= Zobrist::psq[pc][8 + cnt]; + return k; +} + + +// Overload to initialize the position object with the given endgame code string +// like "KBPKN". It's mainly a helper to get the material key out of an endgame code. +Position& Position::set(const string& code, Color c, StateInfo* si) { + + assert(code[0] == 'K'); + + string sides[] = {code.substr(code.find('K', 1)), // Weak + code.substr(0, std::min(code.find('v'), code.find('K', 1)))}; // Strong + + assert(sides[0].length() > 0 && sides[0].length() < 8); + assert(sides[1].length() > 0 && sides[1].length() < 8); + + std::transform(sides[c].begin(), sides[c].end(), sides[c].begin(), tolower); + + string fenStr = "8/" + sides[0] + char(8 - sides[0].length() + '0') + "/8/8/8/8/" + sides[1] + + char(8 - sides[1].length() + '0') + "/8 w - - 0 10"; + + return set(fenStr, false, si); +} + + +// Returns a FEN representation of the position. In case of +// Chess960 the Shredder-FEN notation is used. This is mainly a debugging function. +string Position::fen() const { + + int emptyCnt; + std::ostringstream ss; + + for (Rank r = RANK_8;; --r) + { + for (File f = FILE_A; f <= FILE_H; ++f) + { + for (emptyCnt = 0; f <= FILE_H && empty(make_square(f, r)); ++f) + ++emptyCnt; + + if (emptyCnt) + ss << emptyCnt; + + if (f <= FILE_H) + ss << PieceToChar[piece_on(make_square(f, r))]; + } + + if (r == RANK_1) + break; + ss << '/'; + } + + ss << (sideToMove == WHITE ? " w " : " b "); + + if (can_castle(WHITE_OO)) + ss << (chess960 ? char('A' + file_of(castling_rook_square(WHITE_OO))) : 'K'); + + if (can_castle(WHITE_OOO)) + ss << (chess960 ? char('A' + file_of(castling_rook_square(WHITE_OOO))) : 'Q'); + + if (can_castle(BLACK_OO)) + ss << (chess960 ? char('a' + file_of(castling_rook_square(BLACK_OO))) : 'k'); + + if (can_castle(BLACK_OOO)) + ss << (chess960 ? char('a' + file_of(castling_rook_square(BLACK_OOO))) : 'q'); + + if (!can_castle(ANY_CASTLING)) + ss << '-'; + + ss << (ep_square() == SQ_NONE ? " - " : " " + UCIEngine::square(ep_square()) + " ") + << st->rule50 << " " << 1 + (gamePly - (sideToMove == BLACK)) / 2; + + return ss.str(); +} + +// Calculates st->blockersForKing[c] and st->pinners[~c], +// which store respectively the pieces preventing king of color c from being in check +// and the slider pieces of color ~c pinning pieces of color c to the king. +void Position::update_slider_blockers(Color c) const { + + Square ksq = square(c); + + st->blockersForKing[c] = 0; + st->pinners[~c] = 0; + + // Snipers are sliders that attack 's' when a piece and other snipers are removed + Bitboard snipers = ((attacks_bb(ksq) & pieces(QUEEN, ROOK)) + | (attacks_bb(ksq) & pieces(QUEEN, BISHOP))) + & pieces(~c); + Bitboard occupancy = pieces() ^ snipers; + + while (snipers) + { + Square sniperSq = pop_lsb(snipers); + Bitboard b = between_bb(ksq, sniperSq) & occupancy; + + if (b && !more_than_one(b)) + { + st->blockersForKing[c] |= b; + if (b & pieces(c)) + st->pinners[~c] |= sniperSq; + } + } +} + + +// Computes a bitboard of all pieces which attack a given square. +// Slider attacks use the occupied bitboard to indicate occupancy. +Bitboard Position::attackers_to(Square s, Bitboard occupied) const { + + return (attacks_bb(s, occupied) & pieces(ROOK, QUEEN)) + | (attacks_bb(s, occupied) & pieces(BISHOP, QUEEN)) + | (attacks_bb(s, BLACK) & pieces(WHITE, PAWN)) + | (attacks_bb(s, WHITE) & pieces(BLACK, PAWN)) + | (attacks_bb(s) & pieces(KNIGHT)) | (attacks_bb(s) & pieces(KING)); +} + +bool Position::attackers_to_exist(Square s, Bitboard occupied, Color c) const { + + return (attacks_bb(s, occupied) & pieces(c, ROOK, QUEEN)) + || (attacks_bb(s, occupied) & pieces(c, BISHOP, QUEEN)) + || (attacks_bb(s, ~c) & pieces(c, PAWN)) + || (attacks_bb(s) & pieces(c, KNIGHT)) || (attacks_bb(s) & pieces(c, KING)); +} + +// Tests whether a pseudo-legal move is legal +bool Position::legal(Move m) const { + + assert(m.is_ok()); + + Color us = sideToMove; + Square from = m.from_sq(); + Square to = m.to_sq(); + + assert(color_of(moved_piece(m)) == us); + assert(piece_on(square(us)) == make_piece(us, KING)); + + // En passant captures are a tricky special case. Because they are rather + // uncommon, we do it simply by testing whether the king is attacked after + // the move is made. + if (m.type_of() == EN_PASSANT) + { + Square ksq = square(us); + Square capsq = to - pawn_push(us); + Bitboard occupied = (pieces() ^ from ^ capsq) | to; + + assert(to == ep_square()); + assert(moved_piece(m) == make_piece(us, PAWN)); + assert(piece_on(capsq) == make_piece(~us, PAWN)); + assert(piece_on(to) == NO_PIECE); + + return !(attacks_bb(ksq, occupied) & pieces(~us, QUEEN, ROOK)) + && !(attacks_bb(ksq, occupied) & pieces(~us, QUEEN, BISHOP)); + } + + // Castling moves generation does not check if the castling path is clear of + // enemy attacks, it is delayed at a later time: now! + if (m.type_of() == CASTLING) + { + // After castling, the rook and king final positions are the same in + // Chess960 as they would be in standard chess. + to = relative_square(us, to > from ? SQ_G1 : SQ_C1); + Direction step = to > from ? WEST : EAST; + + for (Square s = to; s != from; s += step) + if (attackers_to_exist(s, pieces(), ~us)) + return false; + + // In case of Chess960, verify if the Rook blocks some checks. + // For instance an enemy queen in SQ_A1 when castling rook is in SQ_B1. + return !chess960 || !(blockers_for_king(us) & m.to_sq()); + } + + // If the moving piece is a king, check whether the destination square is + // attacked by the opponent. + if (type_of(piece_on(from)) == KING) + return !(attackers_to_exist(to, pieces() ^ from, ~us)); + + // A non-king move is legal if and only if it is not pinned or it + // is moving along the ray towards or away from the king. + return !(blockers_for_king(us) & from) || line_bb(from, to) & pieces(us, KING); +} + + +// Takes a random move and tests whether the move is +// pseudo-legal. It is used to validate moves from TT that can be corrupted +// due to SMP concurrent access or hash position key aliasing. +bool Position::pseudo_legal(const Move m) const { + + Color us = sideToMove; + Square from = m.from_sq(); + Square to = m.to_sq(); + Piece pc = moved_piece(m); + + // Use a slower but simpler function for uncommon cases + // yet we skip the legality check of MoveList(). + if (m.type_of() != NORMAL) + return checkers() ? MoveList(*this).contains(m) + : MoveList(*this).contains(m); + + // Is not a promotion, so the promotion piece must be empty + assert(m.promotion_type() - KNIGHT == NO_PIECE_TYPE); + + // If the 'from' square is not occupied by a piece belonging to the side to + // move, the move is obviously not legal. + if (pc == NO_PIECE || color_of(pc) != us) + return false; + + // The destination square cannot be occupied by a friendly piece + if (pieces(us) & to) + return false; + + // Handle the special case of a pawn move + if (type_of(pc) == PAWN) + { + // We have already handled promotion moves, so destination cannot be on the 8th/1st rank + if ((Rank8BB | Rank1BB) & to) + return false; + + // Check if it's a valid capture, single push, or double push + const bool isCapture = bool(attacks_bb(from, us) & pieces(~us) & to); + const bool isSinglePush = (from + pawn_push(us) == to) && empty(to); + const bool isDoublePush = (from + 2 * pawn_push(us) == to) + && (relative_rank(us, from) == RANK_2) && empty(to) + && empty(to - pawn_push(us)); + + if (!(isCapture || isSinglePush || isDoublePush)) + return false; + } + else if (!(attacks_bb(type_of(pc), from, pieces()) & to)) + return false; + + // Evasions generator already takes care to avoid some kind of illegal moves + // and legal() relies on this. We therefore have to take care that the same + // kind of moves are filtered out here. + if (checkers()) + { + if (type_of(pc) != KING) + { + // Double check? In this case, a king move is required + if (more_than_one(checkers())) + return false; + + // Our move must be a blocking interposition or a capture of the checking piece + if (!(between_bb(square(us), lsb(checkers())) & to)) + return false; + } + // In case of king moves under check we have to remove the king so as to catch + // invalid moves like b1a1 when opposite queen is on c1. + else if (attackers_to_exist(to, pieces() ^ from, ~us)) + return false; + } + + return true; +} + + +// Tests whether a pseudo-legal move gives a check +bool Position::gives_check(Move m) const { + + assert(m.is_ok()); + assert(color_of(moved_piece(m)) == sideToMove); + + Square from = m.from_sq(); + Square to = m.to_sq(); + + // Is there a direct check? + if (check_squares(type_of(piece_on(from))) & to) + return true; + + // Is there a discovered check? + if (blockers_for_king(~sideToMove) & from) + return !(line_bb(from, to) & pieces(~sideToMove, KING)) || m.type_of() == CASTLING; + + switch (m.type_of()) + { + case NORMAL : + return false; + + case PROMOTION : + return attacks_bb(m.promotion_type(), to, pieces() ^ from) & pieces(~sideToMove, KING); + + // En passant capture with check? We have already handled the case of direct + // checks and ordinary discovered check, so the only case we need to handle + // is the unusual case of a discovered check through the captured pawn. + case EN_PASSANT : { + Square capsq = make_square(file_of(to), rank_of(from)); + Bitboard b = (pieces() ^ from ^ capsq) | to; + + return (attacks_bb(square(~sideToMove), b) & pieces(sideToMove, QUEEN, ROOK)) + | (attacks_bb(square(~sideToMove), b) + & pieces(sideToMove, QUEEN, BISHOP)); + } + default : //CASTLING + { + // Castling is encoded as 'king captures the rook' + Square rto = relative_square(sideToMove, to > from ? SQ_F1 : SQ_D1); + + return check_squares(ROOK) & rto; + } + } +} + + +// Makes a move, and saves all information necessary +// to a StateInfo object. The move is assumed to be legal. Pseudo-legal +// moves should be filtered out before this function is called. +// If a pointer to the TT table is passed, the entry for the new position +// will be prefetched, and likewise for shared history. +void Position::do_move(Move m, + StateInfo& newSt, + bool givesCheck, + DirtyPiece& dp, + DirtyThreats& dts, + const TranspositionTable* tt = nullptr, + const SharedHistories* history = nullptr) { + + assert(m.is_ok()); + assert(&newSt != st); + + Key k = st->key ^ Zobrist::side; + + // Copy some fields of the old state to our new StateInfo object except the + // ones which are going to be recalculated from scratch anyway and then switch + // our state pointer to point to the new (ready to be updated) state. + std::memcpy(&newSt, st, offsetof(StateInfo, key)); + newSt.previous = st; + st = &newSt; + + // Increment ply counters. In particular, rule50 will be reset to zero later on + // in case of a capture or a pawn move. + ++gamePly; + ++st->rule50; + ++st->pliesFromNull; + + Color us = sideToMove; + Color them = ~us; + Square from = m.from_sq(); + Square to = m.to_sq(); + Piece pc = piece_on(from); + Piece captured = m.type_of() == EN_PASSANT ? make_piece(them, PAWN) : piece_on(to); + + dp.pc = pc; + dp.from = from; + dp.to = to; + dp.add_sq = SQ_NONE; + dts.us = us; + dts.prevKsq = square(us); + dts.threatenedSqs = dts.threateningSqs = 0; + + assert(color_of(pc) == us); + assert(captured == NO_PIECE || color_of(captured) == (m.type_of() != CASTLING ? them : us)); + assert(type_of(captured) != KING); + + if (m.type_of() == CASTLING) + { + assert(pc == make_piece(us, KING)); + assert(captured == make_piece(us, ROOK)); + + Square rfrom, rto; + do_castling(us, from, to, rfrom, rto, &dts, &dp); + + k ^= Zobrist::psq[captured][rfrom] ^ Zobrist::psq[captured][rto]; + st->nonPawnKey[us] ^= Zobrist::psq[captured][rfrom] ^ Zobrist::psq[captured][rto]; + captured = NO_PIECE; + } + else if (captured) + { + Square capsq = to; + + // If the captured piece is a pawn, update pawn hash key, otherwise + // update non-pawn material. + if (type_of(captured) == PAWN) + { + if (m.type_of() == EN_PASSANT) + { + capsq -= pawn_push(us); + + assert(pc == make_piece(us, PAWN)); + assert(to == st->epSquare); + assert(relative_rank(us, to) == RANK_6); + assert(piece_on(to) == NO_PIECE); + assert(piece_on(capsq) == make_piece(them, PAWN)); + + // Update board and piece lists in ep case, normal captures are updated later + remove_piece(capsq, &dts); + } + + st->pawnKey ^= Zobrist::psq[captured][capsq]; + } + else + { + st->nonPawnMaterial[them] -= PieceValue[captured]; + st->nonPawnKey[them] ^= Zobrist::psq[captured][capsq]; + + if (type_of(captured) <= BISHOP) + st->minorPieceKey ^= Zobrist::psq[captured][capsq]; + } + + dp.remove_pc = captured; + dp.remove_sq = capsq; + + k ^= Zobrist::psq[captured][capsq]; + st->materialKey ^= + Zobrist::psq[captured][8 + pieceCount[captured] - (m.type_of() != EN_PASSANT)]; + + // Reset rule 50 counter + st->rule50 = 0; + } + else + dp.remove_sq = SQ_NONE; + + // Update hash key + k ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to]; + + // Reset en passant square + if (st->epSquare != SQ_NONE) + { + k ^= Zobrist::enpassant[file_of(st->epSquare)]; + st->epSquare = SQ_NONE; + } + + // Update castling rights. + k ^= Zobrist::castling[st->castlingRights]; + st->castlingRights &= ~(castlingRightsMask[from] | castlingRightsMask[to]); + k ^= Zobrist::castling[st->castlingRights]; + + // Move the piece. The tricky Chess960 castling is handled earlier + if (m.type_of() != CASTLING) + { + if (captured && m.type_of() != EN_PASSANT) + { + remove_piece(from, &dts); + swap_piece(to, pc, &dts); + } + else + move_piece(from, to, &dts); + } + + // If the moving piece is a pawn do some special extra work + if (type_of(pc) == PAWN) + { + // Check if the en passant square needs to be set. Accurate e.p. info is needed + // for correct zobrist key generation and 3-fold checking. + if ((int(to) ^ int(from)) == 16) + { + Square epSquare = to - pawn_push(us); + Bitboard pawns = attacks_bb(epSquare, us) & pieces(them, PAWN); + + // If there are no pawns attacking the ep square, ep is not possible. + if (pawns) + { + Square ksq = square(them); + Bitboard notBlockers = ~st->previous->blockersForKing[them]; + bool noDiscovery = (from & notBlockers) || file_of(from) == file_of(ksq); + + // If the pawn gives discovered check, ep is never legal. Else, if at least one + // pawn was not a blocker for the enemy king or lies on the same line as the + // enemy king and en passant square, a legal capture exists. + if (noDiscovery && (pawns & (notBlockers | line_bb(epSquare, ksq)))) + { + st->epSquare = epSquare; + k ^= Zobrist::enpassant[file_of(epSquare)]; + } + } + } + + else if (m.type_of() == PROMOTION) + { + Piece promotion = make_piece(us, m.promotion_type()); + PieceType promotionType = type_of(promotion); + + assert(relative_rank(us, to) == RANK_8); + assert(type_of(promotion) >= KNIGHT && type_of(promotion) <= QUEEN); + + swap_piece(to, promotion, &dts); + + dp.add_pc = promotion; + dp.add_sq = to; + dp.to = SQ_NONE; + + // Update hash keys + // Zobrist::psq[pc][to] is zero, so we don't need to clear it + k ^= Zobrist::psq[promotion][to]; + st->materialKey ^= Zobrist::psq[promotion][8 + pieceCount[promotion] - 1] + ^ Zobrist::psq[pc][8 + pieceCount[pc]]; + st->nonPawnKey[us] ^= Zobrist::psq[promotion][to]; + + if (promotionType <= BISHOP) + st->minorPieceKey ^= Zobrist::psq[promotion][to]; + + // Update material + st->nonPawnMaterial[us] += PieceValue[promotion]; + } + + // Update pawn hash key + st->pawnKey ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to]; + + // Reset rule 50 draw counter + st->rule50 = 0; + } + + else + { + st->nonPawnKey[us] ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to]; + + if (type_of(pc) <= BISHOP) + st->minorPieceKey ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to]; + } + + // Update the key with the final value + st->key = k; + if (tt) + prefetch(tt->first_entry(key())); + + if (history) + { + prefetch(&history->pawn_entry(*this)[pc][to]); + prefetch(&history->pawn_correction_entry(*this)); + prefetch(&history->minor_piece_correction_entry(*this)); + prefetch(&history->nonpawn_correction_entry(*this)); + prefetch(&history->nonpawn_correction_entry(*this)); + } + + // Set capture piece + st->capturedPiece = captured; + + // Calculate checkers bitboard (if move gives check) + st->checkersBB = givesCheck ? attackers_to(square(them)) & pieces(us) : 0; + + sideToMove = ~sideToMove; + + // Update king attacks used for fast check detection + set_check_info(); + + // Calculate the repetition info. It is the ply distance from the previous + // occurrence of the same position, negative in the 3-fold case, or zero + // if the position was not repeated. + st->repetition = 0; + int end = std::min(st->rule50, st->pliesFromNull); + if (end >= 4) + { + StateInfo* stp = st->previous->previous; + for (int i = 4; i <= end; i += 2) + { + stp = stp->previous->previous; + if (stp->key == st->key) + { + st->repetition = stp->repetition ? -i : i; + break; + } + } + } + + dts.ksq = square(us); + + assert(pos_is_ok()); + + assert(dp.pc != NO_PIECE); + assert(!(bool(captured) || m.type_of() == CASTLING) ^ (dp.remove_sq != SQ_NONE)); + assert(dp.from != SQ_NONE); + assert(!(dp.add_sq != SQ_NONE) ^ (m.type_of() == PROMOTION || m.type_of() == CASTLING)); +} + + +// Unmakes a move. When it returns, the position should +// be restored to exactly the same state as before the move was made. +void Position::undo_move(Move m) { + + assert(m.is_ok()); + + sideToMove = ~sideToMove; + + Color us = sideToMove; + Square from = m.from_sq(); + Square to = m.to_sq(); + Piece pc = piece_on(to); + + assert(empty(from) || m.type_of() == CASTLING); + assert(type_of(st->capturedPiece) != KING); + + if (m.type_of() == PROMOTION) + { + assert(relative_rank(us, to) == RANK_8); + assert(type_of(pc) == m.promotion_type()); + assert(type_of(pc) >= KNIGHT && type_of(pc) <= QUEEN); + + remove_piece(to); + pc = make_piece(us, PAWN); + put_piece(pc, to); + } + + if (m.type_of() == CASTLING) + { + Square rfrom, rto; + do_castling(us, from, to, rfrom, rto); + } + else + { + move_piece(to, from); // Put the piece back at the source square + + if (st->capturedPiece) + { + Square capsq = to; + + if (m.type_of() == EN_PASSANT) + { + capsq -= pawn_push(us); + + assert(type_of(pc) == PAWN); + assert(to == st->previous->epSquare); + assert(relative_rank(us, to) == RANK_6); + assert(piece_on(capsq) == NO_PIECE); + assert(st->capturedPiece == make_piece(~us, PAWN)); + } + + put_piece(st->capturedPiece, capsq); // Restore the captured piece + } + } + + // Finally point our state pointer back to the previous state + st = st->previous; + --gamePly; + + assert(pos_is_ok()); +} + +template +inline void add_dirty_threat( + DirtyThreats* const dts, Piece pc, Piece threatened, Square s, Square threatenedSq) { + if (PutPiece) + { + dts->threatenedSqs |= threatenedSq; + dts->threateningSqs |= s; + } + + dts->list.push_back({pc, threatened, s, threatenedSq, PutPiece}); +} + +#ifdef USE_AVX512ICL +// Given a DirtyThreat template and bit offsets to insert the piece type and square, write the threats +// present at the given bitboard. +template +void write_multiple_dirties(const Position& p, + Bitboard mask, + DirtyThreat dt_template, + DirtyThreats* dts) { + static_assert(sizeof(DirtyThreat) == 4); + + const __m512i board = _mm512_loadu_si512(p.piece_array().data()); + const __m512i AllSquares = _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, + 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, + 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + + const int dt_count = popcount(mask); + assert(dt_count <= 16); + + const __m512i template_v = _mm512_set1_epi32(dt_template.raw()); + auto* write = dts->list.make_space(dt_count); + + // Extract the list of squares and upconvert to 32 bits. There are never more than 16 + // incoming threats so this is sufficient. + __m512i threat_squares = _mm512_maskz_compress_epi8(mask, AllSquares); + threat_squares = _mm512_cvtepi8_epi32(_mm512_castsi512_si128(threat_squares)); + + __m512i threat_pieces = + _mm512_maskz_permutexvar_epi8(0x1111111111111111ULL, threat_squares, board); + + // Shift the piece and square into place + threat_squares = _mm512_slli_epi32(threat_squares, SqShift); + threat_pieces = _mm512_slli_epi32(threat_pieces, PcShift); + + const __m512i dirties = + _mm512_ternarylogic_epi32(template_v, threat_squares, threat_pieces, 254 /* A | B | C */); + _mm512_storeu_si512(write, dirties); +} +#endif + +template +void Position::update_piece_threats(Piece pc, + Square s, + DirtyThreats* const dts, + [[maybe_unused]] Bitboard noRaysContaining) const { + const Bitboard occupied = pieces(); + const Bitboard rookQueens = pieces(ROOK, QUEEN); + const Bitboard bishopQueens = pieces(BISHOP, QUEEN); + const Bitboard rAttacks = attacks_bb(s, occupied); + const Bitboard bAttacks = attacks_bb(s, occupied); + const Bitboard kings = pieces(KING); + Bitboard occupiedNoK = occupied ^ kings; + + Bitboard sliders = (rookQueens & rAttacks) | (bishopQueens & bAttacks); + auto process_sliders = [&](bool addDirectAttacks) { + while (sliders) + { + Square sliderSq = pop_lsb(sliders); + Piece slider = piece_on(sliderSq); + + const Bitboard ray = RayPassBB[sliderSq][s]; + const Bitboard discovered = ray & (rAttacks | bAttacks) & occupiedNoK; + + assert(!more_than_one(discovered)); + if (discovered && (RayPassBB[sliderSq][s] & noRaysContaining) != noRaysContaining) + { + const Square threatenedSq = lsb(discovered); + const Piece threatenedPc = piece_on(threatenedSq); + add_dirty_threat(dts, slider, threatenedPc, sliderSq, threatenedSq); + } + + if (addDirectAttacks) + add_dirty_threat(dts, slider, pc, sliderSq, s); + } + }; + + if (type_of(pc) == KING) + { + if constexpr (ComputeRay) + process_sliders(false); + return; + } + + + const Bitboard knights = pieces(KNIGHT); + const Bitboard whitePawns = pieces(WHITE, PAWN); + const Bitboard blackPawns = pieces(BLACK, PAWN); + + + Bitboard threatened = attacks_bb(pc, s, occupied) & occupiedNoK; + Bitboard incoming_threats = + (PseudoAttacks[KNIGHT][s] & knights) | (attacks_bb(s, WHITE) & blackPawns) + | (attacks_bb(s, BLACK) & whitePawns) | (PseudoAttacks[KING][s] & kings); + +#ifdef USE_AVX512ICL + if constexpr (PutPiece) + { + dts->threatenedSqs |= threatened; + // A bit may only be set if that square actually produces a threat, so we + // must guard setting the square accordingly + dts->threateningSqs |= Bitboard(bool(threatened)) << s; + } + + DirtyThreat dt_template{pc, NO_PIECE, s, Square(0), PutPiece}; + write_multiple_dirties( + *this, threatened, dt_template, dts); + + Bitboard all_attackers = sliders | incoming_threats; + + if constexpr (PutPiece) + { + dts->threatenedSqs |= Bitboard(bool(all_attackers)) << s; // same as above + dts->threateningSqs |= all_attackers; + } + + dt_template = {NO_PIECE, pc, Square(0), s, PutPiece}; + write_multiple_dirties(*this, all_attackers, + dt_template, dts); +#else + while (threatened) + { + Square threatenedSq = pop_lsb(threatened); + Piece threatenedPc = piece_on(threatenedSq); + + assert(threatenedSq != s); + assert(threatenedPc); + + add_dirty_threat(dts, pc, threatenedPc, s, threatenedSq); + } +#endif + + if constexpr (ComputeRay) + { +#ifndef USE_AVX512ICL + process_sliders(true); +#else // for ICL, direct threats were processed earlier (all_attackers) + process_sliders(false); +#endif + } + else + { + incoming_threats |= sliders; + } + +#ifndef USE_AVX512ICL + while (incoming_threats) + { + Square srcSq = pop_lsb(incoming_threats); + Piece srcPc = piece_on(srcSq); + + assert(srcSq != s); + assert(srcPc != NO_PIECE); + + add_dirty_threat(dts, srcPc, pc, srcSq, s); + } +#endif +} + +// Helper used to do/undo a castling move. This is a bit +// tricky in Chess960 where from/to squares can overlap. +template +void Position::do_castling(Color us, + Square from, + Square& to, + Square& rfrom, + Square& rto, + DirtyThreats* const dts, + DirtyPiece* const dp) { + + bool kingSide = to > from; + rfrom = to; // Castling is encoded as "king captures friendly rook" + rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1); + to = relative_square(us, kingSide ? SQ_G1 : SQ_C1); + + assert(!Do || dp); + + if (Do) + { + dp->to = to; + dp->remove_pc = dp->add_pc = make_piece(us, ROOK); + dp->remove_sq = rfrom; + dp->add_sq = rto; + } + + // Remove both pieces first since squares could overlap in Chess960 + remove_piece(Do ? from : to, dts); + remove_piece(Do ? rfrom : rto, dts); + put_piece(make_piece(us, KING), Do ? to : from, dts); + put_piece(make_piece(us, ROOK), Do ? rto : rfrom, dts); +} + + +// Used to do a "null move": it flips +// the side to move without executing any move on the board. +void Position::do_null_move(StateInfo& newSt) { + + assert(!checkers()); + assert(&newSt != st); + + std::memcpy(&newSt, st, sizeof(StateInfo)); + + newSt.previous = st; + st = &newSt; + + if (st->epSquare != SQ_NONE) + { + st->key ^= Zobrist::enpassant[file_of(st->epSquare)]; + st->epSquare = SQ_NONE; + } + + st->key ^= Zobrist::side; + + st->pliesFromNull = 0; + + sideToMove = ~sideToMove; + + set_check_info(); + + st->repetition = 0; + + assert(pos_is_ok()); +} + + +// Must be used to undo a "null move" +void Position::undo_null_move() { + + assert(!checkers()); + + st = st->previous; + sideToMove = ~sideToMove; +} + + +// Tests if the SEE (Static Exchange Evaluation) +// value of move is greater or equal to the given threshold. We'll use an +// algorithm similar to alpha-beta pruning with a null window. +bool Position::see_ge(Move m, int threshold) const { + + assert(m.is_ok()); + + // Only deal with normal moves, assume others pass a simple SEE + if (m.type_of() != NORMAL) + return VALUE_ZERO >= threshold; + + Square from = m.from_sq(), to = m.to_sq(); + + assert(piece_on(from) != NO_PIECE); + + int swap = PieceValue[piece_on(to)] - threshold; + if (swap < 0) + return false; + + swap = PieceValue[piece_on(from)] - swap; + if (swap <= 0) + return true; + + assert(color_of(piece_on(from)) == sideToMove); + Bitboard occupied = pieces() ^ from ^ to; // xoring to is important for pinned piece logic + Color stm = sideToMove; + Bitboard attackers = attackers_to(to, occupied); + Bitboard stmAttackers, bb; + int res = 1; + + while (true) + { + stm = ~stm; + attackers &= occupied; + + // If stm has no more attackers then give up: stm loses + if (!(stmAttackers = attackers & pieces(stm))) + break; + + // Don't allow pinned pieces to attack as long as there are + // pinners on their original square. + if (pinners(~stm) & occupied) + { + stmAttackers &= ~blockers_for_king(stm); + + if (!stmAttackers) + break; + } + + res ^= 1; + + // Locate and remove the next least valuable attacker, and add to + // the bitboard 'attackers' any X-ray attackers behind it. + if ((bb = stmAttackers & pieces(PAWN))) + { + if ((swap = PawnValue - swap) < res) + break; + occupied ^= least_significant_square_bb(bb); + + attackers |= attacks_bb(to, occupied) & pieces(BISHOP, QUEEN); + } + + else if ((bb = stmAttackers & pieces(KNIGHT))) + { + if ((swap = KnightValue - swap) < res) + break; + occupied ^= least_significant_square_bb(bb); + } + + else if ((bb = stmAttackers & pieces(BISHOP))) + { + if ((swap = BishopValue - swap) < res) + break; + occupied ^= least_significant_square_bb(bb); + + attackers |= attacks_bb(to, occupied) & pieces(BISHOP, QUEEN); + } + + else if ((bb = stmAttackers & pieces(ROOK))) + { + if ((swap = RookValue - swap) < res) + break; + occupied ^= least_significant_square_bb(bb); + + attackers |= attacks_bb(to, occupied) & pieces(ROOK, QUEEN); + } + + else if ((bb = stmAttackers & pieces(QUEEN))) + { + swap = QueenValue - swap; + // implies that the previous recapture was done by a higher rated piece than a Queen (King is excluded) + assert(swap >= res); + occupied ^= least_significant_square_bb(bb); + + attackers |= (attacks_bb(to, occupied) & pieces(BISHOP, QUEEN)) + | (attacks_bb(to, occupied) & pieces(ROOK, QUEEN)); + } + + else // KING + // If we "capture" with the king but the opponent still has attackers, + // reverse the result. + return (attackers & ~pieces(stm)) ? res ^ 1 : res; + } + + return bool(res); +} + +// Tests whether the position is drawn by 50-move rule +// or by repetition. It does not detect stalemates. +bool Position::is_draw(int ply) const { + + if (st->rule50 > 99 && (!checkers() || MoveList(*this).size())) + return true; + + return is_repetition(ply); +} + +// Return a draw score if a position repeats once earlier but strictly +// after the root, or repeats twice before or at the root. +bool Position::is_repetition(int ply) const { return st->repetition && st->repetition < ply; } + +// Tests whether there has been at least one repetition +// of positions since the last capture or pawn move. +bool Position::has_repeated() const { + + StateInfo* stc = st; + int end = std::min(st->rule50, st->pliesFromNull); + while (end-- >= 4) + { + if (stc->repetition) + return true; + + stc = stc->previous; + } + return false; +} + + +// Tests if the position has a move which draws by repetition. +// This function accurately matches the outcome of is_draw() over all legal moves. +bool Position::upcoming_repetition(int ply) const { + + int j; + + int end = std::min(st->rule50, st->pliesFromNull); + + if (end < 3) + return false; + + Key originalKey = st->key; + StateInfo* stp = st->previous; + Key other = originalKey ^ stp->key ^ Zobrist::side; + + for (int i = 3; i <= end; i += 2) + { + stp = stp->previous; + other ^= stp->key ^ stp->previous->key ^ Zobrist::side; + stp = stp->previous; + + if (other != 0) + continue; + + Key moveKey = originalKey ^ stp->key; + if ((j = H1(moveKey), cuckoo[j] == moveKey) || (j = H2(moveKey), cuckoo[j] == moveKey)) + { + Move move = cuckooMove[j]; + Square s1 = move.from_sq(); + Square s2 = move.to_sq(); + + if (!((between_bb(s1, s2) ^ s2) & pieces())) + { + if (ply > i) + return true; + + // For nodes before or at the root, check that the move is a + // repetition rather than a move to the current position. + if (stp->repetition) + return true; + } + } + } + return false; +} + + +// Flips position with the white and black sides reversed. This +// is only useful for debugging e.g. for finding evaluation symmetry bugs. +void Position::flip() { + + string f, token; + std::stringstream ss(fen()); + + for (Rank r = RANK_8;; --r) // Piece placement + { + std::getline(ss, token, r > RANK_1 ? '/' : ' '); + f.insert(0, token + (f.empty() ? " " : "/")); + + if (r == RANK_1) + break; + } + + ss >> token; // Active color + f += (token == "w" ? "B " : "W "); // Will be lowercased later + + ss >> token; // Castling availability + f += token + " "; + + std::transform(f.begin(), f.end(), f.begin(), + [](char c) { return char(islower(c) ? toupper(c) : tolower(c)); }); + + ss >> token; // En passant square + f += (token == "-" ? token : token.replace(1, 1, token[1] == '3' ? "6" : "3")); + + std::getline(ss, token); // Half and full moves + f += token; + + set(f, is_chess960(), st); + + assert(pos_is_ok()); +} + + +bool Position::material_key_is_ok() const { return compute_material_key() == st->materialKey; } + + +// Performs some consistency checks for the position object +// and raise an assert if something wrong is detected. +// This is meant to be helpful when debugging. +bool Position::pos_is_ok() const { + + constexpr bool Fast = true; // Quick (default) or full check? + + if ((sideToMove != WHITE && sideToMove != BLACK) || piece_on(square(WHITE)) != W_KING + || piece_on(square(BLACK)) != B_KING + || (ep_square() != SQ_NONE && relative_rank(sideToMove, ep_square()) != RANK_6)) + assert(0 && "pos_is_ok: Default"); + + if (Fast) + return true; + + if (pieceCount[W_KING] != 1 || pieceCount[B_KING] != 1 + || attackers_to_exist(square(~sideToMove), pieces(), sideToMove)) + assert(0 && "pos_is_ok: Kings"); + + if ((pieces(PAWN) & (Rank1BB | Rank8BB)) || pieceCount[W_PAWN] > 8 || pieceCount[B_PAWN] > 8) + assert(0 && "pos_is_ok: Pawns"); + + + if (ep_square() != SQ_NONE) + { + Square ksq = square(sideToMove); + + Bitboard captured = (ep_square() + pawn_push(~sideToMove)) & pieces(~sideToMove, PAWN); + Bitboard pawns = attacks_bb(ep_square(), ~sideToMove) & pieces(sideToMove, PAWN); + Bitboard potentialCheckers = pieces(~sideToMove) ^ captured; + + if (!captured || !pawns + || ((attackers_to(ksq, pieces() ^ captured ^ ep_square() ^ lsb(pawns)) + & potentialCheckers) + && (attackers_to(ksq, pieces() ^ captured ^ ep_square() ^ msb(pawns)) + & potentialCheckers))) + assert(0 && "pos_is_ok: En passant square"); + } + + if ((pieces(WHITE) & pieces(BLACK)) || (pieces(WHITE) | pieces(BLACK)) != pieces() + || popcount(pieces(WHITE)) > 16 || popcount(pieces(BLACK)) > 16) + assert(0 && "pos_is_ok: Bitboards"); + + for (PieceType p1 = PAWN; p1 <= KING; ++p1) + for (PieceType p2 = PAWN; p2 <= KING; ++p2) + if (p1 != p2 && (pieces(p1) & pieces(p2))) + assert(0 && "pos_is_ok: Bitboards"); + + + for (Piece pc : Pieces) + if (pieceCount[pc] != popcount(pieces(color_of(pc), type_of(pc))) + || pieceCount[pc] != std::count(board.begin(), board.end(), pc)) + assert(0 && "pos_is_ok: Pieces"); + + for (Color c : {WHITE, BLACK}) + for (CastlingRights cr : {c & KING_SIDE, c & QUEEN_SIDE}) + { + if (!can_castle(cr)) + continue; + + if (piece_on(castlingRookSquare[cr]) != make_piece(c, ROOK) + || castlingRightsMask[castlingRookSquare[cr]] != cr + || (castlingRightsMask[square(c)] & cr) != cr) + assert(0 && "pos_is_ok: Castling"); + } + + assert(material_key_is_ok() && "pos_is_ok: materialKey"); + + return true; +} + +} // namespace Stockfish diff --git a/src/position.h b/src/position.h new file mode 100644 index 0000000000000000000000000000000000000000..e02a400d32f6554bcd21417457f8d8ae9fe0aa5d --- /dev/null +++ b/src/position.h @@ -0,0 +1,414 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef POSITION_H_INCLUDED +#define POSITION_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include + +#include "bitboard.h" +#include "types.h" + +namespace Stockfish { + +class TranspositionTable; +struct SharedHistories; + +// StateInfo struct stores information needed to restore a Position object to +// its previous state when we retract a move. Whenever a move is made on the +// board (by calling Position::do_move), a StateInfo object must be passed. + +struct StateInfo { + + // Copied when making a move + Key materialKey; + Key pawnKey; + Key minorPieceKey; + Key nonPawnKey[COLOR_NB]; + Value nonPawnMaterial[COLOR_NB]; + int castlingRights; + int rule50; + int pliesFromNull; + Square epSquare; + + // Not copied when making a move (will be recomputed anyhow) + Key key; + Bitboard checkersBB; + StateInfo* previous; + Bitboard blockersForKing[COLOR_NB]; + Bitboard pinners[COLOR_NB]; + Bitboard checkSquares[PIECE_TYPE_NB]; + Piece capturedPiece; + int repetition; +}; + + +// A list to keep track of the position states along the setup moves (from the +// start position to the position just before the search starts). Needed by +// 'draw by repetition' detection. Use a std::deque because pointers to +// elements are not invalidated upon list resizing. +using StateListPtr = std::unique_ptr>; + +// Position class stores information regarding the board representation as +// pieces, side to move, hash keys, castling info, etc. Important methods are +// do_move() and undo_move(), used by the search to update node info when +// traversing the search tree. +class Position { + public: + static void init(); + + Position() = default; + Position(const Position&) = delete; + Position& operator=(const Position&) = delete; + + // FEN string input/output + Position& set(const std::string& fenStr, bool isChess960, StateInfo* si); + Position& set(const std::string& code, Color c, StateInfo* si); + std::string fen() const; + + // Position representation + Bitboard pieces() const; // All pieces + template + Bitboard pieces(PieceTypes... pts) const; + Bitboard pieces(Color c) const; + template + Bitboard pieces(Color c, PieceTypes... pts) const; + Piece piece_on(Square s) const; + const std::array& piece_array() const; + Square ep_square() const; + bool empty(Square s) const; + template + int count(Color c) const; + template + int count() const; + template + Square square(Color c) const; + + // Castling + bool can_castle(CastlingRights cr) const; + bool castling_impeded(CastlingRights cr) const; + Square castling_rook_square(CastlingRights cr) const; + + // Checking + Bitboard checkers() const; + Bitboard blockers_for_king(Color c) const; + Bitboard check_squares(PieceType pt) const; + Bitboard pinners(Color c) const; + + // Attacks to/from a given square + Bitboard attackers_to(Square s) const; + Bitboard attackers_to(Square s, Bitboard occupied) const; + bool attackers_to_exist(Square s, Bitboard occupied, Color c) const; + void update_slider_blockers(Color c) const; + template + Bitboard attacks_by(Color c) const; + + // Properties of moves + bool legal(Move m) const; + bool pseudo_legal(const Move m) const; + bool capture(Move m) const; + bool capture_stage(Move m) const; + bool gives_check(Move m) const; + Piece moved_piece(Move m) const; + Piece captured_piece() const; + + // Doing and undoing moves + void do_move(Move m, StateInfo& newSt, const TranspositionTable* tt); + void do_move(Move m, + StateInfo& newSt, + bool givesCheck, + DirtyPiece& dp, + DirtyThreats& dts, + const TranspositionTable* tt, + const SharedHistories* worker); + void undo_move(Move m); + void do_null_move(StateInfo& newSt); + void undo_null_move(); + + // Static Exchange Evaluation + bool see_ge(Move m, int threshold = 0) const; + + // Accessing hash keys + Key key() const; + Key material_key() const; + Key pawn_key() const; + Key minor_piece_key() const; + Key non_pawn_key(Color c) const; + + // Other properties of the position + Color side_to_move() const; + int game_ply() const; + bool is_chess960() const; + bool is_draw(int ply) const; + bool is_repetition(int ply) const; + bool upcoming_repetition(int ply) const; + bool has_repeated() const; + int rule50_count() const; + Value non_pawn_material(Color c) const; + Value non_pawn_material() const; + + // Position consistency check, for debugging + bool pos_is_ok() const; + bool material_key_is_ok() const; + void flip(); + + StateInfo* state() const; + + void put_piece(Piece pc, Square s, DirtyThreats* const dts = nullptr); + void remove_piece(Square s, DirtyThreats* const dts = nullptr); + void swap_piece(Square s, Piece pc, DirtyThreats* const dts = nullptr); + + private: + // Initialization helpers (used while setting up a position) + void set_castling_right(Color c, Square rfrom); + Key compute_material_key() const; + void set_state() const; + void set_check_info() const; + + // Other helpers + template + void update_piece_threats(Piece pc, + Square s, + DirtyThreats* const dts, + Bitboard noRaysContaining = -1ULL) const; + void move_piece(Square from, Square to, DirtyThreats* const dts = nullptr); + template + void do_castling(Color us, + Square from, + Square& to, + Square& rfrom, + Square& rto, + DirtyThreats* const dts = nullptr, + DirtyPiece* const dp = nullptr); + Key adjust_key50(Key k) const; + + // Data members + std::array board; + std::array byTypeBB; + std::array byColorBB; + + int pieceCount[PIECE_NB]; + int castlingRightsMask[SQUARE_NB]; + Square castlingRookSquare[CASTLING_RIGHT_NB]; + Bitboard castlingPath[CASTLING_RIGHT_NB]; + StateInfo* st; + int gamePly; + Color sideToMove; + bool chess960; + DirtyPiece scratch_dp; + DirtyThreats scratch_dts; +}; + +std::ostream& operator<<(std::ostream& os, const Position& pos); + +inline Color Position::side_to_move() const { return sideToMove; } + +inline Piece Position::piece_on(Square s) const { + assert(is_ok(s)); + return board[s]; +} + +inline const std::array& Position::piece_array() const { return board; } + +inline bool Position::empty(Square s) const { return piece_on(s) == NO_PIECE; } + +inline Piece Position::moved_piece(Move m) const { return piece_on(m.from_sq()); } + +inline Bitboard Position::pieces() const { return byTypeBB[ALL_PIECES]; } + +template +inline Bitboard Position::pieces(PieceTypes... pts) const { + return (byTypeBB[pts] | ...); +} + +inline Bitboard Position::pieces(Color c) const { return byColorBB[c]; } + +template +inline Bitboard Position::pieces(Color c, PieceTypes... pts) const { + return pieces(c) & pieces(pts...); +} + +template +inline int Position::count(Color c) const { + return pieceCount[make_piece(c, Pt)]; +} + +template +inline int Position::count() const { + return count(WHITE) + count(BLACK); +} + +template +inline Square Position::square(Color c) const { + assert(count(c) == 1); + return lsb(pieces(c, Pt)); +} + +inline Square Position::ep_square() const { return st->epSquare; } + +inline bool Position::can_castle(CastlingRights cr) const { return st->castlingRights & cr; } + +inline bool Position::castling_impeded(CastlingRights cr) const { + assert(cr == WHITE_OO || cr == WHITE_OOO || cr == BLACK_OO || cr == BLACK_OOO); + return pieces() & castlingPath[cr]; +} + +inline Square Position::castling_rook_square(CastlingRights cr) const { + assert(cr == WHITE_OO || cr == WHITE_OOO || cr == BLACK_OO || cr == BLACK_OOO); + return castlingRookSquare[cr]; +} + +inline Bitboard Position::attackers_to(Square s) const { return attackers_to(s, pieces()); } + +template +inline Bitboard Position::attacks_by(Color c) const { + + if constexpr (Pt == PAWN) + return c == WHITE ? pawn_attacks_bb(pieces(WHITE, PAWN)) + : pawn_attacks_bb(pieces(BLACK, PAWN)); + else + { + Bitboard threats = 0; + Bitboard attackers = pieces(c, Pt); + while (attackers) + threats |= attacks_bb(pop_lsb(attackers), pieces()); + return threats; + } +} + +inline Bitboard Position::checkers() const { return st->checkersBB; } + +inline Bitboard Position::blockers_for_king(Color c) const { return st->blockersForKing[c]; } + +inline Bitboard Position::pinners(Color c) const { return st->pinners[c]; } + +inline Bitboard Position::check_squares(PieceType pt) const { return st->checkSquares[pt]; } + +inline Key Position::key() const { return adjust_key50(st->key); } + +inline Key Position::adjust_key50(Key k) const { + return st->rule50 < 14 ? k : k ^ make_key((st->rule50 - 14) / 8); +} + +inline Key Position::pawn_key() const { return st->pawnKey; } + +inline Key Position::material_key() const { return st->materialKey; } + +inline Key Position::minor_piece_key() const { return st->minorPieceKey; } + +inline Key Position::non_pawn_key(Color c) const { return st->nonPawnKey[c]; } + +inline Value Position::non_pawn_material(Color c) const { return st->nonPawnMaterial[c]; } + +inline Value Position::non_pawn_material() const { + return non_pawn_material(WHITE) + non_pawn_material(BLACK); +} + +inline int Position::game_ply() const { return gamePly; } + +inline int Position::rule50_count() const { return st->rule50; } + +inline bool Position::is_chess960() const { return chess960; } + +inline bool Position::capture(Move m) const { + assert(m.is_ok()); + return (!empty(m.to_sq()) && m.type_of() != CASTLING) || m.type_of() == EN_PASSANT; +} + +// Returns true if a move is generated from the capture stage, having also +// queen promotions covered, i.e. consistency with the capture stage move +// generation is needed to avoid the generation of duplicate moves. +inline bool Position::capture_stage(Move m) const { + assert(m.is_ok()); + return capture(m) || m.promotion_type() == QUEEN; +} + +inline Piece Position::captured_piece() const { return st->capturedPiece; } + +inline void Position::put_piece(Piece pc, Square s, DirtyThreats* const dts) { + board[s] = pc; + byTypeBB[ALL_PIECES] |= byTypeBB[type_of(pc)] |= s; + byColorBB[color_of(pc)] |= s; + pieceCount[pc]++; + pieceCount[make_piece(color_of(pc), ALL_PIECES)]++; + + if (dts) + update_piece_threats(pc, s, dts); +} + +inline void Position::remove_piece(Square s, DirtyThreats* const dts) { + Piece pc = board[s]; + + if (dts) + update_piece_threats(pc, s, dts); + + byTypeBB[ALL_PIECES] ^= s; + byTypeBB[type_of(pc)] ^= s; + byColorBB[color_of(pc)] ^= s; + board[s] = NO_PIECE; + pieceCount[pc]--; + pieceCount[make_piece(color_of(pc), ALL_PIECES)]--; +} + +inline void Position::move_piece(Square from, Square to, DirtyThreats* const dts) { + Piece pc = board[from]; + Bitboard fromTo = from | to; + + if (dts) + update_piece_threats(pc, from, dts, fromTo); + + byTypeBB[ALL_PIECES] ^= fromTo; + byTypeBB[type_of(pc)] ^= fromTo; + byColorBB[color_of(pc)] ^= fromTo; + board[from] = NO_PIECE; + board[to] = pc; + + if (dts) + update_piece_threats(pc, to, dts, fromTo); +} + +inline void Position::swap_piece(Square s, Piece pc, DirtyThreats* const dts) { + Piece old = board[s]; + + remove_piece(s); + + if (dts) + update_piece_threats(old, s, dts); + + put_piece(pc, s); + + if (dts) + update_piece_threats(pc, s, dts); +} + +inline void Position::do_move(Move m, StateInfo& newSt, const TranspositionTable* tt = nullptr) { + new (&scratch_dts) DirtyThreats; + do_move(m, newSt, gives_check(m), scratch_dp, scratch_dts, tt, nullptr); +} + +inline StateInfo* Position::state() const { return st; } + +} // namespace Stockfish + +#endif // #ifndef POSITION_H_INCLUDED diff --git a/src/score.cpp b/src/score.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ea62577b9f48fae1e6900ec21239e44470ff32df --- /dev/null +++ b/src/score.cpp @@ -0,0 +1,48 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "score.h" + +#include +#include +#include + +#include "uci.h" + +namespace Stockfish { + +Score::Score(Value v, const Position& pos) { + assert(-VALUE_INFINITE < v && v < VALUE_INFINITE); + + if (!is_decisive(v)) + { + score = InternalUnits{UCIEngine::to_cp(v, pos)}; + } + else if (std::abs(v) <= VALUE_TB) + { + auto distance = VALUE_TB - std::abs(v); + score = (v > 0) ? Tablebase{distance, true} : Tablebase{-distance, false}; + } + else + { + auto distance = VALUE_MATE - std::abs(v); + score = (v > 0) ? Mate{distance} : Mate{-distance}; + } +} + +} \ No newline at end of file diff --git a/src/score.h b/src/score.h new file mode 100644 index 0000000000000000000000000000000000000000..cf89d3cdd54ca9bd4c0b54f96bce2eb204e92506 --- /dev/null +++ b/src/score.h @@ -0,0 +1,70 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef SCORE_H_INCLUDED +#define SCORE_H_INCLUDED + +#include +#include + +#include "types.h" + +namespace Stockfish { + +class Position; + +class Score { + public: + struct Mate { + int plies; + }; + + struct Tablebase { + int plies; + bool win; + }; + + struct InternalUnits { + int value; + }; + + Score() = default; + Score(Value v, const Position& pos); + + template + bool is() const { + return std::holds_alternative(score); + } + + template + T get() const { + return std::get(score); + } + + template + decltype(auto) visit(F&& f) const { + return std::visit(std::forward(f), score); + } + + private: + std::variant score; +}; + +} + +#endif // #ifndef SCORE_H_INCLUDED diff --git a/src/search.cpp b/src/search.cpp new file mode 100644 index 0000000000000000000000000000000000000000..028f61cce2b07be0ae6f6bcf34217ed81f83f873 --- /dev/null +++ b/src/search.cpp @@ -0,0 +1,2217 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "search.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bitboard.h" +#include "evaluate.h" +#include "history.h" +#include "misc.h" +#include "movegen.h" +#include "movepick.h" +#include "nnue/network.h" +#include "nnue/nnue_accumulator.h" +#include "position.h" +#include "syzygy/tbprobe.h" +#include "thread.h" +#include "timeman.h" +#include "tt.h" +#include "types.h" +#include "uci.h" +#include "ucioption.h" + +namespace Stockfish { + +namespace TB = Tablebases; + +void syzygy_extend_pv(const OptionsMap& options, + const Search::LimitsType& limits, + Stockfish::Position& pos, + Stockfish::Search::RootMove& rootMove, + Value& v); + +using namespace Search; + +namespace { + +constexpr int SEARCHEDLIST_CAPACITY = 32; +using SearchedList = ValueList; + +// (*Scalers): +// The values with Scaler asterisks have proven non-linear scaling. +// They are optimized to time controls of 180 + 1.8 and longer, +// so changing them or adding conditions that are similar requires +// tests at these types of time controls. + +// (*Scaler) All tuned parameters at time controls shorter than +// optimized for require verifications at longer time controls + +int correction_value(const Worker& w, const Position& pos, const Stack* const ss) { + const Color us = pos.side_to_move(); + const auto m = (ss - 1)->currentMove; + const auto& shared = w.sharedHistory; + const int pcv = shared.pawn_correction_entry(pos).at(us).pawn; + const int micv = shared.minor_piece_correction_entry(pos).at(us).minor; + const int wnpcv = shared.nonpawn_correction_entry(pos).at(us).nonPawnWhite; + const int bnpcv = shared.nonpawn_correction_entry(pos).at(us).nonPawnBlack; + const int cntcv = + m.is_ok() ? (*(ss - 2)->continuationCorrectionHistory)[pos.piece_on(m.to_sq())][m.to_sq()] + + (*(ss - 4)->continuationCorrectionHistory)[pos.piece_on(m.to_sq())][m.to_sq()] + : 8; + + return 11433 * pcv + 8823 * micv + 12749 * (wnpcv + bnpcv) + 8022 * cntcv; +} + +// Add correctionHistory value to raw staticEval and guarantee evaluation +// does not hit the tablebase range. +Value to_corrected_static_eval(const Value v, const int cv) { + return std::clamp(v + cv / 131072, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); +} + +void update_correction_history(const Position& pos, + Stack* const ss, + Search::Worker& workerThread, + const int bonus) { + const Move m = (ss - 1)->currentMove; + const Color us = pos.side_to_move(); + + constexpr int nonPawnWeight = 181; + auto& shared = workerThread.sharedHistory; + + shared.pawn_correction_entry(pos).at(us).pawn << bonus; + shared.minor_piece_correction_entry(pos).at(us).minor << bonus * 155 / 128; + shared.nonpawn_correction_entry(pos).at(us).nonPawnWhite << bonus * nonPawnWeight / 128; + shared.nonpawn_correction_entry(pos).at(us).nonPawnBlack << bonus * nonPawnWeight / 128; + + // Branchless: use mask to zero bonus when move is not ok + const int mask = int(m.is_ok()); + const Square to = m.to_sq_unchecked(); + const Piece pc = pos.piece_on(to); + const int bonus2 = (bonus * 129 / 128) * mask; + const int bonus4 = (bonus * 61 / 128) * mask; + (*(ss - 2)->continuationCorrectionHistory)[pc][to] << bonus2; + (*(ss - 4)->continuationCorrectionHistory)[pc][to] << bonus4; +} + +// Add a small random component to draw evaluations to avoid 3-fold blindness +Value value_draw(size_t nodes) { return VALUE_DRAW - 1 + Value(nodes & 0x2); } +Value value_to_tt(Value v, int ply); +Value value_from_tt(Value v, int ply, int r50c); +void update_pv(Move* pv, Move move, const Move* childPv); +void update_continuation_histories(Stack* ss, Piece pc, Square to, int bonus); +void update_quiet_histories( + const Position& pos, Stack* ss, Search::Worker& workerThread, Move move, int bonus); +void update_all_stats(const Position& pos, + Stack* ss, + Search::Worker& workerThread, + Move bestMove, + Square prevSq, + SearchedList& quietsSearched, + SearchedList& capturesSearched, + Depth depth, + Move ttMove); + +bool is_shuffling(Move move, Stack* const ss, const Position& pos) { + if (pos.capture_stage(move) || pos.rule50_count() < 11) + return false; + if (pos.state()->pliesFromNull <= 6 || ss->ply < 19) + return false; + return move.from_sq() == (ss - 2)->currentMove.to_sq() + && (ss - 2)->currentMove.from_sq() == (ss - 4)->currentMove.to_sq(); +} + +} // namespace + +Search::Worker::Worker(SharedState& sharedState, + std::unique_ptr sm, + size_t threadId, + size_t numaThreadId, + size_t numaTotalThreads, + NumaReplicatedAccessToken token) : + // Unpack the SharedState struct into member variables + sharedHistory(sharedState.sharedHistories.at(token.get_numa_index())), + threadIdx(threadId), + numaThreadIdx(numaThreadId), + numaTotal(numaTotalThreads), + numaAccessToken(token), + manager(std::move(sm)), + options(sharedState.options), + threads(sharedState.threads), + tt(sharedState.tt), + networks(sharedState.networks), + refreshTable(networks[token]) { + clear(); +} + +void Search::Worker::ensure_network_replicated() { + // Access once to force lazy initialization. + // We do this because we want to avoid initialization during search. + (void) (networks[numaAccessToken]); +} + +void Search::Worker::start_searching() { + + accumulatorStack.reset(); + + // Non-main threads go directly to iterative_deepening() + if (!is_mainthread()) + { + iterative_deepening(); + return; + } + + main_manager()->tm.init(limits, rootPos.side_to_move(), rootPos.game_ply(), options, + main_manager()->originalTimeAdjust); + tt.new_search(); + + if (rootMoves.empty()) + { + rootMoves.emplace_back(Move::none()); + main_manager()->updates.onUpdateNoMoves( + {0, {rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW, rootPos}}); + } + else + { + threads.start_searching(); // start non-main threads + iterative_deepening(); // main thread start searching + } + + // When we reach the maximum depth, we can arrive here without a raise of + // threads.stop. However, if we are pondering or in an infinite search, + // the UCI protocol states that we shouldn't print the best move before the + // GUI sends a "stop" or "ponderhit" command. We therefore simply wait here + // until the GUI sends one of those commands. + while (!threads.stop && (main_manager()->ponder || limits.infinite)) + {} // Busy wait for a stop or a ponder reset + + // Stop the threads if not already stopped (also raise the stop if + // "ponderhit" just reset threads.ponder) + threads.stop = true; + + // Wait until all threads have finished + threads.wait_for_search_finished(); + + // When playing in 'nodes as time' mode, subtract the searched nodes from + // the available ones before exiting. + if (limits.npmsec) + main_manager()->tm.advance_nodes_time(threads.nodes_searched() + - limits.inc[rootPos.side_to_move()]); + + Worker* bestThread = this; + Skill skill = + Skill(options["Skill Level"], options["UCI_LimitStrength"] ? int(options["UCI_Elo"]) : 0); + + if (int(options["MultiPV"]) == 1 && !limits.depth && !limits.mate && !skill.enabled() + && rootMoves[0].pv[0] != Move::none()) + bestThread = threads.get_best_thread()->worker.get(); + + main_manager()->bestPreviousScore = bestThread->rootMoves[0].score; + main_manager()->bestPreviousAverageScore = bestThread->rootMoves[0].averageScore; + + // Send again PV info if we have a new best thread + if (bestThread != this) + main_manager()->pv(*bestThread, threads, tt, bestThread->completedDepth); + + std::string ponder; + + if (bestThread->rootMoves[0].pv.size() > 1 + || bestThread->rootMoves[0].extract_ponder_from_tt(tt, rootPos)) + ponder = UCIEngine::move(bestThread->rootMoves[0].pv[1], rootPos.is_chess960()); + + auto bestmove = UCIEngine::move(bestThread->rootMoves[0].pv[0], rootPos.is_chess960()); + main_manager()->updates.onBestmove(bestmove, ponder); +} + +// Main iterative deepening loop. It calls search() +// repeatedly with increasing depth until the allocated thinking time has been +// consumed, the user stops the search, or the maximum search depth is reached. +void Search::Worker::iterative_deepening() { + + SearchManager* mainThread = (is_mainthread() ? main_manager() : nullptr); + + Move pv[MAX_PLY + 1]; + + Depth lastBestMoveDepth = 0; + Value lastBestScore = -VALUE_INFINITE; + auto lastBestPV = std::vector{Move::none()}; + + Value alpha, beta; + Value bestValue = -VALUE_INFINITE; + Color us = rootPos.side_to_move(); + double timeReduction = 1, totBestMoveChanges = 0; + int delta, iterIdx = 0; + + // Allocate stack with extra size to allow access from (ss - 7) to (ss + 2): + // (ss - 7) is needed for update_continuation_histories(ss - 1) which accesses (ss - 6), + // (ss + 2) is needed for initialization of cutOffCnt. + Stack stack[MAX_PLY + 10] = {}; + Stack* ss = stack + 7; + + for (int i = 7; i > 0; --i) + { + (ss - i)->continuationHistory = + &continuationHistory[0][0][NO_PIECE][0]; // Use as a sentinel + (ss - i)->continuationCorrectionHistory = &continuationCorrectionHistory[NO_PIECE][0]; + (ss - i)->staticEval = VALUE_NONE; + } + + for (int i = 0; i <= MAX_PLY + 2; ++i) + (ss + i)->ply = i; + + ss->pv = pv; + + if (mainThread) + { + if (mainThread->bestPreviousScore == VALUE_INFINITE) + mainThread->iterValue.fill(VALUE_ZERO); + else + mainThread->iterValue.fill(mainThread->bestPreviousScore); + } + + size_t multiPV = size_t(options["MultiPV"]); + Skill skill(options["Skill Level"], options["UCI_LimitStrength"] ? int(options["UCI_Elo"]) : 0); + + // When playing with strength handicap enable MultiPV search that we will + // use behind-the-scenes to retrieve a set of possible moves. + if (skill.enabled()) + multiPV = std::max(multiPV, size_t(4)); + + multiPV = std::min(multiPV, rootMoves.size()); + + int searchAgainCounter = 0; + + lowPlyHistory.fill(100); + + for (Color c : {WHITE, BLACK}) + for (int i = 0; i < UINT_16_HISTORY_SIZE; i++) + mainHistory[c][i] = mainHistory[c][i] * 778 / 1024; + + // Iterative deepening loop until requested to stop or the target depth is reached + while (++rootDepth < MAX_PLY && !threads.stop + && !(limits.depth && mainThread && rootDepth > limits.depth)) + { + // Age out PV variability metric + if (mainThread) + totBestMoveChanges /= 2; + + // Save the last iteration's scores before the first PV line is searched and + // all the move scores except the (new) PV are set to -VALUE_INFINITE. + for (RootMove& rm : rootMoves) + rm.previousScore = rm.score; + + size_t pvFirst = 0; + pvLast = 0; + + if (!threads.increaseDepth) + searchAgainCounter++; + + // MultiPV loop. We perform a full root search for each PV line + for (pvIdx = 0; pvIdx < multiPV; ++pvIdx) + { + if (pvIdx == pvLast) + { + pvFirst = pvLast; + for (pvLast++; pvLast < rootMoves.size(); pvLast++) + if (rootMoves[pvLast].tbRank != rootMoves[pvFirst].tbRank) + break; + } + + // Reset UCI info selDepth for each depth and each PV line + selDepth = 0; + + // Reset aspiration window starting size + delta = 5 + threadIdx % 8 + std::abs(rootMoves[pvIdx].meanSquaredScore) / 9968; + Value avg = rootMoves[pvIdx].averageScore; + alpha = std::max(avg - delta, -VALUE_INFINITE); + beta = std::min(avg + delta, VALUE_INFINITE); + + // Adjust optimism based on root move's averageScore + optimism[us] = 142 * avg / (std::abs(avg) + 86); + optimism[~us] = -optimism[us]; + + // Start with a small aspiration window and, in the case of a fail + // high/low, re-search with a bigger window until we don't fail + // high/low anymore. + int failedHighCnt = 0; + while (true) + { + // Adjust the effective depth searched, but ensure at least one + // effective increment for every four searchAgain steps (see issue #2717). + Depth adjustedDepth = + std::max(1, rootDepth - failedHighCnt - 3 * (searchAgainCounter + 1) / 4); + rootDelta = beta - alpha; + bestValue = search(rootPos, ss, alpha, beta, adjustedDepth, false); + + // Bring the best move to the front. It is critical that sorting + // is done with a stable algorithm because all the values but the + // first and eventually the new best one is set to -VALUE_INFINITE + // and we want to keep the same order for all the moves except the + // new PV that goes to the front. Note that in the case of MultiPV + // search the already searched PV lines are preserved. + std::stable_sort(rootMoves.begin() + pvIdx, rootMoves.begin() + pvLast); + + // If search has been stopped, we break immediately. Sorting is + // safe because RootMoves is still valid, although it refers to + // the previous iteration. + if (threads.stop) + break; + + // When failing high/low give some update before a re-search. To avoid + // excessive output that could hang GUIs like Fritz 19, only start + // at nodes > 10M (rather than depth N, which can be reached quickly) + if (mainThread && multiPV == 1 && (bestValue <= alpha || bestValue >= beta) + && nodes > 10000000) + main_manager()->pv(*this, threads, tt, rootDepth); + + // In case of failing low/high increase aspiration window and re-search, + // otherwise exit the loop. + if (bestValue <= alpha) + { + beta = alpha; + alpha = std::max(bestValue - delta, -VALUE_INFINITE); + + failedHighCnt = 0; + if (mainThread) + mainThread->stopOnPonderhit = false; + } + else if (bestValue >= beta) + { + alpha = std::max(beta - delta, alpha); + beta = std::min(bestValue + delta, VALUE_INFINITE); + ++failedHighCnt; + } + else + break; + + delta += delta / 3; + + assert(alpha >= -VALUE_INFINITE && beta <= VALUE_INFINITE); + } + + // Sort the PV lines searched so far and update the GUI + std::stable_sort(rootMoves.begin() + pvFirst, rootMoves.begin() + pvIdx + 1); + + if (mainThread + && (threads.stop || pvIdx + 1 == multiPV || nodes > 10000000) + // A thread that aborted search can have a mated-in/TB-loss score and + // PV that cannot be trusted, i.e. it can be delayed or refuted if we + // would have had time to fully search other root-moves. Thus here we + // suppress any exact mated-in/TB loss output and, if we do, below pick + // the score/PV from the previously completed iteration with the most + // recent bestmove change. + && !(threads.stop && is_loss(rootMoves[0].uciScore) + && rootMoves[0].score == rootMoves[0].uciScore)) + main_manager()->pv(*this, threads, tt, rootDepth); + + if (threads.stop) + break; + } + + if (!threads.stop) + completedDepth = rootDepth; + + // We make sure not to pick an unproven mated-in score, + // in case this thread prematurely stopped search (aborted-search). + if (completedDepth != rootDepth && rootMoves[0].score != -VALUE_INFINITE + && is_loss(rootMoves[0].score)) + { + // Bring the last best move to the front for best thread selection. + Utility::move_to_front(rootMoves, [&lastBestPV = std::as_const(lastBestPV)]( + const auto& rm) { return rm == lastBestPV[0]; }); + rootMoves[0].pv = lastBestPV; + rootMoves[0].score = rootMoves[0].uciScore = lastBestScore; + } + else if (rootMoves[0].pv[0] != lastBestPV[0]) + { + lastBestPV = rootMoves[0].pv; + lastBestScore = rootMoves[0].score; + lastBestMoveDepth = rootDepth; + } + + if (!mainThread) + continue; + + // Have we found a "mate in x"? + if (limits.mate && rootMoves[0].score == rootMoves[0].uciScore + && ((rootMoves[0].score >= VALUE_MATE_IN_MAX_PLY + && VALUE_MATE - rootMoves[0].score <= 2 * limits.mate) + || (rootMoves[0].score != -VALUE_INFINITE + && rootMoves[0].score <= VALUE_MATED_IN_MAX_PLY + && VALUE_MATE + rootMoves[0].score <= 2 * limits.mate))) + threads.stop = true; + + // If the skill level is enabled and time is up, pick a sub-optimal best move + if (skill.enabled() && skill.time_to_pick(rootDepth)) + skill.pick_best(rootMoves, multiPV); + + // Use part of the gained time from a previous stable move for the current move + for (auto&& th : threads) + { + totBestMoveChanges += th->worker->bestMoveChanges; + th->worker->bestMoveChanges = 0; + } + + // Do we have time for the next iteration? Can we stop searching now? + if (limits.use_time_management() && !threads.stop && !mainThread->stopOnPonderhit) + { + uint64_t nodesEffort = + rootMoves[0].effort * 100000 / std::max(size_t(1), size_t(nodes)); + + double fallingEval = (11.85 + 2.24 * (mainThread->bestPreviousAverageScore - bestValue) + + 0.93 * (mainThread->iterValue[iterIdx] - bestValue)) + / 100.0; + fallingEval = std::clamp(fallingEval, 0.57, 1.70); + + // If the bestMove is stable over several iterations, reduce time accordingly + double k = 0.51; + double center = lastBestMoveDepth + 12.15; + + timeReduction = 0.66 + 0.85 / (0.98 + std::exp(-k * (completedDepth - center))); + + double reduction = (1.43 + mainThread->previousTimeReduction) / (2.28 * timeReduction); + + double bestMoveInstability = 1.02 + 2.14 * totBestMoveChanges / threads.size(); + + double highBestMoveEffort = nodesEffort >= 93340 ? 0.76 : 1.0; + + double totalTime = mainThread->tm.optimum() * fallingEval * reduction + * bestMoveInstability * highBestMoveEffort; + + // Cap used time in case of a single legal move for a better viewer experience + if (rootMoves.size() == 1) + totalTime = std::min(502.0, totalTime); + + auto elapsedTime = elapsed(); + + // Stop the search if we have exceeded the totalTime or maximum + if (elapsedTime > std::min(totalTime, double(mainThread->tm.maximum()))) + { + // If we are allowed to ponder do not stop the search now but + // keep pondering until the GUI sends "ponderhit" or "stop". + if (mainThread->ponder) + mainThread->stopOnPonderhit = true; + else + threads.stop = true; + } + else + threads.increaseDepth = mainThread->ponder || elapsedTime <= totalTime * 0.50; + } + + mainThread->iterValue[iterIdx] = bestValue; + iterIdx = (iterIdx + 1) & 3; + } + + if (!mainThread) + return; + + mainThread->previousTimeReduction = timeReduction; + + // If the skill level is enabled, swap the best PV line with the sub-optimal one + if (skill.enabled()) + std::swap(rootMoves[0], + *std::find(rootMoves.begin(), rootMoves.end(), + skill.best ? skill.best : skill.pick_best(rootMoves, multiPV))); +} + + +void Search::Worker::do_move(Position& pos, const Move move, StateInfo& st, Stack* const ss) { + do_move(pos, move, st, pos.gives_check(move), ss); +} + +void Search::Worker::do_move( + Position& pos, const Move move, StateInfo& st, const bool givesCheck, Stack* const ss) { + bool capture = pos.capture_stage(move); + // Preferable over fetch_add to avoid locking instructions + nodes.store(nodes.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); + + auto [dirtyPiece, dirtyThreats] = accumulatorStack.push(); + pos.do_move(move, st, givesCheck, dirtyPiece, dirtyThreats, &tt, &sharedHistory); + + if (ss != nullptr) + { + ss->currentMove = move; + ss->continuationHistory = + &continuationHistory[ss->inCheck][capture][dirtyPiece.pc][move.to_sq()]; + ss->continuationCorrectionHistory = + &continuationCorrectionHistory[dirtyPiece.pc][move.to_sq()]; + } +} + +void Search::Worker::do_null_move(Position& pos, StateInfo& st, Stack* const ss) { + pos.do_null_move(st); + ss->currentMove = Move::null(); + ss->continuationHistory = &continuationHistory[0][0][NO_PIECE][0]; + ss->continuationCorrectionHistory = &continuationCorrectionHistory[NO_PIECE][0]; +} + +void Search::Worker::undo_move(Position& pos, const Move move) { + pos.undo_move(move); + accumulatorStack.pop(); +} + +void Search::Worker::undo_null_move(Position& pos) { pos.undo_null_move(); } + + +// Reset histories, usually before a new game +void Search::Worker::clear() { + mainHistory.fill(0); + captureHistory.fill(-689); + + // Each thread is responsible for clearing their part of shared history + sharedHistory.correctionHistory.clear_range(0, numaThreadIdx, numaTotal); + sharedHistory.pawnHistory.clear_range(-1238, numaThreadIdx, numaTotal); + + ttMoveHistory = 0; + + for (auto& to : continuationCorrectionHistory) + for (auto& h : to) + h.fill(7); + + for (bool inCheck : {false, true}) + for (StatsType c : {NoCaptures, Captures}) + for (auto& to : continuationHistory[inCheck][c]) + for (auto& h : to) + h.fill(-541); + + for (size_t i = 1; i < reductions.size(); ++i) + reductions[i] = int(2809 / 128.0 * std::log(i)); + + refreshTable.clear(networks[numaAccessToken]); +} + + +// Main search function for both PV and non-PV nodes +template +Value Search::Worker::search( + Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode) { + + constexpr bool PvNode = nodeType != NonPV; + constexpr bool rootNode = nodeType == Root; + const bool allNode = !(PvNode || cutNode); + + // Dive into quiescence search when the depth reaches zero + if (depth <= 0) + return qsearch(pos, ss, alpha, beta); + + // Limit the depth if extensions made it too large + depth = std::min(depth, MAX_PLY - 1); + + // Check if we have an upcoming move that draws by repetition + if (!rootNode && alpha < VALUE_DRAW && pos.upcoming_repetition(ss->ply)) + { + alpha = value_draw(nodes); + if (alpha >= beta) + return alpha; + } + + assert(-VALUE_INFINITE <= alpha && alpha < beta && beta <= VALUE_INFINITE); + assert(PvNode || (alpha == beta - 1)); + assert(0 < depth && depth < MAX_PLY); + assert(!(PvNode && cutNode)); + + Move pv[MAX_PLY + 1]; + StateInfo st; + + Key posKey; + Move move, excludedMove, bestMove; + Depth extension, newDepth; + Value bestValue, value, eval, maxValue, probCutBeta; + bool givesCheck, improving, priorCapture, opponentWorsening; + bool capture, ttCapture; + int priorReduction; + Piece movedPiece; + + SearchedList capturesSearched; + SearchedList quietsSearched; + + // Step 1. Initialize node + ss->inCheck = pos.checkers(); + priorCapture = pos.captured_piece(); + Color us = pos.side_to_move(); + ss->moveCount = 0; + bestValue = -VALUE_INFINITE; + maxValue = VALUE_INFINITE; + + // Check for the available remaining time + if (is_mainthread()) + main_manager()->check_time(*this); + + // Used to send selDepth info to GUI (selDepth counts from 1, ply from 0) + if (PvNode && selDepth < ss->ply + 1) + selDepth = ss->ply + 1; + + if (!rootNode) + { + // Step 2. Check for aborted search and immediate draw + if (threads.stop.load(std::memory_order_relaxed) || pos.is_draw(ss->ply) + || ss->ply >= MAX_PLY) + return (ss->ply >= MAX_PLY && !ss->inCheck) ? evaluate(pos) : value_draw(nodes); + + // Step 3. Mate distance pruning. Even if we mate at the next move our score + // would be at best mate_in(ss->ply + 1), but if alpha is already bigger because + // a shorter mate was found upward in the tree then there is no need to search + // because we will never beat the current alpha. Same logic but with reversed + // signs apply also in the opposite condition of being mated instead of giving + // mate. In this case, return a fail-high score. + alpha = std::max(mated_in(ss->ply), alpha); + beta = std::min(mate_in(ss->ply + 1), beta); + if (alpha >= beta) + return alpha; + } + + assert(0 <= ss->ply && ss->ply < MAX_PLY); + + Square prevSq = ((ss - 1)->currentMove).is_ok() ? ((ss - 1)->currentMove).to_sq() : SQ_NONE; + bestMove = Move::none(); + priorReduction = (ss - 1)->reduction; + (ss - 1)->reduction = 0; + ss->statScore = 0; + (ss + 2)->cutoffCnt = 0; + + // Step 4. Transposition table lookup + excludedMove = ss->excludedMove; + posKey = pos.key(); + auto [ttHit, ttData, ttWriter] = tt.probe(posKey); + // Need further processing of the saved data + ss->ttHit = ttHit; + ttData.move = rootNode ? rootMoves[pvIdx].pv[0] : ttHit ? ttData.move : Move::none(); + ttData.value = ttHit ? value_from_tt(ttData.value, ss->ply, pos.rule50_count()) : VALUE_NONE; + ss->ttPv = excludedMove ? ss->ttPv : PvNode || (ttHit && ttData.is_pv); + ttCapture = ttData.move && pos.capture_stage(ttData.move); + + // Step 6. Static evaluation of the position + Value unadjustedStaticEval = VALUE_NONE; + const auto correctionValue = correction_value(*this, pos, ss); + // Skip early pruning when in check + if (ss->inCheck) + ss->staticEval = eval = (ss - 2)->staticEval; + else if (excludedMove) + unadjustedStaticEval = eval = ss->staticEval; + else if (ss->ttHit) + { + // Never assume anything about values stored in TT + unadjustedStaticEval = ttData.eval; + if (!is_valid(unadjustedStaticEval)) + unadjustedStaticEval = evaluate(pos); + + ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, correctionValue); + + // ttValue can be used as a better position evaluation + if (is_valid(ttData.value) + && (ttData.bound & (ttData.value > eval ? BOUND_LOWER : BOUND_UPPER))) + eval = ttData.value; + } + else + { + unadjustedStaticEval = evaluate(pos); + ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, correctionValue); + + // Static evaluation is saved as it was before adjustment by correction history + ttWriter.write(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_UNSEARCHED, Move::none(), + unadjustedStaticEval, tt.generation()); + } + + // Set up the improving flag, which is true if current static evaluation is + // bigger than the previous static evaluation at our turn (if we were in + // check at our previous move we go back until we weren't in check) and is + // false otherwise. The improving flag is used in various pruning heuristics. + // Similarly, opponentWorsening is true if our static evaluation is better + // for us than at the last ply. + improving = ss->staticEval > (ss - 2)->staticEval; + opponentWorsening = ss->staticEval > -(ss - 1)->staticEval; + + // Hindsight adjustment of reductions based on static evaluation difference. + if (priorReduction >= 3 && !opponentWorsening) + depth++; + if (priorReduction >= 2 && depth >= 2 && ss->staticEval + (ss - 1)->staticEval > 188) + depth--; + + // At non-PV nodes we check for an early TT cutoff + if (!PvNode && !excludedMove && ttData.depth > depth - (ttData.value <= beta) + && is_valid(ttData.value) // Can happen when !ttHit or when access race in probe() + && (ttData.bound & (ttData.value >= beta ? BOUND_LOWER : BOUND_UPPER)) + && (cutNode == (ttData.value >= beta) || depth > 5)) + { + // If ttMove is quiet, update move sorting heuristics on TT hit + if (ttData.move && ttData.value >= beta) + { + // Bonus for a quiet ttMove that fails high + if (!ttCapture) + update_quiet_histories(pos, ss, *this, ttData.move, + std::min(121 * depth - 75, 932)); + + // Extra penalty for early quiet moves of the previous ply + if (prevSq != SQ_NONE && (ss - 1)->moveCount < 4 && !priorCapture) + update_continuation_histories(ss - 1, pos.piece_on(prevSq), prevSq, -2104); + } + + // Partial workaround for the graph history interaction problem + // For high rule50 counts don't produce transposition table cutoffs. + if (pos.rule50_count() < 96) + { + if (depth >= 7 && ttData.move && pos.pseudo_legal(ttData.move) && pos.legal(ttData.move) + && !is_decisive(ttData.value)) + { + pos.do_move(ttData.move, st); + Key nextPosKey = pos.key(); + auto [ttHitNext, ttDataNext, ttWriterNext] = tt.probe(nextPosKey); + pos.undo_move(ttData.move); + + // Check that the ttValue after the tt move would also trigger a cutoff + if (!is_valid(ttDataNext.value)) + return ttData.value; + + if ((ttData.value >= beta) == (-ttDataNext.value >= beta)) + return ttData.value; + } + else + return ttData.value; + } + } + + // Step 5. Tablebases probe + if (!rootNode && !excludedMove && tbConfig.cardinality) + { + int piecesCount = pos.count(); + + if (piecesCount <= tbConfig.cardinality + && (piecesCount < tbConfig.cardinality || depth >= tbConfig.probeDepth) + && pos.rule50_count() == 0 && !pos.can_castle(ANY_CASTLING)) + { + TB::ProbeState err; + TB::WDLScore wdl = Tablebases::probe_wdl(pos, &err); + + // Force check of time on the next occasion + if (is_mainthread()) + main_manager()->callsCnt = 0; + + if (err != TB::ProbeState::FAIL) + { + // Preferable over fetch_add to avoid locking instructions + tbHits.store(tbHits.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); + + int drawScore = tbConfig.useRule50 ? 1 : 0; + + Value tbValue = VALUE_TB - ss->ply; + + // Use the range VALUE_TB to VALUE_TB_WIN_IN_MAX_PLY to score + value = wdl < -drawScore ? -tbValue + : wdl > drawScore ? tbValue + : VALUE_DRAW + 2 * wdl * drawScore; + + Bound b = wdl < -drawScore ? BOUND_UPPER + : wdl > drawScore ? BOUND_LOWER + : BOUND_EXACT; + + if (b == BOUND_EXACT || (b == BOUND_LOWER ? value >= beta : value <= alpha)) + { + ttWriter.write(posKey, value_to_tt(value, ss->ply), ss->ttPv, b, + std::min(MAX_PLY - 1, depth + 6), Move::none(), VALUE_NONE, + tt.generation()); + + return value; + } + + if (PvNode) + { + if (b == BOUND_LOWER) + bestValue = value, alpha = std::max(alpha, bestValue); + else + maxValue = value; + } + } + } + } + + if (ss->inCheck) + goto moves_loop; + + // Use static evaluation difference to improve quiet move ordering + if (((ss - 1)->currentMove).is_ok() && !(ss - 1)->inCheck && !priorCapture) + { + int evalDiff = std::clamp(-int((ss - 1)->staticEval + ss->staticEval), -213, 175) + 59; + mainHistory[~us][((ss - 1)->currentMove).raw()] << evalDiff * 10; + if (!ttHit && type_of(pos.piece_on(prevSq)) != PAWN + && ((ss - 1)->currentMove).type_of() != PROMOTION) + sharedHistory.pawn_entry(pos)[pos.piece_on(prevSq)][prevSq] << evalDiff * 13; + } + + + // Step 7. Razoring + // If eval is really low, skip search entirely and return the qsearch value. + // For PvNodes, we must have a guard against mates being returned. + if (!PvNode && eval < alpha - 507 - 312 * depth * depth) + return qsearch(pos, ss, alpha, beta); + + // Step 8. Futility pruning: child node + // The depth condition is important for mate finding. + { + auto futility_margin = [&](Depth d) { + Value futilityMult = 77 - 22 * !ss->ttHit; + + return futilityMult * d + - (2661 * improving + 355 * opponentWorsening) * futilityMult / 1024 // + + std::abs(correctionValue) / 176900; + }; + + if (!ss->ttPv && depth < 16 && eval - futility_margin(depth) >= beta && eval >= beta + && (!ttData.move || ttCapture) && !is_loss(beta) && !is_win(eval)) + return (2 * beta + eval) / 3; + } + + // Step 9. Null move search with verification search + if (cutNode && ss->staticEval >= beta - 17 * depth - 50 * improving + 359 && !excludedMove + && pos.non_pawn_material(us) && ss->ply >= nmpMinPly && !is_loss(beta)) + { + assert((ss - 1)->currentMove != Move::null()); + + // Null move dynamic reduction based on depth + Depth R = 7 + depth / 3; + do_null_move(pos, st, ss); + + Value nullValue = -search(pos, ss + 1, -beta, -beta + 1, depth - R, false); + + undo_null_move(pos); + + // Do not return unproven mate or TB scores + if (nullValue >= beta && !is_win(nullValue)) + { + if (nmpMinPly || depth < 16) + return nullValue; + + assert(!nmpMinPly); // Recursive verification is not allowed + + // Do verification search at high depths, with null move pruning disabled + // until ply exceeds nmpMinPly. + nmpMinPly = ss->ply + 3 * (depth - R) / 4; + + Value v = search(pos, ss, beta - 1, beta, depth - R, false); + + nmpMinPly = 0; + + if (v >= beta) + return nullValue; + } + } + + improving |= ss->staticEval >= beta; + + // Step 10. Internal iterative reductions + // At sufficient depth, reduce depth for PV/Cut nodes without a TTMove. + // (*Scaler) Making IIR more aggressive scales poorly. + if (!allNode && depth >= 6 && !ttData.move && priorReduction <= 3) + depth--; + + // Step 11. ProbCut + // If we have a good enough capture (or queen promotion) and a reduced search + // returns a value much above beta, we can (almost) safely prune the previous move. + probCutBeta = beta + 229 - 63 * improving; + if (depth >= 3 + && !is_decisive(beta) + // If value from transposition table is lower than probCutBeta, don't attempt + // probCut there + && !(is_valid(ttData.value) && ttData.value < probCutBeta)) + { + assert(probCutBeta < VALUE_INFINITE && probCutBeta > beta); + + MovePicker mp(pos, ttData.move, probCutBeta - ss->staticEval, &captureHistory); + Depth probCutDepth = depth - 4; + + while ((move = mp.next_move()) != Move::none()) + { + assert(move.is_ok()); + + if (move == excludedMove || !pos.legal(move)) + continue; + + assert(pos.capture_stage(move)); + + do_move(pos, move, st, ss); + + // Perform a preliminary qsearch to verify that the move holds + value = -qsearch(pos, ss + 1, -probCutBeta, -probCutBeta + 1); + + // If the qsearch held, perform the regular search + if (value >= probCutBeta && probCutDepth > 0) + value = -search(pos, ss + 1, -probCutBeta, -probCutBeta + 1, probCutDepth, + !cutNode); + + undo_move(pos, move); + + if (value >= probCutBeta) + { + // Save ProbCut data into transposition table + ttWriter.write(posKey, value_to_tt(value, ss->ply), ss->ttPv, BOUND_LOWER, + probCutDepth + 1, move, unadjustedStaticEval, tt.generation()); + + if (!is_decisive(value)) + return value - (probCutBeta - beta); + } + } + } + +moves_loop: // When in check, search starts here + + // Step 12. A small Probcut idea + probCutBeta = beta + 416; + if ((ttData.bound & BOUND_LOWER) && ttData.depth >= depth - 4 && ttData.value >= probCutBeta + && !is_decisive(beta) && is_valid(ttData.value) && !is_decisive(ttData.value)) + return probCutBeta; + + const PieceToHistory* contHist[] = { + (ss - 1)->continuationHistory, (ss - 2)->continuationHistory, (ss - 3)->continuationHistory, + (ss - 4)->continuationHistory, (ss - 5)->continuationHistory, (ss - 6)->continuationHistory}; + + + MovePicker mp(pos, ttData.move, depth, &mainHistory, &lowPlyHistory, &captureHistory, contHist, + &sharedHistory, ss->ply); + + value = bestValue; + + int moveCount = 0; + + // Step 13. Loop through all pseudo-legal moves until no moves remain + // or a beta cutoff occurs. + while ((move = mp.next_move()) != Move::none()) + { + assert(move.is_ok()); + + if (move == excludedMove) + continue; + + // Check for legality + if (!pos.legal(move)) + continue; + + // At root obey the "searchmoves" option and skip moves not listed in Root + // Move List. In MultiPV mode we also skip PV moves that have been already + // searched and those of lower "TB rank" if we are in a TB root position. + if (rootNode && !std::count(rootMoves.begin() + pvIdx, rootMoves.begin() + pvLast, move)) + continue; + + ss->moveCount = ++moveCount; + + if (rootNode && is_mainthread() && nodes > 10000000) + { + main_manager()->updates.onIter( + {depth, UCIEngine::move(move, pos.is_chess960()), moveCount + pvIdx}); + } + if (PvNode) + (ss + 1)->pv = nullptr; + + extension = 0; + capture = pos.capture_stage(move); + movedPiece = pos.moved_piece(move); + givesCheck = pos.gives_check(move); + + // Calculate new depth for this move + newDepth = depth - 1; + + int delta = beta - alpha; + + Depth r = reduction(improving, depth, moveCount, delta); + + // Increase reduction for ttPv nodes (*Scaler) + // Larger values scale well + if (ss->ttPv) + r += 949; + + // Step 14. Pruning at shallow depths. + // Depth conditions are important for mate finding. + if (!rootNode && pos.non_pawn_material(us) && !is_loss(bestValue)) + { + // Skip quiet moves if movecount exceeds our FutilityMoveCount threshold + if (moveCount >= (3 + depth * depth) / (2 - improving)) + mp.skip_quiet_moves(); + + // Reduced depth of the next LMR search + int lmrDepth = newDepth - r / 1024; + + if (capture || givesCheck) + { + Piece capturedPiece = pos.piece_on(move.to_sq()); + int captHist = captureHistory[movedPiece][move.to_sq()][type_of(capturedPiece)]; + + // Futility pruning for captures + if (!givesCheck && lmrDepth < 7) + { + Value futilityValue = ss->staticEval + 235 + 211 * lmrDepth + + PieceValue[capturedPiece] + 126 * captHist / 1024; + + if (futilityValue <= alpha) + continue; + } + + // SEE based pruning for captures and checks + // Avoid pruning sacrifices of our last piece for stalemate + int margin = std::max(185 * depth + captHist / 28, 0); + if ((alpha >= VALUE_DRAW || pos.non_pawn_material(us) != PieceValue[movedPiece]) + && !pos.see_ge(move, -margin)) + continue; + } + else + { + int history = (*contHist[0])[movedPiece][move.to_sq()] + + (*contHist[1])[movedPiece][move.to_sq()] + + sharedHistory.pawn_entry(pos)[movedPiece][move.to_sq()]; + + // Continuation history based pruning + if (history < -3826 * depth) + continue; + + history += 73 * mainHistory[us][move.raw()] / 32; + + // (*Scaler): Generally, lower divisors scales well + lmrDepth += history / 2917; + + Value futilityValue = ss->staticEval + 42 + 157 * !bestMove + 120 * lmrDepth + + 86 * (ss->staticEval > alpha); + + // Futility pruning: parent node + // (*Scaler): Generally, more frequent futility pruning + // scales well + if (!ss->inCheck && lmrDepth < 13 && futilityValue <= alpha) + { + if (bestValue <= futilityValue && !is_decisive(bestValue) + && !is_win(futilityValue)) + bestValue = futilityValue; + continue; + } + + lmrDepth = std::max(lmrDepth, 0); + + // Prune moves with negative SEE + if (!pos.see_ge(move, -25 * lmrDepth * lmrDepth)) + continue; + } + } + + // Step 15. Extensions + // Singular extension search. If all moves but one + // fail low on a search of (alpha-s, beta-s), and just one fails high on + // (alpha, beta), then that move is singular and should be extended. To + // verify this we do a reduced search on the position excluding the ttMove + // and if the result is lower than ttValue minus a margin, then we will + // extend the ttMove. Recursive singular search is avoided. + + // (*Scaler) Generally, higher singularBeta (i.e closer to ttValue) + // and lower extension margins scale well. + if (!rootNode && move == ttData.move && !excludedMove && depth >= 6 + ss->ttPv + && is_valid(ttData.value) && !is_decisive(ttData.value) && (ttData.bound & BOUND_LOWER) + && ttData.depth >= depth - 3 && !is_shuffling(move, ss, pos)) + { + Value singularBeta = ttData.value - (58 + 67 * (ss->ttPv && !PvNode)) * depth / 57; + Depth singularDepth = newDepth / 2; + + ss->excludedMove = move; + value = search(pos, ss, singularBeta - 1, singularBeta, singularDepth, cutNode); + ss->excludedMove = Move::none(); + + if (value < singularBeta) + { + int corrValAdj = std::abs(correctionValue) / 220870; + int doubleMargin = -4 + 213 * PvNode - 196 * !ttCapture - corrValAdj + - 943 * ttMoveHistory / 123477 - (ss->ply > rootDepth) * 45; + int tripleMargin = 73 + 324 * PvNode - 229 * !ttCapture + 87 * ss->ttPv - corrValAdj + - (ss->ply > rootDepth) * 50; + + extension = + 1 + (value < singularBeta - doubleMargin) + (value < singularBeta - tripleMargin); + + depth++; + } + + // Multi-cut pruning + // Our ttMove is assumed to fail high based on the bound of the TT entry, + // and if after excluding the ttMove with a reduced search we fail high + // over the original beta, we assume this expected cut-node is not + // singular (multiple moves fail high), and we can prune the whole + // subtree by returning a softbound. + else if (value >= beta && !is_decisive(value)) + { + ttMoveHistory << std::max(-394 - 105 * depth, -3692); + return value; + } + + // Negative extensions + // If other moves failed high over (ttValue - margin) without the + // ttMove on a reduced search, but we cannot do multi-cut because + // (ttValue - margin) is lower than the original beta, we do not know + // if the ttMove is singular or can do a multi-cut, so we reduce the + // ttMove in favor of other moves based on some conditions: + + // If the ttMove is assumed to fail high over current beta + else if (ttData.value >= beta) + extension = -3; + + // If we are on a cutNode but the ttMove is not assumed to fail high + // over current beta + else if (cutNode) + extension = -2; + } + + // Step 16. Make the move + do_move(pos, move, st, givesCheck, ss); + + // Add extension to new depth + newDepth += extension; + uint64_t nodeCount = rootNode ? uint64_t(nodes) : 0; + + // Decrease reduction for PvNodes (*Scaler) + if (ss->ttPv) + r -= 2823 + PvNode * 1013 + (ttData.value > alpha) * 910 + + (ttData.depth >= depth) * (933 + cutNode * 979); + + r += 690; // Base reduction offset to compensate for other tweaks + r -= moveCount * 70; + r -= std::abs(correctionValue) / 26878; + + // Increase reduction for cut nodes + if (cutNode) + r += 3582 + 1015 * !ttData.move; + + // Increase reduction if ttMove is a capture + if (ttCapture) + r += 1075; + + // Increase reduction if next ply has a lot of fail high + if ((ss + 1)->cutoffCnt > 1) + r += 249 + 1073 * ((ss + 1)->cutoffCnt > 2) + 1064 * allNode; + + // For first picked move (ttMove) reduce reduction + if (move == ttData.move) + r -= 2069; + + if (capture) + ss->statScore = 892 * int(PieceValue[pos.captured_piece()]) / 128 + + captureHistory[movedPiece][move.to_sq()][type_of(pos.captured_piece())]; + else + ss->statScore = 2 * mainHistory[us][move.raw()] + + (*contHist[0])[movedPiece][move.to_sq()] + + (*contHist[1])[movedPiece][move.to_sq()]; + + // Decrease/increase reduction for moves with a good/bad history + r -= ss->statScore * 454 / 4096; + + // Scale up reductions for expected ALL nodes + if (allNode) + r += r * 276 / (256 * depth + 254); + + // Step 17. Late moves reduction / extension (LMR) + if (depth >= 2 && moveCount > 1) + { + // In general we want to cap the LMR depth search at newDepth, but when + // reduction is negative, we allow this move a limited search extension + // beyond the first move depth. + // To prevent problems when the max value is less than the min value, + // std::clamp has been replaced by a more robust implementation. + Depth d = std::max(1, std::min(newDepth - r / 1024, newDepth + 2)) + PvNode; + + ss->reduction = newDepth - d; + value = -search(pos, ss + 1, -(alpha + 1), -alpha, d, true); + ss->reduction = 0; + + // Do a full-depth search when reduced LMR search fails high + // (*Scaler) Shallower searches here don't scale well + if (value > alpha) + { + // Adjust full-depth search based on LMR results - if the result was + // good enough search deeper, if it was bad enough search shallower. + const bool doDeeperSearch = d < newDepth && value > bestValue + 50; + const bool doShallowerSearch = value < bestValue + 9; + + newDepth += doDeeperSearch - doShallowerSearch; + + if (newDepth > d) + value = -search(pos, ss + 1, -(alpha + 1), -alpha, newDepth, !cutNode); + + // Post LMR continuation history updates + update_continuation_histories(ss, movedPiece, move.to_sq(), 1342); + } + } + + // Step 18. Full-depth search when LMR is skipped + else if (!PvNode || moveCount > 1) + { + // Increase reduction if ttMove is not present + if (!ttData.move) + r += 993; + + // Note that if expected reduction is high, we reduce search depth here + value = -search(pos, ss + 1, -(alpha + 1), -alpha, + newDepth - (r > 4302) - (r > 5919 && newDepth > 2), !cutNode); + } + + // For PV nodes only, do a full PV search on the first move or after a fail high, + // otherwise let the parent node fail low with value <= alpha and try another move. + if (PvNode && (moveCount == 1 || value > alpha)) + { + (ss + 1)->pv = pv; + (ss + 1)->pv[0] = Move::none(); + + // Extend move from transposition table if we are about to dive into qsearch. + // decisive score handling improves mate finding and retrograde analysis. + if (move == ttData.move + && ((is_valid(ttData.value) && is_decisive(ttData.value) && ttData.depth > 0) + || ttData.depth > 1)) + newDepth = std::max(newDepth, 1); + + value = -search(pos, ss + 1, -beta, -alpha, newDepth, false); + } + + // Step 19. Undo move + undo_move(pos, move); + + assert(value > -VALUE_INFINITE && value < VALUE_INFINITE); + + // Step 20. Check for a new best move + // Finished searching the move. If a stop occurred, the return value of + // the search cannot be trusted, and we return immediately without updating + // best move, principal variation nor transposition table. + if (threads.stop.load(std::memory_order_relaxed)) + return VALUE_ZERO; + + if (rootNode) + { + RootMove& rm = *std::find(rootMoves.begin(), rootMoves.end(), move); + + rm.effort += nodes - nodeCount; + + rm.averageScore = + rm.averageScore != -VALUE_INFINITE ? (value + rm.averageScore) / 2 : value; + + rm.meanSquaredScore = rm.meanSquaredScore != -VALUE_INFINITE * VALUE_INFINITE + ? (value * std::abs(value) + rm.meanSquaredScore) / 2 + : value * std::abs(value); + + // PV move or new best move? + if (moveCount == 1 || value > alpha) + { + rm.score = rm.uciScore = value; + rm.selDepth = selDepth; + rm.scoreLowerbound = rm.scoreUpperbound = false; + + if (value >= beta) + { + rm.scoreLowerbound = true; + rm.uciScore = beta; + } + else if (value <= alpha) + { + rm.scoreUpperbound = true; + rm.uciScore = alpha; + } + + rm.pv.resize(1); + + assert((ss + 1)->pv); + + for (Move* m = (ss + 1)->pv; *m != Move::none(); ++m) + rm.pv.push_back(*m); + + // We record how often the best move has been changed in each iteration. + // This information is used for time management. In MultiPV mode, + // we must take care to only do this for the first PV line. + if (moveCount > 1 && !pvIdx) + ++bestMoveChanges; + } + else + // All other moves but the PV, are set to the lowest value: this + // is not a problem when sorting because the sort is stable and the + // move position in the list is preserved - just the PV is pushed up. + rm.score = -VALUE_INFINITE; + } + + // In case we have an alternative move equal in eval to the current bestmove, + // promote it to bestmove by pretending it just exceeds alpha (but not beta). + int inc = (value == bestValue && ss->ply + 2 >= rootDepth && (int(nodes) & 14) == 0 + && !is_win(std::abs(value) + 1)); + + if (value + inc > bestValue) + { + bestValue = value; + + if (value + inc > alpha) + { + bestMove = move; + + if (PvNode && !rootNode) // Update pv even in fail-high case + update_pv(ss->pv, move, (ss + 1)->pv); + + if (value >= beta) + { + // (*Scaler) Infrequent and small updates scale well + ss->cutoffCnt += (extension < 2) || PvNode; + assert(value >= beta); // Fail high + break; + } + + // Reduce other moves if we have found at least one score improvement + if (depth > 2 && depth < 14 && !is_decisive(value)) + depth -= 2; + + assert(depth > 0); + alpha = value; // Update alpha! Always alpha < beta + } + } + + // If the move is worse than some previously searched move, + // remember it, to update its stats later. + if (move != bestMove && moveCount <= SEARCHEDLIST_CAPACITY) + { + if (capture) + capturesSearched.push_back(move); + else + quietsSearched.push_back(move); + } + } + + // Step 21. Check for mate and stalemate + // All legal moves have been searched and if there are no legal moves, it + // must be a mate or a stalemate. If we are in a singular extension search then + // return a fail low score. + + assert(moveCount || !ss->inCheck || excludedMove || !MoveList(pos).size()); + + // Adjust best value for fail high cases + if (bestValue >= beta && !is_decisive(bestValue) && !is_decisive(alpha)) + bestValue = (bestValue * depth + beta) / (depth + 1); + + if (!moveCount) + bestValue = excludedMove ? alpha : ss->inCheck ? mated_in(ss->ply) : VALUE_DRAW; + + // If there is a move that produces search value greater than alpha, + // we update the stats of searched moves. + else if (bestMove) + { + update_all_stats(pos, ss, *this, bestMove, prevSq, quietsSearched, capturesSearched, depth, + ttData.move); + if (!PvNode) + ttMoveHistory << (bestMove == ttData.move ? 804 : -860); + } + + // Bonus for prior quiet countermove that caused the fail low + else if (!priorCapture && prevSq != SQ_NONE) + { + int bonusScale = -227; + bonusScale -= (ss - 1)->statScore / 101; + bonusScale += std::min(58 * depth, 488); + bonusScale += 172 * ((ss - 1)->moveCount > 8); + bonusScale += 150 * (!ss->inCheck && bestValue <= ss->staticEval - 113); + bonusScale += 154 * (!(ss - 1)->inCheck && bestValue <= -(ss - 1)->staticEval - 68); + + bonusScale = std::max(bonusScale, 0); + + // scaledBonus ranges from 0 to roughly 2.3M, overflows happen for multipliers larger than 900 + const int scaledBonus = std::min(137 * depth - 79, 1394) * bonusScale; + + update_continuation_histories(ss - 1, pos.piece_on(prevSq), prevSq, + scaledBonus * 222 / 16384); + + mainHistory[~us][((ss - 1)->currentMove).raw()] << scaledBonus * 221 / 32768; + + if (type_of(pos.piece_on(prevSq)) != PAWN && ((ss - 1)->currentMove).type_of() != PROMOTION) + sharedHistory.pawn_entry(pos)[pos.piece_on(prevSq)][prevSq] << scaledBonus * 286 / 8192; + } + + // Bonus for prior capture countermove that caused the fail low + else if (priorCapture && prevSq != SQ_NONE) + { + Piece capturedPiece = pos.captured_piece(); + assert(capturedPiece != NO_PIECE); + captureHistory[pos.piece_on(prevSq)][prevSq][type_of(capturedPiece)] << 993; + } + + if (PvNode) + bestValue = std::min(bestValue, maxValue); + + // If no good move is found and the previous position was ttPv, then the previous + // opponent move is probably good and the new position is added to the search tree. + if (bestValue <= alpha) + ss->ttPv = ss->ttPv || (ss - 1)->ttPv; + + // Write gathered information in transposition table. Note that the + // static evaluation is saved as it was before correction history. + if (!excludedMove && !(rootNode && pvIdx)) + ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv, + bestValue >= beta ? BOUND_LOWER + : PvNode && bestMove ? BOUND_EXACT + : BOUND_UPPER, + moveCount != 0 ? depth : std::min(MAX_PLY - 1, depth + 6), bestMove, + unadjustedStaticEval, tt.generation()); + + // Adjust correction history if the best move is not a capture + // and the error direction matches whether we are above/below bounds. + if (!ss->inCheck && !(bestMove && pos.capture(bestMove)) + && (bestValue > ss->staticEval) == bool(bestMove)) + { + auto bonus = std::clamp(int(bestValue - ss->staticEval) * depth / (bestMove ? 10 : 8), + -CORRECTION_HISTORY_LIMIT / 4, CORRECTION_HISTORY_LIMIT / 4); + update_correction_history(pos, ss, *this, bonus); + } + + assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE); + + return bestValue; +} + + +// Quiescence search function, which is called by the main search function with +// depth zero, or recursively with further decreasing depth. With depth <= 0, we +// "should" be using static eval only, but tactical moves may confuse the static eval. +// To fight this horizon effect, we implement this qsearch of tactical moves. +// See https://www.chessprogramming.org/Horizon_Effect +// and https://www.chessprogramming.org/Quiescence_Search +template +Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta) { + + static_assert(nodeType != Root); + constexpr bool PvNode = nodeType == PV; + + assert(alpha >= -VALUE_INFINITE && alpha < beta && beta <= VALUE_INFINITE); + assert(PvNode || (alpha == beta - 1)); + + // Check if we have an upcoming move that draws by repetition + if (alpha < VALUE_DRAW && pos.upcoming_repetition(ss->ply)) + { + alpha = value_draw(nodes); + if (alpha >= beta) + return alpha; + } + + Move pv[MAX_PLY + 1]; + StateInfo st; + + Key posKey; + Move move, bestMove; + Value bestValue, value, futilityBase; + bool pvHit, givesCheck, capture; + int moveCount; + + // Step 1. Initialize node + if (PvNode) + { + (ss + 1)->pv = pv; + ss->pv[0] = Move::none(); + } + + bestMove = Move::none(); + ss->inCheck = pos.checkers(); + moveCount = 0; + + // Used to send selDepth info to GUI (selDepth counts from 1, ply from 0) + if (PvNode && selDepth < ss->ply + 1) + selDepth = ss->ply + 1; + + // Step 2. Check for an immediate draw or maximum ply reached + if (pos.is_draw(ss->ply) || ss->ply >= MAX_PLY) + return (ss->ply >= MAX_PLY && !ss->inCheck) ? evaluate(pos) : VALUE_DRAW; + + assert(0 <= ss->ply && ss->ply < MAX_PLY); + + // Step 3. Transposition table lookup + posKey = pos.key(); + auto [ttHit, ttData, ttWriter] = tt.probe(posKey); + // Need further processing of the saved data + ss->ttHit = ttHit; + ttData.move = ttHit ? ttData.move : Move::none(); + ttData.value = ttHit ? value_from_tt(ttData.value, ss->ply, pos.rule50_count()) : VALUE_NONE; + pvHit = ttHit && ttData.is_pv; + + // At non-PV nodes we check for an early TT cutoff + if (!PvNode && ttData.depth >= DEPTH_QS + && is_valid(ttData.value) // Can happen when !ttHit or when access race in probe() + && (ttData.bound & (ttData.value >= beta ? BOUND_LOWER : BOUND_UPPER))) + return ttData.value; + + // Step 4. Static evaluation of the position + Value unadjustedStaticEval = VALUE_NONE; + if (ss->inCheck) + bestValue = futilityBase = -VALUE_INFINITE; + else + { + const auto correctionValue = correction_value(*this, pos, ss); + + if (ss->ttHit) + { + // Never assume anything about values stored in TT + unadjustedStaticEval = ttData.eval; + + if (!is_valid(unadjustedStaticEval)) + unadjustedStaticEval = evaluate(pos); + + ss->staticEval = bestValue = + to_corrected_static_eval(unadjustedStaticEval, correctionValue); + + // ttValue can be used as a better position evaluation + if (is_valid(ttData.value) && !is_decisive(ttData.value) + && (ttData.bound & (ttData.value > bestValue ? BOUND_LOWER : BOUND_UPPER))) + bestValue = ttData.value; + } + else + { + unadjustedStaticEval = evaluate(pos); + ss->staticEval = bestValue = + to_corrected_static_eval(unadjustedStaticEval, correctionValue); + } + + // Stand pat. Return immediately if static value is at least beta + if (bestValue >= beta) + { + if (!is_decisive(bestValue)) + bestValue = (bestValue + beta) / 2; + + if (!ss->ttHit) + ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER, + DEPTH_UNSEARCHED, Move::none(), unadjustedStaticEval, + tt.generation()); + return bestValue; + } + + if (bestValue > alpha) + alpha = bestValue; + + futilityBase = ss->staticEval + 351; + } + + const PieceToHistory* contHist[] = {(ss - 1)->continuationHistory}; + + Square prevSq = ((ss - 1)->currentMove).is_ok() ? ((ss - 1)->currentMove).to_sq() : SQ_NONE; + + // Initialize a MovePicker object for the current position, and prepare to search + // the moves. We presently use two stages of move generator in quiescence search: + // captures, or evasions only when in check. + MovePicker mp(pos, ttData.move, DEPTH_QS, &mainHistory, &lowPlyHistory, &captureHistory, + contHist, &sharedHistory, ss->ply); + + // Step 5. Loop through all pseudo-legal moves until no moves remain or a beta + // cutoff occurs. + while ((move = mp.next_move()) != Move::none()) + { + assert(move.is_ok()); + + if (!pos.legal(move)) + continue; + + givesCheck = pos.gives_check(move); + capture = pos.capture_stage(move); + + moveCount++; + + // Step 6. Pruning + if (!is_loss(bestValue)) + { + // Futility pruning and moveCount pruning + if (!givesCheck && move.to_sq() != prevSq && !is_loss(futilityBase) + && move.type_of() != PROMOTION) + { + if (moveCount > 2) + continue; + + Value futilityValue = futilityBase + PieceValue[pos.piece_on(move.to_sq())]; + + // If static eval + value of piece we are going to capture is + // much lower than alpha, we can prune this move. + if (futilityValue <= alpha) + { + bestValue = std::max(bestValue, futilityValue); + continue; + } + + // If static exchange evaluation is low enough + // we can prune this move. + if (!pos.see_ge(move, alpha - futilityBase)) + { + bestValue = std::max(bestValue, std::min(alpha, futilityBase)); + continue; + } + } + + // Skip non-captures + if (!capture) + continue; + + // Do not search moves with bad enough SEE values + if (!pos.see_ge(move, -72)) + continue; + } + + // Step 7. Make and search the move + do_move(pos, move, st, givesCheck, ss); + + value = -qsearch(pos, ss + 1, -beta, -alpha); + undo_move(pos, move); + + assert(value > -VALUE_INFINITE && value < VALUE_INFINITE); + + // Step 8. Check for a new best move + if (value > bestValue) + { + bestValue = value; + + if (value > alpha) + { + bestMove = move; + + if (PvNode) // Update pv even in fail-high case + update_pv(ss->pv, move, (ss + 1)->pv); + + if (value < beta) // Update alpha here! + alpha = value; + else + break; // Fail high + } + } + } + + // Step 9. Check for mate + // All legal moves have been searched. A special case: if we are + // in check and no legal moves were found, it is checkmate. + if (ss->inCheck && bestValue == -VALUE_INFINITE) + { + assert(!MoveList(pos).size()); + return mated_in(ss->ply); // Plies to mate from the root + } + + if (!is_decisive(bestValue) && bestValue > beta) + bestValue = (bestValue + beta) / 2; + + Color us = pos.side_to_move(); + if (!ss->inCheck && !moveCount && !pos.non_pawn_material(us) + && type_of(pos.captured_piece()) >= ROOK) + { + if (!((us == WHITE ? shift(pos.pieces(us, PAWN)) + : shift(pos.pieces(us, PAWN))) + & ~pos.pieces())) // no pawn pushes available + { + pos.state()->checkersBB = Rank1BB; // search for legal king-moves only + if (!MoveList(pos).size()) // stalemate + bestValue = VALUE_DRAW; + pos.state()->checkersBB = 0; + } + } + + // Save gathered info in transposition table. The static evaluation + // is saved as it was before adjustment by correction history. + ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), pvHit, + bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, DEPTH_QS, bestMove, + unadjustedStaticEval, tt.generation()); + + assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE); + + return bestValue; +} + +Depth Search::Worker::reduction(bool i, Depth d, int mn, int delta) const { + int reductionScale = reductions[d] * reductions[mn]; + return reductionScale - delta * 576 / rootDelta + !i * reductionScale * 217 / 512 + 1182; +} + +// elapsed() returns the time elapsed since the search started. If the +// 'nodestime' option is enabled, it will return the count of nodes searched +// instead. This function is called to check whether the search should be +// stopped based on predefined thresholds like time limits or nodes searched. +// +// elapsed_time() returns the actual time elapsed since the start of the search. +// This function is intended for use only when printing PV outputs, and not used +// for making decisions within the search algorithm itself. +TimePoint Search::Worker::elapsed() const { + return main_manager()->tm.elapsed([this]() { return threads.nodes_searched(); }); +} + +TimePoint Search::Worker::elapsed_time() const { return main_manager()->tm.elapsed_time(); } + +Value Search::Worker::evaluate(const Position& pos) { + return Eval::evaluate(networks[numaAccessToken], pos, accumulatorStack, refreshTable, + optimism[pos.side_to_move()]); +} + +namespace { +// Adjusts a mate or TB score from "plies to mate from the root" to +// "plies to mate from the current position". Standard scores are unchanged. +// The function is called before storing a value in the transposition table. +Value value_to_tt(Value v, int ply) { return is_win(v) ? v + ply : is_loss(v) ? v - ply : v; } + + +// Inverse of value_to_tt(): it adjusts a mate or TB score from the transposition +// table (which refers to the plies to mate/be mated from current position) to +// "plies to mate/be mated (TB win/loss) from the root". However, to avoid +// potentially false mate or TB scores related to the 50 moves rule and the +// graph history interaction, we return the highest non-TB score instead. +Value value_from_tt(Value v, int ply, int r50c) { + + if (!is_valid(v)) + return VALUE_NONE; + + // handle TB win or better + if (is_win(v)) + { + // Downgrade a potentially false mate score + if (v >= VALUE_MATE_IN_MAX_PLY && VALUE_MATE - v > 100 - r50c) + return VALUE_TB_WIN_IN_MAX_PLY - 1; + + // Downgrade a potentially false TB score. + if (VALUE_TB - v > 100 - r50c) + return VALUE_TB_WIN_IN_MAX_PLY - 1; + + return v - ply; + } + + // handle TB loss or worse + if (is_loss(v)) + { + // Downgrade a potentially false mate score. + if (v <= VALUE_MATED_IN_MAX_PLY && VALUE_MATE + v > 100 - r50c) + return VALUE_TB_LOSS_IN_MAX_PLY + 1; + + // Downgrade a potentially false TB score. + if (VALUE_TB + v > 100 - r50c) + return VALUE_TB_LOSS_IN_MAX_PLY + 1; + + return v + ply; + } + + return v; +} + + +// Adds current move and appends child pv[] +void update_pv(Move* pv, Move move, const Move* childPv) { + + for (*pv++ = move; childPv && *childPv != Move::none();) + *pv++ = *childPv++; + *pv = Move::none(); +} + + +// Updates stats at the end of search() when a bestMove is found +void update_all_stats(const Position& pos, + Stack* ss, + Search::Worker& workerThread, + Move bestMove, + Square prevSq, + SearchedList& quietsSearched, + SearchedList& capturesSearched, + Depth depth, + Move ttMove) { + + CapturePieceToHistory& captureHistory = workerThread.captureHistory; + Piece movedPiece = pos.moved_piece(bestMove); + PieceType capturedPiece; + + int bonus = + std::min(124 * depth - 84, 1376) + 349 * (bestMove == ttMove) + (ss - 1)->statScore / 32; + int malus = std::min(872 * depth - 212, 2104); + + if (!pos.capture_stage(bestMove)) + { + update_quiet_histories(pos, ss, workerThread, bestMove, bonus * 810 / 1024); + + int actualMalus = malus * 1159 / 1024; + // Decrease stats for all non-best quiet moves + for (Move move : quietsSearched) + { + actualMalus = actualMalus * 963 / 1024; + update_quiet_histories(pos, ss, workerThread, move, -actualMalus); + } + } + else + { + // Increase stats for the best move in case it was a capture move + capturedPiece = type_of(pos.piece_on(bestMove.to_sq())); + captureHistory[movedPiece][bestMove.to_sq()][capturedPiece] << bonus * 1290 / 1024; + } + + // Extra penalty for a quiet early move that was not a TT move in + // previous ply when it gets refuted. + if (prevSq != SQ_NONE && ((ss - 1)->moveCount == 1 + (ss - 1)->ttHit) && !pos.captured_piece()) + update_continuation_histories(ss - 1, pos.piece_on(prevSq), prevSq, -malus * 596 / 1024); + + // Decrease stats for all non-best capture moves + for (Move move : capturesSearched) + { + movedPiece = pos.moved_piece(move); + capturedPiece = type_of(pos.piece_on(move.to_sq())); + captureHistory[movedPiece][move.to_sq()][capturedPiece] << -malus * 1561 / 1024; + } +} + + +// Updates histories of the move pairs formed by moves +// at ply -1, -2, -3, -4, and -6 with current move. +void update_continuation_histories(Stack* ss, Piece pc, Square to, int bonus) { + static constexpr std::array conthist_bonuses = { + {{1, 1106}, {2, 705}, {3, 316}, {4, 572}, {5, 126}, {6, 427}}}; + + // Multipliers for positive history consistency + constexpr int CMHCMultipliers[] = {87, 94, 106, 118, 114, 128, 128}; + int positiveCount = 0; + + for (const auto [i, weight] : conthist_bonuses) + { + // Only update the first 2 continuation histories if we are in check + if (ss->inCheck && i > 2) + break; + + if (((ss - i)->currentMove).is_ok()) + { + auto& historyEntry = (*(ss - i)->continuationHistory)[pc][to]; + if (historyEntry > 0) + positiveCount++; + + int multiplier = CMHCMultipliers[positiveCount]; + historyEntry << (bonus * weight * multiplier / 131072) + 82 * (i < 2); + } + } +} + +// Updates move sorting heuristics + +void update_quiet_histories( + const Position& pos, Stack* ss, Search::Worker& workerThread, Move move, int bonus) { + + Color us = pos.side_to_move(); + workerThread.mainHistory[us][move.raw()] << bonus; // Untuned to prevent duplicate effort + + if (ss->ply < LOW_PLY_HISTORY_SIZE) + workerThread.lowPlyHistory[ss->ply][move.raw()] << bonus * 714 / 1024; + + update_continuation_histories(ss, pos.moved_piece(move), move.to_sq(), bonus * 898 / 1024); + + workerThread.sharedHistory.pawn_entry(pos)[pos.moved_piece(move)][move.to_sq()] + << bonus * (bonus > 0 ? 967 : 535) / 1024; +} + +} + +// When playing with strength handicap, choose the best move among a set of +// RootMoves using a statistical rule dependent on 'level'. Idea by Heinz van Saanen. +Move Skill::pick_best(const RootMoves& rootMoves, size_t multiPV) { + static PRNG rng(now()); // PRNG sequence should be non-deterministic + + // RootMoves are already sorted by score in descending order + Value topScore = rootMoves[0].score; + int delta = std::min(topScore - rootMoves[multiPV - 1].score, int(PawnValue)); + int maxScore = -VALUE_INFINITE; + double weakness = 120 - 2 * level; + + // Choose best move. For each move score we add two terms, both dependent on + // weakness. One is deterministic and bigger for weaker levels, and one is + // random. Then we choose the move with the resulting highest score. + for (size_t i = 0; i < multiPV; ++i) + { + // This is our magic formula + int push = int(weakness * int(topScore - rootMoves[i].score) + + delta * (rng.rand() % int(weakness))) + / 128; + + if (rootMoves[i].score + push >= maxScore) + { + maxScore = rootMoves[i].score + push; + best = rootMoves[i].pv[0]; + } + } + + return best; +} + +// Used to print debug info and, more importantly, to detect +// when we are out of available time and thus stop the search. +void SearchManager::check_time(Search::Worker& worker) { + if (--callsCnt > 0) + return; + + // When using nodes, ensure checking rate is not lower than 0.1% of nodes + callsCnt = worker.limits.nodes ? std::min(512, int(worker.limits.nodes / 1024)) : 512; + + static TimePoint lastInfoTime = now(); + + TimePoint elapsed = tm.elapsed([&worker]() { return worker.threads.nodes_searched(); }); + TimePoint tick = worker.limits.startTime + elapsed; + + if (tick - lastInfoTime >= 1000) + { + lastInfoTime = tick; + dbg_print(); + } + + // We should not stop pondering until told so by the GUI + if (ponder) + return; + + if ( + // Later we rely on the fact that we can at least use the mainthread previous + // root-search score and PV in a multithreaded environment to prove mated-in scores. + worker.completedDepth >= 1 + && ((worker.limits.use_time_management() && (elapsed > tm.maximum() || stopOnPonderhit)) + || (worker.limits.movetime && elapsed >= worker.limits.movetime) + || (worker.limits.nodes && worker.threads.nodes_searched() >= worker.limits.nodes))) + worker.threads.stop = true; +} + +// Used to correct and extend PVs for moves that have a TB (but not a mate) score. +// Keeps the search based PV for as long as it is verified to maintain the game +// outcome, truncates afterwards. Finally, extends to mate the PV, providing a +// possible continuation (but not a proven mating line). +void syzygy_extend_pv(const OptionsMap& options, + const Search::LimitsType& limits, + Position& pos, + RootMove& rootMove, + Value& v) { + + auto t_start = std::chrono::steady_clock::now(); + int moveOverhead = int(options["Move Overhead"]); + bool rule50 = bool(options["Syzygy50MoveRule"]); + + // Do not use more than moveOverhead / 2 time, if time management is active + auto time_abort = [&t_start, &moveOverhead, &limits]() -> bool { + auto t_end = std::chrono::steady_clock::now(); + return limits.use_time_management() + && 2 * std::chrono::duration(t_end - t_start).count() + > moveOverhead; + }; + + std::list sts; + + // Step 0, do the rootMove, no correction allowed, as needed for MultiPV in TB. + auto& stRoot = sts.emplace_back(); + pos.do_move(rootMove.pv[0], stRoot); + int ply = 1; + + // Step 1, walk the PV to the last position in TB with correct decisive score + while (size_t(ply) < rootMove.pv.size()) + { + Move& pvMove = rootMove.pv[ply]; + + RootMoves legalMoves; + for (const auto& m : MoveList(pos)) + legalMoves.emplace_back(m); + + Tablebases::Config config = + Tablebases::rank_root_moves(options, pos, legalMoves, false, time_abort); + RootMove& rm = *std::find(legalMoves.begin(), legalMoves.end(), pvMove); + + if (legalMoves[0].tbRank != rm.tbRank) + break; + + ply++; + + auto& st = sts.emplace_back(); + pos.do_move(pvMove, st); + + // Do not allow for repetitions or drawing moves along the PV in TB regime + if (config.rootInTB && ((rule50 && pos.is_draw(ply)) || pos.is_repetition(ply))) + { + pos.undo_move(pvMove); + ply--; + break; + } + + // Full PV shown will thus be validated and end in TB. + // If we cannot validate the full PV in time, we do not show it. + if (config.rootInTB && time_abort()) + break; + } + + // Resize the PV to the correct part + rootMove.pv.resize(ply); + + // Step 2, now extend the PV to mate, as if the user explored syzygy-tables.info + // using top ranked moves (minimal DTZ), which gives optimal mates only for simple + // endgames e.g. KRvK. + while (!(rule50 && pos.is_draw(0))) + { + if (time_abort()) + break; + + RootMoves legalMoves; + for (const auto& m : MoveList(pos)) + { + auto& rm = legalMoves.emplace_back(m); + StateInfo tmpSI; + pos.do_move(m, tmpSI); + // Give a score of each move to break DTZ ties restricting opponent mobility, + // but not giving the opponent a capture. + for (const auto& mOpp : MoveList(pos)) + rm.tbRank -= pos.capture(mOpp) ? 100 : 1; + pos.undo_move(m); + } + + // Mate found + if (legalMoves.size() == 0) + break; + + // Sort moves according to their above assigned rank. + // This will break ties for moves with equal DTZ in rank_root_moves. + std::stable_sort( + legalMoves.begin(), legalMoves.end(), + [](const Search::RootMove& a, const Search::RootMove& b) { return a.tbRank > b.tbRank; }); + + // The winning side tries to minimize DTZ, the losing side maximizes it + Tablebases::Config config = + Tablebases::rank_root_moves(options, pos, legalMoves, true, time_abort); + + // If DTZ is not available we might not find a mate, so we bail out + if (!config.rootInTB || config.cardinality > 0) + break; + + ply++; + + Move& pvMove = legalMoves[0].pv[0]; + rootMove.pv.push_back(pvMove); + auto& st = sts.emplace_back(); + pos.do_move(pvMove, st); + } + + // Finding a draw in this function is an exceptional case, that cannot happen when rule50 is false or + // during engine game play, since we have a winning score, and play correctly + // with TB support. However, it can be that a position is draw due to the 50 move + // rule if it has been been reached on the board with a non-optimal 50 move counter + // (e.g. 8/8/6k1/3B4/3K4/4N3/8/8 w - - 54 106 ) which TB with dtz counter rounding + // cannot always correctly rank. See also + // https://github.com/official-stockfish/Stockfish/issues/5175#issuecomment-2058893495 + // We adjust the score to match the found PV. Note that a TB loss score can be + // displayed if the engine did not find a drawing move yet, but eventually search + // will figure it out (e.g. 1kq5/q2r4/5K2/8/8/8/8/7Q w - - 96 1 ) + if (pos.is_draw(0)) + v = VALUE_DRAW; + + // Undo the PV moves + for (auto it = rootMove.pv.rbegin(); it != rootMove.pv.rend(); ++it) + pos.undo_move(*it); + + // Inform if we couldn't get a full extension in time + if (time_abort()) + sync_cout + << "info string Syzygy based PV extension requires more time, increase Move Overhead as needed." + << sync_endl; +} + +void SearchManager::pv(Search::Worker& worker, + const ThreadPool& threads, + const TranspositionTable& tt, + Depth depth) { + + const auto nodes = threads.nodes_searched(); + auto& rootMoves = worker.rootMoves; + auto& pos = worker.rootPos; + size_t pvIdx = worker.pvIdx; + size_t multiPV = std::min(size_t(worker.options["MultiPV"]), rootMoves.size()); + uint64_t tbHits = threads.tb_hits() + (worker.tbConfig.rootInTB ? rootMoves.size() : 0); + + for (size_t i = 0; i < multiPV; ++i) + { + bool updated = rootMoves[i].score != -VALUE_INFINITE; + + if (depth == 1 && !updated && i > 0) + continue; + + Depth d = updated ? depth : std::max(1, depth - 1); + Value v = updated ? rootMoves[i].uciScore : rootMoves[i].previousScore; + + if (v == -VALUE_INFINITE) + v = VALUE_ZERO; + + bool tb = worker.tbConfig.rootInTB && std::abs(v) <= VALUE_TB; + v = tb ? rootMoves[i].tbScore : v; + + bool isExact = i != pvIdx || tb || !updated; // tablebase- and previous-scores are exact + + // Potentially correct and extend the PV, and in exceptional cases v + if (is_decisive(v) && std::abs(v) < VALUE_MATE_IN_MAX_PLY + && ((!rootMoves[i].scoreLowerbound && !rootMoves[i].scoreUpperbound) || isExact)) + syzygy_extend_pv(worker.options, worker.limits, pos, rootMoves[i], v); + + std::string pv; + for (Move m : rootMoves[i].pv) + pv += UCIEngine::move(m, pos.is_chess960()) + " "; + + // Remove last whitespace + if (!pv.empty()) + pv.pop_back(); + + auto wdl = worker.options["UCI_ShowWDL"] ? UCIEngine::wdl(v, pos) : ""; + auto bound = rootMoves[i].scoreLowerbound + ? "lowerbound" + : (rootMoves[i].scoreUpperbound ? "upperbound" : ""); + + InfoFull info; + + info.depth = d; + info.selDepth = rootMoves[i].selDepth; + info.multiPV = i + 1; + info.score = {v, pos}; + info.wdl = wdl; + + if (!isExact) + info.bound = bound; + + TimePoint time = std::max(TimePoint(1), tm.elapsed_time()); + info.timeMs = time; + info.nodes = nodes; + info.nps = nodes * 1000 / time; + info.tbHits = tbHits; + info.pv = pv; + info.hashfull = tt.hashfull(); + + updates.onUpdateFull(info); + } +} + +// Called in case we have no ponder move before exiting the search, +// for instance, in case we stop the search during a fail high at root. +// We try hard to have a ponder move to return to the GUI, +// otherwise in case of 'ponder on' we have nothing to think about. +bool RootMove::extract_ponder_from_tt(const TranspositionTable& tt, Position& pos) { + + StateInfo st; + + assert(pv.size() == 1); + if (pv[0] == Move::none()) + return false; + + pos.do_move(pv[0], st, &tt); + + auto [ttHit, ttData, ttWriter] = tt.probe(pos.key()); + if (ttHit) + { + if (MoveList(pos).contains(ttData.move)) + pv.push_back(ttData.move); + } + + pos.undo_move(pv[0]); + return pv.size() > 1; +} + + +} // namespace Stockfish diff --git a/src/search.h b/src/search.h new file mode 100644 index 0000000000000000000000000000000000000000..202f7c8db2e6002353c4c691de9f6d8db4ee5957 --- /dev/null +++ b/src/search.h @@ -0,0 +1,379 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef SEARCH_H_INCLUDED +#define SEARCH_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "history.h" +#include "misc.h" +#include "nnue/network.h" +#include "nnue/nnue_accumulator.h" +#include "numa.h" +#include "position.h" +#include "score.h" +#include "syzygy/tbprobe.h" +#include "timeman.h" +#include "types.h" + +namespace Stockfish { + +// Different node types, used as a template parameter +enum NodeType { + NonPV, + PV, + Root +}; + +class TranspositionTable; +class ThreadPool; +class OptionsMap; + +namespace Search { + +// Stack struct keeps track of the information we need to remember from nodes +// shallower and deeper in the tree during the search. Each search thread has +// its own array of Stack objects, indexed by the current ply. +struct Stack { + Move* pv; + PieceToHistory* continuationHistory; + CorrectionHistory* continuationCorrectionHistory; + int ply; + Move currentMove; + Move excludedMove; + Value staticEval; + int statScore; + int moveCount; + bool inCheck; + bool ttPv; + bool ttHit; + int cutoffCnt; + int reduction; +}; + + +// RootMove struct is used for moves at the root of the tree. For each root move +// we store a score and a PV (really a refutation in the case of moves which +// fail low). Score is normally set at -VALUE_INFINITE for all non-pv moves. +struct RootMove { + + explicit RootMove(Move m) : + pv(1, m) {} + bool extract_ponder_from_tt(const TranspositionTable& tt, Position& pos); + bool operator==(const Move& m) const { return pv[0] == m; } + // Sort in descending order + bool operator<(const RootMove& m) const { + return m.score != score ? m.score < score : m.previousScore < previousScore; + } + + uint64_t effort = 0; + Value score = -VALUE_INFINITE; + Value previousScore = -VALUE_INFINITE; + Value averageScore = -VALUE_INFINITE; + Value meanSquaredScore = -VALUE_INFINITE * VALUE_INFINITE; + Value uciScore = -VALUE_INFINITE; + bool scoreLowerbound = false; + bool scoreUpperbound = false; + int selDepth = 0; + int tbRank = 0; + Value tbScore; + std::vector pv; +}; + +using RootMoves = std::vector; + + +// LimitsType struct stores information sent by the caller about the analysis required. +struct LimitsType { + + // Init explicitly due to broken value-initialization of non POD in MSVC + LimitsType() { + time[WHITE] = time[BLACK] = inc[WHITE] = inc[BLACK] = npmsec = movetime = TimePoint(0); + movestogo = depth = mate = perft = infinite = 0; + nodes = 0; + ponderMode = false; + } + + bool use_time_management() const { return time[WHITE] || time[BLACK]; } + + std::vector searchmoves; + TimePoint time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime; + int movestogo, depth, mate, perft, infinite; + uint64_t nodes; + bool ponderMode; +}; + + +// The UCI stores the uci options, thread pool, and transposition table. +// This struct is used to easily forward data to the Search::Worker class. +struct SharedState { + SharedState(const OptionsMap& optionsMap, + ThreadPool& threadPool, + TranspositionTable& transpositionTable, + std::map& sharedHists, + const LazyNumaReplicatedSystemWide& nets) : + options(optionsMap), + threads(threadPool), + tt(transpositionTable), + sharedHistories(sharedHists), + networks(nets) {} + + const OptionsMap& options; + ThreadPool& threads; + TranspositionTable& tt; + std::map& sharedHistories; + const LazyNumaReplicatedSystemWide& networks; +}; + +class Worker; + +// Null Object Pattern, implement a common interface for the SearchManagers. +// A Null Object will be given to non-mainthread workers. +class ISearchManager { + public: + virtual ~ISearchManager() {} + virtual void check_time(Search::Worker&) = 0; +}; + +struct InfoShort { + int depth; + Score score; +}; + +struct InfoFull: InfoShort { + int selDepth; + size_t multiPV; + std::string_view wdl; + std::string_view bound; + size_t timeMs; + size_t nodes; + size_t nps; + size_t tbHits; + std::string_view pv; + int hashfull; +}; + +struct InfoIteration { + int depth; + std::string_view currmove; + size_t currmovenumber; +}; + +// Skill structure is used to implement strength limit. If we have a UCI_Elo, +// we convert it to an appropriate skill level, anchored to the Stash engine. +// This method is based on a fit of the Elo results for games played between +// Stockfish at various skill levels and various versions of the Stash engine. +// Skill 0 .. 19 now covers CCRL Blitz Elo from 1320 to 3190, approximately +// Reference: https://github.com/vondele/Stockfish/commit/a08b8d4e9711c2 +struct Skill { + // Lowest and highest Elo ratings used in the skill level calculation + constexpr static int LowestElo = 1320; + constexpr static int HighestElo = 3190; + + Skill(int skill_level, int uci_elo) { + if (uci_elo) + { + double e = double(uci_elo - LowestElo) / (HighestElo - LowestElo); + level = std::clamp((((37.2473 * e - 40.8525) * e + 22.2943) * e - 0.311438), 0.0, 19.0); + } + else + level = double(skill_level); + } + bool enabled() const { return level < 20.0; } + bool time_to_pick(Depth depth) const { return depth == 1 + int(level); } + Move pick_best(const RootMoves&, size_t multiPV); + + double level; + Move best = Move::none(); +}; + +// SearchManager manages the search from the main thread. It is responsible for +// keeping track of the time, and storing data strictly related to the main thread. +class SearchManager: public ISearchManager { + public: + using UpdateShort = std::function; + using UpdateFull = std::function; + using UpdateIter = std::function; + using UpdateBestmove = std::function; + + struct UpdateContext { + UpdateShort onUpdateNoMoves; + UpdateFull onUpdateFull; + UpdateIter onIter; + UpdateBestmove onBestmove; + }; + + + SearchManager(const UpdateContext& updateContext) : + updates(updateContext) {} + + void check_time(Search::Worker& worker) override; + + void pv(Search::Worker& worker, + const ThreadPool& threads, + const TranspositionTable& tt, + Depth depth); + + Stockfish::TimeManagement tm; + double originalTimeAdjust; + int callsCnt; + std::atomic_bool ponder; + + std::array iterValue; + double previousTimeReduction; + Value bestPreviousScore; + Value bestPreviousAverageScore; + bool stopOnPonderhit; + + size_t id; + + const UpdateContext& updates; +}; + +class NullSearchManager: public ISearchManager { + public: + void check_time(Search::Worker&) override {} +}; + +// Search::Worker is the class that does the actual search. +// It is instantiated once per thread, and it is responsible for keeping track +// of the search history, and storing data required for the search. +class Worker { + public: + Worker(SharedState&, + std::unique_ptr, + size_t, + size_t, + size_t, + NumaReplicatedAccessToken); + + // Called at instantiation to initialize reductions tables. + // Reset histories, usually before a new game. + void clear(); + + // Called when the program receives the UCI 'go' command. + // It searches from the root position and outputs the "bestmove". + void start_searching(); + + bool is_mainthread() const { return threadIdx == 0; } + + void ensure_network_replicated(); + + // Public because they need to be updatable by the stats + ButterflyHistory mainHistory; + LowPlyHistory lowPlyHistory; + + CapturePieceToHistory captureHistory; + ContinuationHistory continuationHistory[2][2]; + CorrectionHistory continuationCorrectionHistory; + + TTMoveHistory ttMoveHistory; + SharedHistories& sharedHistory; + + private: + void iterative_deepening(); + + void do_move(Position& pos, const Move move, StateInfo& st, Stack* const ss); + void + do_move(Position& pos, const Move move, StateInfo& st, const bool givesCheck, Stack* const ss); + void do_null_move(Position& pos, StateInfo& st, Stack* const ss); + void undo_move(Position& pos, const Move move); + void undo_null_move(Position& pos); + + // This is the main search function, for both PV and non-PV nodes + template + Value search(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode); + + // Quiescence search function, which is called by the main search + template + Value qsearch(Position& pos, Stack* ss, Value alpha, Value beta); + + Depth reduction(bool i, Depth d, int mn, int delta) const; + + // Pointer to the search manager, only allowed to be called by the main thread + SearchManager* main_manager() const { + assert(threadIdx == 0); + return static_cast(manager.get()); + } + + TimePoint elapsed() const; + TimePoint elapsed_time() const; + + Value evaluate(const Position&); + + LimitsType limits; + + size_t pvIdx, pvLast; + std::atomic nodes, tbHits, bestMoveChanges; + int selDepth, nmpMinPly; + + Value optimism[COLOR_NB]; + + Position rootPos; + StateInfo rootState; + RootMoves rootMoves; + Depth rootDepth, completedDepth; + Value rootDelta; + + size_t threadIdx, numaThreadIdx, numaTotal; + NumaReplicatedAccessToken numaAccessToken; + + // Reductions lookup table initialized at startup + std::array reductions; // [depth or moveNumber] + + // The main thread has a SearchManager, the others have a NullSearchManager + std::unique_ptr manager; + + Tablebases::Config tbConfig; + + const OptionsMap& options; + ThreadPool& threads; + TranspositionTable& tt; + const LazyNumaReplicatedSystemWide& networks; + + // Used by NNUE + Eval::NNUE::AccumulatorStack accumulatorStack; + Eval::NNUE::AccumulatorCaches refreshTable; + + friend class Stockfish::ThreadPool; + friend class SearchManager; +}; + +struct ConthistBonus { + int index; + int weight; +}; + + +} // namespace Search + +} // namespace Stockfish + +#endif // #ifndef SEARCH_H_INCLUDED diff --git a/src/shm.h b/src/shm.h new file mode 100644 index 0000000000000000000000000000000000000000..d581bf08ac7f91b9975bfd4c9cd1f80c8aa0dd9b --- /dev/null +++ b/src/shm.h @@ -0,0 +1,634 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef SHM_H_INCLUDED +#define SHM_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) && !defined(__ANDROID__) + #include "shm_linux.h" +#endif + +#if defined(__ANDROID__) + #include + #define SF_MAX_SEM_NAME_LEN NAME_MAX +#endif + +#include "types.h" + +#include "memory.h" + +#if defined(_WIN32) + + #if _WIN32_WINNT < 0x0601 + #undef _WIN32_WINNT + #define _WIN32_WINNT 0x0601 // Force to include needed API prototypes + #endif + + #if !defined(NOMINMAX) + #define NOMINMAX + #endif + #include +#elif defined(__linux__) + #include + #include + #include + #include + #include + #include + #include +#endif + + +#if defined(__APPLE__) + #include + #include + +#elif defined(__sun) + #include + +#elif defined(__FreeBSD__) + #include + #include + #include + +#elif defined(__NetBSD__) || defined(__DragonFly__) || defined(__linux__) + #include + #include +#endif + + +namespace Stockfish { + +// argv[0] CANNOT be used because we need to identify the executable. +// argv[0] contains the command used to invoke it, which does not involve the full path. +// Just using a path is not fully resilient either, as the executable could +// have changed if it wasn't locked by the OS. Ideally we would hash the executable +// but it's not really that important at this point. +// If the path is longer than 4095 bytes the hash will be computed from an unspecified +// amount of bytes of the path; in particular it can a hash of an empty string. + +inline std::string getExecutablePathHash() { + char executable_path[4096] = {0}; + std::size_t path_length = 0; + +#if defined(_WIN32) + path_length = GetModuleFileNameA(NULL, executable_path, sizeof(executable_path)); + +#elif defined(__APPLE__) + uint32_t size = sizeof(executable_path); + if (_NSGetExecutablePath(executable_path, &size) == 0) + { + path_length = std::strlen(executable_path); + } + +#elif defined(__sun) // Solaris + const char* path = getexecname(); + if (path) + { + std::strncpy(executable_path, path, sizeof(executable_path) - 1); + path_length = std::strlen(executable_path); + } + +#elif defined(__FreeBSD__) + size_t size = sizeof(executable_path); + int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1}; + if (sysctl(mib, 4, executable_path, &size, NULL, 0) == 0) + { + path_length = std::strlen(executable_path); + } + +#elif defined(__NetBSD__) || defined(__DragonFly__) + ssize_t len = readlink("/proc/curproc/exe", executable_path, sizeof(executable_path) - 1); + if (len >= 0) + { + executable_path[len] = '\0'; + path_length = len; + } + +#elif defined(__linux__) + ssize_t len = readlink("/proc/self/exe", executable_path, sizeof(executable_path) - 1); + if (len >= 0) + { + executable_path[len] = '\0'; + path_length = len; + } + +#endif + + // In case of any error the path will be empty. + return std::string(executable_path, path_length); +} + +enum class SystemWideSharedConstantAllocationStatus { + NoAllocation, + LocalMemory, + SharedMemory +}; + +#if defined(_WIN32) + +inline std::string GetLastErrorAsString(DWORD error) { + //Get the error message ID, if any. + DWORD errorMessageID = error; + if (errorMessageID == 0) + { + return std::string(); //No error message has been recorded + } + + LPSTR messageBuffer = nullptr; + + //Ask Win32 to give us the string version of that message ID. + //The parameters we pass in, tell Win32 to create the buffer that holds the message for us (because we don't yet know how long the message string will be). + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM + | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, errorMessageID, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPSTR) &messageBuffer, 0, NULL); + + //Copy the error message into a std::string. + std::string message(messageBuffer, size); + + //Free the Win32's string's buffer. + LocalFree(messageBuffer); + + return message; +} + +// Utilizes shared memory to store the value. It is deduplicated system-wide (for the single user). +template +class SharedMemoryBackend { + public: + enum class Status { + Success, + LargePageAllocationError, + FileMappingError, + MapViewError, + MutexCreateError, + MutexWaitError, + MutexReleaseError, + NotInitialized + }; + + static constexpr DWORD IS_INITIALIZED_VALUE = 1; + + SharedMemoryBackend() : + status(Status::NotInitialized) {}; + + SharedMemoryBackend(const std::string& shm_name, const T& value) : + status(Status::NotInitialized) { + + initialize(shm_name, value); + } + + bool is_valid() const { return status == Status::Success; } + + std::optional get_error_message() const { + switch (status) + { + case Status::Success : + return std::nullopt; + case Status::LargePageAllocationError : + return "Failed to allocate large page memory"; + case Status::FileMappingError : + return "Failed to create file mapping: " + last_error_message; + case Status::MapViewError : + return "Failed to map view: " + last_error_message; + case Status::MutexCreateError : + return "Failed to create mutex: " + last_error_message; + case Status::MutexWaitError : + return "Failed to wait on mutex: " + last_error_message; + case Status::MutexReleaseError : + return "Failed to release mutex: " + last_error_message; + case Status::NotInitialized : + return "Not initialized"; + default : + return "Unknown error"; + } + } + + void* get() const { return is_valid() ? pMap : nullptr; } + + ~SharedMemoryBackend() { cleanup(); } + + SharedMemoryBackend(const SharedMemoryBackend&) = delete; + SharedMemoryBackend& operator=(const SharedMemoryBackend&) = delete; + + SharedMemoryBackend(SharedMemoryBackend&& other) noexcept : + pMap(other.pMap), + hMapFile(other.hMapFile), + status(other.status), + last_error_message(std::move(other.last_error_message)) { + + other.pMap = nullptr; + other.hMapFile = 0; + other.status = Status::NotInitialized; + } + + SharedMemoryBackend& operator=(SharedMemoryBackend&& other) noexcept { + if (this != &other) + { + cleanup(); + pMap = other.pMap; + hMapFile = other.hMapFile; + status = other.status; + last_error_message = std::move(other.last_error_message); + + other.pMap = nullptr; + other.hMapFile = 0; + other.status = Status::NotInitialized; + } + return *this; + } + + SystemWideSharedConstantAllocationStatus get_status() const { + return status == Status::Success ? SystemWideSharedConstantAllocationStatus::SharedMemory + : SystemWideSharedConstantAllocationStatus::NoAllocation; + } + + private: + void initialize(const std::string& shm_name, const T& value) { + const size_t total_size = sizeof(T) + sizeof(IS_INITIALIZED_VALUE); + + // Try allocating with large pages first. + hMapFile = windows_try_with_large_page_priviliges( + [&](size_t largePageSize) { + const size_t total_size_aligned = + (total_size + largePageSize - 1) / largePageSize * largePageSize; + + #if defined(_WIN64) + DWORD total_size_low = total_size_aligned & 0xFFFFFFFFu; + DWORD total_size_high = total_size_aligned >> 32u; + #else + DWORD total_size_low = total_size_aligned; + DWORD total_size_high = 0; + #endif + + return CreateFileMappingA(INVALID_HANDLE_VALUE, NULL, + PAGE_READWRITE | SEC_COMMIT | SEC_LARGE_PAGES, + total_size_high, total_size_low, shm_name.c_str()); + }, + []() { return (void*) nullptr; }); + + // Fallback to normal allocation if no large pages available. + if (!hMapFile) + { + hMapFile = CreateFileMappingA(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, + static_cast(total_size), shm_name.c_str()); + } + + if (!hMapFile) + { + const DWORD err = GetLastError(); + last_error_message = GetLastErrorAsString(err); + status = Status::FileMappingError; + return; + } + + pMap = MapViewOfFile(hMapFile, FILE_MAP_ALL_ACCESS, 0, 0, total_size); + if (!pMap) + { + const DWORD err = GetLastError(); + last_error_message = GetLastErrorAsString(err); + status = Status::MapViewError; + cleanup_partial(); + return; + } + + // Use named mutex to ensure only one initializer + std::string mutex_name = shm_name + "$mutex"; + HANDLE hMutex = CreateMutexA(NULL, FALSE, mutex_name.c_str()); + if (!hMutex) + { + const DWORD err = GetLastError(); + last_error_message = GetLastErrorAsString(err); + status = Status::MutexCreateError; + cleanup_partial(); + return; + } + + DWORD wait_result = WaitForSingleObject(hMutex, INFINITE); + if (wait_result != WAIT_OBJECT_0) + { + const DWORD err = GetLastError(); + last_error_message = GetLastErrorAsString(err); + status = Status::MutexWaitError; + CloseHandle(hMutex); + cleanup_partial(); + return; + } + + // Crucially, we place the object first to ensure alignment. + volatile DWORD* is_initialized = + std::launder(reinterpret_cast(reinterpret_cast(pMap) + sizeof(T))); + T* object = std::launder(reinterpret_cast(pMap)); + + if (*is_initialized != IS_INITIALIZED_VALUE) + { + // First time initialization, message for debug purposes + new (object) T{value}; + *is_initialized = IS_INITIALIZED_VALUE; + } + + BOOL release_result = ReleaseMutex(hMutex); + CloseHandle(hMutex); + + if (!release_result) + { + const DWORD err = GetLastError(); + last_error_message = GetLastErrorAsString(err); + status = Status::MutexReleaseError; + cleanup_partial(); + return; + } + + status = Status::Success; + } + + void cleanup_partial() { + if (pMap != nullptr) + { + UnmapViewOfFile(pMap); + pMap = nullptr; + } + if (hMapFile) + { + CloseHandle(hMapFile); + hMapFile = 0; + } + } + + void cleanup() { + if (pMap != nullptr) + { + UnmapViewOfFile(pMap); + pMap = nullptr; + } + if (hMapFile) + { + CloseHandle(hMapFile); + hMapFile = 0; + } + } + + void* pMap = nullptr; + HANDLE hMapFile = 0; + Status status = Status::NotInitialized; + std::string last_error_message; +}; + +#elif defined(__linux__) && !defined(__ANDROID__) + +template +class SharedMemoryBackend { + public: + SharedMemoryBackend() = default; + + SharedMemoryBackend(const std::string& shm_name, const T& value) : + shm1(shm::create_shared(shm_name, value)) {} + + void* get() const { + const T* ptr = &shm1->get(); + return reinterpret_cast(const_cast(ptr)); + } + + bool is_valid() const { return shm1 && shm1->is_open() && shm1->is_initialized(); } + + SystemWideSharedConstantAllocationStatus get_status() const { + return is_valid() ? SystemWideSharedConstantAllocationStatus::SharedMemory + : SystemWideSharedConstantAllocationStatus::NoAllocation; + } + + std::optional get_error_message() const { + if (!shm1) + return "Shared memory not initialized"; + + if (!shm1->is_open()) + return "Shared memory is not open"; + + if (!shm1->is_initialized()) + return "Not initialized"; + + return std::nullopt; + } + + private: + std::optional> shm1; +}; + +#else + +// For systems that don't have shared memory, or support is troublesome. +// The way fallback is done is that we need a dummy backend. + +template +class SharedMemoryBackend { + public: + SharedMemoryBackend() = default; + + SharedMemoryBackend([[maybe_unused]] const std::string& shm_name, + [[maybe_unused]] const T& value) {} + + void* get() const { return nullptr; } + + bool is_valid() const { return false; } + + SystemWideSharedConstantAllocationStatus get_status() const { + return SystemWideSharedConstantAllocationStatus::NoAllocation; + } + + std::optional get_error_message() const { return "Dummy SharedMemoryBackend"; } +}; + +#endif + +template +struct SharedMemoryBackendFallback { + SharedMemoryBackendFallback() = default; + + SharedMemoryBackendFallback(const std::string&, const T& value) : + fallback_object(make_unique_large_page(value)) {} + + void* get() const { return fallback_object.get(); } + + SharedMemoryBackendFallback(const SharedMemoryBackendFallback&) = delete; + SharedMemoryBackendFallback& operator=(const SharedMemoryBackendFallback&) = delete; + + SharedMemoryBackendFallback(SharedMemoryBackendFallback&& other) noexcept : + fallback_object(std::move(other.fallback_object)) {} + + SharedMemoryBackendFallback& operator=(SharedMemoryBackendFallback&& other) noexcept { + fallback_object = std::move(other.fallback_object); + return *this; + } + + SystemWideSharedConstantAllocationStatus get_status() const { + return fallback_object == nullptr ? SystemWideSharedConstantAllocationStatus::NoAllocation + : SystemWideSharedConstantAllocationStatus::LocalMemory; + } + + std::optional get_error_message() const { + if (fallback_object == nullptr) + return "Not initialized"; + + return "Shared memory not supported by the OS. Local allocation fallback."; + } + + private: + LargePagePtr fallback_object; +}; + +// Platform-independent wrapper +template +struct SystemWideSharedConstant { + private: + static std::string createHashString(const std::string& input) { + char buf[1024]; + std::snprintf(buf, sizeof(buf), "%016" PRIx64, hash_string(input)); + return buf; + } + + public: + // We can't run the destructor because it may be in a completely different process. + // The object stored must also be obviously in-line but we can't check for that, other than some basic checks that cover most cases. + static_assert(std::is_trivially_destructible_v); + static_assert(std::is_trivially_move_constructible_v); + static_assert(std::is_trivially_copy_constructible_v); + + SystemWideSharedConstant() = default; + + + // Content is addressed by its hash. An additional discriminator can be added to account for differences + // that are not present in the content, for example NUMA node allocation. + SystemWideSharedConstant(const T& value, std::size_t discriminator = 0) { + std::size_t content_hash = std::hash{}(value); + std::size_t executable_hash = hash_string(getExecutablePathHash()); + + char buf[1024]; + std::snprintf(buf, sizeof(buf), "Local\\sf_%zu$%zu$%zu", content_hash, executable_hash, + discriminator); + std::string shm_name = buf; + +#if defined(__linux__) && !defined(__ANDROID__) + // POSIX shared memory names must start with a slash + shm_name = "/sf_" + createHashString(shm_name); + + // hash name and make sure it is not longer than SF_MAX_SEM_NAME_LEN + if (shm_name.size() > SF_MAX_SEM_NAME_LEN) + { + shm_name = shm_name.substr(0, SF_MAX_SEM_NAME_LEN - 1); + } +#endif + + SharedMemoryBackend shm_backend(shm_name, value); + + if (shm_backend.is_valid()) + { + backend = std::move(shm_backend); + } + else + { + backend = SharedMemoryBackendFallback(shm_name, value); + } + } + + SystemWideSharedConstant(const SystemWideSharedConstant&) = delete; + SystemWideSharedConstant& operator=(const SystemWideSharedConstant&) = delete; + + SystemWideSharedConstant(SystemWideSharedConstant&& other) noexcept : + backend(std::move(other.backend)) {} + + SystemWideSharedConstant& operator=(SystemWideSharedConstant&& other) noexcept { + backend = std::move(other.backend); + return *this; + } + + const T& operator*() const { return *std::launder(reinterpret_cast(get_ptr())); } + + bool operator==(std::nullptr_t) const noexcept { return get_ptr() == nullptr; } + + bool operator!=(std::nullptr_t) const noexcept { return get_ptr() != nullptr; } + + SystemWideSharedConstantAllocationStatus get_status() const { + return std::visit( + [](const auto& end) -> SystemWideSharedConstantAllocationStatus { + if constexpr (std::is_same_v, std::monostate>) + { + return SystemWideSharedConstantAllocationStatus::NoAllocation; + } + else + { + return end.get_status(); + } + }, + backend); + } + + std::optional get_error_message() const { + return std::visit( + [](const auto& end) -> std::optional { + if constexpr (std::is_same_v, std::monostate>) + { + return std::nullopt; + } + else + { + return end.get_error_message(); + } + }, + backend); + } + + private: + auto get_ptr() const { + return std::visit( + [](const auto& end) -> void* { + if constexpr (std::is_same_v, std::monostate>) + { + return nullptr; + } + else + { + return end.get(); + } + }, + backend); + } + + std::variant, SharedMemoryBackendFallback> backend; +}; + + +} // namespace Stockfish + +#endif // #ifndef SHM_H_INCLUDED diff --git a/src/shm_linux.h b/src/shm_linux.h new file mode 100644 index 0000000000000000000000000000000000000000..29c9e90f5aa81c2b37b763da80051232eb92ffa4 --- /dev/null +++ b/src/shm_linux.h @@ -0,0 +1,672 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef SHM_LINUX_H_INCLUDED +#define SHM_LINUX_H_INCLUDED + +#if !defined(__linux__) || defined(__ANDROID__) + #error shm_linux.h should not be included on this platform. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#define SF_MAX_SEM_NAME_LEN NAME_MAX + +#include "misc.h" + +namespace Stockfish::shm { + +namespace detail { + +struct ShmHeader { + static constexpr uint32_t SHM_MAGIC = 0xAD5F1A12; + pthread_mutex_t mutex; + std::atomic ref_count{0}; + std::atomic initialized{false}; + uint32_t magic = SHM_MAGIC; +}; + +class SharedMemoryBase { + public: + virtual ~SharedMemoryBase() = default; + virtual void close(bool skip_unmap = false) noexcept = 0; + virtual const std::string& name() const noexcept = 0; +}; + +class SharedMemoryRegistry { + private: + static std::mutex registry_mutex_; + static std::vector active_instances_; + + public: + static void register_instance(SharedMemoryBase* instance) { + std::scoped_lock lock(registry_mutex_); + active_instances_.push_back(instance); + } + + static void unregister_instance(SharedMemoryBase* instance) { + std::scoped_lock lock(registry_mutex_); + active_instances_.erase( + std::remove(active_instances_.begin(), active_instances_.end(), instance), + active_instances_.end()); + } + + static void cleanup_all(bool skip_unmap = false) noexcept { + std::scoped_lock lock(registry_mutex_); + for (auto* instance : active_instances_) + instance->close(skip_unmap); + active_instances_.clear(); + } +}; + +inline std::mutex SharedMemoryRegistry::registry_mutex_; +inline std::vector SharedMemoryRegistry::active_instances_; + +class CleanupHooks { + private: + static std::once_flag register_once_; + + static void handle_signal(int sig) noexcept { + // Search threads may still be running, so skip munmap (but still perform + // other cleanup actions). The memory mappings will be released on exit. + SharedMemoryRegistry::cleanup_all(true); + + // Invoke the default handler, which will exit + struct sigaction sa; + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + if (sigaction(sig, &sa, nullptr) == -1) + _Exit(128 + sig); + + raise(sig); + } + + static void register_signal_handlers() noexcept { + std::atexit([]() { SharedMemoryRegistry::cleanup_all(true); }); + + constexpr int signals[] = {SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGABRT, SIGFPE, + SIGSEGV, SIGTERM, SIGBUS, SIGSYS, SIGXCPU, SIGXFSZ}; + + struct sigaction sa; + sa.sa_handler = handle_signal; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + + for (int sig : signals) + sigaction(sig, &sa, nullptr); + } + + public: + static void ensure_registered() noexcept { + std::call_once(register_once_, register_signal_handlers); + } +}; + +inline std::once_flag CleanupHooks::register_once_; + + +inline int portable_fallocate(int fd, off_t offset, off_t length) { +#ifdef __APPLE__ + fstore_t store = {F_ALLOCATECONTIG, F_PEOFPOSMODE, offset, length, 0}; + int ret = fcntl(fd, F_PREALLOCATE, &store); + if (ret == -1) + { + store.fst_flags = F_ALLOCATEALL; + ret = fcntl(fd, F_PREALLOCATE, &store); + } + if (ret != -1) + ret = ftruncate(fd, offset + length); + return ret; +#else + return posix_fallocate(fd, offset, length); +#endif +} + +} // namespace detail + +template +class SharedMemory: public detail::SharedMemoryBase { + static_assert(std::is_trivially_copyable_v, "T must be trivially copyable"); + static_assert(!std::is_pointer_v, "T cannot be a pointer type"); + + private: + std::string name_; + int fd_ = -1; + void* mapped_ptr_ = nullptr; + T* data_ptr_ = nullptr; + detail::ShmHeader* header_ptr_ = nullptr; + size_t total_size_ = 0; + std::string sentinel_base_; + std::string sentinel_path_; + + static constexpr size_t calculate_total_size() noexcept { + return sizeof(T) + sizeof(detail::ShmHeader); + } + + static std::string make_sentinel_base(const std::string& name) { + char buf[32]; + // Using std::to_string here causes non-deterministic PGO builds. + // snprintf, being part of libc, is insensitive to the formatted values. + std::snprintf(buf, sizeof(buf), "sfshm_%016" PRIu64, hash_string(name)); + return buf; + } + + public: + explicit SharedMemory(const std::string& name) noexcept : + name_(name), + total_size_(calculate_total_size()), + sentinel_base_(make_sentinel_base(name)) {} + + ~SharedMemory() noexcept override { + detail::SharedMemoryRegistry::unregister_instance(this); + close(); + } + + SharedMemory(const SharedMemory&) = delete; + SharedMemory& operator=(const SharedMemory&) = delete; + + SharedMemory(SharedMemory&& other) noexcept : + name_(std::move(other.name_)), + fd_(other.fd_), + mapped_ptr_(other.mapped_ptr_), + data_ptr_(other.data_ptr_), + header_ptr_(other.header_ptr_), + total_size_(other.total_size_), + sentinel_base_(std::move(other.sentinel_base_)), + sentinel_path_(std::move(other.sentinel_path_)) { + + detail::SharedMemoryRegistry::unregister_instance(&other); + detail::SharedMemoryRegistry::register_instance(this); + other.reset(); + } + + SharedMemory& operator=(SharedMemory&& other) noexcept { + if (this != &other) + { + detail::SharedMemoryRegistry::unregister_instance(this); + close(); + + name_ = std::move(other.name_); + fd_ = other.fd_; + mapped_ptr_ = other.mapped_ptr_; + data_ptr_ = other.data_ptr_; + header_ptr_ = other.header_ptr_; + total_size_ = other.total_size_; + sentinel_base_ = std::move(other.sentinel_base_); + sentinel_path_ = std::move(other.sentinel_path_); + + detail::SharedMemoryRegistry::unregister_instance(&other); + detail::SharedMemoryRegistry::register_instance(this); + + other.reset(); + } + return *this; + } + + [[nodiscard]] bool open(const T& initial_value) noexcept { + detail::CleanupHooks::ensure_registered(); + + bool retried_stale = false; + + while (true) + { + if (is_open()) + return false; + + bool created_new = false; + fd_ = shm_open(name_.c_str(), O_CREAT | O_EXCL | O_RDWR, 0666); + + if (fd_ == -1) + { + fd_ = shm_open(name_.c_str(), O_RDWR, 0666); + if (fd_ == -1) + return false; + } + else + created_new = true; + + if (!lock_file(LOCK_EX)) + { + ::close(fd_); + reset(); + return false; + } + + bool invalid_header = false; + bool success = + created_new ? setup_new_region(initial_value) : setup_existing_region(invalid_header); + + if (!success) + { + if (created_new || invalid_header) + shm_unlink(name_.c_str()); + if (mapped_ptr_) + unmap_region(); + unlock_file(); + ::close(fd_); + reset(); + + if (!created_new && invalid_header && !retried_stale) + { + retried_stale = true; + continue; + } + return false; + } + + if (!lock_shared_mutex()) + { + if (created_new) + shm_unlink(name_.c_str()); + if (mapped_ptr_) + unmap_region(); + unlock_file(); + ::close(fd_); + reset(); + + if (!created_new && !retried_stale) + { + retried_stale = true; + continue; + } + return false; + } + + if (!create_sentinel_file_locked()) + { + unlock_shared_mutex(); + unmap_region(); + if (created_new) + shm_unlink(name_.c_str()); + unlock_file(); + ::close(fd_); + reset(); + return false; + } + + header_ptr_->ref_count.fetch_add(1, std::memory_order_acq_rel); + + unlock_shared_mutex(); + unlock_file(); + detail::SharedMemoryRegistry::register_instance(this); + return true; + } + } + + void close(bool skip_unmap = false) noexcept override { + if (fd_ == -1 && mapped_ptr_ == nullptr) + return; + + bool remove_region = false; + bool file_locked = lock_file(LOCK_EX); + bool mutex_locked = false; + + if (file_locked && header_ptr_ != nullptr) + mutex_locked = lock_shared_mutex(); + + if (mutex_locked) + { + if (header_ptr_) + { + header_ptr_->ref_count.fetch_sub(1, std::memory_order_acq_rel); + } + remove_sentinel_file(); + remove_region = !has_other_live_sentinels_locked(); + unlock_shared_mutex(); + } + else + { + remove_sentinel_file(); + decrement_refcount_relaxed(); + } + + if (skip_unmap) + mapped_ptr_ = nullptr; + else + unmap_region(); + + if (remove_region) + shm_unlink(name_.c_str()); + + if (file_locked) + unlock_file(); + + if (fd_ != -1) + { + ::close(fd_); + fd_ = -1; + } + + if (!skip_unmap) + reset(); + } + + const std::string& name() const noexcept override { return name_; } + + [[nodiscard]] bool is_open() const noexcept { return fd_ != -1 && mapped_ptr_ && data_ptr_; } + + [[nodiscard]] const T& get() const noexcept { return *data_ptr_; } + + [[nodiscard]] const T* operator->() const noexcept { return data_ptr_; } + + [[nodiscard]] const T& operator*() const noexcept { return *data_ptr_; } + + [[nodiscard]] uint32_t ref_count() const noexcept { + return header_ptr_ ? header_ptr_->ref_count.load(std::memory_order_acquire) : 0; + } + + [[nodiscard]] bool is_initialized() const noexcept { + return header_ptr_ ? header_ptr_->initialized.load(std::memory_order_acquire) : false; + } + + static void cleanup_all_instances() noexcept { detail::SharedMemoryRegistry::cleanup_all(); } + + private: + void reset() noexcept { + fd_ = -1; + mapped_ptr_ = nullptr; + data_ptr_ = nullptr; + header_ptr_ = nullptr; + sentinel_path_.clear(); + } + + void unmap_region() noexcept { + if (mapped_ptr_) + { + munmap(mapped_ptr_, total_size_); + mapped_ptr_ = nullptr; + data_ptr_ = nullptr; + header_ptr_ = nullptr; + } + } + + [[nodiscard]] bool lock_file(int operation) noexcept { + if (fd_ == -1) + return false; + + while (flock(fd_, operation) == -1) + { + if (errno == EINTR) + continue; + return false; + } + return true; + } + + void unlock_file() noexcept { + if (fd_ == -1) + return; + + while (flock(fd_, LOCK_UN) == -1) + { + if (errno == EINTR) + continue; + break; + } + } + + std::string sentinel_full_path(pid_t pid) const { + char buf[1024]; + // See above snprintf comment + std::snprintf(buf, sizeof(buf), "/dev/shm/%s.%ld", sentinel_base_.c_str(), long(pid)); + return buf; + } + + void decrement_refcount_relaxed() noexcept { + if (!header_ptr_) + return; + + uint32_t expected = header_ptr_->ref_count.load(std::memory_order_relaxed); + while (expected != 0 + && !header_ptr_->ref_count.compare_exchange_weak( + expected, expected - 1, std::memory_order_acq_rel, std::memory_order_relaxed)) + {} + } + + bool create_sentinel_file_locked() noexcept { + if (!header_ptr_) + return false; + + const pid_t self_pid = getpid(); + sentinel_path_ = sentinel_full_path(self_pid); + + for (int attempt = 0; attempt < 2; ++attempt) + { + int fd = ::open(sentinel_path_.c_str(), O_CREAT | O_EXCL | O_WRONLY | O_CLOEXEC, 0600); + if (fd != -1) + { + ::close(fd); + return true; + } + + if (errno == EEXIST) + { + ::unlink(sentinel_path_.c_str()); + decrement_refcount_relaxed(); + continue; + } + + break; + } + + sentinel_path_.clear(); + return false; + } + + void remove_sentinel_file() noexcept { + if (!sentinel_path_.empty()) + { + ::unlink(sentinel_path_.c_str()); + sentinel_path_.clear(); + } + } + + static bool pid_is_alive(pid_t pid) noexcept { + if (pid <= 0) + return false; + + if (kill(pid, 0) == 0) + return true; + + return errno == EPERM; + } + + [[nodiscard]] bool initialize_shared_mutex() noexcept { + if (!header_ptr_) + return false; + + pthread_mutexattr_t attr; + if (pthread_mutexattr_init(&attr) != 0) + return false; + + bool success = pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) == 0; +#if _POSIX_C_SOURCE >= 200809L + if (success) + success = pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) == 0; +#endif + + if (success) + success = pthread_mutex_init(&header_ptr_->mutex, &attr) == 0; + + pthread_mutexattr_destroy(&attr); + return success; + } + + [[nodiscard]] bool lock_shared_mutex() noexcept { + if (!header_ptr_) + return false; + + while (true) + { + int rc = pthread_mutex_lock(&header_ptr_->mutex); + if (rc == 0) + return true; + +#if _POSIX_C_SOURCE >= 200809L + if (rc == EOWNERDEAD) + { + if (pthread_mutex_consistent(&header_ptr_->mutex) == 0) + return true; + return false; + } +#endif + + if (rc == EINTR) + continue; + + return false; + } + } + + void unlock_shared_mutex() noexcept { + if (header_ptr_) + pthread_mutex_unlock(&header_ptr_->mutex); + } + + bool has_other_live_sentinels_locked() const noexcept { + DIR* dir = opendir("/dev/shm"); + if (!dir) + return false; + + std::string prefix = sentinel_base_ + "."; + bool found = false; + + while (dirent* entry = readdir(dir)) + { + std::string name = entry->d_name; + if (name.rfind(prefix, 0) != 0) + continue; + + auto pid_str = name.substr(prefix.size()); + char* end = nullptr; + long value = std::strtol(pid_str.c_str(), &end, 10); + if (!end || *end != '\0') + continue; + + pid_t pid = static_cast(value); + if (pid_is_alive(pid)) + { + found = true; + break; + } + + std::string stale_path = std::string("/dev/shm/") + name; + ::unlink(stale_path.c_str()); + const_cast(this)->decrement_refcount_relaxed(); + } + + closedir(dir); + return found; + } + + [[nodiscard]] bool setup_new_region(const T& initial_value) noexcept { + if (ftruncate(fd_, static_cast(total_size_)) == -1) + return false; + + if (detail::portable_fallocate(fd_, 0, static_cast(total_size_)) != 0) + return false; + + mapped_ptr_ = mmap(nullptr, total_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0); + if (mapped_ptr_ == MAP_FAILED) + { + mapped_ptr_ = nullptr; + return false; + } + + data_ptr_ = static_cast(mapped_ptr_); + header_ptr_ = + reinterpret_cast(static_cast(mapped_ptr_) + sizeof(T)); + + new (header_ptr_) detail::ShmHeader{}; + new (data_ptr_) T{initial_value}; + + if (!initialize_shared_mutex()) + return false; + + header_ptr_->ref_count.store(0, std::memory_order_release); + header_ptr_->initialized.store(true, std::memory_order_release); + return true; + } + + [[nodiscard]] bool setup_existing_region(bool& invalid_header) noexcept { + invalid_header = false; + + struct stat st; + fstat(fd_, &st); + if (static_cast(st.st_size) < total_size_) + { + invalid_header = true; + return false; + } + + mapped_ptr_ = mmap(nullptr, total_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0); + if (mapped_ptr_ == MAP_FAILED) + { + mapped_ptr_ = nullptr; + return false; + } + + data_ptr_ = static_cast(mapped_ptr_); + header_ptr_ = std::launder( + reinterpret_cast(static_cast(mapped_ptr_) + sizeof(T))); + + if (!header_ptr_->initialized.load(std::memory_order_acquire) + || header_ptr_->magic != detail::ShmHeader::SHM_MAGIC) + { + invalid_header = true; + unmap_region(); + return false; + } + + return true; + } +}; + +template +[[nodiscard]] std::optional> create_shared(const std::string& name, + const T& initial_value) noexcept { + SharedMemory shm(name); + if (shm.open(initial_value)) + return shm; + return std::nullopt; +} + +} // namespace Stockfish::shm + +#endif // #ifndef SHM_LINUX_H_INCLUDED diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9fe6df9dca032134f7750ec84260b5658e25864b --- /dev/null +++ b/src/syzygy/tbprobe.cpp @@ -0,0 +1,1776 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "tbprobe.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../bitboard.h" +#include "../misc.h" +#include "../movegen.h" +#include "../position.h" +#include "../search.h" +#include "../types.h" +#include "../ucioption.h" + +#ifndef _WIN32 + #include + #include + #include +#else + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX // Disable macros min() and max() + #endif + #include +#endif + +using namespace Stockfish::Tablebases; + +int Stockfish::Tablebases::MaxCardinality; + +namespace Stockfish { + +namespace { + +constexpr int TBPIECES = 7; // Max number of supported pieces +constexpr int MAX_DTZ = + 1 << 18; // Max DTZ supported times 2, large enough to deal with the syzygy TB limit. + +enum { + BigEndian, + LittleEndian +}; +enum TBType { + WDL, + DTZ +}; // Used as template parameter + +// Each table has a set of flags: all of them refer to DTZ tables, the last one to WDL tables +enum TBFlag { + STM = 1, + Mapped = 2, + WinPlies = 4, + LossPlies = 8, + Wide = 16, + SingleValue = 128 +}; + +inline WDLScore operator-(WDLScore d) { return WDLScore(-int(d)); } +inline Square operator^(Square s, int i) { return Square(int(s) ^ i); } + +constexpr std::string_view PieceToChar = " PNBRQK pnbrqk"; + +int MapPawns[SQUARE_NB]; +int MapB1H1H7[SQUARE_NB]; +int MapA1D1D4[SQUARE_NB]; +int MapKK[10][SQUARE_NB]; // [MapA1D1D4][SQUARE_NB] + +int Binomial[6][SQUARE_NB]; // [k][n] k elements from a set of n elements +int LeadPawnIdx[6][SQUARE_NB]; // [leadPawnsCnt][SQUARE_NB] +int LeadPawnsSize[6][4]; // [leadPawnsCnt][FILE_A..FILE_D] + +// Comparison function to sort leading pawns in ascending MapPawns[] order +bool pawns_comp(Square i, Square j) { return MapPawns[i] < MapPawns[j]; } +int off_A1H8(Square sq) { return int(rank_of(sq)) - file_of(sq); } + +constexpr Value WDL_to_value[] = {-VALUE_MATE + MAX_PLY + 1, VALUE_DRAW - 2, VALUE_DRAW, + VALUE_DRAW + 2, VALUE_MATE - MAX_PLY - 1}; + +template +inline void swap_endian(T& x) { + static_assert(std::is_unsigned_v, "Argument of swap_endian not unsigned"); + + uint8_t tmp, *c = (uint8_t*) &x; + for (int i = 0; i < Half; ++i) + tmp = c[i], c[i] = c[End - i], c[End - i] = tmp; +} +template<> +inline void swap_endian(uint8_t&) {} + +template +T number(void* addr) { + T v; + + if (uintptr_t(addr) & (alignof(T) - 1)) // Unaligned pointer (very rare) + std::memcpy(&v, addr, sizeof(T)); + else + v = *((T*) addr); + + if (LE != IsLittleEndian) + swap_endian(v); + return v; +} + +// DTZ tables don't store valid scores for moves that reset the rule50 counter +// like captures and pawn moves but we can easily recover the correct dtz of the +// previous move if we know the position's WDL score. +int dtz_before_zeroing(WDLScore wdl) { + return wdl == WDLWin ? 1 + : wdl == WDLCursedWin ? 101 + : wdl == WDLBlessedLoss ? -101 + : wdl == WDLLoss ? -1 + : 0; +} + +// Return the sign of a number (-1, 0, 1) +template +int sign_of(T val) { + return (T(0) < val) - (val < T(0)); +} + +// Numbers in little-endian used by sparseIndex[] to point into blockLength[] +struct SparseEntry { + char block[4]; // Number of block + char offset[2]; // Offset within the block +}; + +static_assert(sizeof(SparseEntry) == 6, "SparseEntry must be 6 bytes"); + +using Sym = uint16_t; // Huffman symbol + +struct LR { + enum Side { + Left, + Right + }; + + uint8_t lr[3]; // The first 12 bits is the left-hand symbol, the second 12 + // bits is the right-hand symbol. If the symbol has length 1, + // then the left-hand symbol is the stored value. + template + Sym get() { + return S == Left ? ((lr[1] & 0xF) << 8) | lr[0] + : S == Right ? (lr[2] << 4) | (lr[1] >> 4) + : (assert(false), Sym(-1)); + } +}; + +static_assert(sizeof(LR) == 3, "LR tree entry must be 3 bytes"); + +// Tablebases data layout is structured as following: +// +// TBFile: memory maps/unmaps the physical .rtbw and .rtbz files +// TBTable: one object for each file with corresponding indexing information +// TBTables: has ownership of TBTable objects, keeping a list and a hash + +// class TBFile memory maps/unmaps the single .rtbw and .rtbz files. Files are +// memory mapped for best performance. Files are mapped at first access: at init +// time only existence of the file is checked. +class TBFile: public std::ifstream { + + std::string fname; + + public: + // Look for and open the file among the Paths directories where the .rtbw + // and .rtbz files can be found. Multiple directories are separated by ";" + // on Windows and by ":" on Unix-based operating systems. + // + // Example: + // C:\tb\wdl345;C:\tb\wdl6;D:\tb\dtz345;D:\tb\dtz6 + static std::string Paths; + + TBFile(const std::string& f) { + +#ifndef _WIN32 + constexpr char SepChar = ':'; +#else + constexpr char SepChar = ';'; +#endif + std::stringstream ss(Paths); + std::string path; + + while (std::getline(ss, path, SepChar)) + { + fname = path + "/" + f; + std::ifstream::open(fname); + if (is_open()) + return; + } + } + + // Memory map the file and check it. + uint8_t* map(void** baseAddress, uint64_t* mapping, TBType type) { + if (is_open()) + close(); // Need to re-open to get native file descriptor + +#ifndef _WIN32 + struct stat statbuf; + int fd = ::open(fname.c_str(), O_RDONLY); + + if (fd == -1) + return *baseAddress = nullptr, nullptr; + + fstat(fd, &statbuf); + + if (statbuf.st_size % 64 != 16) + { + std::cerr << "Corrupt tablebase file " << fname << std::endl; + exit(EXIT_FAILURE); + } + + *mapping = statbuf.st_size; + *baseAddress = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_SHARED, fd, 0); + #if defined(MADV_RANDOM) + madvise(*baseAddress, statbuf.st_size, MADV_RANDOM); + #endif + ::close(fd); + + if (*baseAddress == MAP_FAILED) + { + std::cerr << "Could not mmap() " << fname << std::endl; + exit(EXIT_FAILURE); + } +#else + // Note FILE_FLAG_RANDOM_ACCESS is only a hint to Windows and as such may get ignored. + HANDLE fd = CreateFileA(fname.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, + OPEN_EXISTING, FILE_FLAG_RANDOM_ACCESS, nullptr); + + if (fd == INVALID_HANDLE_VALUE) + return *baseAddress = nullptr, nullptr; + + DWORD size_high; + DWORD size_low = GetFileSize(fd, &size_high); + + if (size_low % 64 != 16) + { + std::cerr << "Corrupt tablebase file " << fname << std::endl; + exit(EXIT_FAILURE); + } + + HANDLE mmap = CreateFileMapping(fd, nullptr, PAGE_READONLY, size_high, size_low, nullptr); + CloseHandle(fd); + + if (!mmap) + { + std::cerr << "CreateFileMapping() failed" << std::endl; + exit(EXIT_FAILURE); + } + + *mapping = uint64_t(mmap); + *baseAddress = MapViewOfFile(mmap, FILE_MAP_READ, 0, 0, 0); + + if (!*baseAddress) + { + std::cerr << "MapViewOfFile() failed, name = " << fname + << ", error = " << GetLastError() << std::endl; + exit(EXIT_FAILURE); + } +#endif + uint8_t* data = (uint8_t*) *baseAddress; + + constexpr uint8_t Magics[][4] = {{0xD7, 0x66, 0x0C, 0xA5}, {0x71, 0xE8, 0x23, 0x5D}}; + + if (memcmp(data, Magics[type == WDL], 4)) + { + std::cerr << "Corrupted table in file " << fname << std::endl; + unmap(*baseAddress, *mapping); + return *baseAddress = nullptr, nullptr; + } + + return data + 4; // Skip Magics's header + } + + static void unmap(void* baseAddress, uint64_t mapping) { + +#ifndef _WIN32 + munmap(baseAddress, mapping); +#else + UnmapViewOfFile(baseAddress); + CloseHandle((HANDLE) mapping); +#endif + } +}; + +std::string TBFile::Paths; + +// struct PairsData contains low-level indexing information to access TB data. +// There are 8, 4, or 2 PairsData records for each TBTable, according to the type +// of table and if positions have pawns or not. It is populated at first access. +struct PairsData { + uint8_t flags; // Table flags, see enum TBFlag + uint8_t maxSymLen; // Maximum length in bits of the Huffman symbols + uint8_t minSymLen; // Minimum length in bits of the Huffman symbols + uint32_t blocksNum; // Number of blocks in the TB file + size_t sizeofBlock; // Block size in bytes + size_t span; // About every span values there is a SparseIndex[] entry + Sym* lowestSym; // lowestSym[l] is the symbol of length l with the lowest value + LR* btree; // btree[sym] stores the left and right symbols that expand sym + uint16_t* blockLength; // Number of stored positions (minus one) for each block: 1..65536 + uint32_t blockLengthSize; // Size of blockLength[] table: padded so it's bigger than blocksNum + SparseEntry* sparseIndex; // Partial indices into blockLength[] + size_t sparseIndexSize; // Size of SparseIndex[] table + uint8_t* data; // Start of Huffman compressed data + std::vector + base64; // base64[l - min_sym_len] is the 64bit-padded lowest symbol of length l + std::vector + symlen; // Number of values (-1) represented by a given Huffman symbol: 1..256 + Piece pieces[TBPIECES]; // Position pieces: the order of pieces defines the groups + uint64_t groupIdx[TBPIECES + 1]; // Start index used for the encoding of the group's pieces + int groupLen[TBPIECES + 1]; // Number of pieces in a given group: KRKN -> (3, 1) + uint16_t map_idx[4]; // WDLWin, WDLLoss, WDLCursedWin, WDLBlessedLoss (used in DTZ) +}; + +// struct TBTable contains indexing information to access the corresponding TBFile. +// There are 2 types of TBTable, corresponding to a WDL or a DTZ file. TBTable +// is populated at init time but the nested PairsData records are populated at +// first access, when the corresponding file is memory mapped. +template +struct TBTable { + using Ret = std::conditional_t; + + static constexpr int Sides = Type == WDL ? 2 : 1; + + std::atomic_bool ready; + void* baseAddress; + uint8_t* map; + uint64_t mapping; + Key key; + Key key2; + int pieceCount; + bool hasPawns; + bool hasUniquePieces; + uint8_t pawnCount[2]; // [Lead color / other color] + PairsData items[Sides][4]; // [wtm / btm][FILE_A..FILE_D or 0] + + PairsData* get(int stm, int f) { return &items[stm % Sides][hasPawns ? f : 0]; } + + TBTable() : + ready(false), + baseAddress(nullptr) {} + explicit TBTable(const std::string& code); + explicit TBTable(const TBTable& wdl); + + ~TBTable() { + if (baseAddress) + TBFile::unmap(baseAddress, mapping); + } +}; + +template<> +TBTable::TBTable(const std::string& code) : + TBTable() { + + StateInfo st; + Position pos; + + key = pos.set(code, WHITE, &st).material_key(); + pieceCount = pos.count(); + hasPawns = pos.pieces(PAWN); + + hasUniquePieces = false; + for (Color c : {WHITE, BLACK}) + for (PieceType pt = PAWN; pt < KING; ++pt) + if (popcount(pos.pieces(c, pt)) == 1) + hasUniquePieces = true; + + // Set the leading color. In case both sides have pawns the leading color + // is the side with fewer pawns because this leads to better compression. + bool c = !pos.count(BLACK) + || (pos.count(WHITE) && pos.count(BLACK) >= pos.count(WHITE)); + + pawnCount[0] = pos.count(c ? WHITE : BLACK); + pawnCount[1] = pos.count(c ? BLACK : WHITE); + + key2 = pos.set(code, BLACK, &st).material_key(); +} + +template<> +TBTable::TBTable(const TBTable& wdl) : + TBTable() { + + // Use the corresponding WDL table to avoid recalculating all from scratch + key = wdl.key; + key2 = wdl.key2; + pieceCount = wdl.pieceCount; + hasPawns = wdl.hasPawns; + hasUniquePieces = wdl.hasUniquePieces; + pawnCount[0] = wdl.pawnCount[0]; + pawnCount[1] = wdl.pawnCount[1]; +} + +// class TBTables creates and keeps ownership of the TBTable objects, one for +// each TB file found. It supports a fast, hash-based, table lookup. Populated +// at init time, accessed at probe time. +class TBTables { + + struct Entry { + Key key; + TBTable* wdl; + TBTable* dtz; + + template + TBTable* get() const { + return (TBTable*) (Type == WDL ? (void*) wdl : (void*) dtz); + } + }; + + static constexpr int Size = 1 << 12; // 4K table, indexed by key's 12 lsb + static constexpr int Overflow = 1; // Number of elements allowed to map to the last bucket + + Entry hashTable[Size + Overflow]; + + std::deque> wdlTable; + std::deque> dtzTable; + size_t foundDTZFiles = 0; + size_t foundWDLFiles = 0; + + void insert(Key key, TBTable* wdl, TBTable* dtz) { + uint32_t homeBucket = uint32_t(key) & (Size - 1); + Entry entry{key, wdl, dtz}; + + // Ensure last element is empty to avoid overflow when looking up + for (uint32_t bucket = homeBucket; bucket < Size + Overflow - 1; ++bucket) + { + Key otherKey = hashTable[bucket].key; + if (otherKey == key || !hashTable[bucket].get()) + { + hashTable[bucket] = entry; + return; + } + + // Robin Hood hashing: If we've probed for longer than this element, + // insert here and search for a new spot for the other element instead. + uint32_t otherHomeBucket = uint32_t(otherKey) & (Size - 1); + if (otherHomeBucket > homeBucket) + { + std::swap(entry, hashTable[bucket]); + key = otherKey; + homeBucket = otherHomeBucket; + } + } + std::cerr << "TB hash table size too low!" << std::endl; + exit(EXIT_FAILURE); + } + + public: + template + TBTable* get(Key key) { + for (const Entry* entry = &hashTable[uint32_t(key) & (Size - 1)];; ++entry) + { + if (entry->key == key || !entry->get()) + return entry->get(); + } + } + + void clear() { + memset(hashTable, 0, sizeof(hashTable)); + wdlTable.clear(); + dtzTable.clear(); + foundDTZFiles = 0; + foundWDLFiles = 0; + } + + void info() const { + sync_cout << "info string Found " << foundWDLFiles << " WDL and " << foundDTZFiles + << " DTZ tablebase files (up to " << MaxCardinality << "-man)." << sync_endl; + } + + void add(const std::vector& pieces); +}; + +TBTables TBTables; + +// If the corresponding file exists two new objects TBTable and TBTable +// are created and added to the lists and hash table. Called at init time. +void TBTables::add(const std::vector& pieces) { + + std::string code; + + for (PieceType pt : pieces) + code += PieceToChar[pt]; + code.insert(code.find('K', 1), "v"); + + TBFile file_dtz(code + ".rtbz"); // KRK -> KRvK + if (file_dtz.is_open()) + { + file_dtz.close(); + foundDTZFiles++; + } + + TBFile file(code + ".rtbw"); // KRK -> KRvK + + if (!file.is_open()) // Only WDL file is checked + return; + + file.close(); + foundWDLFiles++; + + MaxCardinality = std::max(int(pieces.size()), MaxCardinality); + + wdlTable.emplace_back(code); + dtzTable.emplace_back(wdlTable.back()); + + // Insert into the hash keys for both colors: KRvK with KR white and black + insert(wdlTable.back().key, &wdlTable.back(), &dtzTable.back()); + insert(wdlTable.back().key2, &wdlTable.back(), &dtzTable.back()); +} + +// TB tables are compressed with canonical Huffman code. The compressed data is divided into +// blocks of size d->sizeofBlock, and each block stores a variable number of symbols. +// Each symbol represents either a WDL or a (remapped) DTZ value, or a pair of other symbols +// (recursively). If you keep expanding the symbols in a block, you end up with up to 65536 +// WDL or DTZ values. Each symbol represents up to 256 values and will correspond after +// Huffman coding to at least 1 bit. So a block of 32 bytes corresponds to at most +// 32 x 8 x 256 = 65536 values. This maximum is only reached for tables that consist mostly +// of draws or mostly of wins, but such tables are actually quite common. In principle, the +// blocks in WDL tables are 64 bytes long (and will be aligned on cache lines). But for +// mostly-draw or mostly-win tables this can leave many 64-byte blocks only half-filled, so +// in such cases blocks are 32 bytes long. The blocks of DTZ tables are up to 1024 bytes long. +// The generator picks the size that leads to the smallest table. The "book" of symbols and +// Huffman codes are the same for all blocks in the table. A non-symmetric pawnless TB file +// will have one table for wtm and one for btm, a TB file with pawns will have tables per +// file a,b,c,d also, in this case, one set for wtm and one for btm. +int decompress_pairs(PairsData* d, uint64_t idx) { + + // Special case where all table positions store the same value + if (d->flags & TBFlag::SingleValue) + return d->minSymLen; + + // First we need to locate the right block that stores the value at index "idx". + // Because each block n stores blockLength[n] + 1 values, the index i of the block + // that contains the value at position idx is: + // + // for (i = -1, sum = 0; sum <= idx; i++) + // sum += blockLength[i + 1] + 1; + // + // This can be slow, so we use SparseIndex[] populated with a set of SparseEntry that + // point to known indices into blockLength[]. Namely SparseIndex[k] is a SparseEntry + // that stores the blockLength[] index and the offset within that block of the value + // with index I(k), where: + // + // I(k) = k * d->span + d->span / 2 (1) + + // First step is to get the 'k' of the I(k) nearest to our idx, using definition (1) + uint32_t k = uint32_t(idx / d->span); + + // Then we read the corresponding SparseIndex[] entry + uint32_t block = number(&d->sparseIndex[k].block); + int offset = number(&d->sparseIndex[k].offset); + + // Now compute the difference idx - I(k). From the definition of k, we know that + // + // idx = k * d->span + idx % d->span (2) + // + // So from (1) and (2) we can compute idx - I(K): + int diff = int(idx % d->span - d->span / 2); + + // Sum the above to offset to find the offset corresponding to our idx + offset += diff; + + // Move to the previous/next block, until we reach the correct block that contains idx, + // that is when 0 <= offset <= d->blockLength[block] + while (offset < 0) + offset += d->blockLength[--block] + 1; + + while (offset > d->blockLength[block]) + offset -= d->blockLength[block++] + 1; + + // Finally, we find the start address of our block of canonical Huffman symbols + uint32_t* ptr = (uint32_t*) (d->data + (uint64_t(block) * d->sizeofBlock)); + + // Read the first 64 bits in our block, this is a (truncated) sequence of + // unknown number of symbols of unknown length but we know the first one + // is at the beginning of this 64-bit sequence. + uint64_t buf64 = number(ptr); + ptr += 2; + int buf64Size = 64; + Sym sym; + + while (true) + { + int len = 0; // This is the symbol length - d->min_sym_len + + // Now get the symbol length. For any symbol s64 of length l right-padded + // to 64 bits we know that d->base64[l-1] >= s64 >= d->base64[l] so we + // can find the symbol length iterating through base64[]. + while (buf64 < d->base64[len]) + ++len; + + // All the symbols of a given length are consecutive integers (numerical + // sequence property), so we can compute the offset of our symbol of + // length len, stored at the beginning of buf64. + sym = Sym((buf64 - d->base64[len]) >> (64 - len - d->minSymLen)); + + // Now add the value of the lowest symbol of length len to get our symbol + sym += number(&d->lowestSym[len]); + + // If our offset is within the number of values represented by symbol sym, + // we are done. + if (offset < d->symlen[sym] + 1) + break; + + // ...otherwise update the offset and continue to iterate + offset -= d->symlen[sym] + 1; + len += d->minSymLen; // Get the real length + buf64 <<= len; // Consume the just processed symbol + buf64Size -= len; + + if (buf64Size <= 32) + { // Refill the buffer + buf64Size += 32; + buf64 |= uint64_t(number(ptr++)) << (64 - buf64Size); + } + } + + // Now we have our symbol that expands into d->symlen[sym] + 1 symbols. + // We binary-search for our value recursively expanding into the left and + // right child symbols until we reach a leaf node where symlen[sym] + 1 == 1 + // that will store the value we need. + while (d->symlen[sym]) + { + Sym left = d->btree[sym].get(); + + // If a symbol contains 36 sub-symbols (d->symlen[sym] + 1 = 36) and + // expands in a pair (d->symlen[left] = 23, d->symlen[right] = 11), then + // we know that, for instance, the tenth value (offset = 10) will be on + // the left side because in Recursive Pairing child symbols are adjacent. + if (offset < d->symlen[left] + 1) + sym = left; + else + { + offset -= d->symlen[left] + 1; + sym = d->btree[sym].get(); + } + } + + return d->btree[sym].get(); +} + +bool check_dtz_stm(TBTable*, int, File) { return true; } + +bool check_dtz_stm(TBTable* entry, int stm, File f) { + + auto flags = entry->get(stm, f)->flags; + return (flags & TBFlag::STM) == stm || ((entry->key == entry->key2) && !entry->hasPawns); +} + +// DTZ scores are sorted by frequency of occurrence and then assigned the +// values 0, 1, 2, ... in order of decreasing frequency. This is done for each +// of the four WDLScore values. The mapping information necessary to reconstruct +// the original values are stored in the TB file and read during map[] init. +WDLScore map_score(TBTable*, File, int value, WDLScore) { return WDLScore(value - 2); } + +int map_score(TBTable* entry, File f, int value, WDLScore wdl) { + + constexpr int WDLMap[] = {1, 3, 0, 2, 0}; + + auto flags = entry->get(0, f)->flags; + + uint8_t* map = entry->map; + uint16_t* idx = entry->get(0, f)->map_idx; + if (flags & TBFlag::Mapped) + { + if (flags & TBFlag::Wide) + value = ((uint16_t*) map)[idx[WDLMap[wdl + 2]] + value]; + else + value = map[idx[WDLMap[wdl + 2]] + value]; + } + + // DTZ tables store distance to zero in number of moves or plies. We + // want to return plies, so we have to convert to plies when needed. + if ((wdl == WDLWin && !(flags & TBFlag::WinPlies)) + || (wdl == WDLLoss && !(flags & TBFlag::LossPlies)) || wdl == WDLCursedWin + || wdl == WDLBlessedLoss) + value *= 2; + + return value + 1; +} + +// A temporary fix for the compiler bug with vectorization. (#4450) +#if defined(__clang__) && defined(__clang_major__) && __clang_major__ >= 15 + #define DISABLE_CLANG_LOOP_VEC _Pragma("clang loop vectorize(disable)") +#else + #define DISABLE_CLANG_LOOP_VEC +#endif + +// Compute a unique index out of a position and use it to probe the TB file. To +// encode k pieces of the same type and color, first sort the pieces by square in +// ascending order s1 <= s2 <= ... <= sk then compute the unique index as: +// +// idx = Binomial[1][s1] + Binomial[2][s2] + ... + Binomial[k][sk] +// +template +Ret do_probe_table(const Position& pos, T* entry, WDLScore wdl, ProbeState* result) { + + Square squares[TBPIECES]; + Piece pieces[TBPIECES]; + uint64_t idx; + int next = 0, size = 0, leadPawnsCnt = 0; + PairsData* d; + Bitboard b, leadPawns = 0; + File tbFile = FILE_A; + + // A given TB entry like KRK has associated two material keys: KRvk and Kvkr. + // If both sides have the same pieces keys are equal. In this case TB tables + // only stores the 'white to move' case, so if the position to lookup has black + // to move, we need to switch the color and flip the squares before to lookup. + bool symmetricBlackToMove = (entry->key == entry->key2 && pos.side_to_move()); + + // TB files are calculated for white as the stronger side. For instance, we + // have KRvK, not KvKR. A position where the stronger side is white will have + // its material key == entry->key, otherwise we have to switch the color and + // flip the squares before to lookup. + bool blackStronger = (pos.material_key() != entry->key); + + int flipColor = (symmetricBlackToMove || blackStronger) * 8; + int flipSquares = (symmetricBlackToMove || blackStronger) * 56; + int stm = (symmetricBlackToMove || blackStronger) ^ pos.side_to_move(); + + // For pawns, TB files store 4 separate tables according if leading pawn is on + // file a, b, c or d after reordering. The leading pawn is the one with maximum + // MapPawns[] value, that is the one most toward the edges and with lowest rank. + if (entry->hasPawns) + { + + // In all the 4 tables, pawns are at the beginning of the piece sequence and + // their color is the reference one. So we just pick the first one. + Piece pc = Piece(entry->get(0, 0)->pieces[0] ^ flipColor); + + assert(type_of(pc) == PAWN); + + leadPawns = b = pos.pieces(color_of(pc), PAWN); + do + squares[size++] = pop_lsb(b) ^ flipSquares; + while (b); + + leadPawnsCnt = size; + + std::swap(squares[0], *std::max_element(squares, squares + leadPawnsCnt, pawns_comp)); + + tbFile = File(edge_distance(file_of(squares[0]))); + } + + // DTZ tables are one-sided, i.e. they store positions only for white to + // move or only for black to move, so check for side to move to be stm, + // early exit otherwise. + if (!check_dtz_stm(entry, stm, tbFile)) + return *result = CHANGE_STM, Ret(); + + // Now we are ready to get all the position pieces (but the lead pawns) and + // directly map them to the correct color and square. + b = pos.pieces() ^ leadPawns; + do + { + Square s = pop_lsb(b); + squares[size] = s ^ flipSquares; + pieces[size++] = Piece(pos.piece_on(s) ^ flipColor); + } while (b); + + assert(size >= 2); + + d = entry->get(stm, tbFile); + + // Then we reorder the pieces to have the same sequence as the one stored + // in pieces[i]: the sequence that ensures the best compression. + for (int i = leadPawnsCnt; i < size - 1; ++i) + for (int j = i + 1; j < size; ++j) + if (d->pieces[i] == pieces[j]) + { + std::swap(pieces[i], pieces[j]); + std::swap(squares[i], squares[j]); + break; + } + + // Now we map again the squares so that the square of the lead piece is in + // the triangle A1-D1-D4. + if (file_of(squares[0]) > FILE_D) + { + DISABLE_CLANG_LOOP_VEC + for (int i = 0; i < size; ++i) + squares[i] = flip_file(squares[i]); + } + + // Encode leading pawns starting with the one with minimum MapPawns[] and + // proceeding in ascending order. + if (entry->hasPawns) + { + idx = LeadPawnIdx[leadPawnsCnt][squares[0]]; + + std::stable_sort(squares + 1, squares + leadPawnsCnt, pawns_comp); + + for (int i = 1; i < leadPawnsCnt; ++i) + idx += Binomial[i][MapPawns[squares[i]]]; + + goto encode_remaining; // With pawns we have finished special treatments + } + + // In positions without pawns, we further flip the squares to ensure leading + // piece is below RANK_5. + if (rank_of(squares[0]) > RANK_4) + { + DISABLE_CLANG_LOOP_VEC + for (int i = 0; i < size; ++i) + squares[i] = flip_rank(squares[i]); + } + + // Look for the first piece of the leading group not on the A1-D4 diagonal + // and ensure it is mapped below the diagonal. + DISABLE_CLANG_LOOP_VEC + for (int i = 0; i < d->groupLen[0]; ++i) + { + if (!off_A1H8(squares[i])) + continue; + + if (off_A1H8(squares[i]) > 0) // A1-H8 diagonal flip: SQ_A3 -> SQ_C1 + { + DISABLE_CLANG_LOOP_VEC + for (int j = i; j < size; ++j) + squares[j] = Square(((squares[j] >> 3) | (squares[j] << 3)) & 63); + } + break; + } + + // Encode the leading group. + // + // Suppose we have KRvK. Let's say the pieces are on square numbers wK, wR + // and bK (each 0...63). The simplest way to map this position to an index + // is like this: + // + // index = wK * 64 * 64 + wR * 64 + bK; + // + // But this way the TB is going to have 64*64*64 = 262144 positions, with + // lots of positions being equivalent (because they are mirrors of each + // other) and lots of positions being invalid (two pieces on one square, + // adjacent kings, etc.). + // Usually the first step is to take the wK and bK together. There are just + // 462 ways legal and not-mirrored ways to place the wK and bK on the board. + // Once we have placed the wK and bK, there are 62 squares left for the wR + // Mapping its square from 0..63 to available squares 0..61 can be done like: + // + // wR -= (wR > wK) + (wR > bK); + // + // In words: if wR "comes later" than wK, we deduct 1, and the same if wR + // "comes later" than bK. In case of two same pieces like KRRvK we want to + // place the two Rs "together". If we have 62 squares left, we can place two + // Rs "together" in 62 * 61 / 2 ways (we divide by 2 because rooks can be + // swapped and still get the same position.) + // + // In case we have at least 3 unique pieces (including kings) we encode them + // together. + if (entry->hasUniquePieces) + { + + int adjust1 = squares[1] > squares[0]; + int adjust2 = (squares[2] > squares[0]) + (squares[2] > squares[1]); + + // First piece is below a1-h8 diagonal. MapA1D1D4[] maps the b1-d1-d3 + // triangle to 0...5. There are 63 squares for second piece and 62 + // (mapped to 0...61) for the third. + if (off_A1H8(squares[0])) + idx = (MapA1D1D4[squares[0]] * 63 + (squares[1] - adjust1)) * 62 + squares[2] - adjust2; + + // First piece is on a1-h8 diagonal, second below: map this occurrence to + // 6 to differentiate from the above case, rank_of() maps a1-d4 diagonal + // to 0...3 and finally MapB1H1H7[] maps the b1-h1-h7 triangle to 0..27. + else if (off_A1H8(squares[1])) + idx = (6 * 63 + rank_of(squares[0]) * 28 + MapB1H1H7[squares[1]]) * 62 + squares[2] + - adjust2; + + // First two pieces are on a1-h8 diagonal, third below + else if (off_A1H8(squares[2])) + idx = 6 * 63 * 62 + 4 * 28 * 62 + rank_of(squares[0]) * 7 * 28 + + (rank_of(squares[1]) - adjust1) * 28 + MapB1H1H7[squares[2]]; + + // All 3 pieces on the diagonal a1-h8 + else + idx = 6 * 63 * 62 + 4 * 28 * 62 + 4 * 7 * 28 + rank_of(squares[0]) * 7 * 6 + + (rank_of(squares[1]) - adjust1) * 6 + (rank_of(squares[2]) - adjust2); + } + else + // We don't have at least 3 unique pieces, like in KRRvKBB, just map + // the kings. + idx = MapKK[MapA1D1D4[squares[0]]][squares[1]]; + +encode_remaining: + idx *= d->groupIdx[0]; + Square* groupSq = squares + d->groupLen[0]; + + // Encode remaining pawns and then pieces according to square, in ascending order + bool remainingPawns = entry->hasPawns && entry->pawnCount[1]; + + while (d->groupLen[++next]) + { + std::stable_sort(groupSq, groupSq + d->groupLen[next]); + uint64_t n = 0; + + // Map down a square if "comes later" than a square in the previous + // groups (similar to what was done earlier for leading group pieces). + for (int i = 0; i < d->groupLen[next]; ++i) + { + auto f = [&](Square s) { return groupSq[i] > s; }; + auto adjust = std::count_if(squares, groupSq, f); + n += Binomial[i + 1][groupSq[i] - adjust - 8 * remainingPawns]; + } + + remainingPawns = false; + idx += n * d->groupIdx[next]; + groupSq += d->groupLen[next]; + } + + // Now that we have the index, decompress the pair and get the score + return map_score(entry, tbFile, decompress_pairs(d, idx), wdl); +} + +// Group together pieces that will be encoded together. The general rule is that +// a group contains pieces of the same type and color. The exception is the leading +// group that, in case of positions without pawns, can be formed by 3 different +// pieces (default) or by the king pair when there is not a unique piece apart +// from the kings. When there are pawns, pawns are always first in pieces[]. +// +// As example KRKN -> KRK + N, KNNK -> KK + NN, KPPKP -> P + PP + K + K +// +// The actual grouping depends on the TB generator and can be inferred from the +// sequence of pieces in piece[] array. +template +void set_groups(T& e, PairsData* d, int order[], File f) { + + int n = 0, firstLen = e.hasPawns ? 0 : e.hasUniquePieces ? 3 : 2; + d->groupLen[n] = 1; + + // Number of pieces per group is stored in groupLen[], for instance in KRKN + // the encoder will default on '111', so groupLen[] will be (3, 1). + for (int i = 1; i < e.pieceCount; ++i) + if (--firstLen > 0 || d->pieces[i] == d->pieces[i - 1]) + d->groupLen[n]++; + else + d->groupLen[++n] = 1; + + d->groupLen[++n] = 0; // Zero-terminated + + // The sequence in pieces[] defines the groups, but not the order in which + // they are encoded. If the pieces in a group g can be combined on the board + // in N(g) different ways, then the position encoding will be of the form: + // + // g1 * N(g2) * N(g3) + g2 * N(g3) + g3 + // + // This ensures unique encoding for the whole position. The order of the + // groups is a per-table parameter and could not follow the canonical leading + // pawns/pieces -> remaining pawns -> remaining pieces. In particular the + // first group is at order[0] position and the remaining pawns, when present, + // are at order[1] position. + bool pp = e.hasPawns && e.pawnCount[1]; // Pawns on both sides + int next = pp ? 2 : 1; + int freeSquares = 64 - d->groupLen[0] - (pp ? d->groupLen[1] : 0); + uint64_t idx = 1; + + for (int k = 0; next < n || k == order[0] || k == order[1]; ++k) + if (k == order[0]) // Leading pawns or pieces + { + d->groupIdx[0] = idx; + idx *= e.hasPawns ? LeadPawnsSize[d->groupLen[0]][f] : e.hasUniquePieces ? 31332 : 462; + } + else if (k == order[1]) // Remaining pawns + { + d->groupIdx[1] = idx; + idx *= Binomial[d->groupLen[1]][48 - d->groupLen[0]]; + } + else // Remaining pieces + { + d->groupIdx[next] = idx; + idx *= Binomial[d->groupLen[next]][freeSquares]; + freeSquares -= d->groupLen[next++]; + } + + d->groupIdx[n] = idx; +} + +// In Recursive Pairing each symbol represents a pair of children symbols. So +// read d->btree[] symbols data and expand each one in his left and right child +// symbol until reaching the leaves that represent the symbol value. +uint8_t set_symlen(PairsData* d, Sym s, std::vector& visited) { + + visited[s] = true; // We can set it now because tree is acyclic + Sym sr = d->btree[s].get(); + + if (sr == 0xFFF) + return 0; + + Sym sl = d->btree[s].get(); + + if (!visited[sl]) + d->symlen[sl] = set_symlen(d, sl, visited); + + if (!visited[sr]) + d->symlen[sr] = set_symlen(d, sr, visited); + + return d->symlen[sl] + d->symlen[sr] + 1; +} + +uint8_t* set_sizes(PairsData* d, uint8_t* data) { + + d->flags = *data++; + + if (d->flags & TBFlag::SingleValue) + { + d->blocksNum = d->blockLengthSize = 0; + d->span = d->sparseIndexSize = 0; // Broken MSVC zero-init + d->minSymLen = *data++; // Here we store the single value + return data; + } + + // groupLen[] is a zero-terminated list of group lengths, the last groupIdx[] + // element stores the biggest index that is the tb size. + uint64_t tbSize = d->groupIdx[std::find(d->groupLen, d->groupLen + 7, 0) - d->groupLen]; + + d->sizeofBlock = 1ULL << *data++; + d->span = 1ULL << *data++; + d->sparseIndexSize = size_t((tbSize + d->span - 1) / d->span); // Round up + auto padding = number(data++); + d->blocksNum = number(data); + data += sizeof(uint32_t); + d->blockLengthSize = d->blocksNum + padding; // Padded to ensure SparseIndex[] + // does not point out of range. + d->maxSymLen = *data++; + d->minSymLen = *data++; + d->lowestSym = (Sym*) data; + d->base64.resize(d->maxSymLen - d->minSymLen + 1); + + // See https://en.wikipedia.org/wiki/Huffman_coding + // The canonical code is ordered such that longer symbols (in terms of + // the number of bits of their Huffman code) have a lower numeric value, + // so that d->lowestSym[i] >= d->lowestSym[i+1] (when read as LittleEndian). + // Starting from this we compute a base64[] table indexed by symbol length + // and containing 64 bit values so that d->base64[i] >= d->base64[i+1]. + + // Implementation note: we first cast the unsigned size_t "base64.size()" + // to a signed int "base64_size" variable and then we are able to subtract 2, + // avoiding unsigned overflow warnings. + + int base64_size = static_cast(d->base64.size()); + for (int i = base64_size - 2; i >= 0; --i) + { + d->base64[i] = (d->base64[i + 1] + number(&d->lowestSym[i]) + - number(&d->lowestSym[i + 1])) + / 2; + + assert(d->base64[i] * 2 >= d->base64[i + 1]); + } + + // Now left-shift by an amount so that d->base64[i] gets shifted 1 bit more + // than d->base64[i+1] and given the above assert condition, we ensure that + // d->base64[i] >= d->base64[i+1]. Moreover for any symbol s64 of length i + // and right-padded to 64 bits holds d->base64[i-1] >= s64 >= d->base64[i]. + for (int i = 0; i < base64_size; ++i) + d->base64[i] <<= 64 - i - d->minSymLen; // Right-padding to 64 bits + + data += base64_size * sizeof(Sym); + d->symlen.resize(number(data)); + data += sizeof(uint16_t); + d->btree = (LR*) data; + + // The compression scheme used is "Recursive Pairing", that replaces the most + // frequent adjacent pair of symbols in the source message by a new symbol, + // reevaluating the frequencies of all of the symbol pairs with respect to + // the extended alphabet, and then repeating the process. + // See https://web.archive.org/web/20201106232444/http://www.larsson.dogma.net/dcc99.pdf + std::vector visited(d->symlen.size()); + + for (Sym sym = 0; sym < d->symlen.size(); ++sym) + if (!visited[sym]) + d->symlen[sym] = set_symlen(d, sym, visited); + + return data + d->symlen.size() * sizeof(LR) + (d->symlen.size() & 1); +} + +uint8_t* set_dtz_map(TBTable&, uint8_t* data, File) { return data; } + +uint8_t* set_dtz_map(TBTable& e, uint8_t* data, File maxFile) { + + e.map = data; + + for (File f = FILE_A; f <= maxFile; ++f) + { + auto flags = e.get(0, f)->flags; + if (flags & TBFlag::Mapped) + { + if (flags & TBFlag::Wide) + { + data += uintptr_t(data) & 1; // Word alignment, we may have a mixed table + for (int i = 0; i < 4; ++i) + { // Sequence like 3,x,x,x,1,x,0,2,x,x + e.get(0, f)->map_idx[i] = uint16_t((uint16_t*) data - (uint16_t*) e.map + 1); + data += 2 * number(data) + 2; + } + } + else + { + for (int i = 0; i < 4; ++i) + { + e.get(0, f)->map_idx[i] = uint16_t(data - e.map + 1); + data += *data + 1; + } + } + } + } + + return data += uintptr_t(data) & 1; // Word alignment +} + +// Populate entry's PairsData records with data from the just memory-mapped file. +// Called at first access. +template +void set(T& e, uint8_t* data) { + + PairsData* d; + + enum { + Split = 1, + HasPawns = 2 + }; + + assert(e.hasPawns == bool(*data & HasPawns)); + assert((e.key != e.key2) == bool(*data & Split)); + + data++; // First byte stores flags + + const int sides = T::Sides == 2 && (e.key != e.key2) ? 2 : 1; + const File maxFile = e.hasPawns ? FILE_D : FILE_A; + + bool pp = e.hasPawns && e.pawnCount[1]; // Pawns on both sides + + assert(!pp || e.pawnCount[0]); + + for (File f = FILE_A; f <= maxFile; ++f) + { + + for (int i = 0; i < sides; i++) + *e.get(i, f) = PairsData(); + + int order[][2] = {{*data & 0xF, pp ? *(data + 1) & 0xF : 0xF}, + {*data >> 4, pp ? *(data + 1) >> 4 : 0xF}}; + data += 1 + pp; + + for (int k = 0; k < e.pieceCount; ++k, ++data) + for (int i = 0; i < sides; i++) + e.get(i, f)->pieces[k] = Piece(i ? *data >> 4 : *data & 0xF); + + for (int i = 0; i < sides; ++i) + set_groups(e, e.get(i, f), order[i], f); + } + + data += uintptr_t(data) & 1; // Word alignment + + for (File f = FILE_A; f <= maxFile; ++f) + for (int i = 0; i < sides; i++) + data = set_sizes(e.get(i, f), data); + + data = set_dtz_map(e, data, maxFile); + + for (File f = FILE_A; f <= maxFile; ++f) + for (int i = 0; i < sides; i++) + { + (d = e.get(i, f))->sparseIndex = (SparseEntry*) data; + data += d->sparseIndexSize * sizeof(SparseEntry); + } + + for (File f = FILE_A; f <= maxFile; ++f) + for (int i = 0; i < sides; i++) + { + (d = e.get(i, f))->blockLength = (uint16_t*) data; + data += d->blockLengthSize * sizeof(uint16_t); + } + + for (File f = FILE_A; f <= maxFile; ++f) + for (int i = 0; i < sides; i++) + { + data = (uint8_t*) ((uintptr_t(data) + 0x3F) & ~0x3F); // 64 byte alignment + (d = e.get(i, f))->data = data; + data += d->blocksNum * d->sizeofBlock; + } +} + +// If the TB file corresponding to the given position is already memory-mapped +// then return its base address, otherwise, try to memory map and init it. Called +// at every probe, memory map, and init only at first access. Function is thread +// safe and can be called concurrently. +template +void* mapped(TBTable& e, const Position& pos) { + + static std::mutex mutex; + // Because TB is the only usage of materialKey, check it here in debug mode + assert(pos.material_key_is_ok()); + + // Use 'acquire' to avoid a thread reading 'ready' == true while + // another is still working. (compiler reordering may cause this). + if (e.ready.load(std::memory_order_acquire)) + return e.baseAddress; // Could be nullptr if file does not exist + + std::scoped_lock lk(mutex); + + if (e.ready.load(std::memory_order_relaxed)) // Recheck under lock + return e.baseAddress; + + // Pieces strings in decreasing order for each color, like ("KPP","KR") + std::string fname, w, b; + for (PieceType pt = KING; pt >= PAWN; --pt) + { + w += std::string(popcount(pos.pieces(WHITE, pt)), PieceToChar[pt]); + b += std::string(popcount(pos.pieces(BLACK, pt)), PieceToChar[pt]); + } + + fname = + (e.key == pos.material_key() ? w + 'v' + b : b + 'v' + w) + (Type == WDL ? ".rtbw" : ".rtbz"); + + uint8_t* data = TBFile(fname).map(&e.baseAddress, &e.mapping, Type); + + if (data) + set(e, data); + + e.ready.store(true, std::memory_order_release); + return e.baseAddress; +} + +template::Ret> +Ret probe_table(const Position& pos, ProbeState* result, WDLScore wdl = WDLDraw) { + + if (pos.count() == 2) // KvK + return Ret(WDLDraw); + + TBTable* entry = TBTables.get(pos.material_key()); + + if (!entry || !mapped(*entry, pos)) + return *result = FAIL, Ret(); + + return do_probe_table(pos, entry, wdl, result); +} + +// For a position where the side to move has a winning capture it is not necessary +// to store a winning value so the generator treats such positions as "don't care" +// and tries to assign to it a value that improves the compression ratio. Similarly, +// if the side to move has a drawing capture, then the position is at least drawn. +// If the position is won, then the TB needs to store a win value. But if the +// position is drawn, the TB may store a loss value if that is better for compression. +// All of this means that during probing, the engine must look at captures and probe +// their results and must probe the position itself. The "best" result of these +// probes is the correct result for the position. +// DTZ tables do not store values when a following move is a zeroing winning move +// (winning capture or winning pawn move). Also, DTZ store wrong values for positions +// where the best move is an ep-move (even if losing). So in all these cases set +// the state to ZEROING_BEST_MOVE. +template +WDLScore search(Position& pos, ProbeState* result) { + + WDLScore value, bestValue = WDLLoss; + StateInfo st; + + auto moveList = MoveList(pos); + size_t totalCount = moveList.size(), moveCount = 0; + + for (const Move move : moveList) + { + if (!pos.capture(move) && (!CheckZeroingMoves || type_of(pos.moved_piece(move)) != PAWN)) + continue; + + moveCount++; + + pos.do_move(move, st); + value = -search(pos, result); + pos.undo_move(move); + + if (*result == FAIL) + return WDLDraw; + + if (value > bestValue) + { + bestValue = value; + + if (value >= WDLWin) + { + *result = ZEROING_BEST_MOVE; // Winning DTZ-zeroing move + return value; + } + } + } + + // In case we have already searched all the legal moves we don't have to probe + // the TB because the stored score could be wrong. For instance TB tables + // do not contain information on position with ep rights, so in this case + // the result of probe_wdl_table is wrong. Also in case of only capture + // moves, for instance here 4K3/4q3/6p1/2k5/6p1/8/8/8 w - - 0 7, we have to + // return with ZEROING_BEST_MOVE set. + bool noMoreMoves = (moveCount && moveCount == totalCount); + + if (noMoreMoves) + value = bestValue; + else + { + value = probe_table(pos, result); + + if (*result == FAIL) + return WDLDraw; + } + + // DTZ stores a "don't care" value if bestValue is a win + if (bestValue >= value) + return *result = (bestValue > WDLDraw || noMoreMoves ? ZEROING_BEST_MOVE : OK), bestValue; + + return *result = OK, value; +} + +} // namespace + + +// Called at startup and after every change to +// "SyzygyPath" UCI option to (re)create the various tables. It is not thread +// safe, nor it needs to be. +void Tablebases::init(const std::string& paths) { + + TBTables.clear(); + MaxCardinality = 0; + TBFile::Paths = paths; + + if (paths.empty()) + return; + + // MapB1H1H7[] encodes a square below a1-h8 diagonal to 0..27 + int code = 0; + for (Square s = SQ_A1; s <= SQ_H8; ++s) + if (off_A1H8(s) < 0) + MapB1H1H7[s] = code++; + + // MapA1D1D4[] encodes a square in the a1-d1-d4 triangle to 0..9 + std::vector diagonal; + code = 0; + for (Square s = SQ_A1; s <= SQ_D4; ++s) + if (off_A1H8(s) < 0 && file_of(s) <= FILE_D) + MapA1D1D4[s] = code++; + + else if (!off_A1H8(s) && file_of(s) <= FILE_D) + diagonal.push_back(s); + + // Diagonal squares are encoded as last ones + for (auto s : diagonal) + MapA1D1D4[s] = code++; + + // MapKK[] encodes all the 462 possible legal positions of two kings where + // the first is in the a1-d1-d4 triangle. If the first king is on the a1-d4 + // diagonal, the other one shall not be above the a1-h8 diagonal. + std::vector> bothOnDiagonal; + code = 0; + for (int idx = 0; idx < 10; idx++) + for (Square s1 = SQ_A1; s1 <= SQ_D4; ++s1) + if (MapA1D1D4[s1] == idx && (idx || s1 == SQ_B1)) // SQ_B1 is mapped to 0 + { + for (Square s2 = SQ_A1; s2 <= SQ_H8; ++s2) + if ((PseudoAttacks[KING][s1] | s1) & s2) + continue; // Illegal position + + else if (!off_A1H8(s1) && off_A1H8(s2) > 0) + continue; // First on diagonal, second above + + else if (!off_A1H8(s1) && !off_A1H8(s2)) + bothOnDiagonal.emplace_back(idx, s2); + + else + MapKK[idx][s2] = code++; + } + + // Legal positions with both kings on a diagonal are encoded as last ones + for (auto p : bothOnDiagonal) + MapKK[p.first][p.second] = code++; + + // Binomial[] stores the Binomial Coefficients using Pascal rule. There + // are Binomial[k][n] ways to choose k elements from a set of n elements. + Binomial[0][0] = 1; + + for (int n = 1; n < 64; n++) // Squares + for (int k = 0; k < 6 && k <= n; ++k) // Pieces + Binomial[k][n] = + (k > 0 ? Binomial[k - 1][n - 1] : 0) + (k < n ? Binomial[k][n - 1] : 0); + + // MapPawns[s] encodes squares a2-h7 to 0..47. This is the number of possible + // available squares when the leading one is in 's'. Moreover the pawn with + // highest MapPawns[] is the leading pawn, the one nearest the edge, and + // among pawns with the same file, the one with the lowest rank. + int availableSquares = 47; // Available squares when lead pawn is in a2 + + // Init the tables for the encoding of leading pawns group: with 7-men TB we + // can have up to 5 leading pawns (KPPPPPK). + for (int leadPawnsCnt = 1; leadPawnsCnt <= 5; ++leadPawnsCnt) + for (File f = FILE_A; f <= FILE_D; ++f) + { + // Restart the index at every file because TB table is split + // by file, so we can reuse the same index for different files. + int idx = 0; + + // Sum all possible combinations for a given file, starting with + // the leading pawn on rank 2 and increasing the rank. + for (Rank r = RANK_2; r <= RANK_7; ++r) + { + Square sq = make_square(f, r); + + // Compute MapPawns[] at first pass. + // If sq is the leading pawn square, any other pawn cannot be + // below or more toward the edge of sq. There are 47 available + // squares when sq = a2 and reduced by 2 for any rank increase + // due to mirroring: sq == a3 -> no a2, h2, so MapPawns[a3] = 45 + if (leadPawnsCnt == 1) + { + MapPawns[sq] = availableSquares--; + MapPawns[flip_file(sq)] = availableSquares--; + } + LeadPawnIdx[leadPawnsCnt][sq] = idx; + idx += Binomial[leadPawnsCnt - 1][MapPawns[sq]]; + } + // After a file is traversed, store the cumulated per-file index + LeadPawnsSize[leadPawnsCnt][f] = idx; + } + + // Add entries in TB tables if the corresponding ".rtbw" file exists + for (PieceType p1 = PAWN; p1 < KING; ++p1) + { + TBTables.add({KING, p1, KING}); + + for (PieceType p2 = PAWN; p2 <= p1; ++p2) + { + TBTables.add({KING, p1, p2, KING}); + TBTables.add({KING, p1, KING, p2}); + + for (PieceType p3 = PAWN; p3 < KING; ++p3) + TBTables.add({KING, p1, p2, KING, p3}); + + for (PieceType p3 = PAWN; p3 <= p2; ++p3) + { + TBTables.add({KING, p1, p2, p3, KING}); + + for (PieceType p4 = PAWN; p4 <= p3; ++p4) + { + TBTables.add({KING, p1, p2, p3, p4, KING}); + + for (PieceType p5 = PAWN; p5 <= p4; ++p5) + TBTables.add({KING, p1, p2, p3, p4, p5, KING}); + + for (PieceType p5 = PAWN; p5 < KING; ++p5) + TBTables.add({KING, p1, p2, p3, p4, KING, p5}); + } + + for (PieceType p4 = PAWN; p4 < KING; ++p4) + { + TBTables.add({KING, p1, p2, p3, KING, p4}); + + for (PieceType p5 = PAWN; p5 <= p4; ++p5) + TBTables.add({KING, p1, p2, p3, KING, p4, p5}); + } + } + + for (PieceType p3 = PAWN; p3 <= p1; ++p3) + for (PieceType p4 = PAWN; p4 <= (p1 == p3 ? p2 : p3); ++p4) + TBTables.add({KING, p1, p2, KING, p3, p4}); + } + } + + TBTables.info(); +} + +// Probe the WDL table for a particular position. +// If *result != FAIL, the probe was successful. +// The return value is from the point of view of the side to move: +// -2 : loss +// -1 : loss, but draw under 50-move rule +// 0 : draw +// 1 : win, but draw under 50-move rule +// 2 : win +WDLScore Tablebases::probe_wdl(Position& pos, ProbeState* result) { + + *result = OK; + return search(pos, result); +} + +// Probe the DTZ table for a particular position. +// If *result != FAIL, the probe was successful. +// The return value is from the point of view of the side to move: +// n < -100 : loss, but draw under 50-move rule +// -100 <= n < -1 : loss in n ply (assuming 50-move counter == 0) +// -1 : loss, the side to move is mated +// 0 : draw +// 1 < n <= 100 : win in n ply (assuming 50-move counter == 0) +// 100 < n : win, but draw under 50-move rule +// +// The return value n can be off by 1: a return value -n can mean a loss +// in n+1 ply and a return value +n can mean a win in n+1 ply. This +// cannot happen for tables with positions exactly on the "edge" of +// the 50-move rule. +// +// This implies that if dtz > 0 is returned, the position is certainly +// a win if dtz + 50-move-counter <= 99. Care must be taken that the engine +// picks moves that preserve dtz + 50-move-counter <= 99. +// +// If n = 100 immediately after a capture or pawn move, then the position +// is also certainly a win, and during the whole phase until the next +// capture or pawn move, the inequality to be preserved is +// dtz + 50-move-counter <= 100. +// +// In short, if a move is available resulting in dtz + 50-move-counter <= 99, +// then do not accept moves leading to dtz + 50-move-counter == 100. +int Tablebases::probe_dtz(Position& pos, ProbeState* result) { + + *result = OK; + WDLScore wdl = search(pos, result); + + if (*result == FAIL || wdl == WDLDraw) // DTZ tables don't store draws + return 0; + + // DTZ stores a 'don't care value in this case, or even a plain wrong + // one as in case the best move is a losing ep, so it cannot be probed. + if (*result == ZEROING_BEST_MOVE) + return dtz_before_zeroing(wdl); + + int dtz = probe_table(pos, result, wdl); + + if (*result == FAIL) + return 0; + + if (*result != CHANGE_STM) + return (dtz + 100 * (wdl == WDLBlessedLoss || wdl == WDLCursedWin)) * sign_of(wdl); + + // DTZ stores results for the other side, so we need to do a 1-ply search and + // find the winning move that minimizes DTZ. + StateInfo st; + int minDTZ = 0xFFFF; + + for (const Move move : MoveList(pos)) + { + bool zeroing = pos.capture(move) || type_of(pos.moved_piece(move)) == PAWN; + + pos.do_move(move, st); + + // For zeroing moves we want the dtz of the move _before_ doing it, + // otherwise we will get the dtz of the next move sequence. Search the + // position after the move to get the score sign (because even in a + // winning position we could make a losing capture or go for a draw). + dtz = zeroing ? -dtz_before_zeroing(search(pos, result)) : -probe_dtz(pos, result); + + // If the move mates, force minDTZ to 1 + if (dtz == 1 && pos.checkers() && MoveList(pos).size() == 0) + minDTZ = 1; + + // Convert result from 1-ply search. Zeroing moves are already accounted + // by dtz_before_zeroing() that returns the DTZ of the previous move. + if (!zeroing) + dtz += sign_of(dtz); + + // Skip the draws and if we are winning only pick positive dtz + if (dtz < minDTZ && sign_of(dtz) == sign_of(wdl)) + minDTZ = dtz; + + pos.undo_move(move); + + if (*result == FAIL) + return 0; + } + + // When there are no legal moves, the position is mate: we return -1 + return minDTZ == 0xFFFF ? -1 : minDTZ; +} + + +// Use the DTZ tables to rank root moves. +// +// A return value false indicates that not all probes were successful. +bool Tablebases::root_probe(Position& pos, + Search::RootMoves& rootMoves, + bool rule50, + bool rankDTZ, + const std::function& time_abort) { + + ProbeState result = OK; + StateInfo st; + + // Obtain 50-move counter for the root position + int cnt50 = pos.rule50_count(); + + // Check whether a position was repeated since the last zeroing move. + bool rep = pos.has_repeated(); + + int dtz, bound = rule50 ? (MAX_DTZ / 2 - 100) : 1; + + // Probe and rank each move + for (auto& m : rootMoves) + { + pos.do_move(m.pv[0], st); + + // Calculate dtz for the current move counting from the root position + if (pos.rule50_count() == 0) + { + // In case of a zeroing move, dtz is one of -101/-1/0/1/101 + WDLScore wdl = -probe_wdl(pos, &result); + dtz = dtz_before_zeroing(wdl); + } + else if ((rule50 && pos.is_draw(1)) || pos.is_repetition(1)) + { + // In case a root move leads to a draw by repetition or 50-move rule, + // we set dtz to zero. Note: since we are only 1 ply from the root, + // this must be a true 3-fold repetition inside the game history. + dtz = 0; + } + else + { + // Otherwise, take dtz for the new position and correct by 1 ply + dtz = -probe_dtz(pos, &result); + dtz = dtz > 0 ? dtz + 1 : dtz < 0 ? dtz - 1 : dtz; + } + + // Make sure that a mating move is assigned a dtz value of 1 + if (pos.checkers() && dtz == 2 && MoveList(pos).size() == 0) + dtz = 1; + + pos.undo_move(m.pv[0]); + + if (time_abort() || result == FAIL) + return false; + + // Better moves are ranked higher. Certain wins are ranked equally. + // Losing moves are ranked equally unless a 50-move draw is in sight. + int r = dtz > 0 ? (dtz + cnt50 <= 99 && !rep ? MAX_DTZ - (rankDTZ ? dtz : 0) + : MAX_DTZ / 2 - (dtz + cnt50)) + : dtz < 0 ? (-dtz * 2 + cnt50 < 100 ? -MAX_DTZ - (rankDTZ ? dtz : 0) + : -MAX_DTZ / 2 + (-dtz + cnt50)) + : 0; + m.tbRank = r; + + // Determine the score to be displayed for this move. Assign at least + // 1 cp to cursed wins and let it grow to 49 cp as the positions gets + // closer to a real win. + m.tbScore = r >= bound ? VALUE_MATE - MAX_PLY - 1 + : r > 0 ? Value((std::max(3, r - (MAX_DTZ / 2 - 200)) * int(PawnValue)) / 200) + : r == 0 ? VALUE_DRAW + : r > -bound + ? Value((std::min(-3, r + (MAX_DTZ / 2 - 200)) * int(PawnValue)) / 200) + : -VALUE_MATE + MAX_PLY + 1; + } + + return true; +} + + +// Use the WDL tables to rank root moves. +// This is a fallback for the case that some or all DTZ tables are missing. +// +// A return value false indicates that not all probes were successful. +bool Tablebases::root_probe_wdl(Position& pos, Search::RootMoves& rootMoves, bool rule50) { + + static const int WDL_to_rank[] = {-MAX_DTZ, -MAX_DTZ + 101, 0, MAX_DTZ - 101, MAX_DTZ}; + + ProbeState result = OK; + StateInfo st; + WDLScore wdl; + + + // Probe and rank each move + for (auto& m : rootMoves) + { + pos.do_move(m.pv[0], st); + + if (pos.is_draw(1)) + wdl = WDLDraw; + else + wdl = -probe_wdl(pos, &result); + + pos.undo_move(m.pv[0]); + + if (result == FAIL) + return false; + + m.tbRank = WDL_to_rank[wdl + 2]; + + if (!rule50) + wdl = wdl > WDLDraw ? WDLWin : wdl < WDLDraw ? WDLLoss : WDLDraw; + m.tbScore = WDL_to_value[wdl + 2]; + } + + return true; +} + +Config Tablebases::rank_root_moves(const OptionsMap& options, + Position& pos, + Search::RootMoves& rootMoves, + bool rankDTZ, + const std::function& time_abort) { + Config config; + + if (rootMoves.empty()) + return config; + + config.rootInTB = false; + config.useRule50 = bool(options["Syzygy50MoveRule"]); + config.probeDepth = int(options["SyzygyProbeDepth"]); + config.cardinality = int(options["SyzygyProbeLimit"]); + + bool dtz_available = true; + + // Tables with fewer pieces than SyzygyProbeLimit are searched with + // probeDepth == DEPTH_ZERO + if (config.cardinality > MaxCardinality) + { + config.cardinality = MaxCardinality; + config.probeDepth = 0; + } + + if (config.cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING)) + { + // Rank moves using DTZ tables, bail out if time_abort flags zeitnot + config.rootInTB = + root_probe(pos, rootMoves, options["Syzygy50MoveRule"], rankDTZ, time_abort); + + if (!config.rootInTB && !time_abort()) + { + // DTZ tables are missing; try to rank moves using WDL tables + dtz_available = false; + config.rootInTB = root_probe_wdl(pos, rootMoves, options["Syzygy50MoveRule"]); + } + } + + if (config.rootInTB) + { + // Sort moves according to TB rank + std::stable_sort( + rootMoves.begin(), rootMoves.end(), + [](const Search::RootMove& a, const Search::RootMove& b) { return a.tbRank > b.tbRank; }); + + // Probe during search only if DTZ is not available and we are winning + if (dtz_available || rootMoves[0].tbScore <= VALUE_DRAW) + config.cardinality = 0; + } + else + { + // Clean up if root_probe() and root_probe_wdl() have failed + for (auto& m : rootMoves) + m.tbRank = 0; + } + + return config; +} +} // namespace Stockfish diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h new file mode 100644 index 0000000000000000000000000000000000000000..7b60d6e20546c221c5e1a4f26c5b32bf04396115 --- /dev/null +++ b/src/syzygy/tbprobe.h @@ -0,0 +1,85 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef TBPROBE_H +#define TBPROBE_H + +#include +#include +#include + + +namespace Stockfish { +class Position; +class OptionsMap; + +using Depth = int; + +namespace Search { +struct RootMove; +using RootMoves = std::vector; +} +} + +namespace Stockfish::Tablebases { + +struct Config { + int cardinality = 0; + bool rootInTB = false; + bool useRule50 = false; + Depth probeDepth = 0; +}; + +enum WDLScore { + WDLLoss = -2, // Loss + WDLBlessedLoss = -1, // Loss, but draw under 50-move rule + WDLDraw = 0, // Draw + WDLCursedWin = 1, // Win, but draw under 50-move rule + WDLWin = 2, // Win +}; + +// Possible states after a probing operation +enum ProbeState { + FAIL = 0, // Probe failed (missing file table) + OK = 1, // Probe successful + CHANGE_STM = -1, // DTZ should check the other side + ZEROING_BEST_MOVE = 2 // Best move zeroes DTZ (capture or pawn move) +}; + +extern int MaxCardinality; + + +void init(const std::string& paths); +WDLScore probe_wdl(Position& pos, ProbeState* result); +int probe_dtz(Position& pos, ProbeState* result); +bool root_probe(Position& pos, + Search::RootMoves& rootMoves, + bool rule50, + bool rankDTZ, + const std::function& time_abort); +bool root_probe_wdl(Position& pos, Search::RootMoves& rootMoves, bool rule50); +Config rank_root_moves( + const OptionsMap& options, + Position& pos, + Search::RootMoves& rootMoves, + bool rankDTZ = false, + const std::function& time_abort = []() { return false; }); + +} // namespace Stockfish::Tablebases + +#endif diff --git a/src/thread.cpp b/src/thread.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a2f59d5b100f787c0c8abf5efab51bd775f4a93c --- /dev/null +++ b/src/thread.cpp @@ -0,0 +1,456 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "thread.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bitboard.h" +#include "history.h" +#include "memory.h" +#include "movegen.h" +#include "search.h" +#include "syzygy/tbprobe.h" +#include "timeman.h" +#include "types.h" +#include "uci.h" +#include "ucioption.h" + +namespace Stockfish { + +// Constructor launches the thread and waits until it goes to sleep +// in idle_loop(). Note that 'searching' and 'exit' should be already set. +Thread::Thread(Search::SharedState& sharedState, + std::unique_ptr sm, + size_t n, + size_t numaN, + size_t totalNumaCount, + OptionalThreadToNumaNodeBinder binder) : + idx(n), + idxInNuma(numaN), + totalNuma(totalNumaCount), + nthreads(sharedState.options["Threads"]), + stdThread(&Thread::idle_loop, this) { + + wait_for_search_finished(); + + run_custom_job([this, &binder, &sharedState, &sm, n]() { + // Use the binder to [maybe] bind the threads to a NUMA node before doing + // the Worker allocation. Ideally we would also allocate the SearchManager + // here, but that's minor. + this->numaAccessToken = binder(); + this->worker = make_unique_large_page( + sharedState, std::move(sm), n, idxInNuma, totalNuma, this->numaAccessToken); + }); + + wait_for_search_finished(); +} + + +// Destructor wakes up the thread in idle_loop() and waits +// for its termination. Thread should be already waiting. +Thread::~Thread() { + + assert(!searching); + + exit = true; + start_searching(); + stdThread.join(); +} + +// Wakes up the thread that will start the search +void Thread::start_searching() { + assert(worker != nullptr); + run_custom_job([this]() { worker->start_searching(); }); +} + +// Clears the histories for the thread worker (usually before a new game) +void Thread::clear_worker() { + assert(worker != nullptr); + run_custom_job([this]() { worker->clear(); }); +} + +// Blocks on the condition variable until the thread has finished searching +void Thread::wait_for_search_finished() { + + std::unique_lock lk(mutex); + cv.wait(lk, [&] { return !searching; }); +} + +// Launching a function in the thread +void Thread::run_custom_job(std::function f) { + { + std::unique_lock lk(mutex); + cv.wait(lk, [&] { return !searching; }); + jobFunc = std::move(f); + searching = true; + } + cv.notify_one(); +} + +void Thread::ensure_network_replicated() { worker->ensure_network_replicated(); } + +// Thread gets parked here, blocked on the condition variable +// when the thread has no work to do. + +void Thread::idle_loop() { + while (true) + { + std::unique_lock lk(mutex); + searching = false; + cv.notify_one(); // Wake up anyone waiting for search finished + cv.wait(lk, [&] { return searching; }); + + if (exit) + return; + + std::function job = std::move(jobFunc); + jobFunc = nullptr; + + lk.unlock(); + + if (job) + job(); + } +} + +Search::SearchManager* ThreadPool::main_manager() { return main_thread()->worker->main_manager(); } + +uint64_t ThreadPool::nodes_searched() const { return accumulate(&Search::Worker::nodes); } +uint64_t ThreadPool::tb_hits() const { return accumulate(&Search::Worker::tbHits); } + +static size_t next_power_of_two(uint64_t count) { return count > 1 ? (2ULL << msb(count - 1)) : 1; } + +// Creates/destroys threads to match the requested number. +// Created and launched threads will immediately go to sleep in idle_loop. +// Upon resizing, threads are recreated to allow for binding if necessary. +void ThreadPool::set(const NumaConfig& numaConfig, + Search::SharedState sharedState, + const Search::SearchManager::UpdateContext& updateContext) { + + if (threads.size() > 0) // destroy any existing thread(s) + { + main_thread()->wait_for_search_finished(); + + threads.clear(); + + boundThreadToNumaNode.clear(); + } + + const size_t requested = sharedState.options["Threads"]; + + if (requested > 0) // create new thread(s) + { + // Binding threads may be problematic when there's multiple NUMA nodes and + // multiple Stockfish instances running. In particular, if each instance + // runs a single thread then they would all be mapped to the first NUMA node. + // This is undesirable, and so the default behaviour (i.e. when the user does not + // change the NumaConfig UCI setting) is to not bind the threads to processors + // unless we know for sure that we span NUMA nodes and replication is required. + const std::string numaPolicy(sharedState.options["NumaPolicy"]); + const bool doBindThreads = [&]() { + if (numaPolicy == "none") + return false; + + if (numaPolicy == "auto") + return numaConfig.suggests_binding_threads(requested); + + // numaPolicy == "system", or explicitly set by the user + return true; + }(); + + std::map counts; + boundThreadToNumaNode = doBindThreads + ? numaConfig.distribute_threads_among_numa_nodes(requested) + : std::vector{}; + + if (boundThreadToNumaNode.empty()) + counts[0] = requested; // Pretend all threads are part of numa node 0 + else + { + for (size_t i = 0; i < boundThreadToNumaNode.size(); ++i) + counts[boundThreadToNumaNode[i]]++; + } + + sharedState.sharedHistories.clear(); + for (auto pair : counts) + { + NumaIndex numaIndex = pair.first; + uint64_t count = pair.second; + auto f = [&]() { + sharedState.sharedHistories.try_emplace(numaIndex, next_power_of_two(count)); + }; + if (doBindThreads) + numaConfig.execute_on_numa_node(numaIndex, f); + else + f(); + } + + auto threadsPerNode = counts; + counts.clear(); + + while (threads.size() < requested) + { + const size_t threadId = threads.size(); + const NumaIndex numaId = doBindThreads ? boundThreadToNumaNode[threadId] : 0; + auto create_thread = [&]() { + auto manager = threadId == 0 + ? std::unique_ptr( + std::make_unique(updateContext)) + : std::make_unique(); + + // When not binding threads we want to force all access to happen + // from the same NUMA node, because in case of NUMA replicated memory + // accesses we don't want to trash cache in case the threads get scheduled + // on the same NUMA node. + auto binder = doBindThreads ? OptionalThreadToNumaNodeBinder(numaConfig, numaId) + : OptionalThreadToNumaNodeBinder(numaId); + + threads.emplace_back(std::make_unique(sharedState, std::move(manager), + threadId, counts[numaId]++, + threadsPerNode[numaId], binder)); + }; + + // Ensure the worker thread inherits the intended NUMA affinity at creation. + if (doBindThreads) + numaConfig.execute_on_numa_node(numaId, create_thread); + else + create_thread(); + } + + clear(); + + main_thread()->wait_for_search_finished(); + } +} + + +// Sets threadPool data to initial values +void ThreadPool::clear() { + if (threads.size() == 0) + return; + + for (auto&& th : threads) + th->clear_worker(); + + for (auto&& th : threads) + th->wait_for_search_finished(); + + // These two affect the time taken on the first move of a game: + main_manager()->bestPreviousAverageScore = VALUE_INFINITE; + main_manager()->previousTimeReduction = 0.85; + + main_manager()->callsCnt = 0; + main_manager()->bestPreviousScore = VALUE_INFINITE; + main_manager()->originalTimeAdjust = -1; + main_manager()->tm.clear(); +} + +void ThreadPool::run_on_thread(size_t threadId, std::function f) { + assert(threads.size() > threadId); + threads[threadId]->run_custom_job(std::move(f)); +} + +void ThreadPool::wait_on_thread(size_t threadId) { + assert(threads.size() > threadId); + threads[threadId]->wait_for_search_finished(); +} + +size_t ThreadPool::num_threads() const { return threads.size(); } + + +// Wakes up main thread waiting in idle_loop() and returns immediately. +// Main thread will wake up other threads and start the search. +void ThreadPool::start_thinking(const OptionsMap& options, + Position& pos, + StateListPtr& states, + Search::LimitsType limits) { + + main_thread()->wait_for_search_finished(); + + main_manager()->stopOnPonderhit = stop = false; + main_manager()->ponder = limits.ponderMode; + + increaseDepth = true; + + Search::RootMoves rootMoves; + const auto legalmoves = MoveList(pos); + + for (const auto& uciMove : limits.searchmoves) + { + auto move = UCIEngine::to_move(pos, uciMove); + + if (std::find(legalmoves.begin(), legalmoves.end(), move) != legalmoves.end()) + rootMoves.emplace_back(move); + } + + if (rootMoves.empty()) + for (const auto& m : legalmoves) + rootMoves.emplace_back(m); + + Tablebases::Config tbConfig = Tablebases::rank_root_moves(options, pos, rootMoves); + + // After ownership transfer 'states' becomes empty, so if we stop the search + // and call 'go' again without setting a new position states.get() == nullptr. + assert(states.get() || setupStates.get()); + + if (states.get()) + setupStates = std::move(states); // Ownership transfer, states is now empty + + // We use Position::set() to set root position across threads. But there are + // some StateInfo fields (previous, pliesFromNull, capturedPiece) that cannot + // be deduced from a fen string, so set() clears them and they are set from + // setupStates->back() later. The rootState is per thread, earlier states are + // shared since they are read-only. + for (auto&& th : threads) + { + th->run_custom_job([&]() { + th->worker->limits = limits; + th->worker->nodes = th->worker->tbHits = th->worker->bestMoveChanges = 0; + th->worker->nmpMinPly = 0; + th->worker->rootDepth = th->worker->completedDepth = 0; + th->worker->rootMoves = rootMoves; + th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState); + th->worker->rootState = setupStates->back(); + th->worker->tbConfig = tbConfig; + }); + } + + for (auto&& th : threads) + th->wait_for_search_finished(); + + main_thread()->start_searching(); +} + +Thread* ThreadPool::get_best_thread() const { + + Thread* bestThread = threads.front().get(); + Value minScore = VALUE_NONE; + + std::unordered_map votes( + 2 * std::min(size(), bestThread->worker->rootMoves.size())); + + // Find the minimum score of all threads + for (auto&& th : threads) + minScore = std::min(minScore, th->worker->rootMoves[0].score); + + // Vote according to score and depth, and select the best thread + auto thread_voting_value = [minScore](Thread* th) { + return (th->worker->rootMoves[0].score - minScore + 14) * int(th->worker->completedDepth); + }; + + for (auto&& th : threads) + votes[th->worker->rootMoves[0].pv[0]] += thread_voting_value(th.get()); + + for (auto&& th : threads) + { + const auto bestThreadScore = bestThread->worker->rootMoves[0].score; + const auto newThreadScore = th->worker->rootMoves[0].score; + + const auto& bestThreadPV = bestThread->worker->rootMoves[0].pv; + const auto& newThreadPV = th->worker->rootMoves[0].pv; + + const auto bestThreadMoveVote = votes[bestThreadPV[0]]; + const auto newThreadMoveVote = votes[newThreadPV[0]]; + + const bool bestThreadInProvenWin = is_win(bestThreadScore); + const bool newThreadInProvenWin = is_win(newThreadScore); + + const bool bestThreadInProvenLoss = + bestThreadScore != -VALUE_INFINITE && is_loss(bestThreadScore); + const bool newThreadInProvenLoss = + newThreadScore != -VALUE_INFINITE && is_loss(newThreadScore); + + // We make sure not to pick a thread with truncated principal variation + const bool betterVotingValue = + thread_voting_value(th.get()) * int(newThreadPV.size() > 2) + > thread_voting_value(bestThread) * int(bestThreadPV.size() > 2); + + if (bestThreadInProvenWin) + { + // Make sure we pick the shortest mate / TB conversion + if (newThreadScore > bestThreadScore) + bestThread = th.get(); + } + else if (bestThreadInProvenLoss) + { + // Make sure we pick the shortest mated / TB conversion + if (newThreadInProvenLoss && newThreadScore < bestThreadScore) + bestThread = th.get(); + } + else if (newThreadInProvenWin || newThreadInProvenLoss + || (!is_loss(newThreadScore) + && (newThreadMoveVote > bestThreadMoveVote + || (newThreadMoveVote == bestThreadMoveVote && betterVotingValue)))) + bestThread = th.get(); + } + + return bestThread; +} + + +// Start non-main threads. +// Will be invoked by main thread after it has started searching. +void ThreadPool::start_searching() { + + for (auto&& th : threads) + if (th != threads.front()) + th->start_searching(); +} + + +// Wait for non-main threads +void ThreadPool::wait_for_search_finished() const { + + for (auto&& th : threads) + if (th != threads.front()) + th->wait_for_search_finished(); +} + +std::vector ThreadPool::get_bound_thread_count_by_numa_node() const { + std::vector counts; + + if (!boundThreadToNumaNode.empty()) + { + NumaIndex highestNumaNode = 0; + for (NumaIndex n : boundThreadToNumaNode) + if (n > highestNumaNode) + highestNumaNode = n; + + counts.resize(highestNumaNode + 1, 0); + + for (NumaIndex n : boundThreadToNumaNode) + counts[n] += 1; + } + + return counts; +} + +void ThreadPool::ensure_network_replicated() { + for (auto&& th : threads) + th->ensure_network_replicated(); +} + +} // namespace Stockfish diff --git a/src/thread.h b/src/thread.h new file mode 100644 index 0000000000000000000000000000000000000000..d6032d295027dae03929ea8c21a3963158887f8e --- /dev/null +++ b/src/thread.h @@ -0,0 +1,181 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef THREAD_H_INCLUDED +#define THREAD_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memory.h" +#include "numa.h" +#include "position.h" +#include "search.h" +#include "thread_win32_osx.h" + +namespace Stockfish { + + +class OptionsMap; +using Value = int; + +// Sometimes we don't want to actually bind the threads, but the recipient still +// needs to think it runs on *some* NUMA node, such that it can access structures +// that rely on NUMA node knowledge. This class encapsulates this optional process +// such that the recipient does not need to know whether the binding happened or not. +class OptionalThreadToNumaNodeBinder { + public: + OptionalThreadToNumaNodeBinder(NumaIndex n) : + numaConfig(nullptr), + numaId(n) {} + + OptionalThreadToNumaNodeBinder(const NumaConfig& cfg, NumaIndex n) : + numaConfig(&cfg), + numaId(n) {} + + NumaReplicatedAccessToken operator()() const { + if (numaConfig != nullptr) + return numaConfig->bind_current_thread_to_numa_node(numaId); + else + return NumaReplicatedAccessToken(numaId); + } + + private: + const NumaConfig* numaConfig; + NumaIndex numaId; +}; + +// Abstraction of a thread. It contains a pointer to the worker and a native thread. +// After construction, the native thread is started with idle_loop() +// waiting for a signal to start searching. +// When the signal is received, the thread starts searching and when +// the search is finished, it goes back to idle_loop() waiting for a new signal. +class Thread { + public: + Thread(Search::SharedState&, + std::unique_ptr, + size_t, + size_t, + size_t, + OptionalThreadToNumaNodeBinder); + virtual ~Thread(); + + void idle_loop(); + void start_searching(); + void clear_worker(); + void run_custom_job(std::function f); + + void ensure_network_replicated(); + + // Thread has been slightly altered to allow running custom jobs, so + // this name is no longer correct. However, this class (and ThreadPool) + // require further work to make them properly generic while maintaining + // appropriate specificity regarding search, from the point of view of an + // outside user, so renaming of this function is left for whenever that happens. + void wait_for_search_finished(); + size_t id() const { return idx; } + + LargePagePtr worker; + std::function jobFunc; + + private: + std::mutex mutex; + std::condition_variable cv; + size_t idx, idxInNuma, totalNuma, nthreads; + bool exit = false, searching = true; // Set before starting std::thread + NativeThread stdThread; + NumaReplicatedAccessToken numaAccessToken; +}; + + +// ThreadPool struct handles all the threads-related stuff like init, starting, +// parking and, most importantly, launching a thread. All the access to threads +// is done through this class. +class ThreadPool { + public: + ThreadPool() {} + + ~ThreadPool() { + // destroy any existing thread(s) + if (threads.size() > 0) + { + main_thread()->wait_for_search_finished(); + + threads.clear(); + } + } + + ThreadPool(const ThreadPool&) = delete; + ThreadPool(ThreadPool&&) = delete; + + ThreadPool& operator=(const ThreadPool&) = delete; + ThreadPool& operator=(ThreadPool&&) = delete; + + void start_thinking(const OptionsMap&, Position&, StateListPtr&, Search::LimitsType); + void run_on_thread(size_t threadId, std::function f); + void wait_on_thread(size_t threadId); + size_t num_threads() const; + void clear(); + void set(const NumaConfig& numaConfig, + Search::SharedState, + const Search::SearchManager::UpdateContext&); + + Search::SearchManager* main_manager(); + Thread* main_thread() const { return threads.front().get(); } + uint64_t nodes_searched() const; + uint64_t tb_hits() const; + Thread* get_best_thread() const; + void start_searching(); + void wait_for_search_finished() const; + + std::vector get_bound_thread_count_by_numa_node() const; + + void ensure_network_replicated(); + + std::atomic_bool stop, increaseDepth; + + auto cbegin() const noexcept { return threads.cbegin(); } + auto begin() noexcept { return threads.begin(); } + auto end() noexcept { return threads.end(); } + auto cend() const noexcept { return threads.cend(); } + auto size() const noexcept { return threads.size(); } + auto empty() const noexcept { return threads.empty(); } + + private: + StateListPtr setupStates; + std::vector> threads; + std::vector boundThreadToNumaNode; + + uint64_t accumulate(std::atomic Search::Worker::* member) const { + + uint64_t sum = 0; + for (auto&& th : threads) + sum += (th->worker.get()->*member).load(std::memory_order_relaxed); + return sum; + } +}; + +} // namespace Stockfish + +#endif // #ifndef THREAD_H_INCLUDED diff --git a/src/thread_win32_osx.h b/src/thread_win32_osx.h new file mode 100644 index 0000000000000000000000000000000000000000..5a8d43a2e7718ed8e6541c212ca6c7b3d6f59cfe --- /dev/null +++ b/src/thread_win32_osx.h @@ -0,0 +1,78 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef THREAD_WIN32_OSX_H_INCLUDED +#define THREAD_WIN32_OSX_H_INCLUDED + +#include + +// On OSX threads other than the main thread are created with a reduced stack +// size of 512KB by default, this is too low for deep searches, which require +// somewhat more than 1MB stack, so adjust it to TH_STACK_SIZE. +// The implementation calls pthread_create() with the stack size parameter +// equal to the Linux 8MB default, on platforms that support it. + +#if defined(__APPLE__) || defined(__MINGW32__) || defined(__MINGW64__) || defined(USE_PTHREADS) + + #include + #include + +namespace Stockfish { + +class NativeThread { + pthread_t thread; + + static constexpr size_t TH_STACK_SIZE = 8 * 1024 * 1024; + + public: + template + explicit NativeThread(Function&& fun, Args&&... args) { + auto func = new std::function( + std::bind(std::forward(fun), std::forward(args)...)); + + pthread_attr_t attr_storage, *attr = &attr_storage; + pthread_attr_init(attr); + pthread_attr_setstacksize(attr, TH_STACK_SIZE); + + auto start_routine = [](void* ptr) -> void* { + auto f = reinterpret_cast*>(ptr); + // Call the function + (*f)(); + delete f; + return nullptr; + }; + + pthread_create(&thread, attr, start_routine, func); + } + + void join() { pthread_join(thread, nullptr); } +}; + +} // namespace Stockfish + +#else // Default case: use STL classes + +namespace Stockfish { + +using NativeThread = std::thread; + +} // namespace Stockfish + +#endif + +#endif // #ifndef THREAD_WIN32_OSX_H_INCLUDED diff --git a/src/timeman.cpp b/src/timeman.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4e98081bc2baf4131604322f4d04df3a744c755f --- /dev/null +++ b/src/timeman.cpp @@ -0,0 +1,140 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "timeman.h" + +#include +#include +#include +#include + +#include "search.h" +#include "ucioption.h" + +namespace Stockfish { + +TimePoint TimeManagement::optimum() const { return optimumTime; } +TimePoint TimeManagement::maximum() const { return maximumTime; } + +void TimeManagement::clear() { + availableNodes = -1; // When in 'nodes as time' mode +} + +void TimeManagement::advance_nodes_time(std::int64_t nodes) { + assert(useNodesTime); + availableNodes = std::max(int64_t(0), availableNodes - nodes); +} + +// Called at the beginning of the search and calculates +// the bounds of time allowed for the current game ply. We currently support: +// 1) x basetime (+ z increment) +// 2) x moves in y seconds (+ z increment) +void TimeManagement::init(Search::LimitsType& limits, + Color us, + int ply, + const OptionsMap& options, + double& originalTimeAdjust) { + TimePoint npmsec = TimePoint(options["nodestime"]); + + // If we have no time, we don't need to fully initialize TM. + // startTime is used by movetime and useNodesTime is used in elapsed calls. + startTime = limits.startTime; + useNodesTime = npmsec != 0; + + if (limits.time[us] == 0) + return; + + TimePoint moveOverhead = TimePoint(options["Move Overhead"]); + + // optScale is a percentage of available time to use for the current move. + // maxScale is a multiplier applied to optimumTime. + double optScale, maxScale; + + // If we have to play in 'nodes as time' mode, then convert from time + // to nodes, and use resulting values in time management formulas. + // WARNING: to avoid time losses, the given npmsec (nodes per millisecond) + // must be much lower than the real engine speed. + if (useNodesTime) + { + if (availableNodes == -1) // Only once at game start + availableNodes = npmsec * limits.time[us]; // Time is in msec + + // Convert from milliseconds to nodes + limits.time[us] = TimePoint(availableNodes); + limits.inc[us] *= npmsec; + limits.npmsec = npmsec; + moveOverhead *= npmsec; + } + + // These numbers are used where multiplications, divisions or comparisons + // with constants are involved. + const int64_t scaleFactor = useNodesTime ? npmsec : 1; + const TimePoint scaledTime = limits.time[us] / scaleFactor; + + // Maximum move horizon + int centiMTG = limits.movestogo ? std::min(limits.movestogo * 100, 5000) : 5051; + + // If less than one second, gradually reduce mtg + if (scaledTime < 1000) + centiMTG = int(scaledTime * 5.051); + + // Make sure timeLeft is > 0 since we may use it as a divisor + TimePoint timeLeft = + std::max(TimePoint(1), + limits.time[us] + + (limits.inc[us] * (centiMTG - 100) - moveOverhead * (200 + centiMTG)) / 100); + + // x basetime (+ z increment) + // If there is a healthy increment, timeLeft can exceed the actual available + // game time for the current move, so also cap to a percentage of available game time. + if (limits.movestogo == 0) + { + // Extra time according to timeLeft + if (originalTimeAdjust < 0) + originalTimeAdjust = 0.3128 * std::log10(timeLeft) - 0.4354; + + // Calculate time constants based on current time left. + double logTimeInSec = std::log10(scaledTime / 1000.0); + double optConstant = std::min(0.0032116 + 0.000321123 * logTimeInSec, 0.00508017); + double maxConstant = std::max(3.3977 + 3.03950 * logTimeInSec, 2.94761); + + optScale = std::min(0.0121431 + std::pow(ply + 2.94693, 0.461073) * optConstant, + 0.213035 * limits.time[us] / timeLeft) + * originalTimeAdjust; + + maxScale = std::min(6.67704, maxConstant + ply / 11.9847); + } + + // x moves in y seconds (+ z increment) + else + { + optScale = + std::min((0.88 + ply / 116.4) / (centiMTG / 100.0), 0.88 * limits.time[us] / timeLeft); + maxScale = 1.3 + 0.11 * (centiMTG / 100.0); + } + + // Limit the maximum possible time for this move + optimumTime = TimePoint(optScale * timeLeft); + maximumTime = + TimePoint(std::min(0.825179 * limits.time[us] - moveOverhead, maxScale * optimumTime)) - 10; + + if (options["Ponder"]) + optimumTime += optimumTime / 4; +} + +} // namespace Stockfish diff --git a/src/timeman.h b/src/timeman.h new file mode 100644 index 0000000000000000000000000000000000000000..08e8da10dcc7b5de65dbad53f43751cfc8a527fe --- /dev/null +++ b/src/timeman.h @@ -0,0 +1,67 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef TIMEMAN_H_INCLUDED +#define TIMEMAN_H_INCLUDED + +#include + +#include "misc.h" + +namespace Stockfish { + +class OptionsMap; +enum Color : uint8_t; + +namespace Search { +struct LimitsType; +} + +// The TimeManagement class computes the optimal time to think depending on +// the maximum available time, the game move number, and other parameters. +class TimeManagement { + public: + void init(Search::LimitsType& limits, + Color us, + int ply, + const OptionsMap& options, + double& originalTimeAdjust); + + TimePoint optimum() const; + TimePoint maximum() const; + template + TimePoint elapsed(FUNC nodes) const { + return useNodesTime ? TimePoint(nodes()) : elapsed_time(); + } + TimePoint elapsed_time() const { return now() - startTime; }; + + void clear(); + void advance_nodes_time(std::int64_t nodes); + + private: + TimePoint startTime; + TimePoint optimumTime; + TimePoint maximumTime; + + std::int64_t availableNodes = -1; // When in 'nodes as time' mode + bool useNodesTime = false; // True if we are in 'nodes as time' mode +}; + +} // namespace Stockfish + +#endif // #ifndef TIMEMAN_H_INCLUDED diff --git a/src/tt.cpp b/src/tt.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ef602809f79a6e17112de220e9cf6b2a47b12df5 --- /dev/null +++ b/src/tt.cpp @@ -0,0 +1,251 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "tt.h" + +#include +#include +#include +#include +#include + +#include "memory.h" +#include "misc.h" +#include "syzygy/tbprobe.h" +#include "thread.h" + +namespace Stockfish { + + +// TTEntry struct is the 10 bytes transposition table entry, defined as below: +// +// key 16 bit +// depth 8 bit +// generation 5 bit +// pv node 1 bit +// bound type 2 bit +// move 16 bit +// value 16 bit +// evaluation 16 bit +// +// These fields are in the same order as accessed by TT::probe(), since memory is fastest sequentially. +// Equally, the store order in save() matches this order. + +struct TTEntry { + + // Convert internal bitfields to external types + TTData read() const { + return TTData{Move(move16), Value(value16), + Value(eval16), Depth(depth8 + DEPTH_ENTRY_OFFSET), + Bound(genBound8 & 0x3), bool(genBound8 & 0x4)}; + } + + bool is_occupied() const; + void save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8); + // The returned age is a multiple of TranspositionTable::GENERATION_DELTA + uint8_t relative_age(const uint8_t generation8) const; + + private: + friend class TranspositionTable; + + uint16_t key16; + uint8_t depth8; + uint8_t genBound8; + Move move16; + int16_t value16; + int16_t eval16; +}; + +// `genBound8` is where most of the details are. We use the following constants to manipulate 5 leading generation bits +// and 3 trailing miscellaneous bits. + +// These bits are reserved for other things. +static constexpr unsigned GENERATION_BITS = 3; +// increment for generation field +static constexpr int GENERATION_DELTA = (1 << GENERATION_BITS); +// cycle length +static constexpr int GENERATION_CYCLE = 255 + GENERATION_DELTA; +// mask to pull out generation number +static constexpr int GENERATION_MASK = (0xFF << GENERATION_BITS) & 0xFF; + +// DEPTH_ENTRY_OFFSET exists because 1) we use `bool(depth8)` as the occupancy check, but +// 2) we need to store negative depths for QS. (`depth8` is the only field with "spare bits": +// we sacrifice the ability to store depths greater than 1<<8 less the offset, as asserted in `save`.) +bool TTEntry::is_occupied() const { return bool(depth8); } + +// Populates the TTEntry with a new node's data, possibly +// overwriting an old position. The update is not atomic and can be racy. +void TTEntry::save( + Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8) { + + // Preserve the old ttmove if we don't have a new one + if (m || uint16_t(k) != key16) + move16 = m; + + // Overwrite less valuable entries (cheapest checks first) + if (b == BOUND_EXACT || uint16_t(k) != key16 || d - DEPTH_ENTRY_OFFSET + 2 * pv > depth8 - 4 + || relative_age(generation8)) + { + assert(d > DEPTH_ENTRY_OFFSET); + assert(d < 256 + DEPTH_ENTRY_OFFSET); + + key16 = uint16_t(k); + depth8 = uint8_t(d - DEPTH_ENTRY_OFFSET); + genBound8 = uint8_t(generation8 | uint8_t(pv) << 2 | b); + value16 = int16_t(v); + eval16 = int16_t(ev); + } +} + + +uint8_t TTEntry::relative_age(const uint8_t generation8) const { + // Due to our packed storage format for generation and its cyclic + // nature we add GENERATION_CYCLE (256 is the modulus, plus what + // is needed to keep the unrelated lowest n bits from affecting + // the result) to calculate the entry age correctly even after + // generation8 overflows into the next cycle. + return (GENERATION_CYCLE + generation8 - genBound8) & GENERATION_MASK; +} + + +// TTWriter is but a very thin wrapper around the pointer +TTWriter::TTWriter(TTEntry* tte) : + entry(tte) {} + +void TTWriter::write( + Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8) { + entry->save(k, v, pv, b, d, m, ev, generation8); +} + + +// A TranspositionTable is an array of Cluster, of size clusterCount. Each cluster consists of ClusterSize number +// of TTEntry. Each non-empty TTEntry contains information on exactly one position. The size of a Cluster should +// divide the size of a cache line for best performance, as the cacheline is prefetched when possible. + +static constexpr int ClusterSize = 3; + +struct Cluster { + TTEntry entry[ClusterSize]; + char padding[2]; // Pad to 32 bytes +}; + +static_assert(sizeof(Cluster) == 32, "Suboptimal Cluster size"); + + +// Sets the size of the transposition table, +// measured in megabytes. Transposition table consists +// of clusters and each cluster consists of ClusterSize number of TTEntry. +void TranspositionTable::resize(size_t mbSize, ThreadPool& threads) { + aligned_large_pages_free(table); + + clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster); + + table = static_cast(aligned_large_pages_alloc(clusterCount * sizeof(Cluster))); + + if (!table) + { + std::cerr << "Failed to allocate " << mbSize << "MB for transposition table." << std::endl; + exit(EXIT_FAILURE); + } + + clear(threads); +} + + +// Initializes the entire transposition table to zero, +// in a multi-threaded way. +void TranspositionTable::clear(ThreadPool& threads) { + generation8 = 0; + const size_t threadCount = threads.num_threads(); + + for (size_t i = 0; i < threadCount; ++i) + { + threads.run_on_thread(i, [this, i, threadCount]() { + // Each thread will zero its part of the hash table + const size_t stride = clusterCount / threadCount; + const size_t start = stride * i; + const size_t len = i + 1 != threadCount ? stride : clusterCount - start; + + std::memset(&table[start], 0, len * sizeof(Cluster)); + }); + } + + for (size_t i = 0; i < threadCount; ++i) + threads.wait_on_thread(i); +} + + +// Returns an approximation of the hashtable +// occupation during a search. The hash is x permill full, as per UCI protocol. +// Only counts entries which match the current generation. +int TranspositionTable::hashfull(int maxAge) const { + int maxAgeInternal = maxAge << GENERATION_BITS; + int cnt = 0; + for (int i = 0; i < 1000; ++i) + for (int j = 0; j < ClusterSize; ++j) + cnt += table[i].entry[j].is_occupied() + && table[i].entry[j].relative_age(generation8) <= maxAgeInternal; + + return cnt / ClusterSize; +} + + +void TranspositionTable::new_search() { + // increment by delta to keep lower bits as is + generation8 += GENERATION_DELTA; +} + + +uint8_t TranspositionTable::generation() const { return generation8; } + + +// Looks up the current position in the transposition +// table. It returns true if the position is found. +// Otherwise, it returns false and a pointer to an empty or least valuable TTEntry +// to be replaced later. The replace value of an entry is calculated as its depth +// minus 8 times its relative age. TTEntry t1 is considered more valuable than +// TTEntry t2 if its replace value is greater than that of t2. +std::tuple TranspositionTable::probe(const Key key) const { + + TTEntry* const tte = first_entry(key); + const uint16_t key16 = uint16_t(key); // Use the low 16 bits as key inside the cluster + + for (int i = 0; i < ClusterSize; ++i) + if (tte[i].key16 == key16) + // This gap is the main place for read races. + // After `read()` completes that copy is final, but may be self-inconsistent. + return {tte[i].is_occupied(), tte[i].read(), TTWriter(&tte[i])}; + + // Find an entry to be replaced according to the replacement strategy + TTEntry* replace = tte; + for (int i = 1; i < ClusterSize; ++i) + if (replace->depth8 - replace->relative_age(generation8) + > tte[i].depth8 - tte[i].relative_age(generation8)) + replace = &tte[i]; + + return {false, + TTData{Move::none(), VALUE_NONE, VALUE_NONE, DEPTH_ENTRY_OFFSET, BOUND_NONE, false}, + TTWriter(replace)}; +} + + +TTEntry* TranspositionTable::first_entry(const Key key) const { + return &table[mul_hi64(key, clusterCount)].entry[0]; +} + +} // namespace Stockfish diff --git a/src/tt.h b/src/tt.h new file mode 100644 index 0000000000000000000000000000000000000000..38f6c8f4f6263db985ac2cc3b6d68c7b79674cb0 --- /dev/null +++ b/src/tt.h @@ -0,0 +1,110 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef TT_H_INCLUDED +#define TT_H_INCLUDED + +#include +#include +#include + +#include "memory.h" +#include "types.h" + +namespace Stockfish { + +class ThreadPool; +struct TTEntry; +struct Cluster; + +// There is only one global hash table for the engine and all its threads. For chess in particular, we even allow racy +// updates between threads to and from the TT, as taking the time to synchronize access would cost thinking time and +// thus elo. As a hash table, collisions are possible and may cause chess playing issues (bizarre blunders, faulty mate +// reports, etc). Fixing these also loses elo; however such risk decreases quickly with larger TT size. +// +// `probe` is the primary method: given a board position, we lookup its entry in the table, and return a tuple of: +// 1) whether the entry already has this position +// 2) a copy of the prior data (if any) (may be inconsistent due to read races) +// 3) a writer object to this entry +// The copied data and the writer are separated to maintain clear boundaries between local vs global objects. + + +// A copy of the data already in the entry (possibly collided). `probe` may be racy, resulting in inconsistent data. +struct TTData { + Move move; + Value value, eval; + Depth depth; + Bound bound; + bool is_pv; + + TTData() = delete; + + // clang-format off + TTData(Move m, Value v, Value ev, Depth d, Bound b, bool pv) : + move(m), + value(v), + eval(ev), + depth(d), + bound(b), + is_pv(pv) {}; + // clang-format on +}; + + +// This is used to make racy writes to the global TT. +struct TTWriter { + public: + void write(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8); + + private: + friend class TranspositionTable; + TTEntry* entry; + TTWriter(TTEntry* tte); +}; + + +class TranspositionTable { + + public: + ~TranspositionTable() { aligned_large_pages_free(table); } + + void resize(size_t mbSize, ThreadPool& threads); // Set TT size + void clear(ThreadPool& threads); // Re-initialize memory, multithreaded + int hashfull(int maxAge = 0) + const; // Approximate what fraction of entries (permille) have been written to during this root search + + void + new_search(); // This must be called at the beginning of each root search to track entry aging + uint8_t generation() const; // The current age, used when writing new data to the TT + std::tuple + probe(const Key key) const; // The main method, whose retvals separate local vs global objects + TTEntry* first_entry(const Key key) + const; // This is the hash function; its only external use is memory prefetching. + + private: + friend struct TTEntry; + + size_t clusterCount; + Cluster* table = nullptr; + + uint8_t generation8 = 0; // Size must be not bigger than TTEntry::genBound8 +}; + +} // namespace Stockfish + +#endif // #ifndef TT_H_INCLUDED diff --git a/src/tune.cpp b/src/tune.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f930c267e22ecb6b68712c1f89da3b03720bf77a --- /dev/null +++ b/src/tune.cpp @@ -0,0 +1,126 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "tune.h" + +#include +#include +#include +#include +#include +#include + +#include "ucioption.h" + +using std::string; + +namespace Stockfish { + +bool Tune::update_on_last; +const Option* LastOption = nullptr; +OptionsMap* Tune::options; +namespace { +std::map TuneResults; + +std::optional on_tune(const Option& o) { + + if (!Tune::update_on_last || LastOption == &o) + Tune::read_options(); + + return std::nullopt; +} +} + +void Tune::make_option(OptionsMap* opts, const string& n, int v, const SetRange& r) { + + // Do not generate option when there is nothing to tune (ie. min = max) + if (r(v).first == r(v).second) + return; + + if (TuneResults.count(n)) + v = TuneResults[n]; + + opts->add(n, Option(v, r(v).first, r(v).second, on_tune)); + LastOption = &((*opts)[n]); + + // Print formatted parameters, ready to be copy-pasted in Fishtest + std::cout << n << "," // + << v << "," // + << r(v).first << "," // + << r(v).second << "," // + << (r(v).second - r(v).first) / 20.0 << "," // + << "0.0020" << std::endl; +} + +string Tune::next(string& names, bool pop) { + + string name; + + do + { + string token = names.substr(0, names.find(',')); + + if (pop) + names.erase(0, token.size() + 1); + + std::stringstream ws(token); + name += (ws >> token, token); // Remove trailing whitespace + + } while (std::count(name.begin(), name.end(), '(') - std::count(name.begin(), name.end(), ')')); + + return name; +} + + +template<> +void Tune::Entry::init_option() { + make_option(options, name, value, range); +} + +template<> +void Tune::Entry::read_option() { + if (options->count(name)) + value = int((*options)[name]); +} + +// Instead of a variable here we have a PostUpdate function: just call it +template<> +void Tune::Entry::init_option() {} +template<> +void Tune::Entry::read_option() { + value(); +} + +} // namespace Stockfish + + +// Init options with tuning session results instead of default values. Useful to +// get correct bench signature after a tuning session or to test tuned values. +// Just copy fishtest tuning results in a result.txt file and extract the +// values with: +// +// cat results.txt | sed 's/^param: \([^,]*\), best: \([^,]*\).*/ TuneResults["\1"] = int(round(\2));/' +// +// Then paste the output below, as the function body + + +namespace Stockfish { + +void Tune::read_results() { /* ...insert your values here... */ } + +} // namespace Stockfish diff --git a/src/tune.h b/src/tune.h new file mode 100644 index 0000000000000000000000000000000000000000..4ce6e759fde3b7920e3649dce438e185eb6bd42f --- /dev/null +++ b/src/tune.h @@ -0,0 +1,192 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef TUNE_H_INCLUDED +#define TUNE_H_INCLUDED + +#include +#include +#include +#include // IWYU pragma: keep +#include +#include + +namespace Stockfish { + +class OptionsMap; + +using Range = std::pair; // Option's min-max values +using RangeFun = Range(int); + +// Default Range function, to calculate Option's min-max values +inline Range default_range(int v) { return v > 0 ? Range(0, 2 * v) : Range(2 * v, 0); } + +struct SetRange { + explicit SetRange(RangeFun f) : + fun(f) {} + SetRange(int min, int max) : + fun(nullptr), + range(min, max) {} + Range operator()(int v) const { return fun ? fun(v) : range; } + + RangeFun* fun; + Range range; +}; + +#define SetDefaultRange SetRange(default_range) + + +// Tune class implements the 'magic' code that makes the setup of a fishtest tuning +// session as easy as it can be. Mainly you have just to remove const qualifiers +// from the variables you want to tune and flag them for tuning, so if you have: +// +// const Value myValue[][2] = { { V(100), V(20) }, { V(7), V(78) } }; +// +// If you have a my_post_update() function to run after values have been updated, +// and a my_range() function to set custom Option's min-max values, then you just +// remove the 'const' qualifiers and write somewhere below in the file: +// +// TUNE(SetRange(my_range), myValue, my_post_update); +// +// You can also set the range directly, and restore the default at the end +// +// TUNE(SetRange(-100, 100), myValue, SetDefaultRange); +// +// In case update function is slow and you have many parameters, you can add: +// +// UPDATE_ON_LAST(); +// +// And the values update, including post update function call, will be done only +// once, after the engine receives the last UCI option, that is the one defined +// and created as the last one, so the GUI should send the options in the same +// order in which have been defined. + +class Tune { + + using PostUpdate = void(); // Post-update function + + Tune() { read_results(); } + Tune(const Tune&) = delete; + void operator=(const Tune&) = delete; + void read_results(); + + static Tune& instance() { + static Tune t; + return t; + } // Singleton + + // Use polymorphism to accommodate Entry of different types in the same vector + struct EntryBase { + virtual ~EntryBase() = default; + virtual void init_option() = 0; + virtual void read_option() = 0; + }; + + template + struct Entry: public EntryBase { + + static_assert(!std::is_const_v, "Parameter cannot be const!"); + + static_assert(std::is_same_v || std::is_same_v, + "Parameter type not supported!"); + + Entry(const std::string& n, T& v, const SetRange& r) : + name(n), + value(v), + range(r) {} + void operator=(const Entry&) = delete; // Because 'value' is a reference + void init_option() override; + void read_option() override; + + std::string name; + T& value; + SetRange range; + }; + + // Our facility to fill the container, each Entry corresponds to a parameter + // to tune. We use variadic templates to deal with an unspecified number of + // entries, each one of a possible different type. + static std::string next(std::string& names, bool pop = true); + + int add(const SetRange&, std::string&&) { return 0; } + + template + int add(const SetRange& range, std::string&& names, T& value, Args&&... args) { + list.push_back(std::unique_ptr(new Entry(next(names), value, range))); + return add(range, std::move(names), args...); + } + + // Template specialization for arrays: recursively handle multi-dimensional arrays + template + int add(const SetRange& range, std::string&& names, T (&value)[N], Args&&... args) { + for (size_t i = 0; i < N; i++) + add(range, next(names, i == N - 1) + "[" + std::to_string(i) + "]", value[i]); + return add(range, std::move(names), args...); + } + + // Template specialization for SetRange + template + int add(const SetRange&, std::string&& names, SetRange& value, Args&&... args) { + return add(value, (next(names), std::move(names)), args...); + } + + static void make_option(OptionsMap* options, const std::string& n, int v, const SetRange& r); + + std::vector> list; + + public: + template + static int add(const std::string& names, Args&&... args) { + return instance().add(SetDefaultRange, names.substr(1, names.size() - 2), + args...); // Remove trailing parenthesis + } + static void init(OptionsMap& o) { + options = &o; + for (auto& e : instance().list) + e->init_option(); + read_options(); + } // Deferred, due to UCIEngine::Options access + static void read_options() { + for (auto& e : instance().list) + e->read_option(); + } + + static bool update_on_last; + static OptionsMap* options; +}; + +template +constexpr void tune_check_args(Args&&...) { + static_assert((!std::is_fundamental_v && ...), "TUNE macro arguments wrong"); +} + +// Some macro magic :-) we define a dummy int variable that the compiler initializes calling Tune::add() +#define STRINGIFY(x) #x +#define UNIQUE2(x, y) x##y +#define UNIQUE(x, y) UNIQUE2(x, y) // Two indirection levels to expand __LINE__ +#define TUNE(...) \ + int UNIQUE(p, __LINE__) = []() -> int { \ + tune_check_args(__VA_ARGS__); \ + return Tune::add(STRINGIFY((__VA_ARGS__)), __VA_ARGS__); \ + }(); + +#define UPDATE_ON_LAST() bool UNIQUE(p, __LINE__) = Tune::update_on_last = true + +} // namespace Stockfish + +#endif // #ifndef TUNE_H_INCLUDED diff --git a/src/types.h b/src/types.h new file mode 100644 index 0000000000000000000000000000000000000000..bfaa658e9c9b7d3b87e38ed5ab47c27ab07bd574 --- /dev/null +++ b/src/types.h @@ -0,0 +1,492 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef TYPES_H_INCLUDED + #define TYPES_H_INCLUDED + +// When compiling with provided Makefile (e.g. for Linux and OSX), configuration +// is done automatically. To get started type 'make help'. +// +// When Makefile is not used (e.g. with Microsoft Visual Studio) some switches +// need to be set manually: +// +// -DNDEBUG | Disable debugging mode. Always use this for release. +// +// -DNO_PREFETCH | Disable use of prefetch asm-instruction. You may need this to +// | run on some very old machines. +// +// -DUSE_POPCNT | Add runtime support for use of popcnt asm-instruction. Works +// | only in 64-bit mode and requires hardware with popcnt support. +// +// -DUSE_PEXT | Add runtime support for use of pext asm-instruction. Works +// | only in 64-bit mode and requires hardware with pext support. + + #include + #include + #include + #include + #include "misc.h" + + #if defined(_MSC_VER) + // Disable some silly and noisy warnings from MSVC compiler + #pragma warning(disable: 4127) // Conditional expression is constant + #pragma warning(disable: 4146) // Unary minus operator applied to unsigned type + #pragma warning(disable: 4800) // Forcing value to bool 'true' or 'false' + #endif + +// Predefined macros hell: +// +// __GNUC__ Compiler is GCC, Clang or ICX +// __clang__ Compiler is Clang or ICX +// __INTEL_LLVM_COMPILER Compiler is ICX +// _MSC_VER Compiler is MSVC +// _WIN32 Building on Windows (any) +// _WIN64 Building on Windows 64 bit + +// Enforce minimum GCC version + #if defined(__GNUC__) && !defined(__clang__) \ + && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ < 3)) + #error "Stockfish requires GCC 9.3 or later for correct compilation" + #endif + + // Enforce minimum Clang version + #if defined(__clang__) && (__clang_major__ < 10) + #error "Stockfish requires Clang 10.0 or later for correct compilation" + #endif + + #define ASSERT_ALIGNED(ptr, alignment) assert(reinterpret_cast(ptr) % alignment == 0) + + #if defined(_WIN64) && defined(_MSC_VER) // No Makefile used + #include // Microsoft header for _BitScanForward64() + #define IS_64BIT + #endif + + #if defined(USE_POPCNT) && defined(_MSC_VER) + #include // Microsoft header for _mm_popcnt_u64() + #endif + + #if !defined(NO_PREFETCH) && defined(_MSC_VER) + #include // Microsoft header for _mm_prefetch() + #endif + + #if defined(USE_PEXT) + #include // Header for _pext_u64() intrinsic + #define pext(b, m) _pext_u64(b, m) + #else + #define pext(b, m) 0 + #endif + +namespace Stockfish { + + #ifdef USE_POPCNT +constexpr bool HasPopCnt = true; + #else +constexpr bool HasPopCnt = false; + #endif + + #ifdef USE_PEXT +constexpr bool HasPext = true; + #else +constexpr bool HasPext = false; + #endif + + #ifdef IS_64BIT +constexpr bool Is64Bit = true; + #else +constexpr bool Is64Bit = false; + #endif + +using Key = uint64_t; +using Bitboard = uint64_t; + +constexpr int MAX_MOVES = 256; +constexpr int MAX_PLY = 246; + +enum Color : uint8_t { + WHITE, + BLACK, + COLOR_NB = 2 +}; + +enum CastlingRights : uint8_t { + NO_CASTLING, + WHITE_OO, + WHITE_OOO = WHITE_OO << 1, + BLACK_OO = WHITE_OO << 2, + BLACK_OOO = WHITE_OO << 3, + + KING_SIDE = WHITE_OO | BLACK_OO, + QUEEN_SIDE = WHITE_OOO | BLACK_OOO, + WHITE_CASTLING = WHITE_OO | WHITE_OOO, + BLACK_CASTLING = BLACK_OO | BLACK_OOO, + ANY_CASTLING = WHITE_CASTLING | BLACK_CASTLING, + + CASTLING_RIGHT_NB = 16 +}; + +enum Bound : uint8_t { + BOUND_NONE, + BOUND_UPPER, + BOUND_LOWER, + BOUND_EXACT = BOUND_UPPER | BOUND_LOWER +}; + +// Value is used as an alias for int, this is done to differentiate between a search +// value and any other integer value. The values used in search are always supposed +// to be in the range (-VALUE_NONE, VALUE_NONE] and should not exceed this range. +using Value = int; + +constexpr Value VALUE_ZERO = 0; +constexpr Value VALUE_DRAW = 0; +constexpr Value VALUE_NONE = 32002; +constexpr Value VALUE_INFINITE = 32001; + +constexpr Value VALUE_MATE = 32000; +constexpr Value VALUE_MATE_IN_MAX_PLY = VALUE_MATE - MAX_PLY; +constexpr Value VALUE_MATED_IN_MAX_PLY = -VALUE_MATE_IN_MAX_PLY; + +constexpr Value VALUE_TB = VALUE_MATE_IN_MAX_PLY - 1; +constexpr Value VALUE_TB_WIN_IN_MAX_PLY = VALUE_TB - MAX_PLY; +constexpr Value VALUE_TB_LOSS_IN_MAX_PLY = -VALUE_TB_WIN_IN_MAX_PLY; + + +constexpr bool is_valid(Value value) { return value != VALUE_NONE; } + +constexpr bool is_win(Value value) { + assert(is_valid(value)); + return value >= VALUE_TB_WIN_IN_MAX_PLY; +} + +constexpr bool is_loss(Value value) { + assert(is_valid(value)); + return value <= VALUE_TB_LOSS_IN_MAX_PLY; +} + +constexpr bool is_decisive(Value value) { return is_win(value) || is_loss(value); } + +// In the code, we make the assumption that these values +// are such that non_pawn_material() can be used to uniquely +// identify the material on the board. +constexpr Value PawnValue = 208; +constexpr Value KnightValue = 781; +constexpr Value BishopValue = 825; +constexpr Value RookValue = 1276; +constexpr Value QueenValue = 2538; + + +// clang-format off +enum PieceType : std::uint8_t { + NO_PIECE_TYPE, PAWN, KNIGHT, BISHOP, ROOK, QUEEN, KING, + ALL_PIECES = 0, + PIECE_TYPE_NB = 8 +}; + +enum Piece : std::uint8_t { + NO_PIECE, + W_PAWN = PAWN, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING, + B_PAWN = PAWN + 8, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING, + PIECE_NB = 16 +}; +// clang-format on + +constexpr Value PieceValue[PIECE_NB] = { + VALUE_ZERO, PawnValue, KnightValue, BishopValue, RookValue, QueenValue, VALUE_ZERO, VALUE_ZERO, + VALUE_ZERO, PawnValue, KnightValue, BishopValue, RookValue, QueenValue, VALUE_ZERO, VALUE_ZERO}; + +using Depth = int; + +// The following DEPTH_ constants are used for transposition table entries +// and quiescence search move generation stages. In regular search, the +// depth stored in the transposition table is literal: the search depth +// (effort) used to make the corresponding transposition table value. In +// quiescence search, however, the transposition table entries only store +// the current quiescence move generation stage (which should thus compare +// lower than any regular search depth). +constexpr Depth DEPTH_QS = 0; +// For transposition table entries where no searching at all was done +// (whether regular or qsearch) we use DEPTH_UNSEARCHED, which should thus +// compare lower than any quiescence or regular depth. DEPTH_ENTRY_OFFSET +// is used only for the transposition table entry occupancy check (see tt.cpp), +// and should thus be lower than DEPTH_UNSEARCHED. +constexpr Depth DEPTH_UNSEARCHED = -2; +constexpr Depth DEPTH_ENTRY_OFFSET = -3; + +// clang-format off +enum Square : uint8_t { + SQ_A1, SQ_B1, SQ_C1, SQ_D1, SQ_E1, SQ_F1, SQ_G1, SQ_H1, + SQ_A2, SQ_B2, SQ_C2, SQ_D2, SQ_E2, SQ_F2, SQ_G2, SQ_H2, + SQ_A3, SQ_B3, SQ_C3, SQ_D3, SQ_E3, SQ_F3, SQ_G3, SQ_H3, + SQ_A4, SQ_B4, SQ_C4, SQ_D4, SQ_E4, SQ_F4, SQ_G4, SQ_H4, + SQ_A5, SQ_B5, SQ_C5, SQ_D5, SQ_E5, SQ_F5, SQ_G5, SQ_H5, + SQ_A6, SQ_B6, SQ_C6, SQ_D6, SQ_E6, SQ_F6, SQ_G6, SQ_H6, + SQ_A7, SQ_B7, SQ_C7, SQ_D7, SQ_E7, SQ_F7, SQ_G7, SQ_H7, + SQ_A8, SQ_B8, SQ_C8, SQ_D8, SQ_E8, SQ_F8, SQ_G8, SQ_H8, + SQ_NONE, + + SQUARE_ZERO = 0, + SQUARE_NB = 64 +}; +// clang-format on + +enum Direction : int8_t { + NORTH = 8, + EAST = 1, + SOUTH = -NORTH, + WEST = -EAST, + + NORTH_EAST = NORTH + EAST, + SOUTH_EAST = SOUTH + EAST, + SOUTH_WEST = SOUTH + WEST, + NORTH_WEST = NORTH + WEST +}; + +enum File : uint8_t { + FILE_A, + FILE_B, + FILE_C, + FILE_D, + FILE_E, + FILE_F, + FILE_G, + FILE_H, + FILE_NB +}; + +enum Rank : uint8_t { + RANK_1, + RANK_2, + RANK_3, + RANK_4, + RANK_5, + RANK_6, + RANK_7, + RANK_8, + RANK_NB +}; + +// Keep track of what a move changes on the board (used by NNUE) +struct DirtyPiece { + Piece pc; // this is never allowed to be NO_PIECE + Square from, to; // to should be SQ_NONE for promotions + + // if {add,remove}_sq is SQ_NONE, {add,remove}_pc is allowed to be + // uninitialized + // castling uses add_sq and remove_sq to remove and add the rook + Square remove_sq, add_sq; + Piece remove_pc, add_pc; +}; + +// Keep track of what threats change on the board (used by NNUE) +struct DirtyThreat { + static constexpr int PcSqOffset = 0; + static constexpr int ThreatenedSqOffset = 8; + static constexpr int ThreatenedPcOffset = 16; + static constexpr int PcOffset = 20; + + DirtyThreat() { /* don't initialize data */ } + DirtyThreat(uint32_t raw) : + data(raw) {} + DirtyThreat(Piece pc, Piece threatened_pc, Square pc_sq, Square threatened_sq, bool add) { + data = (uint32_t(add) << 31) | (pc << PcOffset) | (threatened_pc << ThreatenedPcOffset) + | (threatened_sq << ThreatenedSqOffset) | (pc_sq << PcSqOffset); + } + + Piece pc() const { return static_cast(data >> PcOffset & 0xf); } + Piece threatened_pc() const { return static_cast(data >> ThreatenedPcOffset & 0xf); } + Square threatened_sq() const { return static_cast(data >> ThreatenedSqOffset & 0xff); } + Square pc_sq() const { return static_cast(data >> PcSqOffset & 0xff); } + bool add() const { return data >> 31; } + uint32_t raw() const { return data; } + + private: + uint32_t data; +}; + +// A piece can be involved in at most 8 outgoing attacks and 16 incoming attacks. +// Moving a piece also can reveal at most 8 discovered attacks. +// This implies that a non-castling move can change at most (8 + 16) * 3 + 8 = 80 features. +// By similar logic, a castling move can change at most (5 + 1 + 3 + 9) * 2 = 36 features. +// Thus, 80 should work as an upper bound. Finally, 16 entries are added to accommodate +// unmasked vector stores near the end of the list. + +using DirtyThreatList = ValueList; + +struct DirtyThreats { + DirtyThreatList list; + Color us; + Square prevKsq, ksq; + + Bitboard threatenedSqs, threateningSqs; +}; + + #define ENABLE_INCR_OPERATORS_ON(T) \ + constexpr T& operator++(T& d) { return d = T(int(d) + 1); } \ + constexpr T& operator--(T& d) { return d = T(int(d) - 1); } + +ENABLE_INCR_OPERATORS_ON(PieceType) +ENABLE_INCR_OPERATORS_ON(Square) +ENABLE_INCR_OPERATORS_ON(File) +ENABLE_INCR_OPERATORS_ON(Rank) + + #undef ENABLE_INCR_OPERATORS_ON + +constexpr Direction operator+(Direction d1, Direction d2) { return Direction(int(d1) + int(d2)); } +constexpr Direction operator*(int i, Direction d) { return Direction(i * int(d)); } + +// Additional operators to add a Direction to a Square +constexpr Square operator+(Square s, Direction d) { return Square(int(s) + int(d)); } +constexpr Square operator-(Square s, Direction d) { return Square(int(s) - int(d)); } +constexpr Square& operator+=(Square& s, Direction d) { return s = s + d; } +constexpr Square& operator-=(Square& s, Direction d) { return s = s - d; } + +// Toggle color +constexpr Color operator~(Color c) { return Color(c ^ BLACK); } + +// Swap A1 <-> A8 +constexpr Square flip_rank(Square s) { return Square(s ^ SQ_A8); } + +// Swap A1 <-> H1 +constexpr Square flip_file(Square s) { return Square(s ^ SQ_H1); } + +// Swap color of piece B_KNIGHT <-> W_KNIGHT +constexpr Piece operator~(Piece pc) { return Piece(pc ^ 8); } + +constexpr CastlingRights operator&(Color c, CastlingRights cr) { + return CastlingRights((c == WHITE ? WHITE_CASTLING : BLACK_CASTLING) & cr); +} + +constexpr Value mate_in(int ply) { return VALUE_MATE - ply; } + +constexpr Value mated_in(int ply) { return -VALUE_MATE + ply; } + +constexpr Square make_square(File f, Rank r) { return Square((r << 3) + f); } + +constexpr Piece make_piece(Color c, PieceType pt) { return Piece((c << 3) + pt); } + +constexpr PieceType type_of(Piece pc) { return PieceType(pc & 7); } + +constexpr Color color_of(Piece pc) { + assert(pc != NO_PIECE); + return Color(pc >> 3); +} + +constexpr bool is_ok(Square s) { return s >= SQ_A1 && s <= SQ_H8; } + +constexpr File file_of(Square s) { return File(s & 7); } + +constexpr Rank rank_of(Square s) { return Rank(s >> 3); } + +constexpr Square relative_square(Color c, Square s) { return Square(s ^ (c * 56)); } + +constexpr Rank relative_rank(Color c, Rank r) { return Rank(r ^ (c * 7)); } + +constexpr Rank relative_rank(Color c, Square s) { return relative_rank(c, rank_of(s)); } + +constexpr Direction pawn_push(Color c) { return c == WHITE ? NORTH : SOUTH; } + + +// Based on a congruential pseudo-random number generator +constexpr Key make_key(uint64_t seed) { + return seed * 6364136223846793005ULL + 1442695040888963407ULL; +} + + +enum MoveType : uint16_t { + NORMAL, + PROMOTION = 1 << 14, + EN_PASSANT = 2 << 14, + CASTLING = 3 << 14 +}; + +// A move needs 16 bits to be stored +// +// bit 0- 5: destination square (from 0 to 63) +// bit 6-11: origin square (from 0 to 63) +// bit 12-13: promotion piece type - 2 (from KNIGHT-2 to QUEEN-2) +// bit 14-15: special move flag: promotion (1), en passant (2), castling (3) +// NOTE: en passant bit is set only when a pawn can be captured +// +// Special cases are Move::none() and Move::null(). We can sneak these in because +// in any normal move the destination square and origin square are always different, +// but Move::none() and Move::null() have the same origin and destination square. + +class Move { + public: + Move() = default; + constexpr explicit Move(std::uint16_t d) : + data(d) {} + + constexpr Move(Square from, Square to) : + data((from << 6) + to) {} + + template + static constexpr Move make(Square from, Square to, PieceType pt = KNIGHT) { + return Move(T + ((pt - KNIGHT) << 12) + (from << 6) + to); + } + + constexpr Square from_sq() const { + assert(is_ok()); + return Square((data >> 6) & 0x3F); + } + + constexpr Square to_sq() const { + assert(is_ok()); + return Square(data & 0x3F); + } + + // Same as to_sq() but without assertion, for branchless code paths + // where the result is masked/ignored when move is not ok + constexpr Square to_sq_unchecked() const { return Square(data & 0x3F); } + + constexpr MoveType type_of() const { return MoveType(data & (3 << 14)); } + + constexpr PieceType promotion_type() const { return PieceType(((data >> 12) & 3) + KNIGHT); } + + constexpr bool is_ok() const { return none().data != data && null().data != data; } + + static constexpr Move null() { return Move(65); } + static constexpr Move none() { return Move(0); } + + constexpr bool operator==(const Move& m) const { return data == m.data; } + constexpr bool operator!=(const Move& m) const { return data != m.data; } + + constexpr explicit operator bool() const { return data != 0; } + + constexpr std::uint16_t raw() const { return data; } + + struct MoveHash { + std::size_t operator()(const Move& m) const { return make_key(m.data); } + }; + + protected: + std::uint16_t data; +}; + +template +struct is_all_same { + static constexpr bool value = (std::is_same_v && ...); +}; + +template +constexpr auto is_all_same_v = is_all_same::value; + +} // namespace Stockfish + +#endif // #ifndef TYPES_H_INCLUDED + +#include "tune.h" // Global visibility to tuning setup diff --git a/src/uci.cpp b/src/uci.cpp new file mode 100644 index 0000000000000000000000000000000000000000..385dfeb4ee04dbbe7453f4c7d3bdd7ecba16238e --- /dev/null +++ b/src/uci.cpp @@ -0,0 +1,658 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "uci.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "benchmark.h" +#include "engine.h" +#include "memory.h" +#include "movegen.h" +#include "position.h" +#include "score.h" +#include "search.h" +#include "types.h" +#include "ucioption.h" + +namespace Stockfish { + +constexpr auto BenchmarkCommand = "speedtest"; + +constexpr auto StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"; +template +struct overload: Ts... { + using Ts::operator()...; +}; + +template +overload(Ts...) -> overload; + +void UCIEngine::print_info_string(std::string_view str) { + sync_cout_start(); + for (auto& line : split(str, "\n")) + { + if (!is_whitespace(line)) + { + std::cout << "info string " << line << '\n'; + } + } + sync_cout_end(); +} + +UCIEngine::UCIEngine(int argc, char** argv) : + engine(argv[0]), + cli(argc, argv) { + + engine.get_options().add_info_listener([](const std::optional& str) { + if (str.has_value()) + print_info_string(*str); + }); + + init_search_update_listeners(); +} + +void UCIEngine::init_search_update_listeners() { + engine.set_on_iter([](const auto& i) { on_iter(i); }); + engine.set_on_update_no_moves([](const auto& i) { on_update_no_moves(i); }); + engine.set_on_update_full( + [this](const auto& i) { on_update_full(i, engine.get_options()["UCI_ShowWDL"]); }); + engine.set_on_bestmove([](const auto& bm, const auto& p) { on_bestmove(bm, p); }); + engine.set_on_verify_networks([](const auto& s) { print_info_string(s); }); +} + +void UCIEngine::loop() { + std::string token, cmd; + + for (int i = 1; i < cli.argc; ++i) + cmd += std::string(cli.argv[i]) + " "; + + do + { + if (cli.argc == 1 + && !getline(std::cin, cmd)) // Wait for an input or an end-of-file (EOF) indication + cmd = "quit"; + + std::istringstream is(cmd); + + token.clear(); // Avoid a stale if getline() returns nothing or a blank line + is >> std::skipws >> token; + + if (token == "quit" || token == "stop") + engine.stop(); + + // The GUI sends 'ponderhit' to tell that the user has played the expected move. + // So, 'ponderhit' is sent if pondering was done on the same move that the user + // has played. The search should continue, but should also switch from pondering + // to the normal search. + else if (token == "ponderhit") + engine.set_ponderhit(false); + + else if (token == "uci") + { + sync_cout << "id name " << engine_info(true) << "\n" + << engine.get_options() << sync_endl; + + sync_cout << "uciok" << sync_endl; + } + + else if (token == "setoption") + setoption(is); + else if (token == "go") + { + // send info strings after the go command is sent for old GUIs and python-chess + print_info_string(engine.numa_config_information_as_string()); + print_info_string(engine.thread_allocation_information_as_string()); + go(is); + } + else if (token == "position") + position(is); + else if (token == "ucinewgame") + engine.search_clear(); + else if (token == "isready") + sync_cout << "readyok" << sync_endl; + + // Add custom non-UCI commands, mainly for debugging purposes. + // These commands must not be used during a search! + else if (token == "flip") + engine.flip(); + else if (token == "bench") + bench(is); + else if (token == BenchmarkCommand) + benchmark(is); + else if (token == "d") + sync_cout << engine.visualize() << sync_endl; + else if (token == "eval") + engine.trace_eval(); + else if (token == "compiler") + sync_cout << compiler_info() << sync_endl; + else if (token == "export_net") + { + std::pair, std::string> files[2]; + + if (is >> std::skipws >> files[0].second) + files[0].first = files[0].second; + + if (is >> std::skipws >> files[1].second) + files[1].first = files[1].second; + + engine.save_network(files); + } + else if (token == "--help" || token == "help" || token == "--license" || token == "license") + sync_cout + << "\nStockfish is a powerful chess engine for playing and analyzing." + "\nIt is released as free software licensed under the GNU GPLv3 License." + "\nStockfish is normally used with a graphical user interface (GUI) and implements" + "\nthe Universal Chess Interface (UCI) protocol to communicate with a GUI, an API, etc." + "\nFor any further information, visit https://github.com/official-stockfish/Stockfish#readme" + "\nor read the corresponding README.md and Copying.txt files distributed along with this program.\n" + << sync_endl; + else if (!token.empty() && token[0] != '#') + sync_cout << "Unknown command: '" << cmd << "'. Type help for more information." + << sync_endl; + + } while (token != "quit" && cli.argc == 1); // The command-line arguments are one-shot +} + +Search::LimitsType UCIEngine::parse_limits(std::istream& is) { + Search::LimitsType limits; + std::string token; + + limits.startTime = now(); // The search starts as early as possible + + while (is >> token) + if (token == "searchmoves") // Needs to be the last command on the line + while (is >> token) + limits.searchmoves.push_back(to_lower(token)); + + else if (token == "wtime") + is >> limits.time[WHITE]; + else if (token == "btime") + is >> limits.time[BLACK]; + else if (token == "winc") + is >> limits.inc[WHITE]; + else if (token == "binc") + is >> limits.inc[BLACK]; + else if (token == "movestogo") + is >> limits.movestogo; + else if (token == "depth") + is >> limits.depth; + else if (token == "nodes") + is >> limits.nodes; + else if (token == "movetime") + is >> limits.movetime; + else if (token == "mate") + is >> limits.mate; + else if (token == "perft") + is >> limits.perft; + else if (token == "infinite") + limits.infinite = 1; + else if (token == "ponder") + limits.ponderMode = true; + + return limits; +} + +void UCIEngine::go(std::istringstream& is) { + + Search::LimitsType limits = parse_limits(is); + + if (limits.perft) + perft(limits); + else + engine.go(limits); +} + +void UCIEngine::bench(std::istream& args) { + std::string token; + uint64_t num, nodes = 0, cnt = 1; + uint64_t nodesSearched = 0; + const auto& options = engine.get_options(); + + engine.set_on_update_full([&](const auto& i) { + nodesSearched = i.nodes; + on_update_full(i, options["UCI_ShowWDL"]); + }); + + std::vector list = Benchmark::setup_bench(engine.fen(), args); + + num = count_if(list.begin(), list.end(), + [](const std::string& s) { return s.find("go ") == 0 || s.find("eval") == 0; }); + + TimePoint elapsed = now(); + + for (const auto& cmd : list) + { + std::istringstream is(cmd); + is >> std::skipws >> token; + + if (token == "go" || token == "eval") + { + std::cerr << "\nPosition: " << cnt++ << '/' << num << " (" << engine.fen() << ")" + << std::endl; + if (token == "go") + { + Search::LimitsType limits = parse_limits(is); + + if (limits.perft) + nodesSearched = perft(limits); + else + { + engine.go(limits); + engine.wait_for_search_finished(); + } + + nodes += nodesSearched; + nodesSearched = 0; + } + else + engine.trace_eval(); + } + else if (token == "setoption") + setoption(is); + else if (token == "position") + position(is); + else if (token == "ucinewgame") + { + engine.search_clear(); // search_clear may take a while + elapsed = now(); + } + } + + elapsed = now() - elapsed + 1; // Ensure positivity to avoid a 'divide by zero' + + dbg_print(); + + std::cerr << "\n===========================" // + << "\nTotal time (ms) : " << elapsed // + << "\nNodes searched : " << nodes // + << "\nNodes/second : " << 1000 * nodes / elapsed << std::endl; + + // reset callback, to not capture a dangling reference to nodesSearched + engine.set_on_update_full([&](const auto& i) { on_update_full(i, options["UCI_ShowWDL"]); }); +} + +void UCIEngine::benchmark(std::istream& args) { + // Probably not very important for a test this long, but include for completeness and sanity. + static constexpr int NUM_WARMUP_POSITIONS = 3; + + std::string token; + uint64_t nodes = 0, cnt = 1; + uint64_t nodesSearched = 0; + + engine.set_on_update_full([&](const Engine::InfoFull& i) { nodesSearched = i.nodes; }); + + engine.set_on_iter([](const auto&) {}); + engine.set_on_update_no_moves([](const auto&) {}); + engine.set_on_bestmove([](const auto&, const auto&) {}); + engine.set_on_verify_networks([](const auto&) {}); + + Benchmark::BenchmarkSetup setup = Benchmark::setup_benchmark(args); + + const auto numGoCommands = count_if(setup.commands.begin(), setup.commands.end(), + [](const std::string& s) { return s.find("go ") == 0; }); + + TimePoint totalTime = 0; + + // Set options once at the start. + auto ss = std::istringstream("name Threads value " + std::to_string(setup.threads)); + setoption(ss); + ss = std::istringstream("name Hash value " + std::to_string(setup.ttSize)); + setoption(ss); + ss = std::istringstream("name UCI_Chess960 value false"); + setoption(ss); + + // Warmup + for (const auto& cmd : setup.commands) + { + std::istringstream is(cmd); + is >> std::skipws >> token; + + if (token == "go") + { + // One new line is produced by the search, so omit it here + std::cerr << "\rWarmup position " << cnt++ << '/' << NUM_WARMUP_POSITIONS; + + Search::LimitsType limits = parse_limits(is); + + // Run with silenced network verification + engine.go(limits); + engine.wait_for_search_finished(); + } + else if (token == "position") + position(is); + else if (token == "ucinewgame") + { + engine.search_clear(); // search_clear may take a while + } + + if (cnt > NUM_WARMUP_POSITIONS) + break; + } + + std::cerr << "\n"; + + cnt = 1; + nodes = 0; + + int numHashfullReadings = 0; + constexpr int hashfullAges[] = {0, 999}; // Only normal hashfull and touched hash. + constexpr int hashfullAgeCount = std::size(hashfullAges); + int totalHashfull[hashfullAgeCount] = {0}; + int maxHashfull[hashfullAgeCount] = {0}; + + auto updateHashfullReadings = [&]() { + numHashfullReadings += 1; + + for (int i = 0; i < hashfullAgeCount; ++i) + { + const int hashfull = engine.get_hashfull(hashfullAges[i]); + maxHashfull[i] = std::max(maxHashfull[i], hashfull); + totalHashfull[i] += hashfull; + } + }; + + engine.search_clear(); // search_clear may take a while + + for (const auto& cmd : setup.commands) + { + std::istringstream is(cmd); + is >> std::skipws >> token; + + if (token == "go") + { + // One new line is produced by the search, so omit it here + std::cerr << "\rPosition " << cnt++ << '/' << numGoCommands; + + Search::LimitsType limits = parse_limits(is); + + nodesSearched = 0; + TimePoint elapsed = now(); + + // Run with silenced network verification + engine.go(limits); + engine.wait_for_search_finished(); + + totalTime += now() - elapsed; + + updateHashfullReadings(); + + nodes += nodesSearched; + } + else if (token == "position") + position(is); + else if (token == "ucinewgame") + { + engine.search_clear(); // search_clear may take a while + } + } + + totalTime = std::max(totalTime, 1); // Ensure positivity to avoid a 'divide by zero' + + dbg_print(); + + std::cerr << "\n"; + + static_assert( + std::size(hashfullAges) == 2 && hashfullAges[0] == 0 && hashfullAges[1] == 999, + "Hardcoded for display. Would complicate the code needlessly in the current state."); + + std::string threadBinding = engine.thread_binding_information_as_string(); + if (threadBinding.empty()) + threadBinding = "none"; + + // clang-format off + + std::cerr << "===========================" + << "\nVersion : " + << engine_version_info() + // "\nCompiled by : " + << compiler_info() + << "Large pages : " << (has_large_pages() ? "yes" : "no") + << "\nUser invocation : " << BenchmarkCommand << " " + << setup.originalInvocation << "\nFilled invocation : " << BenchmarkCommand + << " " << setup.filledInvocation + << "\nAvailable processors : " << engine.get_numa_config_as_string() + << "\nThread count : " << setup.threads + << "\nThread binding : " << threadBinding + << "\nTT size [MiB] : " << setup.ttSize + << "\nHash max, avg [per mille] : " + << "\n single search : " << maxHashfull[0] << ", " + << totalHashfull[0] / numHashfullReadings + << "\n single game : " << maxHashfull[1] << ", " + << totalHashfull[1] / numHashfullReadings + << "\nTotal nodes searched : " << nodes + << "\nTotal search time [s] : " << totalTime / 1000.0 + << "\nNodes/second : " << 1000 * nodes / totalTime << std::endl; + + // clang-format on + + init_search_update_listeners(); +} + +void UCIEngine::setoption(std::istringstream& is) { + engine.wait_for_search_finished(); + engine.get_options().setoption(is); +} + +std::uint64_t UCIEngine::perft(const Search::LimitsType& limits) { + auto nodes = engine.perft(engine.fen(), limits.perft, engine.get_options()["UCI_Chess960"]); + sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl; + return nodes; +} + +void UCIEngine::position(std::istringstream& is) { + std::string token, fen; + + is >> token; + + if (token == "startpos") + { + fen = StartFEN; + is >> token; // Consume the "moves" token, if any + } + else if (token == "fen") + while (is >> token && token != "moves") + fen += token + " "; + else + return; + + std::vector moves; + + while (is >> token) + { + moves.push_back(token); + } + + engine.set_position(fen, moves); +} + +namespace { + +struct WinRateParams { + double a; + double b; +}; + +WinRateParams win_rate_params(const Position& pos) { + + int material = pos.count() + 3 * pos.count() + 3 * pos.count() + + 5 * pos.count() + 9 * pos.count(); + + // The fitted model only uses data for material counts in [17, 78], and is anchored at count 58. + double m = std::clamp(material, 17, 78) / 58.0; + + // Return a = p_a(material) and b = p_b(material), see github.com/official-stockfish/WDL_model + constexpr double as[] = {-72.32565836, 185.93832038, -144.58862193, 416.44950446}; + constexpr double bs[] = {83.86794042, -136.06112997, 69.98820887, 47.62901433}; + + double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3]; + double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3]; + + return {a, b}; +} + +// The win rate model is 1 / (1 + exp((a - eval) / b)), where a = p_a(material) and b = p_b(material). +// It fits the LTC fishtest statistics rather accurately. +int win_rate_model(Value v, const Position& pos) { + + auto [a, b] = win_rate_params(pos); + + // Return the win rate in per mille units, rounded to the nearest integer. + return int(0.5 + 1000 / (1 + std::exp((a - double(v)) / b))); +} +} + +std::string UCIEngine::format_score(const Score& s) { + constexpr int TB_CP = 20000; + if (s.is()) { + return std::string("cp ") + std::to_string(s.get().value); + } else if (s.is()) { + auto mate = s.get(); + auto m = (mate.plies > 0 ? (mate.plies + 1) : mate.plies) / 2; + return std::string("mate ") + std::to_string(m); + } else if (s.is()) { + auto tb = s.get(); + return std::string("cp ") + std::to_string((tb.win ? TB_CP - tb.plies : -TB_CP - tb.plies)); + } + return "cp 0"; +} + +// Turns a Value to an integer centipawn number, +// without treatment of mate and similar special scores. +int UCIEngine::to_cp(Value v, const Position& pos) { + + // In general, the score can be defined via the WDL as + // (log(1/L - 1) - log(1/W - 1)) / (log(1/L - 1) + log(1/W - 1)). + // Based on our win_rate_model, this simply yields v / a. + + auto [a, b] = win_rate_params(pos); + + return int(std::round(100 * int(v) / a)); +} + +std::string UCIEngine::wdl(Value v, const Position& pos) { + std::stringstream ss; + + int wdl_w = win_rate_model(v, pos); + int wdl_l = win_rate_model(-v, pos); + int wdl_d = 1000 - wdl_w - wdl_l; + ss << wdl_w << " " << wdl_d << " " << wdl_l; + + return ss.str(); +} + +std::string UCIEngine::square(Square s) { + return std::string{char('a' + file_of(s)), char('1' + rank_of(s))}; +} + +std::string UCIEngine::move(Move m, bool chess960) { + if (m == Move::none()) + return "(none)"; + + if (m == Move::null()) + return "0000"; + + Square from = m.from_sq(); + Square to = m.to_sq(); + + if (m.type_of() == CASTLING && !chess960) + to = make_square(to > from ? FILE_G : FILE_C, rank_of(from)); + + std::string move = square(from) + square(to); + + if (m.type_of() == PROMOTION) + move += " pnbrqk"[m.promotion_type()]; + + return move; +} + + +std::string UCIEngine::to_lower(std::string str) { + std::transform(str.begin(), str.end(), str.begin(), [](auto c) { return std::tolower(c); }); + + return str; +} + +Move UCIEngine::to_move(const Position& pos, std::string str) { + str = to_lower(str); + + for (const auto& m : MoveList(pos)) + if (str == move(m, pos.is_chess960())) + return m; + + return Move::none(); +} + +void UCIEngine::on_update_no_moves(const Engine::InfoShort& info) { + sync_cout << "info depth " << info.depth << " score " << format_score(info.score) << sync_endl; +} + +void UCIEngine::on_update_full(const Engine::InfoFull& info, bool showWDL) { + std::stringstream ss; + + ss << "info"; + ss << " depth " << info.depth // + << " seldepth " << info.selDepth // + << " multipv " << info.multiPV // + << " score " << format_score(info.score); // + + if (!info.bound.empty()) + ss << " " << info.bound; + + if (showWDL) + ss << " wdl " << info.wdl; + + ss << " nodes " << info.nodes // + << " nps " << info.nps // + << " hashfull " << info.hashfull // + << " tbhits " << info.tbHits // + << " time " << info.timeMs // + << " pv " << info.pv; // + + sync_cout << ss.str() << sync_endl; +} + +void UCIEngine::on_iter(const Engine::InfoIter& info) { + std::stringstream ss; + + ss << "info"; + ss << " depth " << info.depth // + << " currmove " << info.currmove // + << " currmovenumber " << info.currmovenumber; // + + sync_cout << ss.str() << sync_endl; +} + +void UCIEngine::on_bestmove(std::string_view bestmove, std::string_view ponder) { + sync_cout << "bestmove " << bestmove; + if (!ponder.empty()) + std::cout << " ponder " << ponder; + std::cout << sync_endl; +} + +} // namespace Stockfish diff --git a/src/uci.h b/src/uci.h new file mode 100644 index 0000000000000000000000000000000000000000..ebc04fc3c70bb2230c38facc5b55047a46ffae65 --- /dev/null +++ b/src/uci.h @@ -0,0 +1,80 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef UCI_H_INCLUDED +#define UCI_H_INCLUDED + +#include +#include +#include +#include + +#include "engine.h" +#include "misc.h" +#include "search.h" + +namespace Stockfish { + +class Position; +class Move; +class Score; +enum Square : uint8_t; +using Value = int; + +class UCIEngine { + public: + UCIEngine(int argc, char** argv); + + void loop(); + + static int to_cp(Value v, const Position& pos); + static std::string format_score(const Score& s); + static std::string square(Square s); + static std::string move(Move m, bool chess960); + static std::string wdl(Value v, const Position& pos); + static std::string to_lower(std::string str); + static Move to_move(const Position& pos, std::string str); + + static Search::LimitsType parse_limits(std::istream& is); + + auto& engine_options() { return engine.get_options(); } + + private: + Engine engine; + CommandLine cli; + + static void print_info_string(std::string_view str); + + void go(std::istringstream& is); + void bench(std::istream& args); + void benchmark(std::istream& args); + void position(std::istringstream& is); + void setoption(std::istringstream& is); + std::uint64_t perft(const Search::LimitsType&); + + static void on_update_no_moves(const Engine::InfoShort& info); + static void on_update_full(const Engine::InfoFull& info, bool showWDL); + static void on_iter(const Engine::InfoIter& info); + static void on_bestmove(std::string_view bestmove, std::string_view ponder); + + void init_search_update_listeners(); +}; + +} // namespace Stockfish + +#endif // #ifndef UCI_H_INCLUDED diff --git a/src/ucioption.cpp b/src/ucioption.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8db7967497e5277efc43bd693732e33ef8c344a6 --- /dev/null +++ b/src/ucioption.cpp @@ -0,0 +1,213 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "ucioption.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "misc.h" + +namespace Stockfish { + +bool CaseInsensitiveLess::operator()(const std::string& s1, const std::string& s2) const { + + return std::lexicographical_compare( + s1.begin(), s1.end(), s2.begin(), s2.end(), + [](char c1, char c2) { return std::tolower(c1) < std::tolower(c2); }); +} + +void OptionsMap::add_info_listener(InfoListener&& message_func) { info = std::move(message_func); } + +void OptionsMap::setoption(std::istringstream& is) { + std::string token, name, value; + + is >> token; // Consume the "name" token + + // Read the option name (can contain spaces) + while (is >> token && token != "value") + name += (name.empty() ? "" : " ") + token; + + // Read the option value (can contain spaces) + while (is >> token) + value += (value.empty() ? "" : " ") + token; + + if (options_map.count(name)) + options_map[name] = value; + else + sync_cout << "No such option: " << name << sync_endl; +} + +const Option& OptionsMap::operator[](const std::string& name) const { + auto it = options_map.find(name); + assert(it != options_map.end()); + return it->second; +} + +// Inits options and assigns idx in the correct printing order +void OptionsMap::add(const std::string& name, const Option& option) { + if (!options_map.count(name)) + { + static size_t insert_order = 0; + + options_map[name] = option; + + options_map[name].parent = this; + options_map[name].idx = insert_order++; + } + else + { + std::cerr << "Option \"" << name << "\" was already added!" << std::endl; + std::exit(EXIT_FAILURE); + } +} + + +std::size_t OptionsMap::count(const std::string& name) const { return options_map.count(name); } + +Option::Option(const OptionsMap* map) : + parent(map) {} + +Option::Option(const char* v, OnChange f) : + type("string"), + min(0), + max(0), + on_change(std::move(f)) { + defaultValue = currentValue = v; +} + +Option::Option(bool v, OnChange f) : + type("check"), + min(0), + max(0), + on_change(std::move(f)) { + defaultValue = currentValue = (v ? "true" : "false"); +} + +Option::Option(OnChange f) : + type("button"), + min(0), + max(0), + on_change(std::move(f)) {} + +Option::Option(int v, int minv, int maxv, OnChange f) : + type("spin"), + min(minv), + max(maxv), + on_change(std::move(f)) { + defaultValue = currentValue = std::to_string(v); +} + +Option::Option(const char* v, const char* cur, OnChange f) : + type("combo"), + min(0), + max(0), + on_change(std::move(f)) { + defaultValue = v; + currentValue = cur; +} + +Option::operator int() const { + assert(type == "check" || type == "spin"); + return (type == "spin" ? std::stoi(currentValue) : currentValue == "true"); +} + +Option::operator std::string() const { + assert(type == "string"); + return currentValue; +} + +bool Option::operator==(const char* s) const { + assert(type == "combo"); + return !CaseInsensitiveLess()(currentValue, s) && !CaseInsensitiveLess()(s, currentValue); +} + +bool Option::operator!=(const char* s) const { return !(*this == s); } + + +// Updates currentValue and triggers on_change() action. It's up to +// the GUI to check for option's limits, but we could receive the new value +// from the user by console window, so let's check the bounds anyway. +Option& Option::operator=(const std::string& v) { + + assert(!type.empty()); + + if ((type != "button" && type != "string" && v.empty()) + || (type == "check" && v != "true" && v != "false") + || (type == "spin" && (std::stoi(v) < min || std::stoi(v) > max))) + return *this; + + if (type == "combo") + { + OptionsMap comboMap; // To have case insensitive compare + std::string token; + std::istringstream ss(defaultValue); + while (ss >> token) + comboMap.add(token, Option()); + if (!comboMap.count(v) || v == "var") + return *this; + } + + if (type == "string") + currentValue = v == "" ? "" : v; + else if (type != "button") + currentValue = v; + + if (on_change) + { + const auto ret = on_change(*this); + + if (ret && parent != nullptr && parent->info != nullptr) + parent->info(ret); + } + + return *this; +} + +std::ostream& operator<<(std::ostream& os, const OptionsMap& om) { + for (size_t idx = 0; idx < om.options_map.size(); ++idx) + for (const auto& it : om.options_map) + if (it.second.idx == idx) + { + const Option& o = it.second; + os << "\noption name " << it.first << " type " << o.type; + + if (o.type == "check" || o.type == "combo") + os << " default " << o.defaultValue; + + else if (o.type == "string") + { + std::string defaultValue = o.defaultValue.empty() ? "" : o.defaultValue; + os << " default " << defaultValue; + } + + else if (o.type == "spin") + os << " default " << stoi(o.defaultValue) << " min " << o.min << " max " + << o.max; + + break; + } + + return os; +} +} diff --git a/src/ucioption.h b/src/ucioption.h new file mode 100644 index 0000000000000000000000000000000000000000..4f6d7541cff0a1e3603f62291c61ad96876466b9 --- /dev/null +++ b/src/ucioption.h @@ -0,0 +1,106 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef UCIOPTION_H_INCLUDED +#define UCIOPTION_H_INCLUDED + +#include +#include +#include +#include +#include +#include + +namespace Stockfish { +// Define a custom comparator, because the UCI options should be case-insensitive +struct CaseInsensitiveLess { + bool operator()(const std::string&, const std::string&) const; +}; + +class OptionsMap; + +// The Option class implements each option as specified by the UCI protocol +class Option { + public: + using OnChange = std::function(const Option&)>; + + Option(const OptionsMap*); + Option(OnChange = nullptr); + Option(bool v, OnChange = nullptr); + Option(const char* v, OnChange = nullptr); + Option(int v, int minv, int maxv, OnChange = nullptr); + Option(const char* v, const char* cur, OnChange = nullptr); + + Option& operator=(const std::string&); + operator int() const; + operator std::string() const; + bool operator==(const char*) const; + bool operator!=(const char*) const; + + friend std::ostream& operator<<(std::ostream&, const OptionsMap&); + + int operator<<(const Option&) = delete; + + private: + friend class OptionsMap; + friend class Engine; + friend class Tune; + + + std::string defaultValue, currentValue, type; + int min, max; + size_t idx; + OnChange on_change; + const OptionsMap* parent = nullptr; +}; + +class OptionsMap { + public: + using InfoListener = std::function)>; + + OptionsMap() = default; + OptionsMap(const OptionsMap&) = delete; + OptionsMap(OptionsMap&&) = delete; + OptionsMap& operator=(const OptionsMap&) = delete; + OptionsMap& operator=(OptionsMap&&) = delete; + + void add_info_listener(InfoListener&&); + + void setoption(std::istringstream&); + + const Option& operator[](const std::string&) const; + + void add(const std::string&, const Option& option); + + std::size_t count(const std::string&) const; + + private: + friend class Engine; + friend class Option; + + friend std::ostream& operator<<(std::ostream&, const OptionsMap&); + + // The options container is defined as a std::map + using OptionsStore = std::map; + + OptionsStore options_map; + InfoListener info; +}; + +} +#endif // #ifndef UCIOPTION_H_INCLUDED