Spaces:
Running
Running
| /* | |
| Stockfish, a UCI chess playing engine derived from Glaurung 2.1 | |
| Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) | |
| Stockfish is free software: you can redistribute it and/or modify | |
| it under the terms of the GNU General Public License as published by | |
| the Free Software Foundation, either version 3 of the License, or | |
| (at your option) any later version. | |
| Stockfish is distributed in the hope that it will be useful, | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| GNU General Public License for more details. | |
| You should have received a copy of the GNU General Public License | |
| along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| */ | |
| namespace Stockfish::Eval::NNUE::SIMD { | |
| // If vector instructions are enabled, we update and refresh the | |
| // accumulator tile by tile such that each tile fits in the CPU's | |
| // vector registers. | |
| using vec_t = __m512i; | |
| using vec_i8_t = __m256i; | |
| using vec128_t = __m128i; | |
| using psqt_vec_t = __m256i; | |
| using vec_uint_t = __m512i; | |
| // Inverse permuted at load time | |
| using vec_t = __m256i; | |
| using vec_i8_t = __m128i; | |
| using vec128_t = __m128i; | |
| using psqt_vec_t = __m256i; | |
| using vec_uint_t = __m256i; | |
| // Inverse permuted at load time | |
| using vec_t = __m128i; | |
| using vec_i8_t = std::uint64_t; // for the correct size -- will be loaded into an xmm reg | |
| using vec128_t = __m128i; | |
| using psqt_vec_t = __m128i; | |
| using vec_uint_t = __m128i; | |
| inline __m128i _mm_cvtsi64_si128(int64_t val) { | |
| return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&val)); | |
| } | |
| // Credit: Yoshie2000 | |
| inline __m128i vec_convert_8_16(uint64_t x) { | |
| __m128i v8 = _mm_cvtsi64_si128(static_cast<int64_t>(x)); | |
| __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(), v8); | |
| return _mm_unpacklo_epi8(v8, sign); | |
| } | |
| using vec_i8x8_t __attribute__((may_alias)) = int8x8_t; | |
| using vec_i16x8_t __attribute__((may_alias)) = int16x8_t; | |
| using vec_i8x16_t __attribute__((may_alias)) = int8x16_t; | |
| using vec_u16x8_t __attribute__((may_alias)) = uint16x8_t; | |
| using vec_i32x4_t __attribute__((may_alias)) = int32x4_t; | |
| using vec_t __attribute__((may_alias)) = int16x8_t; | |
| using vec_i8_t __attribute__((may_alias)) = int8x16_t; | |
| using psqt_vec_t __attribute__((may_alias)) = int32x4_t; | |
| using vec128_t __attribute__((may_alias)) = uint16x8_t; | |
| using vec_uint_t __attribute__((may_alias)) = uint32x4_t; | |
| static constexpr std::uint32_t Mask[4] = {1, 2, 4, 8}; | |
| // Single instruction doesn't exist on 32-bit ARM | |
| inline int16x8_t vmovl_high_s8(int8x16_t val) { return vmovl_s8(vget_high_s8(val)); } | |
| struct Vec16Wrapper { | |
| using type = vec_t; | |
| static type add(const type& lhs, const type& rhs) { return vec_add_16(lhs, rhs); } | |
| static type sub(const type& lhs, const type& rhs) { return vec_sub_16(lhs, rhs); } | |
| using type = BiasType; | |
| static type add(const type& lhs, const type& rhs) { return lhs + rhs; } | |
| static type sub(const type& lhs, const type& rhs) { return lhs - rhs; } | |
| }; | |
| struct Vec32Wrapper { | |
| using type = psqt_vec_t; | |
| static type add(const type& lhs, const type& rhs) { return vec_add_psqt_32(lhs, rhs); } | |
| static type sub(const type& lhs, const type& rhs) { return vec_sub_psqt_32(lhs, rhs); } | |
| using type = PSQTWeightType; | |
| static type add(const type& lhs, const type& rhs) { return lhs + rhs; } | |
| static type sub(const type& lhs, const type& rhs) { return lhs - rhs; } | |
| }; | |
| enum UpdateOperation { | |
| Add, | |
| Sub | |
| }; | |
| template<typename VecWrapper, | |
| UpdateOperation... ops, | |
| std::enable_if_t<sizeof...(ops) == 0, bool> = true> | |
| typename VecWrapper::type fused(const typename VecWrapper::type& in) { | |
| return in; | |
| } | |
| template<typename VecWrapper, | |
| UpdateOperation update_op, | |
| UpdateOperation... ops, | |
| typename T, | |
| typename... Ts, | |
| std::enable_if_t<is_all_same_v<typename VecWrapper::type, T, Ts...>, bool> = true, | |
| std::enable_if_t<sizeof...(ops) == sizeof...(Ts), bool> = true> | |
| typename VecWrapper::type | |
| fused(const typename VecWrapper::type& in, const T& operand, const Ts&... operands) { | |
| switch (update_op) | |
| { | |
| case Add : | |
| return fused<VecWrapper, ops...>(VecWrapper::add(in, operand), operands...); | |
| case Sub : | |
| return fused<VecWrapper, ops...>(VecWrapper::sub(in, operand), operands...); | |
| default : | |
| static_assert(update_op == Add || update_op == Sub, | |
| "Only Add and Sub are currently supported."); | |
| return typename VecWrapper::type(); | |
| } | |
| } | |
| [[maybe_unused]] static int m512_hadd(__m512i sum, int bias) { | |
| return _mm512_reduce_add_epi32(sum) + bias; | |
| } | |
| [[maybe_unused]] static void m512_add_dpbusd_epi32(__m512i& acc, __m512i a, __m512i b) { | |
| acc = _mm512_dpbusd_epi32(acc, a, b); | |
| __m512i product0 = _mm512_maddubs_epi16(a, b); | |
| product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1)); | |
| acc = _mm512_add_epi32(acc, product0); | |
| } | |
| [[maybe_unused]] static int m256_hadd(__m256i sum, int bias) { | |
| __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); | |
| sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); | |
| sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); | |
| return _mm_cvtsi128_si32(sum128) + bias; | |
| } | |
| [[maybe_unused]] static void m256_add_dpbusd_epi32(__m256i& acc, __m256i a, __m256i b) { | |
| acc = _mm256_dpbusd_epi32(acc, a, b); | |
| __m256i product0 = _mm256_maddubs_epi16(a, b); | |
| product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1)); | |
| acc = _mm256_add_epi32(acc, product0); | |
| } | |
| [[maybe_unused]] static int m128_hadd(__m128i sum, int bias) { | |
| sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC | |
| sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB | |
| return _mm_cvtsi128_si32(sum) + bias; | |
| } | |
| [[maybe_unused]] static void m128_add_dpbusd_epi32(__m128i& acc, __m128i a, __m128i b) { | |
| __m128i product0 = _mm_maddubs_epi16(a, b); | |
| product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1)); | |
| acc = _mm_add_epi32(acc, product0); | |
| } | |
| [[maybe_unused]] static void | |
| dotprod_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) { | |
| acc = vdotq_s32(acc, a, b); | |
| } | |
| [[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) { | |
| return vaddvq_s32(s); | |
| return s[0] + s[1] + s[2] + s[3]; | |
| } | |
| [[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) { | |
| return neon_m128_reduce_add_epi32(sum) + bias; | |
| } | |
| [[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) { | |
| int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b)); | |
| int16x8_t product1 = vmull_high_s8(a, b); | |
| int16x8_t sum = vpaddq_s16(product0, product1); | |
| acc = vpadalq_s16(acc, sum); | |
| } | |
| // Compute optimal SIMD register count for feature transformer accumulation. | |
| template<IndexType TransformedFeatureWidth, IndexType HalfDimensions, IndexType PSQTBuckets> | |
| class SIMDTiling { | |
| // We use __m* types as template arguments, which causes GCC to emit warnings | |
| // about losing some attribute information. This is irrelevant to us as we | |
| // only take their size, so the following pragma are harmless. | |
| template<typename SIMDRegisterType, typename LaneType, int NumLanes, int MaxRegisters> | |
| static constexpr int BestRegisterCount() { | |
| constexpr std::size_t RegisterSize = sizeof(SIMDRegisterType); | |
| constexpr std::size_t LaneSize = sizeof(LaneType); | |
| static_assert(RegisterSize >= LaneSize); | |
| static_assert(MaxRegisters <= NumRegistersSIMD); | |
| static_assert(MaxRegisters > 0); | |
| static_assert(NumRegistersSIMD > 0); | |
| static_assert(RegisterSize % LaneSize == 0); | |
| static_assert((NumLanes * LaneSize) % RegisterSize == 0); | |
| const int ideal = (NumLanes * LaneSize) / RegisterSize; | |
| if (ideal <= MaxRegisters) | |
| return ideal; | |
| // Look for the largest divisor of the ideal register count that is smaller than MaxRegisters | |
| for (int divisor = MaxRegisters; divisor > 1; --divisor) | |
| if (ideal % divisor == 0) | |
| return divisor; | |
| return 1; | |
| } | |
| public: | |
| static constexpr int NumRegs = | |
| BestRegisterCount<vec_t, WeightType, TransformedFeatureWidth, NumRegistersSIMD>(); | |
| static constexpr int NumPsqtRegs = | |
| BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>(); | |
| static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2; | |
| static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4; | |
| static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions"); | |
| static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets"); | |
| }; | |
| } | |