/* Stockfish, a UCI chess playing engine derived from Glaurung 2.1 Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file) Stockfish is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Stockfish is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "nnue_accumulator.h" #include #include #include #include #include "../bitboard.h" #include "../misc.h" #include "../position.h" #include "../types.h" #include "features/half_ka_v2_hm.h" #include "nnue_architecture.h" #include "nnue_common.h" #include "nnue_feature_transformer.h" // IWYU pragma: keep #include "simd.h" namespace Stockfish::Eval::NNUE { using namespace SIMD; namespace { template void double_inc_update(Color perspective, const FeatureTransformer& featureTransformer, const Square ksq, AccumulatorState& middle_state, AccumulatorState& target_state, const AccumulatorState& computed); template void double_inc_update(Color perspective, const FeatureTransformer& featureTransformer, const Square ksq, AccumulatorState& middle_state, AccumulatorState& target_state, const AccumulatorState& computed, const DirtyPiece& dp2); template void update_accumulator_incremental( Color perspective, const FeatureTransformer& featureTransformer, const Square ksq, AccumulatorState& target_state, const AccumulatorState& computed); template void update_accumulator_refresh_cache(Color perspective, const FeatureTransformer& featureTransformer, const Position& pos, AccumulatorState& accumulatorState, AccumulatorCaches::Cache& cache); template void update_threats_accumulator_full(Color perspective, const FeatureTransformer& featureTransformer, const Position& pos, AccumulatorState& accumulatorState); } template const AccumulatorState& AccumulatorStack::latest() const noexcept { return accumulators()[size - 1]; } // Explicit template instantiations template const AccumulatorState& AccumulatorStack::latest() const noexcept; template const AccumulatorState& AccumulatorStack::latest() const noexcept; template AccumulatorState& AccumulatorStack::mut_latest() noexcept { return mut_accumulators()[size - 1]; } template const std::array, AccumulatorStack::MaxSize>& AccumulatorStack::accumulators() const noexcept { static_assert(std::is_same_v || std::is_same_v, "Invalid Feature Set Type"); if constexpr (std::is_same_v) return psq_accumulators; if constexpr (std::is_same_v) return threat_accumulators; } template std::array, AccumulatorStack::MaxSize>& AccumulatorStack::mut_accumulators() noexcept { static_assert(std::is_same_v || std::is_same_v, "Invalid Feature Set Type"); if constexpr (std::is_same_v) return psq_accumulators; if constexpr (std::is_same_v) return threat_accumulators; } void AccumulatorStack::reset() noexcept { psq_accumulators[0].reset({}); threat_accumulators[0].reset({}); size = 1; } std::pair AccumulatorStack::push() noexcept { assert(size < MaxSize); auto& dp = psq_accumulators[size].reset(); auto& dts = threat_accumulators[size].reset(); new (&dts) DirtyThreats; size++; return {dp, dts}; } void AccumulatorStack::pop() noexcept { assert(size > 1); size--; } template void AccumulatorStack::evaluate(const Position& pos, const FeatureTransformer& featureTransformer, AccumulatorCaches::Cache& cache) noexcept { constexpr bool UseThreats = (Dimensions == TransformedFeatureDimensionsBig); evaluate_side(WHITE, pos, featureTransformer, cache); if (UseThreats) evaluate_side(WHITE, pos, featureTransformer, cache); evaluate_side(BLACK, pos, featureTransformer, cache); if (UseThreats) evaluate_side(BLACK, pos, featureTransformer, cache); } template void AccumulatorStack::evaluate_side(Color perspective, const Position& pos, const FeatureTransformer& featureTransformer, AccumulatorCaches::Cache& cache) noexcept { const auto last_usable_accum = find_last_usable_accumulator(perspective); if ((accumulators()[last_usable_accum].template acc()) .computed[perspective]) forward_update_incremental(perspective, pos, featureTransformer, last_usable_accum); else { if constexpr (std::is_same_v) update_accumulator_refresh_cache(perspective, featureTransformer, pos, mut_latest(), cache); else update_threats_accumulator_full(perspective, featureTransformer, pos, mut_latest()); backward_update_incremental(perspective, pos, featureTransformer, last_usable_accum); } } // Find the earliest usable accumulator, this can either be a computed accumulator or the accumulator // state just before a change that requires full refresh. template std::size_t AccumulatorStack::find_last_usable_accumulator(Color perspective) const noexcept { for (std::size_t curr_idx = size - 1; curr_idx > 0; curr_idx--) { if ((accumulators()[curr_idx].template acc()).computed[perspective]) return curr_idx; if (FeatureSet::requires_refresh(accumulators()[curr_idx].diff, perspective)) return curr_idx; } return 0; } template void AccumulatorStack::forward_update_incremental( Color perspective, const Position& pos, const FeatureTransformer& featureTransformer, const std::size_t begin) noexcept { assert(begin < accumulators().size()); assert((accumulators()[begin].template acc()).computed[perspective]); const Square ksq = pos.square(perspective); for (std::size_t next = begin + 1; next < size; next++) { if (next + 1 < size) { DirtyPiece& dp1 = mut_accumulators()[next].diff; DirtyPiece& dp2 = mut_accumulators()[next + 1].diff; auto& accumulators = mut_accumulators(); if constexpr (std::is_same_v) { if (dp2.remove_sq != SQ_NONE && (accumulators[next].diff.threateningSqs & square_bb(dp2.remove_sq))) { double_inc_update(perspective, featureTransformer, ksq, accumulators[next], accumulators[next + 1], accumulators[next - 1], dp2); next++; continue; } } if constexpr (std::is_same_v) { if (dp1.to != SQ_NONE && dp1.to == dp2.remove_sq) { const Square captureSq = dp1.to; dp1.to = dp2.remove_sq = SQ_NONE; double_inc_update(perspective, featureTransformer, ksq, accumulators[next], accumulators[next + 1], accumulators[next - 1]); dp1.to = dp2.remove_sq = captureSq; next++; continue; } } } update_accumulator_incremental(perspective, featureTransformer, ksq, mut_accumulators()[next], accumulators()[next - 1]); } assert((latest().acc()).computed[perspective]); } template void AccumulatorStack::backward_update_incremental( Color perspective, const Position& pos, const FeatureTransformer& featureTransformer, const std::size_t end) noexcept { assert(end < accumulators().size()); assert(end < size); assert((latest().template acc()).computed[perspective]); const Square ksq = pos.square(perspective); for (std::int64_t next = std::int64_t(size) - 2; next >= std::int64_t(end); next--) update_accumulator_incremental(perspective, featureTransformer, ksq, mut_accumulators()[next], accumulators()[next + 1]); assert((accumulators()[end].template acc()).computed[perspective]); } // Explicit template instantiations template void AccumulatorStack::evaluate( const Position& pos, const FeatureTransformer& featureTransformer, AccumulatorCaches::Cache& cache) noexcept; template void AccumulatorStack::evaluate( const Position& pos, const FeatureTransformer& featureTransformer, AccumulatorCaches::Cache& cache) noexcept; namespace { template, bool> = true> void fused_row_reduce(const ElementType* in, ElementType* out, const Ts* const... rows) { constexpr IndexType size = Width * sizeof(ElementType) / sizeof(typename VectorWrapper::type); auto* vecIn = reinterpret_cast(in); auto* vecOut = reinterpret_cast(out); for (IndexType i = 0; i < size; ++i) vecOut[i] = fused( vecIn[i], reinterpret_cast(rows)[i]...); } template struct AccumulatorUpdateContext { Color perspective; const FeatureTransformer& featureTransformer; const AccumulatorState& from; AccumulatorState& to; AccumulatorUpdateContext(Color persp, const FeatureTransformer& ft, const AccumulatorState& accF, AccumulatorState& accT) noexcept : perspective{persp}, featureTransformer{ft}, from{accF}, to{accT} {} template, bool> = true> void apply(const Ts... indices) { auto to_weight_vector = [&](const IndexType index) { return &featureTransformer.weights[index * Dimensions]; }; auto to_psqt_weight_vector = [&](const IndexType index) { return &featureTransformer.psqtWeights[index * PSQTBuckets]; }; fused_row_reduce( (from.template acc()).accumulation[perspective].data(), (to.template acc()).accumulation[perspective].data(), to_weight_vector(indices)...); fused_row_reduce( (from.template acc()).psqtAccumulation[perspective].data(), (to.template acc()).psqtAccumulation[perspective].data(), to_psqt_weight_vector(indices)...); } void apply(const typename FeatureSet::IndexList& added, const typename FeatureSet::IndexList& removed) { const auto& fromAcc = from.template acc().accumulation[perspective]; auto& toAcc = to.template acc().accumulation[perspective]; const auto& fromPsqtAcc = from.template acc().psqtAccumulation[perspective]; auto& toPsqtAcc = to.template acc().psqtAccumulation[perspective]; #ifdef VECTOR using Tiling = SIMDTiling; vec_t acc[Tiling::NumRegs]; psqt_vec_t psqt[Tiling::NumPsqtRegs]; const auto* threatWeights = &featureTransformer.threatWeights[0]; for (IndexType j = 0; j < Dimensions / Tiling::TileHeight; ++j) { auto* fromTile = reinterpret_cast(&fromAcc[j * Tiling::TileHeight]); auto* toTile = reinterpret_cast(&toAcc[j * Tiling::TileHeight]); for (IndexType k = 0; k < Tiling::NumRegs; ++k) acc[k] = fromTile[k]; for (int i = 0; i < removed.ssize(); ++i) { size_t index = removed[i]; const size_t offset = Dimensions * index; auto* column = reinterpret_cast(&threatWeights[offset]); #ifdef USE_NEON for (IndexType k = 0; k < Tiling::NumRegs; k += 2) { acc[k] = vec_sub_16(acc[k], vmovl_s8(vget_low_s8(column[k / 2]))); acc[k + 1] = vec_sub_16(acc[k + 1], vmovl_high_s8(column[k / 2])); } #else for (IndexType k = 0; k < Tiling::NumRegs; ++k) acc[k] = vec_sub_16(acc[k], vec_convert_8_16(column[k])); #endif } for (int i = 0; i < added.ssize(); ++i) { size_t index = added[i]; const size_t offset = Dimensions * index; auto* column = reinterpret_cast(&threatWeights[offset]); #ifdef USE_NEON for (IndexType k = 0; k < Tiling::NumRegs; k += 2) { acc[k] = vec_add_16(acc[k], vmovl_s8(vget_low_s8(column[k / 2]))); acc[k + 1] = vec_add_16(acc[k + 1], vmovl_high_s8(column[k / 2])); } #else for (IndexType k = 0; k < Tiling::NumRegs; ++k) acc[k] = vec_add_16(acc[k], vec_convert_8_16(column[k])); #endif } for (IndexType k = 0; k < Tiling::NumRegs; k++) vec_store(&toTile[k], acc[k]); threatWeights += Tiling::TileHeight; } for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j) { auto* fromTilePsqt = reinterpret_cast(&fromPsqtAcc[j * Tiling::PsqtTileHeight]); auto* toTilePsqt = reinterpret_cast(&toPsqtAcc[j * Tiling::PsqtTileHeight]); for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) psqt[k] = fromTilePsqt[k]; for (int i = 0; i < removed.ssize(); ++i) { size_t index = removed[i]; const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight; auto* columnPsqt = reinterpret_cast( &featureTransformer.threatPsqtWeights[offset]); for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k) psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); } for (int i = 0; i < added.ssize(); ++i) { size_t index = added[i]; const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight; auto* columnPsqt = reinterpret_cast( &featureTransformer.threatPsqtWeights[offset]); for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k) psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); } for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) vec_store_psqt(&toTilePsqt[k], psqt[k]); } #else toAcc = fromAcc; toPsqtAcc = fromPsqtAcc; for (const auto index : removed) { const IndexType offset = Dimensions * index; for (IndexType j = 0; j < Dimensions; ++j) toAcc[j] -= featureTransformer.threatWeights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) toPsqtAcc[k] -= featureTransformer.threatPsqtWeights[index * PSQTBuckets + k]; } for (const auto index : added) { const IndexType offset = Dimensions * index; for (IndexType j = 0; j < Dimensions; ++j) toAcc[j] += featureTransformer.threatWeights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) toPsqtAcc[k] += featureTransformer.threatPsqtWeights[index * PSQTBuckets + k]; } #endif } }; template auto make_accumulator_update_context(Color perspective, const FeatureTransformer& featureTransformer, const AccumulatorState& accumulatorFrom, AccumulatorState& accumulatorTo) noexcept { return AccumulatorUpdateContext{perspective, featureTransformer, accumulatorFrom, accumulatorTo}; } template void double_inc_update(Color perspective, const FeatureTransformer& featureTransformer, const Square ksq, AccumulatorState& middle_state, AccumulatorState& target_state, const AccumulatorState& computed) { assert(computed.acc().computed[perspective]); assert(!middle_state.acc().computed[perspective]); assert(!target_state.acc().computed[perspective]); PSQFeatureSet::IndexList removed, added; PSQFeatureSet::append_changed_indices(perspective, ksq, middle_state.diff, removed, added); // you can't capture a piece that was just involved in castling since the rook ends up // in a square that the king passed assert(added.size() < 2); PSQFeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added); [[maybe_unused]] const int addedSize = added.ssize(); [[maybe_unused]] const int removedSize = removed.ssize(); assert(addedSize == 1); assert(removedSize == 2 || removedSize == 3); // Workaround compiler warning for uninitialized variables, replicated on // profile builds on windows with gcc 14.2.0. // Also helps with optimizations on some compilers. sf_assume(addedSize == 1); sf_assume(removedSize == 2 || removedSize == 3); auto updateContext = make_accumulator_update_context(perspective, featureTransformer, computed, target_state); if (removedSize == 2) { updateContext.template apply(added[0], removed[0], removed[1]); } else { updateContext.template apply(added[0], removed[0], removed[1], removed[2]); } target_state.acc().computed[perspective] = true; } template void double_inc_update(Color perspective, const FeatureTransformer& featureTransformer, const Square ksq, AccumulatorState& middle_state, AccumulatorState& target_state, const AccumulatorState& computed, const DirtyPiece& dp2) { assert(computed.acc().computed[perspective]); assert(!middle_state.acc().computed[perspective]); assert(!target_state.acc().computed[perspective]); ThreatFeatureSet::FusedUpdateData fusedData; fusedData.dp2removed = dp2.remove_sq; ThreatFeatureSet::IndexList removed, added; const auto* pfBase = &featureTransformer.threatWeights[0]; auto pfStride = static_cast(TransformedFeatureDimensions); ThreatFeatureSet::append_changed_indices(perspective, ksq, middle_state.diff, removed, added, &fusedData, true, pfBase, pfStride); ThreatFeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added, &fusedData, false, pfBase, pfStride); auto updateContext = make_accumulator_update_context(perspective, featureTransformer, computed, target_state); updateContext.apply(added, removed); target_state.acc().computed[perspective] = true; } template void update_accumulator_incremental( Color perspective, const FeatureTransformer& featureTransformer, const Square ksq, AccumulatorState& target_state, const AccumulatorState& computed) { assert((computed.template acc()).computed[perspective]); assert(!(target_state.template acc()).computed[perspective]); // The size must be enough to contain the largest possible update. // That might depend on the feature set and generally relies on the // feature set's update cost calculation to be correct and never allow // updates with more added/removed features than MaxActiveDimensions. // In this case, the maximum size of both feature addition and removal // is 2, since we are incrementally updating one move at a time. typename FeatureSet::IndexList removed, added; if constexpr (std::is_same_v) { const auto* pfBase = &featureTransformer.threatWeights[0]; auto pfStride = static_cast(TransformedFeatureDimensions); if constexpr (Forward) FeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added, nullptr, false, pfBase, pfStride); else FeatureSet::append_changed_indices(perspective, ksq, computed.diff, added, removed, nullptr, false, pfBase, pfStride); } else { if constexpr (Forward) FeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added); else FeatureSet::append_changed_indices(perspective, ksq, computed.diff, added, removed); } auto updateContext = make_accumulator_update_context(perspective, featureTransformer, computed, target_state); if constexpr (std::is_same_v) updateContext.apply(added, removed); else { [[maybe_unused]] const int addedSize = added.ssize(); [[maybe_unused]] const int removedSize = removed.ssize(); assert(addedSize == 1 || addedSize == 2); assert(removedSize == 1 || removedSize == 2); assert((Forward && addedSize <= removedSize) || (!Forward && addedSize >= removedSize)); // Workaround compiler warning for uninitialized variables, replicated // on profile builds on windows with gcc 14.2.0. // Also helps with optimizations on some compilers. sf_assume(addedSize == 1 || addedSize == 2); sf_assume(removedSize == 1 || removedSize == 2); if (!(removedSize == 1 || removedSize == 2) || !(addedSize == 1 || addedSize == 2)) sf_unreachable(); if ((Forward && removedSize == 1) || (!Forward && addedSize == 1)) { assert(addedSize == 1 && removedSize == 1); updateContext.template apply(added[0], removed[0]); } else if (Forward && addedSize == 1) { assert(removedSize == 2); updateContext.template apply(added[0], removed[0], removed[1]); } else if (!Forward && removedSize == 1) { assert(addedSize == 2); updateContext.template apply(added[0], added[1], removed[0]); } else { assert(addedSize == 2 && removedSize == 2); updateContext.template apply(added[0], added[1], removed[0], removed[1]); } } (target_state.template acc()).computed[perspective] = true; } Bitboard get_changed_pieces(const std::array& oldPieces, const std::array& newPieces) { #if defined(USE_AVX512) || defined(USE_AVX2) static_assert(sizeof(Piece) == 1); Bitboard sameBB = 0; for (int i = 0; i < 64; i += 32) { const __m256i old_v = _mm256_loadu_si256(reinterpret_cast(&oldPieces[i])); const __m256i new_v = _mm256_loadu_si256(reinterpret_cast(&newPieces[i])); const __m256i cmpEqual = _mm256_cmpeq_epi8(old_v, new_v); const std::uint32_t equalMask = _mm256_movemask_epi8(cmpEqual); sameBB |= static_cast(equalMask) << i; } return ~sameBB; #elif defined(USE_NEON) uint8x16x4_t old_v = vld4q_u8(reinterpret_cast(oldPieces.data())); uint8x16x4_t new_v = vld4q_u8(reinterpret_cast(newPieces.data())); auto cmp = [=](const int i) { return vceqq_u8(old_v.val[i], new_v.val[i]); }; uint8x16_t cmp0_1 = vsriq_n_u8(cmp(1), cmp(0), 1); uint8x16_t cmp2_3 = vsriq_n_u8(cmp(3), cmp(2), 1); uint8x16_t merged = vsriq_n_u8(cmp2_3, cmp0_1, 2); merged = vsriq_n_u8(merged, merged, 4); uint8x8_t sameBB = vshrn_n_u16(vreinterpretq_u16_u8(merged), 4); return ~vget_lane_u64(vreinterpret_u64_u8(sameBB), 0); #else Bitboard changed = 0; for (Square sq = SQUARE_ZERO; sq < SQUARE_NB; ++sq) changed |= static_cast(oldPieces[sq] != newPieces[sq]) << sq; return changed; #endif } template void update_accumulator_refresh_cache(Color perspective, const FeatureTransformer& featureTransformer, const Position& pos, AccumulatorState& accumulatorState, AccumulatorCaches::Cache& cache) { using Tiling [[maybe_unused]] = SIMDTiling; const Square ksq = pos.square(perspective); auto& entry = cache[ksq][perspective]; PSQFeatureSet::IndexList removed, added; const Bitboard changedBB = get_changed_pieces(entry.pieces, pos.piece_array()); Bitboard removedBB = changedBB & entry.pieceBB; Bitboard addedBB = changedBB & pos.pieces(); while (removedBB) { Square sq = pop_lsb(removedBB); removed.push_back(PSQFeatureSet::make_index(perspective, sq, entry.pieces[sq], ksq)); } while (addedBB) { Square sq = pop_lsb(addedBB); added.push_back(PSQFeatureSet::make_index(perspective, sq, pos.piece_on(sq), ksq)); } entry.pieceBB = pos.pieces(); entry.pieces = pos.piece_array(); auto& accumulator = accumulatorState.acc(); accumulator.computed[perspective] = true; #ifdef VECTOR vec_t acc[Tiling::NumRegs]; psqt_vec_t psqt[Tiling::NumPsqtRegs]; const auto* weights = &featureTransformer.weights[0]; for (IndexType j = 0; j < Dimensions / Tiling::TileHeight; ++j) { auto* accTile = reinterpret_cast(&accumulator.accumulation[perspective][j * Tiling::TileHeight]); auto* entryTile = reinterpret_cast(&entry.accumulation[j * Tiling::TileHeight]); for (IndexType k = 0; k < Tiling::NumRegs; ++k) acc[k] = entryTile[k]; int i = 0; for (; i < std::min(removed.ssize(), added.ssize()); ++i) { size_t indexR = removed[i]; const size_t offsetR = Dimensions * indexR; auto* columnR = reinterpret_cast(&weights[offsetR]); size_t indexA = added[i]; const size_t offsetA = Dimensions * indexA; auto* columnA = reinterpret_cast(&weights[offsetA]); for (IndexType k = 0; k < Tiling::NumRegs; ++k) acc[k] = fused(acc[k], columnA[k], columnR[k]); } for (; i < removed.ssize(); ++i) { size_t index = removed[i]; const size_t offset = Dimensions * index; auto* column = reinterpret_cast(&weights[offset]); for (IndexType k = 0; k < Tiling::NumRegs; ++k) acc[k] = vec_sub_16(acc[k], column[k]); } for (; i < added.ssize(); ++i) { size_t index = added[i]; const size_t offset = Dimensions * index; auto* column = reinterpret_cast(&weights[offset]); for (IndexType k = 0; k < Tiling::NumRegs; ++k) acc[k] = vec_add_16(acc[k], column[k]); } for (IndexType k = 0; k < Tiling::NumRegs; k++) vec_store(&entryTile[k], acc[k]); for (IndexType k = 0; k < Tiling::NumRegs; k++) vec_store(&accTile[k], acc[k]); weights += Tiling::TileHeight; } for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j) { auto* accTilePsqt = reinterpret_cast( &accumulator.psqtAccumulation[perspective][j * Tiling::PsqtTileHeight]); auto* entryTilePsqt = reinterpret_cast(&entry.psqtAccumulation[j * Tiling::PsqtTileHeight]); for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) psqt[k] = entryTilePsqt[k]; for (int i = 0; i < removed.ssize(); ++i) { size_t index = removed[i]; const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight; auto* columnPsqt = reinterpret_cast(&featureTransformer.psqtWeights[offset]); for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k) psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); } for (int i = 0; i < added.ssize(); ++i) { size_t index = added[i]; const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight; auto* columnPsqt = reinterpret_cast(&featureTransformer.psqtWeights[offset]); for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k) psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); } for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) vec_store_psqt(&entryTilePsqt[k], psqt[k]); for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) vec_store_psqt(&accTilePsqt[k], psqt[k]); } #else for (const auto index : removed) { const IndexType offset = Dimensions * index; for (IndexType j = 0; j < Dimensions; ++j) entry.accumulation[j] -= featureTransformer.weights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) entry.psqtAccumulation[k] -= featureTransformer.psqtWeights[index * PSQTBuckets + k]; } for (const auto index : added) { const IndexType offset = Dimensions * index; for (IndexType j = 0; j < Dimensions; ++j) entry.accumulation[j] += featureTransformer.weights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) entry.psqtAccumulation[k] += featureTransformer.psqtWeights[index * PSQTBuckets + k]; } // The accumulator of the refresh entry has been updated. // Now copy its content to the actual accumulator we were refreshing. accumulator.accumulation[perspective] = entry.accumulation; accumulator.psqtAccumulation[perspective] = entry.psqtAccumulation; #endif } template void update_threats_accumulator_full(Color perspective, const FeatureTransformer& featureTransformer, const Position& pos, AccumulatorState& accumulatorState) { using Tiling [[maybe_unused]] = SIMDTiling; ThreatFeatureSet::IndexList active; ThreatFeatureSet::append_active_indices(perspective, pos, active); auto& accumulator = accumulatorState.acc(); accumulator.computed[perspective] = true; #ifdef VECTOR vec_t acc[Tiling::NumRegs]; psqt_vec_t psqt[Tiling::NumPsqtRegs]; const auto* threatWeights = &featureTransformer.threatWeights[0]; for (IndexType j = 0; j < Dimensions / Tiling::TileHeight; ++j) { auto* accTile = reinterpret_cast(&accumulator.accumulation[perspective][j * Tiling::TileHeight]); for (IndexType k = 0; k < Tiling::NumRegs; ++k) acc[k] = vec_zero(); int i = 0; for (; i < active.ssize(); ++i) { size_t index = active[i]; const size_t offset = Dimensions * index; auto* column = reinterpret_cast(&threatWeights[offset]); #ifdef USE_NEON for (IndexType k = 0; k < Tiling::NumRegs; k += 2) { acc[k] = vec_add_16(acc[k], vmovl_s8(vget_low_s8(column[k / 2]))); acc[k + 1] = vec_add_16(acc[k + 1], vmovl_high_s8(column[k / 2])); } #else for (IndexType k = 0; k < Tiling::NumRegs; ++k) acc[k] = vec_add_16(acc[k], vec_convert_8_16(column[k])); #endif } for (IndexType k = 0; k < Tiling::NumRegs; k++) vec_store(&accTile[k], acc[k]); threatWeights += Tiling::TileHeight; } for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j) { auto* accTilePsqt = reinterpret_cast( &accumulator.psqtAccumulation[perspective][j * Tiling::PsqtTileHeight]); for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) psqt[k] = vec_zero_psqt(); for (int i = 0; i < active.ssize(); ++i) { size_t index = active[i]; const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight; auto* columnPsqt = reinterpret_cast(&featureTransformer.threatPsqtWeights[offset]); for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k) psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); } for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k) vec_store_psqt(&accTilePsqt[k], psqt[k]); } #else for (IndexType j = 0; j < Dimensions; ++j) accumulator.accumulation[perspective][j] = 0; for (std::size_t k = 0; k < PSQTBuckets; ++k) accumulator.psqtAccumulation[perspective][k] = 0; for (const auto index : active) { const IndexType offset = Dimensions * index; for (IndexType j = 0; j < Dimensions; ++j) accumulator.accumulation[perspective][j] += featureTransformer.threatWeights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) accumulator.psqtAccumulation[perspective][k] += featureTransformer.threatPsqtWeights[index * PSQTBuckets + k]; } #endif } } }