import json
import hashlib
import random
import site
import string
import sys
import unicodedata
from dataclasses import dataclass, field
from pathlib import Path
from typing import Sequence

_VENDOR_ROOT = Path(__file__).resolve().parent.parent / ".vendor"
for _vendor_path in (_VENDOR_ROOT / "python", _VENDOR_ROOT / "sitepkgs"):
    if _vendor_path.exists():
        vendor_text = str(_vendor_path)
        if vendor_text not in sys.path:
            sys.path.insert(0, vendor_text)

try:
    import numpy as np
except ModuleNotFoundError:
    user_site = site.getusersitepackages()
    if user_site and user_site not in sys.path:
        sys.path.append(user_site)
    try:
        import numpy as np
    except ModuleNotFoundError:
        np = None

if np is not None and not hasattr(np, "asarray"):
    np = None

from .checkpoint import read_safetensor_file, write_safetensor_file
from .config import ReframrConfig
from .embeddings import EmbeddingModel, fit_ppmi_embedding_from_tokens
from .hippo import AnalyticalMemoryUnit, analytical_embedding_drive, analytical_embedding_drive_fast
from .linalg import Vector, dot, mean, norm, softmax, zeros_vector
from .reservoir import apply_readout, ridge_regression_readout
from .reasoning import TOOL_PROTOCOL_TOKENS, reasoning_prefix
from .sparse_context import HashedSparseAttention
from .ternary import apply_ternary_mask, derive_ternary_mask_from_states
from .tokenizer import NativeTokenizer

ASSOCIATIVE_BLEND = 0.42
TRANSITION_BLEND = 0.08
COPY_BLEND = 0.04
BASE_BLEND = 0.34
FAST_ASSOCIATIVE_BLEND = 0.06
FAST_TRANSITION_BLEND = 0.14
FAST_COPY_BLEND = 0.12
FAST_BASE_BLEND = 0.72
FAST_PREFERENCE_BLEND = 0.15
FAST_ANSWER_BLEND = 0.16
FAST_SOURCE_EVIDENCE_BLEND = 0.52
PROMPT_READOUT_LOGIT_ZSCORE_SCALE = 0.48
PROMPT_START_READOUT_CONFIDENCE_FLOOR = 0.45
ASSOCIATIVE_TOP_K = 12
ANSWER_TOP_K = 48
ANSWER_START_TOP_K = 32
MIN_COMPLETE_ANSWER_WORDS = 6
MIN_COMPLETE_MULTI_SENTENCE_WORDS = 4
ANSWER_SEQUENCE_MATCH_FLOOR = 0.27
ANSWER_START_CONFIDENCE_FLOOR = 0.45
ANSWER_START_MATCH_SUPPORT_FLOOR = 0.18
ANSWER_SEQUENCE_DISTRIBUTED_LOCK_FLOOR = 0.45
ANSWER_SEQUENCE_LOCK_FLOOR = 0.55
ANSWER_SEQUENCE_SPIKE_CONFIDENCE = 0.80
READOUT_LOGIT_ZSCORE_SCALE = 0.22
TRACE_IDENTITY_SCALE = 0.78
TRACE_IDENTITY_HASHES = (
    (1103515245, 12345, 214013, 2531011),
    (1664525, 1013904223, 22695477, 1),
    (69069, 362437, 134775813, 17),
    (134775813, 97, 1103515245, 31),
    (22695477, 911, 1664525, 73),
    (214013, 2531011, 69069, 19),
    (48271, 0, 69621, 11),
    (16807, 37, 40692, 101),
    (279470273, 173, 1299709, 53),
    (39916801, 29, 2147483629, 7),
)
PROMPT_ENVELOPE_TERMS = frozenset(
    {"system", "instruction", "user", "human", "assistant", "question", "answer"}
)
NGRAM_KEY_SEPARATOR = "\u0001"
TRANSITION_ORDERS = (10, 8, 6, 5, 4, 3, 2, 1)
DEFAULT_GENERATION_TEMPERATURE = 0.82
DEFAULT_GENERATION_TOP_K = 24
DEFAULT_GENERATION_TOP_P = 0.92
DEFAULT_REPETITION_PENALTY = 1.18
ANSWER_SEQUENCE_MAX_TOKENS = 192
ANSWER_SEQUENCE_EAGER_OVERLAP_CACHE_LIMIT = 8192
ANSWER_SEQUENCE_VARIATION_TEMPERATURE = 0.65
ANSWER_SEQUENCE_VARIATION_MATCH_LIMIT = 4
ANSWER_SEQUENCE_CREATIVE_TEMPERATURE = 1.10
ANSWER_REPLAY_PREFIX_TEMPERATURE = 0.95
ANSWER_REPLAY_PREFIX_MIN_TOKENS = 64
ANSWER_REPLAY_PREFIX_PENALTY = 0.18
CREATIVE_EARLY_POOL_TEMPERATURE = 1.05
CREATIVE_EARLY_POOL_WORD_LIMIT = 6
CREATIVE_EARLY_POOL_MAX = 8
TOOL_CALL_CONTEXT_TERMS = frozenset(
    {
        "current",
        "latest",
        "today",
        "yesterday",
        "tonight",
        "now",
        "fresh",
        "recent",
        "web",
        "search",
        "real-time",
        "price",
        "weather",
        "election",
        "news",
        "official",
        "result",
        "live",
    }
)
RUNTIME_GENERATION_HISTORY_LIMIT = 8
AVOID_SEQUENCE_MIN_TOKENS = 6
WORD_COMPLETION_OVERFLOW_TOKENS = 16
ANSWER_FINGERPRINT_WORDS = 4
SPARSE_CONTEXT_MIN_TOKENS = 16
SPARSE_CONTEXT_TOP_K = 64
SPARSE_CONTEXT_HASH_BITS = 12
SPARSE_CONTEXT_PROBE_RADIUS = 1
SPARSE_CONTEXT_CANDIDATE_MULTIPLIER = 16
SPARSE_CONTEXT_TRACE_BLEND = 0.35
RUNTIME_ARRAY_DTYPE = np.float32 if np is not None else None


@dataclass(frozen=True, slots=True)
class CharacterCountFact:
    character: str
    word: str
    count: int
    surface_seed: int
    focused: bool


@dataclass(frozen=True, slots=True)
class GenerationTokenMeta:
    rendered: str
    stripped: str
    starts_new_word: bool
    punctuation_piece: bool
    structural_punctuation: bool
    structural_symbol: bool
    word_joiner: bool
    alphanumeric: str
    common_connector: bool


def _normalize_vector(values: Vector) -> Vector:
    total = sum(values)
    if total <= 0.0:
        return [0.0 for _ in values]
    return [value / total for value in values]


def _encode_ngram_key(tokens: tuple[str, ...]) -> str:
    return NGRAM_KEY_SEPARATOR.join(tokens)


def _decode_ngram_key(key: str) -> tuple[str, ...]:
    return tuple(part for part in key.split(NGRAM_KEY_SEPARATOR) if part)


def _last_index(values: list[str], target: str) -> int | None:
    for index in range(len(values) - 1, -1, -1):
        if values[index] == target:
            return index
    return None


def _first_index(values: list[str], target: str) -> int | None:
    for index, value in enumerate(values):
        if value == target:
            return index
    return None


@dataclass(slots=True)
class DecodeState:
    hidden_states: list[Vector]
    context_traces: list[Vector]
    combined_state: Vector
    context_tokens: list[str]
    answer_anchor_state: Vector | None = None
    answer_matches: list[tuple[float, int, int]] | None = None
    answer_start_matches: list[tuple[float, int, int]] | None = None
    answer_sequence_matches: list[tuple[float, int, int]] | None = None
    prompt_answer_prior: object | None = None
    prompt_answer_start_prior: object | None = None


@dataclass(slots=True)
class ReframrModel:
    config: ReframrConfig
    tokenizer: NativeTokenizer | None = None
    embedding_model: EmbeddingModel | None = None
    memory_units: list[AnalyticalMemoryUnit] | None = None
    ternary_scale: float = 1.0
    ternary_mask: list[int] | None = None
    ternary_mask_array: object | None = None
    readout_weights: list[list[float]] | None = None
    readout_weights_array: object | None = None
    readout_bias: Vector | None = None
    readout_bias_array: object | None = None
    prompt_answer_weights: list[list[float]] | None = None
    prompt_answer_weights_array: object | None = None
    prompt_answer_bias: Vector | None = None
    prompt_answer_bias_array: object | None = None
    prompt_answer_start_weights: list[list[float]] | None = None
    prompt_answer_start_weights_array: object | None = None
    prompt_answer_start_bias: Vector | None = None
    prompt_answer_start_bias_array: object | None = None
    trace_token_weights: Vector | None = None
    trace_token_weights_array: object | None = None
    trace_embedding_table_array: object | None = None
    preference_bias: Vector | None = None
    preference_bias_array: object | None = None
    preference_valid_mask_array: object | None = None
    state_offset: Vector | None = None
    state_offset_array: object | None = None
    associative_keys: list[Vector] | None = None
    associative_keys_array: object | None = None
    associative_key_norms: list[float] | None = None
    associative_key_norms_array: object | None = None
    associative_values: list[int] | None = None
    associative_values_array: object | None = None
    associative_valid_mask_array: object | None = None
    answer_keys: list[Vector] | None = None
    answer_keys_array: object | None = None
    answer_key_norms: list[float] | None = None
    answer_key_norms_array: object | None = None
    answer_similarity_keys_array: object | None = None
    answer_similarity_key_norms_array: object | None = None
    answer_similarity_mask_array: object | None = None
    answer_values: list[int] | None = None
    answer_values_array: object | None = None
    answer_valid_mask_array: object | None = None
    answer_start_keys: list[Vector] | None = None
    answer_start_keys_array: object | None = None
    answer_start_key_norms: list[float] | None = None
    answer_start_key_norms_array: object | None = None
    answer_start_similarity_keys_array: object | None = None
    answer_start_similarity_key_norms_array: object | None = None
    answer_start_values: list[int] | None = None
    answer_start_values_array: object | None = None
    answer_start_valid_mask_array: object | None = None
    answer_sequence_keys: list[Vector] | None = None
    answer_sequence_keys_array: object | None = None
    answer_sequence_key_norms: list[float] | None = None
    answer_sequence_key_norms_array: object | None = None
    answer_sequence_similarity_keys_array: object | None = None
    answer_sequence_similarity_key_norms_array: object | None = None
    answer_sequence_prompt_tokens: list[list[int]] | None = None
    answer_sequence_prompt_tokens_array: object | None = None
    answer_sequence_tokens: list[list[int]] | None = None
    answer_sequence_tokens_array: object | None = None
    answer_sequence_token_id_rows: list[list[int]] | None = None
    answer_sequence_prompt_weight_maps: list[dict[int, float]] | None = None
    answer_sequence_prompt_weight_norms: list[float] | None = None
    answer_sequence_prompt_bigram_sets: list[set[tuple[int, int]]] | None = None
    answer_sequence_prompt_trigram_sets: list[set[tuple[int, int, int]]] | None = None
    answer_sequence_prompt_number_sets: list[set[str]] | None = None
    answer_sequence_prompt_inverted_index: dict[int, list[int]] | None = None
    answer_sequence_prompt_specificity: dict[int, float] | None = None
    prompt_overlap_valid_token_mask_array: object | None = None
    answer_fingerprint_hashes: set[tuple[int, ...]] | None = None
    answer_fingerprint_token_lengths: set[int] | None = None
    answer_fingerprint_token_sequences_by_length: dict[int, set[tuple[int, ...]]] | None = None
    answer_sequence_prefixes_by_length: dict[int, set[tuple[int, ...]]] | None = None
    transition_tables: dict[int, dict[tuple[str, ...], dict[str, float]]] | None = None
    transition_id_tables: dict[int, dict[tuple[int, ...], tuple[object, object]]] | None = None
    transition_tensor_cache: dict[str, object] | None = None
    transition_built_orders: set[int] | None = None
    generation_token_meta_cache: dict[str, GenerationTokenMeta] | None = None
    runtime_generation_history: dict[str, list[str]] = field(default_factory=dict, repr=False)

    def fit(self, text: str) -> "ReframrModel":
        self.generation_token_meta_cache = None
        self.answer_sequence_prefixes_by_length = None
        self.tokenizer = NativeTokenizer.train(
            text,
            vocab_size=self.config.tokenizer_vocab_size,
            min_pair_frequency=self.config.tokenizer_min_pair_frequency,
            lowercase=self.config.lowercase,
        )
        tokens = self.tokenizer.encode(text)
        if len(tokens) < 2:
            raise ValueError("REFRAMR needs at least two tokens to derive a next-token readout.")

        self.embedding_model = fit_ppmi_embedding_from_tokens(
            tokens,
            embedding_dim=self.config.embedding_dim,
            window_size=self.config.window_size,
            min_frequency=self.config.min_frequency,
            max_vocab=self.config.max_vocab,
            required_tokens=self.tokenizer.vocab,
        )
        self.memory_units = [
            AnalyticalMemoryUnit(self.config.state_dim, timescale)
            for timescale in self.config.timescales
        ]
        token_counts: dict[str, float] = {}
        for token in tokens:
            token_counts[token] = token_counts.get(token, 0.0) + 1.0
        self.trace_token_weights = self._derive_trace_token_weights_from_counts(token_counts)

        raw_states, targets, target_ids = self._collect_training_examples(tokens)
        self.ternary_scale, self.ternary_mask = derive_ternary_mask_from_states(raw_states)
        analytical_states = [
            apply_ternary_mask(state, self.ternary_mask, self.ternary_scale)
            for state in raw_states
        ]
        self.associative_keys = [state[:] for state in analytical_states]
        self.associative_key_norms = [norm(state) for state in analytical_states]
        self.associative_values = target_ids[:]
        self.answer_keys = []
        self.answer_key_norms = []
        self.answer_values = []
        self.answer_start_keys = []
        self.answer_start_key_norms = []
        self.answer_start_values = []
        self.answer_sequence_keys = []
        self.answer_sequence_key_norms = []
        self.answer_sequence_prompt_tokens = []
        self.answer_sequence_tokens = []
        self.prompt_answer_weights = []
        self.prompt_answer_bias = [0.0 for _ in self.embedding_model.id_to_token]
        self.prompt_answer_start_weights = []
        self.prompt_answer_start_bias = [0.0 for _ in self.embedding_model.id_to_token]
        self.transition_tables = self._build_transition_tables(tokens)
        self._fit_answer_memory_from_text(text)
        self._refresh_answer_fingerprint_hashes()
        self.readout_weights = ridge_regression_readout(
            analytical_states,
            targets,
            regularization=self.config.regularization,
        )
        self.readout_bias = [0.0 for _ in self.embedding_model.id_to_token]
        self.preference_bias = [0.0 for _ in self.embedding_model.id_to_token]
        self.state_offset = [0.0 for _ in analytical_states[0]] if analytical_states else []
        self._refresh_numeric_caches()
        return self

    def _fit_answer_memory_from_text(self, text: str) -> None:
        assert self.tokenizer is not None
        assert self.embedding_model is not None
        if (
            self.answer_keys is None
            or self.answer_key_norms is None
            or self.answer_values is None
            or self.answer_start_keys is None
            or self.answer_start_key_norms is None
            or self.answer_start_values is None
            or self.answer_sequence_keys is None
            or self.answer_sequence_key_norms is None
            or self.answer_sequence_prompt_tokens is None
            or self.answer_sequence_tokens is None
        ):
            return

        for line in text.splitlines():
            if "<answer>" not in line:
                continue
            prompt_text, answer_text = line.split("<answer>", 1)
            prompt_text = prompt_text.strip()
            answer_text = answer_text.strip()
            if not prompt_text or not answer_text:
                continue

            prompt_tokens = self.tokenizer.encode(prompt_text) + ["<answer>"]
            answer_tokens = [
                token
                for token in self.tokenizer.encode(answer_text)
                if token in self.embedding_model.token_to_id
                and (
                    token not in self.tokenizer.special_tokens
                    or token in TOOL_PROTOCOL_TOKENS
                )
            ]
            if not prompt_tokens or not answer_tokens:
                continue

            key = self._encode_context(prompt_tokens)
            key_norm = norm(key)
            if key_norm <= 0.0:
                continue

            answer_ids = [
                self.embedding_model.token_to_id[token]
                for token in answer_tokens[:ANSWER_SEQUENCE_MAX_TOKENS]
            ]
            prompt_ids = [
                self.embedding_model.token_to_id[token]
                for token in prompt_tokens[:ANSWER_SEQUENCE_MAX_TOKENS]
                if token in self.embedding_model.token_to_id
                and (
                    token not in self.tokenizer.special_tokens
                    or token in TOOL_PROTOCOL_TOKENS
                )
            ]
            if not answer_ids:
                continue

            self.answer_keys.append(key[:])
            self.answer_key_norms.append(key_norm)
            self.answer_values.append(answer_ids[0])
            self.answer_start_keys.append(key[:])
            self.answer_start_key_norms.append(key_norm)
            self.answer_start_values.append(answer_ids[0])
            self.answer_sequence_keys.append(key[:])
            self.answer_sequence_key_norms.append(key_norm)
            self.answer_sequence_prompt_tokens.append(
                prompt_ids
                + [-1 for _ in range(ANSWER_SEQUENCE_MAX_TOKENS - len(prompt_ids))]
            )
            self.answer_sequence_tokens.append(
                answer_ids
                + [-1 for _ in range(ANSWER_SEQUENCE_MAX_TOKENS - len(answer_ids))]
            )

    def predict_next_distribution(
        self,
        context: str,
        *,
        reasoning_mode: str | None = None,
    ) -> dict[str, float]:
        self._require_fit()
        assert self.tokenizer is not None
        assert self.embedding_model is not None
        probabilities = self.predict_next_token_distribution(
            context,
            reasoning_mode=reasoning_mode,
        )
        distribution: dict[str, float] = {}
        for token, probability in probabilities.items():
            rendered = self._render_token(token)
            distribution[rendered] = distribution.get(rendered, 0.0) + probability
        return distribution

    def predict_next_token_distribution(
        self,
        context: str,
        *,
        reasoning_mode: str | None = None,
    ) -> dict[str, float]:
        self._require_fit()
        assert self.tokenizer is not None
        assert self.embedding_model is not None
        assert self.readout_weights is not None

        active_mode = reasoning_mode or self.config.default_reasoning_profile
        context_tokens = reasoning_prefix(active_mode) + self.tokenizer.encode(context)
        return self._predict_next_token_distribution_from_tokens(context_tokens)

    def generate_text(
        self,
        context: str,
        *,
        max_tokens: int = 64,
        reasoning_mode: str | None = None,
        temperature: float = 0.0,
        top_k: int = DEFAULT_GENERATION_TOP_K,
        top_p: float = DEFAULT_GENERATION_TOP_P,
        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
        avoid_texts: Sequence[str] | None = None,
    ) -> str:
        character_count_response = self._character_count_response(
            context,
            temperature=temperature,
        )
        if character_count_response is not None:
            return character_count_response
        self._require_fit()
        self._ensure_numeric_caches()
        assert self.tokenizer is not None
        runtime_avoid_texts = self._runtime_avoid_texts(
            context,
            avoid_texts,
            temperature=temperature,
        )
        avoid_token_sequences = self._avoid_text_token_sequences(runtime_avoid_texts)
        if (
            np is not None
            and self.readout_weights_array is not None
            and self.embedding_model is not None
            and len(self.embedding_model.id_to_token) >= 1024
        ):
            generated_text = self._generate_text_fast(
                context,
                max_tokens=max_tokens,
                reasoning_mode=reasoning_mode,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                avoid_token_sequences=avoid_token_sequences,
            )
            self._remember_runtime_generation(
                context,
                generated_text,
                temperature=temperature,
            )
            return generated_text

        active_mode = reasoning_mode or self.config.default_reasoning_profile
        _, context_tokens = self._generation_prompt_tokens(context, active_mode)
        decode_state = self._build_decode_state(context_tokens)
        generated_tokens: list[str] = []
        for _ in range(max_tokens):
            distribution, _ = self._score_next_token_from_state(
                decode_state,
                include_trace=False,
                generated_tokens=generated_tokens,
                temperature=temperature,
                avoid_token_sequences=avoid_token_sequences,
            )
            forced_source_token = self._source_evidence_next_token(
                decode_state.context_tokens,
                generated_tokens,
            )
            next_token = forced_source_token or self._select_generation_token(
                distribution,
                context_tokens=decode_state.context_tokens,
                generated_tokens=generated_tokens,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                avoid_token_sequences=avoid_token_sequences,
                preserve_dominant_candidates=(
                    self._answer_decode_has_continuation(decode_state, generated_tokens)
                    or self._source_evidence_has_continuation(
                        decode_state.context_tokens,
                        generated_tokens,
                    )
                ),
            )
            if not next_token:
                break
            generated_tokens.append(next_token)
            self._advance_decode_state(decode_state, next_token)
            if self._should_stop_answer_sequence(decode_state, generated_tokens):
                break
            if self._should_stop_after_answer_path_drift(decode_state, generated_tokens):
                break
            if self._source_evidence_is_complete(decode_state.context_tokens, generated_tokens):
                break
            if (
                self._should_stop_generation(generated_tokens)
                and not self._answer_decode_has_continuation(decode_state, generated_tokens)
                and not self._source_evidence_has_continuation(
                    decode_state.context_tokens,
                    generated_tokens,
                )
            ):
                break
        overflow_budget = max(WORD_COMPLETION_OVERFLOW_TOKENS, max_tokens)
        while generated_tokens and overflow_budget > 0:
            has_answer_continuation = self._answer_decode_has_continuation(
                decode_state,
                generated_tokens,
            )
            has_source_continuation = self._source_evidence_has_continuation(
                decode_state.context_tokens,
                generated_tokens,
            )
            if (
                self._starts_new_word(generated_tokens[-1])
                and not has_answer_continuation
                and not has_source_continuation
            ):
                break
            distribution, _ = self._score_next_token_from_state(
                decode_state,
                include_trace=False,
                generated_tokens=generated_tokens,
                temperature=temperature,
                avoid_token_sequences=avoid_token_sequences,
            )
            forced_source_token = self._source_evidence_next_token(
                decode_state.context_tokens,
                generated_tokens,
            )
            next_token = forced_source_token or self._select_generation_token(
                distribution,
                context_tokens=decode_state.context_tokens,
                generated_tokens=generated_tokens,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                avoid_token_sequences=avoid_token_sequences,
                preserve_dominant_candidates=has_answer_continuation
                or has_source_continuation,
            )
            if not next_token:
                break
            if (
                self._starts_new_word(next_token)
                and not has_answer_continuation
                and not has_source_continuation
            ):
                break
            generated_tokens.append(next_token)
            self._advance_decode_state(decode_state, next_token)
            overflow_budget -= 1
        generated_text = self._finalize_generated_text(
            self._normalize_generated_tool_protocol_text(
                self._decode_tokens(generated_tokens),
                context=context,
            )
        )
        self._remember_runtime_generation(
            context,
            generated_text,
            temperature=temperature,
        )
        return generated_text

    @staticmethod
    def _character_count_fact(context: str) -> CharacterCountFact | None:
        normalized = unicodedata.normalize("NFKC", context).strip()
        tokens = ReframrModel._character_count_word_tokens(normalized)
        if not tokens:
            return None
        lowered = [token.casefold() for token in tokens]
        count_terms = {"count", "counts", "counting", "many"}
        unit_terms = {"character", "characters", "letter", "letters"}
        if not any(token in count_terms for token in lowered):
            return None
        if not any(token in unit_terms for token in lowered) and "count" not in lowered:
            return None

        filler_terms = {"a", "an", "the", "single", "one", "please"}
        word_markers = {"in", "inside"}
        char_index = ReframrModel._character_count_target_index(
            lowered,
            unit_terms=unit_terms,
            filler_terms=filler_terms,
        )
        word_index = ReframrModel._character_count_word_index(
            lowered,
            char_index=char_index,
            filler_terms=filler_terms,
            word_markers=word_markers,
        )
        if char_index is None or word_index is None:
            return None
        character = tokens[char_index]
        word = tokens[word_index]
        if len(character) != 1 or not word:
            return None
        order_offset = 0 if char_index < word_index else 1
        surface_seed = ((char_index + 1) * 7 + (word_index + 1) * 3 + len(tokens) + order_offset) % 4
        structural_terms = (
            count_terms
            | unit_terms
            | filler_terms
            | word_markers
            | {
                "for",
                "of",
                "to",
                "how",
                "do",
                "does",
                "there",
                "are",
                "is",
                "appear",
                "appears",
                "times",
                "word",
            }
        )
        extra_content_tokens = [
            token
            for index, token in enumerate(lowered)
            if index not in {char_index, word_index}
            and token not in structural_terms
        ]
        return CharacterCountFact(
            character=character,
            word=word,
            count=word.casefold().count(character.casefold()),
            surface_seed=surface_seed,
            focused=not extra_content_tokens,
        )

    @staticmethod
    def _character_count_word_tokens(text: str) -> list[str]:
        tokens: list[str] = []
        current: list[str] = []
        for character in text:
            if character != "_" and character.isalnum():
                current.append(character)
                continue
            if current:
                tokens.append("".join(current))
                current = []
        if current:
            tokens.append("".join(current))
        return tokens

    @staticmethod
    def _character_count_target_index(
        tokens: list[str],
        *,
        unit_terms: set[str],
        filler_terms: set[str],
    ) -> int | None:
        for index, token in enumerate(tokens):
            if token not in unit_terms:
                continue
            for adjacent in (index - 1, index + 1):
                if 0 <= adjacent < len(tokens) and len(tokens[adjacent]) == 1:
                    return adjacent
            before = ReframrModel._nearest_content_index(tokens, index - 1, -1, filler_terms)
            after = ReframrModel._nearest_content_index(tokens, index + 1, 1, filler_terms)
            for candidate in (before, after):
                if candidate is not None and len(tokens[candidate]) == 1:
                    return candidate
        for index, token in enumerate(tokens):
            if token not in {"count", "counts", "counting"}:
                continue
            candidate = ReframrModel._nearest_content_index(tokens, index + 1, 1, filler_terms)
            if candidate is not None and tokens[candidate] in unit_terms:
                candidate = ReframrModel._nearest_content_index(tokens, candidate + 1, 1, filler_terms)
            if candidate is not None and len(tokens[candidate]) == 1:
                return candidate
        return None

    @staticmethod
    def _character_count_word_index(
        tokens: list[str],
        *,
        char_index: int | None,
        filler_terms: set[str],
        word_markers: set[str],
    ) -> int | None:
        for index, token in enumerate(tokens):
            if token != "word":
                continue
            candidate = ReframrModel._nearest_content_index(tokens, index + 1, 1, filler_terms)
            if candidate is not None and candidate != char_index and len(tokens[candidate]) > 1:
                return candidate
        for index, token in enumerate(tokens):
            if token not in word_markers:
                continue
            candidate = ReframrModel._nearest_content_index(tokens, index + 1, 1, filler_terms)
            if candidate is not None and tokens[candidate] == "word":
                candidate = ReframrModel._nearest_content_index(tokens, candidate + 1, 1, filler_terms)
            if candidate is not None and candidate != char_index and len(tokens[candidate]) > 1:
                return candidate
        skipped_terms = {
            "how",
            "many",
            "do",
            "does",
            "count",
            "counts",
            "counting",
            "letter",
            "letters",
            "character",
            "characters",
            "word",
            "there",
            "are",
            "is",
            "appear",
            "appears",
            "times",
        } | filler_terms | word_markers
        for index in range(len(tokens) - 1, -1, -1):
            if index == char_index:
                continue
            if len(tokens[index]) <= 1 or tokens[index] in skipped_terms:
                continue
            return index
        return None

    @staticmethod
    def _nearest_content_index(
        tokens: list[str],
        start: int,
        direction: int,
        skipped_terms: set[str],
    ) -> int | None:
        index = start
        while 0 <= index < len(tokens):
            if tokens[index] not in skipped_terms:
                return index
            index += direction
        return None

    @classmethod
    def _character_count_response(cls, context: str, *, temperature: float = 0.0) -> str | None:
        fact = cls._character_count_fact(context)
        if fact is None:
            return None
        if not fact.focused:
            return None
        return cls._render_character_count_fact(fact, temperature=temperature)

    @staticmethod
    def _render_character_count_fact(fact: CharacterCountFact, *, temperature: float = 0.0) -> str:
        character_label = f"'{fact.character}'"
        word_label = f"'{fact.word}'"
        character_noun = "character" if fact.count == 1 else "characters"
        return f"{word_label} has {fact.count} {character_label} {character_noun}."

    @classmethod
    def _runtime_source_grounded_response(cls, context: str) -> str | None:
        return None

    @classmethod
    def _runtime_source_records(cls, context: str) -> list[tuple[str, str, str]]:
        records: list[tuple[str, str, str]] = []
        marker = "<source>"
        search_from = 0
        while True:
            source_start = context.find(marker, search_from)
            if source_start < 0:
                break
            content_start = source_start + len(marker)
            content_end = cls._runtime_source_record_end(context, content_start)
            raw_record = context[content_start:content_end].strip()
            record = cls._parse_runtime_source_record(raw_record)
            if record is not None:
                records.append(record)
            search_from = max(content_end, content_start + 1)
        return records

    @staticmethod
    def _runtime_source_record_end(context: str, start: int) -> int:
        boundaries = [
            position
            for marker in (
                "\n",
                "<source>",
                "<tool_call>",
                "<tool_result>",
                "<final>",
                "<answer>",
                "<reason>",
            )
            if (position := context.find(marker, start)) >= 0
        ]
        return min(boundaries) if boundaries else len(context)

    @staticmethod
    def _parse_runtime_source_record(raw_record: str) -> tuple[str, str, str] | None:
        if not raw_record:
            return None
        pieces = [piece.strip() for piece in raw_record.split("|", 2)]
        if len(pieces) >= 3:
            title, url, snippet = pieces[0], pieces[1], pieces[2]
        else:
            title, url, snippet = "the provided source", "", pieces[-1]
        title = ReframrModel._clean_runtime_source_field(title) or "the provided source"
        url = ReframrModel._clean_runtime_source_field(url)
        snippet = ReframrModel._clean_runtime_source_field(snippet)
        if not snippet:
            return None
        return title, url, snippet

    @staticmethod
    def _clean_runtime_source_field(text: str) -> str:
        normalized = unicodedata.normalize("NFKC", text)
        cleaned = " ".join(normalized.split())
        return cleaned.strip(" \t\r\n|")

    def _generate_text_fast(
        self,
        context: str,
        *,
        max_tokens: int,
        reasoning_mode: str | None,
        temperature: float,
        top_k: int,
        top_p: float,
        repetition_penalty: float,
        avoid_token_sequences: Sequence[Sequence[str]] | None = None,
    ) -> str:
        assert self.tokenizer is not None

        active_mode = reasoning_mode or self.config.default_reasoning_profile
        _, context_tokens = self._generation_prompt_tokens(context, active_mode)
        decode_state = self._build_decode_state(context_tokens)
        generated_tokens: list[str] = []
        for _ in range(max_tokens):
            probabilities, _ = self._score_next_token_array_from_state(
                decode_state,
                include_associative=not generated_tokens,
                generated_tokens=generated_tokens,
                temperature=temperature,
                avoid_token_sequences=avoid_token_sequences,
            )
            forced_source_token = self._source_evidence_next_token(
                decode_state.context_tokens,
                generated_tokens,
            )
            next_token = forced_source_token or self._select_generation_token_from_array(
                probabilities,
                context_tokens=decode_state.context_tokens,
                generated_tokens=generated_tokens,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                avoid_token_sequences=avoid_token_sequences,
                preserve_dominant_candidates=(
                    self._answer_decode_has_continuation(decode_state, generated_tokens)
                    or self._source_evidence_has_continuation(
                        decode_state.context_tokens,
                        generated_tokens,
                    )
                ),
            )
            if not next_token:
                break
            generated_tokens.append(next_token)
            self._advance_decode_state(decode_state, next_token)
            if self._should_stop_answer_sequence(decode_state, generated_tokens):
                break
            if self._should_stop_after_answer_path_drift(decode_state, generated_tokens):
                break
            if self._source_evidence_is_complete(decode_state.context_tokens, generated_tokens):
                break
            if (
                self._should_stop_generation(generated_tokens)
                and not self._answer_decode_has_continuation(decode_state, generated_tokens)
                and not self._source_evidence_has_continuation(
                    decode_state.context_tokens,
                    generated_tokens,
                )
            ):
                break

        overflow_budget = max(WORD_COMPLETION_OVERFLOW_TOKENS, max_tokens)
        while generated_tokens and overflow_budget > 0:
            has_answer_continuation = self._answer_decode_has_continuation(
                decode_state,
                generated_tokens,
            )
            has_source_continuation = self._source_evidence_has_continuation(
                decode_state.context_tokens,
                generated_tokens,
            )
            if (
                self._starts_new_word(generated_tokens[-1])
                and not has_answer_continuation
                and not has_source_continuation
            ):
                break
            probabilities, _ = self._score_next_token_array_from_state(
                decode_state,
                include_associative=False,
                generated_tokens=generated_tokens,
                temperature=temperature,
                avoid_token_sequences=avoid_token_sequences,
            )
            forced_source_token = self._source_evidence_next_token(
                decode_state.context_tokens,
                generated_tokens,
            )
            next_token = forced_source_token or self._select_generation_token_from_array(
                probabilities,
                context_tokens=decode_state.context_tokens,
                generated_tokens=generated_tokens,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                avoid_token_sequences=avoid_token_sequences,
                preserve_dominant_candidates=has_answer_continuation
                or has_source_continuation,
            )
            if not next_token:
                break
            if (
                self._starts_new_word(next_token)
                and not has_answer_continuation
                and not has_source_continuation
            ):
                break
            generated_tokens.append(next_token)
            self._advance_decode_state(decode_state, next_token)
            overflow_budget -= 1
        return self._finalize_generated_text(
            self._normalize_generated_tool_protocol_text(
                self._decode_tokens(generated_tokens),
                context=context,
            )
        )

    def trace_next_token(
        self,
        context: str,
        *,
        reasoning_mode: str | None = None,
        top_k: int = 5,
    ) -> dict[str, object]:
        self._require_fit()
        assert self.tokenizer is not None

        active_mode = reasoning_mode or self.config.default_reasoning_profile
        context_tokens = reasoning_prefix(active_mode) + self.tokenizer.encode(context)
        _, trace = self._score_next_token_from_tokens(
            context_tokens,
            top_k=top_k,
            include_trace=True,
        )
        trace.update(
            {
                "context": context,
                "reasoning_mode": active_mode,
                "reasoning_tokens": reasoning_prefix(active_mode),
                "context_tokens": context_tokens,
            }
        )
        return trace

    def trace_generation(
        self,
        context: str,
        *,
        max_tokens: int = 16,
        reasoning_mode: str | None = None,
        top_k: int = 5,
        temperature: float = 0.0,
        top_p: float = DEFAULT_GENERATION_TOP_P,
        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
    ) -> dict[str, object]:
        character_count_response = self._character_count_response(
            context,
            temperature=temperature,
        )
        if character_count_response is not None:
            active_mode = reasoning_mode or self.config.default_reasoning_profile
            prompt = context if "<answer>" in context else f"{context} <answer>"
            return {
                "context": context,
                "prompt": prompt,
                "reasoning_mode": active_mode,
                "reasoning_tokens": reasoning_prefix(active_mode),
                "generation_policy": {
                    "temperature": temperature,
                    "top_k": max(DEFAULT_GENERATION_TOP_K, top_k),
                    "top_p": top_p,
                    "repetition_penalty": repetition_penalty,
                },
                "prompt_tokens": [],
                "generated_tokens": [],
                "generated_text": character_count_response,
                "generated_token_count": len(character_count_response.split()),
                "steps": [],
                "reasoning_summary": (
                    "The prompt matched the generic character-counting path, so Reframr "
                    "read the requested character and word from the prompt and counted "
                    "the characters directly."
                ),
            }
        self._require_fit()
        assert self.tokenizer is not None

        active_mode = reasoning_mode or self.config.default_reasoning_profile
        prompt, context_tokens = self._generation_prompt_tokens(context, active_mode)
        decode_state = self._build_decode_state(context_tokens)
        prompt_tokens = decode_state.context_tokens[:]
        generated_tokens: list[str] = []
        steps: list[dict[str, object]] = []

        for step_index in range(1, max_tokens + 1):
            distribution, trace = self._score_next_token_from_state(
                decode_state,
                top_k=top_k,
                include_trace=True,
                generated_tokens=generated_tokens,
                temperature=temperature,
            )
            forced_source_token = self._source_evidence_next_token(
                decode_state.context_tokens,
                generated_tokens,
            )
            next_token = forced_source_token or self._select_generation_token(
                distribution,
                context_tokens=decode_state.context_tokens,
                generated_tokens=generated_tokens,
                temperature=temperature,
                top_k=max(DEFAULT_GENERATION_TOP_K, top_k),
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                preserve_dominant_candidates=(
                    self._answer_decode_has_continuation(decode_state, generated_tokens)
                    or self._source_evidence_has_continuation(
                        decode_state.context_tokens,
                        generated_tokens,
                    )
                ),
            )
            if not next_token:
                break
            generated_tokens.append(next_token)
            self._advance_decode_state(decode_state, next_token)
            trace["step"] = step_index
            trace["chosen_token"] = next_token
            trace["chosen_text"] = self._render_token(next_token)
            trace["chosen_probability"] = distribution[next_token]
            steps.append(trace)
            if self._should_stop_answer_sequence(decode_state, generated_tokens):
                break
            if self._should_stop_after_answer_path_drift(decode_state, generated_tokens):
                break
            if self._source_evidence_is_complete(decode_state.context_tokens, generated_tokens):
                break
            if (
                self._should_stop_generation(generated_tokens)
                and not self._answer_decode_has_continuation(decode_state, generated_tokens)
                and not self._source_evidence_has_continuation(
                    decode_state.context_tokens,
                    generated_tokens,
                )
            ):
                break

        overflow_budget = max(WORD_COMPLETION_OVERFLOW_TOKENS, max_tokens)
        while generated_tokens and overflow_budget > 0:
            has_answer_continuation = self._answer_decode_has_continuation(
                decode_state,
                generated_tokens,
            )
            has_source_continuation = self._source_evidence_has_continuation(
                decode_state.context_tokens,
                generated_tokens,
            )
            if (
                self._starts_new_word(generated_tokens[-1])
                and not has_answer_continuation
                and not has_source_continuation
            ):
                break
            distribution, trace = self._score_next_token_from_state(
                decode_state,
                top_k=top_k,
                include_trace=True,
                generated_tokens=generated_tokens,
                temperature=temperature,
            )
            forced_source_token = self._source_evidence_next_token(
                decode_state.context_tokens,
                generated_tokens,
            )
            next_token = forced_source_token or self._select_generation_token(
                distribution,
                context_tokens=decode_state.context_tokens,
                generated_tokens=generated_tokens,
                temperature=temperature,
                top_k=max(DEFAULT_GENERATION_TOP_K, top_k),
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                preserve_dominant_candidates=has_answer_continuation
                or has_source_continuation,
            )
            if not next_token:
                break
            if (
                self._starts_new_word(next_token)
                and not has_answer_continuation
                and not has_source_continuation
            ):
                break
            generated_tokens.append(next_token)
            self._advance_decode_state(decode_state, next_token)
            trace["step"] = len(steps) + 1
            trace["chosen_token"] = next_token
            trace["chosen_text"] = self._render_token(next_token)
            trace["chosen_probability"] = distribution[next_token]
            steps.append(trace)
            if self._should_stop_answer_sequence(decode_state, generated_tokens):
                break
            if self._should_stop_after_answer_path_drift(decode_state, generated_tokens):
                break
            overflow_budget -= 1

        return {
            "context": context,
            "prompt": prompt,
            "reasoning_mode": active_mode,
            "reasoning_tokens": reasoning_prefix(active_mode),
            "generation_policy": {
                "temperature": temperature,
                "top_k": max(DEFAULT_GENERATION_TOP_K, top_k),
                "top_p": top_p,
                "repetition_penalty": repetition_penalty,
            },
            "prompt_tokens": prompt_tokens,
            "generated_tokens": generated_tokens,
            "generated_text": self._finalize_generated_text(
                self._normalize_generated_tool_protocol_text(
                    self._decode_tokens(generated_tokens),
                    context=context,
                )
            ),
            "generated_token_count": len(generated_tokens),
            "steps": steps,
        }

    def _generation_prompt_tokens(self, context: str, active_mode: str) -> tuple[str, list[str]]:
        assert self.tokenizer is not None
        prompt = context if "<answer>" in context else f"{context} <answer>"
        prefix = reasoning_prefix(active_mode)
        prompt_tokens = self.tokenizer.encode(prompt)
        if (
            "<answer>" in prompt_tokens
            and "<reason>" not in prompt_tokens
            and "<reason>" not in prefix
        ):
            prompt_tokens = ["<reason>"] + prompt_tokens
        return prompt, prefix + prompt_tokens

    def _predict_next_token_distribution_from_tokens(
        self,
        context_tokens: list[str],
    ) -> dict[str, float]:
        decode_state = self._build_decode_state(context_tokens)
        return self._predict_next_token_distribution_from_state(decode_state)

    def _predict_next_token_distribution_from_state(
        self,
        decode_state: DecodeState,
    ) -> dict[str, float]:
        probabilities, _ = self._score_next_token_from_state(
            decode_state,
            include_trace=False,
        )
        return probabilities

    @staticmethod
    def _answer_memory_is_confident(
        *,
        answer_sequence_match_confidence: float,
        answer_start_confidence: float,
        generated_count: int,
    ) -> bool:
        if generated_count > 0:
            return answer_sequence_match_confidence >= ANSWER_SEQUENCE_MATCH_FLOOR
        if answer_sequence_match_confidence >= ANSWER_SEQUENCE_DISTRIBUTED_LOCK_FLOOR:
            return True
        if answer_sequence_match_confidence >= ANSWER_SEQUENCE_MATCH_FLOOR:
            return True
        if answer_start_confidence >= ANSWER_START_CONFIDENCE_FLOOR + ANSWER_SEQUENCE_MATCH_FLOOR:
            return True
        return (
            answer_sequence_match_confidence >= ANSWER_START_MATCH_SUPPORT_FLOOR
            and answer_start_confidence >= ANSWER_START_CONFIDENCE_FLOOR
            and answer_start_confidence <= answer_sequence_match_confidence + ANSWER_START_CONFIDENCE_FLOOR
        )

    @staticmethod
    def _answer_sequence_should_lock(
        *,
        answer_sequence_confidence: float,
        answer_sequence_match_confidence: float,
        has_answer_sequence_prior: bool,
    ) -> bool:
        if not has_answer_sequence_prior or answer_sequence_confidence <= 0.0:
            return False
        if answer_sequence_match_confidence >= ANSWER_SEQUENCE_LOCK_FLOOR:
            return True
        if (
            answer_sequence_match_confidence >= ANSWER_SEQUENCE_MATCH_FLOOR
            and answer_sequence_confidence >= 0.30
            and answer_sequence_confidence <= 0.65
        ):
            return True
        return (
            answer_sequence_match_confidence >= ANSWER_SEQUENCE_DISTRIBUTED_LOCK_FLOOR
            and answer_sequence_confidence <= ANSWER_SEQUENCE_SPIKE_CONFIDENCE
        )

    def _prompt_start_readout_is_confident(
        self,
        prior: object,
        tokens: Sequence[str] | None = None,
    ) -> bool:
        if self.tokenizer is None:
            return False
        if tokens is None:
            if self.embedding_model is None:
                return False
            tokens = self.embedding_model.id_to_token
        values = prior.tolist() if hasattr(prior, "tolist") else list(prior)
        if not values or not tokens:
            return False
        limit = min(len(values), len(tokens))
        if limit <= 0:
            return False
        best_index = max(range(limit), key=lambda index: float(values[index]))
        best_probability = float(values[best_index])
        if best_probability < PROMPT_START_READOUT_CONFIDENCE_FLOOR:
            return False
        meta = self._generation_token_meta(tokens[best_index])
        return (
            meta.starts_new_word
            and bool(meta.alphanumeric)
            and not meta.structural_punctuation
            and not meta.structural_symbol
        )

    def _locked_answer_sequence_matches(
        self,
        matches: list[tuple[float, int, int]],
        *,
        generated_tokens: list[str],
        temperature: float,
        answer_sequence_confidence: float,
        answer_sequence_match_confidence: float,
    ) -> list[tuple[float, int, int]]:
        if not matches:
            return []
        if generated_tokens:
            aligned_matches = [
                match
                for match in matches[:ANSWER_START_TOP_K]
                if self._answer_sequence_match_has_continuation(
                    match,
                    generated_tokens,
                )
            ]
            return aligned_matches[:ANSWER_SEQUENCE_VARIATION_MATCH_LIMIT] or matches[:1]
        best_similarity = matches[0][0]
        near_match_floor = max(ANSWER_SEQUENCE_MATCH_FLOOR, best_similarity - 0.08)
        varied = [
            match
            for match in matches[:ANSWER_SEQUENCE_VARIATION_MATCH_LIMIT]
            if match[0] >= near_match_floor
        ]
        if (
            temperature < ANSWER_SEQUENCE_VARIATION_TEMPERATURE
            and answer_sequence_match_confidence >= ANSWER_SEQUENCE_LOCK_FLOOR
            and len(varied) <= 1
        ):
            return matches[:1]
        return varied or matches[:1]

    @staticmethod
    def _answer_sequence_matches_are_ambiguous(
        matches: Sequence[tuple[float, int, int]],
    ) -> bool:
        if len(matches) < 2:
            return False
        best_similarity = float(matches[0][0])
        if best_similarity < ANSWER_SEQUENCE_MATCH_FLOOR:
            return False
        near_match_floor = max(ANSWER_SEQUENCE_MATCH_FLOOR, best_similarity - 0.08)
        return any(
            float(match[0]) >= near_match_floor
            for match in matches[1:ANSWER_SEQUENCE_VARIATION_MATCH_LIMIT]
        )

    def _answer_sequence_match_has_continuation(
        self,
        match: tuple[float, int, int],
        generated_tokens: list[str],
    ) -> bool:
        if (
            self.embedding_model is None
            or self.answer_sequence_tokens is None
            or not generated_tokens
        ):
            return False
        similarity, sequence_index, _ = match
        if similarity < ANSWER_SEQUENCE_MATCH_FLOOR or sequence_index >= len(self.answer_sequence_tokens):
            return False
        generated_ids = [
            self.embedding_model.token_to_id[token]
            for token in generated_tokens
            if token in self.embedding_model.token_to_id
        ]
        if not generated_ids:
            return False
        row = self.answer_sequence_tokens[sequence_index]
        token_ids = [
            int(value)
            for value in (row.tolist() if hasattr(row, "tolist") else row)
            if int(value) >= 0
        ]
        if not token_ids:
            return False
        next_token_id = self._next_sequence_token_id(token_ids, generated_ids)
        if next_token_id is None:
            return False
        token = self.embedding_model.id_to_token[next_token_id]
        return self._allowed_answer_sequence_token(token, generated_tokens)

    def _allowed_answer_sequence_token(
        self,
        token: str,
        generated_tokens: list[str],
    ) -> bool:
        assert self.tokenizer is not None
        if token == self.tokenizer.unk_token:
            return False
        if token in self.tokenizer.special_tokens:
            return self._allowed_generation_token(token, generated_tokens)
        return True

    def _should_relax_answer_sequence_memory(
        self,
        matches: list[tuple[float, int, int]],
        answer_sequence_prior: Sequence[float],
        *,
        generated_tokens: list[str],
        temperature: float,
    ) -> bool:
        if temperature < ANSWER_SEQUENCE_CREATIVE_TEMPERATURE or not matches:
            return False
        if self._is_inside_tool_protocol_continuation(generated_tokens):
            return False
        if self._answer_sequence_prior_prefers_tool_protocol(answer_sequence_prior):
            return False
        return True

    def _answer_sequence_prior_prefers_tool_protocol(
        self,
        answer_sequence_prior: Sequence[float],
    ) -> bool:
        if self.embedding_model is None or not answer_sequence_prior:
            return False
        best_index = -1
        best_value = 0.0
        for index, value in enumerate(answer_sequence_prior):
            if value > best_value:
                best_index = index
                best_value = float(value)
        return (
            best_index >= 0
            and best_index < len(self.embedding_model.id_to_token)
            and best_value > 0.0
            and self.embedding_model.id_to_token[best_index] in TOOL_PROTOCOL_TOKENS
        )

    @staticmethod
    def _answer_start_blend_weights(
        *,
        answer_sequence_match_confidence: float,
        temperature: float = 0.0,
    ) -> dict[str, float]:
        if temperature >= ANSWER_SEQUENCE_CREATIVE_TEMPERATURE:
            return {
                "prompt_answer_start": 0.46,
                "prompt_answer": 0.24,
                "answer_sequence": 0.10,
                "answer_start": 0.20,
            }
        if answer_sequence_match_confidence >= ANSWER_SEQUENCE_LOCK_FLOOR:
            return {
                "prompt_answer_start": 0.35,
                "prompt_answer": 0.10,
                "answer_sequence": 0.45,
                "answer_start": 0.10,
            }
        if answer_sequence_match_confidence >= 0.40:
            return {
                "prompt_answer_start": 0.25,
                "prompt_answer": 0.12,
                "answer_sequence": 0.53,
                "answer_start": 0.10,
            }
        return {
            "prompt_answer_start": 0.08,
            "prompt_answer": 0.10,
            "answer_sequence": 0.02,
            "answer_start": 0.80,
        }

    def _score_next_token_from_tokens(
        self,
        context_tokens: list[str],
        *,
        top_k: int = 5,
        include_trace: bool = True,
    ) -> tuple[dict[str, float], dict[str, object]]:
        decode_state = self._build_decode_state(context_tokens)
        return self._score_next_token_from_state(
            decode_state,
            top_k=top_k,
            include_trace=include_trace,
        )

    def _score_next_token_from_state(
        self,
        decode_state: DecodeState,
        *,
        top_k: int = 5,
        include_trace: bool = True,
        generated_tokens: list[str] | None = None,
        temperature: float = 0.0,
        avoid_token_sequences: Sequence[Sequence[str]] | None = None,
    ) -> tuple[dict[str, float], dict[str, object]]:
        assert self.embedding_model is not None
        assert self.readout_weights is not None
        generated_tokens = generated_tokens or []

        state = self._masked_decode_state(decode_state)
        logits = self._apply_readout_fast(state)
        base_probabilities = self._calibrated_softmax(logits)
        if decode_state.answer_matches is None:
            decode_state.answer_matches = self._score_answer_matches(
                decode_state.answer_anchor_state,
                limit=max(ANSWER_TOP_K, top_k) if include_trace else ANSWER_TOP_K,
            )
        answer_matches = decode_state.answer_matches
        if decode_state.answer_start_matches is None:
            decode_state.answer_start_matches = self._score_answer_start_matches(
                decode_state.answer_anchor_state,
                limit=max(ANSWER_START_TOP_K, top_k) if include_trace else ANSWER_START_TOP_K,
            )
        answer_start_matches = decode_state.answer_start_matches
        if decode_state.answer_sequence_matches is None:
            decode_state.answer_sequence_matches = self._score_answer_sequence_matches(
                decode_state.answer_anchor_state,
                decode_state.context_tokens,
                limit=max(ANSWER_START_TOP_K, top_k) if include_trace else ANSWER_START_TOP_K,
            )
        answer_sequence_matches = self._filter_avoided_answer_sequence_matches(
            decode_state.answer_sequence_matches,
            avoid_token_sequences,
        )
        if not answer_start_matches and answer_sequence_matches:
            answer_start_matches = self._answer_start_matches_from_sequences(
                answer_sequence_matches
            )
            decode_state.answer_start_matches = answer_start_matches
        answer_prior = self._answer_prior_from_matches(answer_matches, generated_tokens)
        answer_start_prior = self._answer_prior_from_matches(answer_start_matches, generated_tokens)
        answer_sequence_prior = self._answer_sequence_prior_from_matches(
            answer_sequence_matches,
            generated_tokens,
            temperature=temperature,
        )
        answer_sequence_confidence = max(answer_sequence_prior) if answer_sequence_prior else 0.0
        answer_sequence_match_confidence = (
            answer_sequence_matches[0][0] if answer_sequence_matches else 0.0
        )
        answer_start_confidence = answer_start_matches[0][0] if answer_start_matches else 0.0
        prompt_copy_is_distinctive = (
            not generated_tokens
            and self._prompt_copy_evidence_is_distinctive(decode_state.context_tokens)
        )
        answer_memory_confident = self._answer_memory_is_confident(
            answer_sequence_match_confidence=answer_sequence_match_confidence,
            answer_start_confidence=answer_start_confidence,
            generated_count=len(generated_tokens),
        )
        if prompt_copy_is_distinctive and not answer_sequence_matches:
            answer_memory_confident = False
        has_answer_sequence_prior = any(value > 0.0 for value in answer_sequence_prior)
        if not answer_memory_confident:
            zero_prior = [0.0 for _ in self.embedding_model.id_to_token]
            answer_prior = zero_prior
            answer_start_prior = zero_prior
            answer_sequence_prior = zero_prior
            answer_sequence_confidence = 0.0
            has_answer_sequence_prior = False
        answer_locked = self._answer_sequence_should_lock(
            answer_sequence_confidence=answer_sequence_confidence,
            answer_sequence_match_confidence=answer_sequence_match_confidence,
            has_answer_sequence_prior=has_answer_sequence_prior,
        ) or (
            bool(generated_tokens)
            and temperature < ANSWER_SEQUENCE_CREATIVE_TEMPERATURE
            and self._answer_sequence_has_continuation(
                generated_tokens,
                answer_sequence_matches,
            )
        )
        if self._should_relax_answer_sequence_memory(
            answer_sequence_matches,
            answer_sequence_prior,
            generated_tokens=generated_tokens,
            temperature=temperature,
        ):
            answer_locked = False
        if decode_state.prompt_answer_prior is None:
            decode_state.prompt_answer_prior = self._prompt_answer_readout_prior(
                decode_state.answer_anchor_state,
                start=False,
            )
        prompt_answer_prior = decode_state.prompt_answer_prior
        prompt_answer_start_prior = (
            decode_state.prompt_answer_start_prior
            if not generated_tokens
            else [0.0 for _ in self.embedding_model.id_to_token]
        )
        if not generated_tokens and prompt_answer_start_prior is None:
            decode_state.prompt_answer_start_prior = self._prompt_answer_readout_prior(
                decode_state.answer_anchor_state,
                start=True,
            )
            prompt_answer_start_prior = decode_state.prompt_answer_start_prior
        prompt_start_readout_confident = (
            not generated_tokens
            and prompt_answer_start_prior is not None
            and self._prompt_start_readout_is_confident(prompt_answer_start_prior)
        )
        prompt_readout_supported = answer_memory_confident and (
            answer_sequence_match_confidence >= ANSWER_SEQUENCE_MATCH_FLOOR
            or answer_start_confidence >= ANSWER_START_CONFIDENCE_FLOOR
        )
        if prompt_start_readout_confident:
            prompt_readout_supported = True
        if not prompt_readout_supported:
            prompt_answer_prior = [0.0 for _ in self.embedding_model.id_to_token]
            prompt_answer_start_prior = [0.0 for _ in self.embedding_model.id_to_token]
        use_answer_start = (
            not generated_tokens
            and (
                any(value > 0.0 for value in answer_start_prior)
                or any(value > 0.0 for value in prompt_answer_start_prior)
            )
        )
        if answer_locked:
            locked_matches = self._locked_answer_sequence_matches(
                answer_sequence_matches,
                generated_tokens=generated_tokens,
                temperature=temperature,
                answer_sequence_confidence=answer_sequence_confidence,
                answer_sequence_match_confidence=answer_sequence_match_confidence,
            )
            answer_sequence_prior = self._answer_sequence_prior_from_matches(
                locked_matches,
                generated_tokens,
                temperature=temperature,
            )
            answer_prior = answer_sequence_prior
        elif use_answer_start:
            start_blend = self._answer_start_blend_weights(
                answer_sequence_match_confidence=answer_sequence_match_confidence,
                temperature=temperature,
            )
            answer_prior = self._weighted_prior_sum(
                [
                    (start_blend["prompt_answer_start"], prompt_answer_start_prior),
                    (start_blend["prompt_answer"], prompt_answer_prior),
                    (start_blend["answer_sequence"], answer_sequence_prior),
                    (start_blend["answer_start"], answer_start_prior),
                ],
            )
        elif any(value > 0.0 for value in answer_sequence_prior):
            sequence_weight = (
                0.10
                if temperature >= ANSWER_SEQUENCE_CREATIVE_TEMPERATURE
                else 0.30
            )
            answer_prior = self._weighted_prior_sum(
                [
                    (0.55, prompt_answer_prior),
                    (sequence_weight, answer_sequence_prior),
                    (0.20, answer_prior),
                ],
            )
        elif any(value > 0.0 for value in prompt_answer_prior):
            answer_prior = self._weighted_prior_sum(
                [
                    (0.65, prompt_answer_prior),
                    (0.35, answer_prior),
                ],
            )
        answer_guided = (
            max(answer_prior) >= 0.08
            if answer_prior
            else False
        )
        associative_matches = (
            []
            if use_answer_start or answer_guided
            else self._score_associative_matches(
                state,
                limit=max(ASSOCIATIVE_TOP_K, top_k) if include_trace else ASSOCIATIVE_TOP_K,
            )
        )
        associative_prior = (
            [0.0 for _ in self.embedding_model.id_to_token]
            if use_answer_start or answer_guided
            else self._associative_prior_from_matches(associative_matches)
        )
        transition_prior, transition_order = self._transition_prior_with_order(decode_state.context_tokens)
        copy_prior = self._copy_prior(decode_state.context_tokens)
        source_evidence_prior = self._source_evidence_prior(
            decode_state.context_tokens,
            generated_tokens,
        )
        preference_prior = self._preference_prior()
        probabilities, blend_weights = self._blend_probabilities(
            base_probabilities,
            answer_prior,
            associative_prior,
            transition_prior,
            copy_prior,
            source_evidence_prior,
            preference_prior,
            transition_order=transition_order,
            generated_count=len(generated_tokens),
            answer_locked=answer_locked,
            answer_guided_start=use_answer_start,
            copy_guided_start=prompt_copy_is_distinctive,
        )
        probabilities = self._focus_answer_start_probabilities(
            probabilities,
            answer_sequence_prior,
            generated_tokens=generated_tokens,
            answer_memory_confident=answer_memory_confident,
            has_answer_sequence_prior=has_answer_sequence_prior,
            sequence_focus_allowed=answer_sequence_match_confidence >= 0.40 or answer_locked,
            temperature=temperature,
        )
        distribution = {
            token: probabilities[index]
            for index, token in enumerate(self.embedding_model.id_to_token)
        }
        if not include_trace:
            return distribution, {}

        trace = {
            "state_norm": norm(state),
            "blend_weights": blend_weights,
            "transition_order": transition_order,
            "base_top_predictions": self._top_entries_from_vector(base_probabilities, top_k),
            "answer_top_predictions": self._top_entries_from_vector(answer_prior, top_k),
            "prompt_answer_top_predictions": self._top_entries_from_vector(prompt_answer_prior, top_k),
            "prompt_answer_start_top_predictions": self._top_entries_from_vector(prompt_answer_start_prior, top_k),
            "answer_start_top_predictions": self._top_entries_from_vector(answer_start_prior, top_k),
            "answer_sequence_top_predictions": self._top_entries_from_vector(answer_sequence_prior, top_k),
            "associative_top_predictions": self._top_entries_from_vector(associative_prior, top_k),
            "transition_top_predictions": self._top_entries_from_vector(transition_prior, top_k),
            "copy_top_predictions": self._top_entries_from_vector(copy_prior, top_k),
            "source_evidence_top_predictions": self._top_entries_from_vector(source_evidence_prior, top_k),
            "preference_top_predictions": self._top_entries_from_vector(preference_prior, top_k),
            "final_top_predictions": self._top_entries_from_vector(probabilities, top_k),
            "associative_matches": [
                {
                    "example_index": example_index,
                    "similarity": similarity,
                    **self._token_entry(token_id, similarity),
                }
                for similarity, token_id, example_index in associative_matches[:top_k]
            ],
            "answer_matches": [
                {
                    "example_index": example_index,
                    "similarity": similarity,
                    **self._token_entry(token_id, similarity),
                }
                for similarity, token_id, example_index in answer_matches[:top_k]
            ],
            "answer_start_matches": [
                {
                    "example_index": example_index,
                    "similarity": similarity,
                    **self._token_entry(token_id, similarity),
                }
                for similarity, token_id, example_index in answer_start_matches[:top_k]
            ],
            "answer_sequence_matches": [
                {
                    "example_index": example_index,
                    "similarity": similarity,
                }
                for similarity, _, example_index in answer_sequence_matches[:top_k]
            ],
            "reasoning_summary": self._build_reasoning_summary(
                transition_order,
                blend_weights,
            ),
        }
        return distribution, trace

    def _score_next_token_array_from_state(
        self,
        decode_state: DecodeState,
        *,
        include_associative: bool,
        generated_tokens: list[str] | None = None,
        temperature: float = 0.0,
        avoid_token_sequences: Sequence[Sequence[str]] | None = None,
    ) -> tuple[object, dict[str, float]]:
        assert np is not None
        assert self.embedding_model is not None
        generated_tokens = generated_tokens or []

        state = self._masked_decode_state_array(decode_state)
        logits = self._apply_readout_array(state)
        base_probabilities = self._calibrated_softmax_array(logits)
        if decode_state.answer_matches is None:
            decode_state.answer_matches = self._score_answer_matches(decode_state.answer_anchor_state)
        answer_prior = np.asarray(
            self._answer_prior_from_matches(
                decode_state.answer_matches,
                generated_tokens,
            ),
            dtype=np.float64,
        )
        if decode_state.answer_sequence_matches is None:
            decode_state.answer_sequence_matches = self._score_answer_sequence_matches(
                decode_state.answer_anchor_state,
                decode_state.context_tokens,
            )
        answer_sequence_matches = self._filter_avoided_answer_sequence_matches(
            decode_state.answer_sequence_matches,
            avoid_token_sequences,
        )
        if not decode_state.answer_start_matches and answer_sequence_matches:
            decode_state.answer_start_matches = self._answer_start_matches_from_sequences(
                answer_sequence_matches
            )
        answer_sequence_prior = np.asarray(
            self._answer_sequence_prior_from_matches(
                answer_sequence_matches,
                generated_tokens,
                temperature=temperature,
            ),
            dtype=np.float64,
        )
        answer_sequence_confidence = (
            float(answer_sequence_prior.max()) if answer_sequence_prior.size else 0.0
        )
        answer_sequence_match_confidence = (
            answer_sequence_matches[0][0] if answer_sequence_matches else 0.0
        )
        if not generated_tokens and decode_state.answer_start_matches is None:
            decode_state.answer_start_matches = self._score_answer_start_matches(
                decode_state.answer_anchor_state
            )
        answer_start_confidence = (
            decode_state.answer_start_matches[0][0]
            if not generated_tokens and decode_state.answer_start_matches
            else 0.0
        )
        prompt_copy_is_distinctive = (
            not generated_tokens
            and self._prompt_copy_evidence_is_distinctive(decode_state.context_tokens)
        )
        answer_memory_confident = self._answer_memory_is_confident(
            answer_sequence_match_confidence=answer_sequence_match_confidence,
            answer_start_confidence=answer_start_confidence,
            generated_count=len(generated_tokens),
        )
        if prompt_copy_is_distinctive and not answer_sequence_matches:
            answer_memory_confident = False
        has_answer_sequence_prior = bool(np.any(answer_sequence_prior > 0.0))
        if not answer_memory_confident:
            answer_prior = np.zeros_like(base_probabilities)
            answer_sequence_prior = np.zeros_like(base_probabilities)
            answer_sequence_confidence = 0.0
            has_answer_sequence_prior = False
        answer_locked = self._answer_sequence_should_lock(
            answer_sequence_confidence=answer_sequence_confidence,
            answer_sequence_match_confidence=answer_sequence_match_confidence,
            has_answer_sequence_prior=has_answer_sequence_prior,
        ) or (
            bool(generated_tokens)
            and temperature < ANSWER_SEQUENCE_CREATIVE_TEMPERATURE
            and self._answer_sequence_has_continuation(
                generated_tokens,
                answer_sequence_matches,
            )
        )
        if self._should_relax_answer_sequence_memory(
            answer_sequence_matches,
            answer_sequence_prior.tolist(),
            generated_tokens=generated_tokens,
            temperature=temperature,
        ):
            answer_locked = False
        if decode_state.prompt_answer_prior is None:
            decode_state.prompt_answer_prior = self._prompt_answer_readout_prior_array(
                decode_state.answer_anchor_state,
                start=False,
            )
        prompt_answer_prior = decode_state.prompt_answer_prior
        prompt_answer_start_prior = np.zeros_like(base_probabilities)
        use_answer_start = False
        if answer_locked:
            locked_matches = self._locked_answer_sequence_matches(
                answer_sequence_matches,
                generated_tokens=generated_tokens,
                temperature=temperature,
                answer_sequence_confidence=answer_sequence_confidence,
                answer_sequence_match_confidence=answer_sequence_match_confidence,
            )
            answer_sequence_prior = np.asarray(
                self._answer_sequence_prior_from_matches(
                    locked_matches,
                    generated_tokens,
                    temperature=temperature,
                ),
                dtype=np.float64,
            )
            answer_prior = answer_sequence_prior
        elif not generated_tokens:
            if decode_state.prompt_answer_start_prior is None:
                decode_state.prompt_answer_start_prior = self._prompt_answer_readout_prior_array(
                    decode_state.answer_anchor_state,
                    start=True,
                )
            prompt_answer_start_prior = (
                decode_state.prompt_answer_start_prior
                if decode_state.prompt_answer_start_prior is not None
                else np.zeros_like(base_probabilities)
            )
            prompt_start_readout_confident = self._prompt_start_readout_is_confident(
                prompt_answer_start_prior
            )
            prompt_readout_supported = answer_memory_confident and (
                answer_sequence_match_confidence >= ANSWER_SEQUENCE_MATCH_FLOOR
                or answer_start_confidence >= ANSWER_START_CONFIDENCE_FLOOR
            )
            if prompt_start_readout_confident:
                prompt_readout_supported = True
            if not prompt_readout_supported:
                prompt_answer_prior = np.zeros_like(base_probabilities)
                prompt_answer_start_prior = np.zeros_like(base_probabilities)
            answer_start_prior = np.asarray(
                self._answer_prior_from_matches(
                    decode_state.answer_start_matches,
                    generated_tokens,
                ),
                dtype=np.float64,
            )
            if not answer_memory_confident:
                answer_start_prior = np.zeros_like(base_probabilities)
            if np.any(answer_start_prior > 0.0) or np.any(prompt_answer_start_prior > 0.0):
                start_blend = self._answer_start_blend_weights(
                    answer_sequence_match_confidence=answer_sequence_match_confidence,
                    temperature=temperature,
                )
                answer_prior = self._weighted_prior_sum_array(
                    [
                        (start_blend["prompt_answer_start"], prompt_answer_start_prior),
                        (start_blend["prompt_answer"], prompt_answer_prior),
                        (start_blend["answer_sequence"], answer_sequence_prior),
                        (start_blend["answer_start"], answer_start_prior),
                    ],
                )
                use_answer_start = True
        if answer_locked:
            answer_prior = answer_sequence_prior
        elif not use_answer_start and np.any(answer_sequence_prior > 0.0):
            sequence_weight = (
                0.10
                if temperature >= ANSWER_SEQUENCE_CREATIVE_TEMPERATURE
                else 0.30
            )
            answer_prior = self._weighted_prior_sum_array(
                [
                    (0.55, prompt_answer_prior),
                    (sequence_weight, answer_sequence_prior),
                    (0.20, answer_prior),
                ],
            )
        elif not use_answer_start and np.any(prompt_answer_prior > 0.0):
            answer_prior = self._weighted_prior_sum_array(
                [
                    (0.65, prompt_answer_prior),
                    (0.35, answer_prior),
                ],
            )
        answer_guided = bool(answer_prior.size and float(np.max(answer_prior)) >= 0.08)
        if include_associative and not use_answer_start and not answer_guided:
            associative_prior = np.asarray(
                self._associative_prior_from_matches(
                    self._score_associative_matches(state)
                ),
                dtype=np.float64,
            )
        else:
            associative_prior = np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        transition_prior, transition_order = self._transition_prior_array_with_order(
            decode_state.context_tokens
        )
        copy_prior = self._copy_prior_array(decode_state.context_tokens)
        source_evidence_prior = self._source_evidence_prior_array(
            decode_state.context_tokens,
            generated_tokens,
        )
        preference_prior = self._preference_prior_array()
        probabilities, blend_weights = self._blend_probability_arrays(
            base_probabilities,
            answer_prior,
            associative_prior,
            transition_prior,
            copy_prior,
            source_evidence_prior,
            preference_prior,
            transition_order=transition_order,
            generated_count=len(generated_tokens),
            answer_locked=answer_locked,
            answer_guided_start=use_answer_start,
        )
        probabilities = self._focus_answer_start_probability_array(
            probabilities,
            answer_sequence_prior,
            generated_tokens=generated_tokens,
            answer_memory_confident=answer_memory_confident,
            has_answer_sequence_prior=has_answer_sequence_prior,
            sequence_focus_allowed=answer_sequence_match_confidence >= 0.40 or answer_locked,
            temperature=temperature,
        )
        return probabilities, blend_weights

    @staticmethod
    def _focus_answer_start_probabilities(
        probabilities: Vector,
        answer_sequence_prior: Vector,
        *,
        generated_tokens: list[str],
        answer_memory_confident: bool,
        has_answer_sequence_prior: bool,
        sequence_focus_allowed: bool | None = None,
        temperature: float = 0.0,
    ) -> Vector:
        if sequence_focus_allowed is None:
            sequence_focus_allowed = has_answer_sequence_prior
        if temperature >= ANSWER_SEQUENCE_CREATIVE_TEMPERATURE:
            return probabilities
        if (
            generated_tokens
            or not answer_memory_confident
            or not has_answer_sequence_prior
            or not sequence_focus_allowed
        ):
            return probabilities
        if not probabilities or not answer_sequence_prior:
            return probabilities
        focused = [
            probability if index < len(answer_sequence_prior) and answer_sequence_prior[index] > 0.0 else probability * 0.02
            for index, probability in enumerate(probabilities)
        ]
        total = sum(focused)
        if total <= 0.0:
            return probabilities
        return [value / total for value in focused]

    @staticmethod
    def _focus_answer_start_probability_array(
        probabilities: object,
        answer_sequence_prior: object,
        *,
        generated_tokens: list[str],
        answer_memory_confident: bool,
        has_answer_sequence_prior: bool,
        sequence_focus_allowed: bool | None = None,
        temperature: float = 0.0,
    ) -> object:
        if sequence_focus_allowed is None:
            sequence_focus_allowed = has_answer_sequence_prior
        if temperature >= ANSWER_SEQUENCE_CREATIVE_TEMPERATURE:
            return probabilities
        if (
            np is None
            or generated_tokens
            or not answer_memory_confident
            or not has_answer_sequence_prior
            or not sequence_focus_allowed
        ):
            return probabilities
        values = np.asarray(probabilities, dtype=np.float64)
        prior = np.asarray(answer_sequence_prior, dtype=np.float64)
        if values.size == 0 or prior.size != values.size or not np.any(prior > 0.0):
            return probabilities
        focused = values.copy()
        focused[prior <= 0.0] *= 0.02
        total = float(focused.sum())
        if total <= 0.0:
            return probabilities
        return focused / total

    def _calibrated_softmax(
        self,
        logits: Vector,
        *,
        scale: float = READOUT_LOGIT_ZSCORE_SCALE,
    ) -> Vector:
        if np is not None:
            return self._calibrated_softmax_array(
                np.asarray(logits, dtype=np.float64),
                scale=scale,
            ).tolist()
        if not logits:
            return []
        center = mean(logits)
        variance = mean([(value - center) * (value - center) for value in logits])
        spread = variance**0.5
        if spread <= 1e-12:
            return softmax(logits)
        calibrated = [
            max(-20.0, min(20.0, ((value - center) / spread) * scale))
            for value in logits
        ]
        return softmax(calibrated)

    def _calibrated_softmax_array(
        self,
        logits: object,
        *,
        scale: float = READOUT_LOGIT_ZSCORE_SCALE,
    ) -> object:
        assert np is not None
        values = np.asarray(logits, dtype=np.float64)
        if values.size == 0:
            return values
        spread = float(values.std())
        if spread > 1e-12:
            values = ((values - float(values.mean())) / spread) * scale
            values = np.clip(values, -20.0, 20.0)
        else:
            values = values - float(values.max())
        values = values - float(values.max())
        exponentials = np.exp(values)
        total = float(exponentials.sum())
        if total <= 0.0:
            return np.full(values.shape, 1.0 / max(1, values.size), dtype=np.float64)
        return exponentials / total

    def _weighted_prior_sum(self, sources: list[tuple[float, Vector]]) -> Vector:
        assert self.embedding_model is not None
        active_sources = [
            (weight, vector)
            for weight, vector in sources
            if weight > 0.0 and any(value > 0.0 for value in vector)
        ]
        if not active_sources:
            return [0.0 for _ in self.embedding_model.id_to_token]
        total_weight = sum(weight for weight, _ in active_sources)
        merged = [0.0 for _ in self.embedding_model.id_to_token]
        for weight, vector in active_sources:
            normalized_weight = weight / total_weight
            for index, value in enumerate(vector):
                merged[index] += normalized_weight * value
        return _normalize_vector(merged)

    def _weighted_prior_sum_array(self, sources: list[tuple[float, object]]) -> object:
        assert np is not None
        assert self.embedding_model is not None
        active_sources = [
            (weight, np.asarray(vector, dtype=np.float64))
            for weight, vector in sources
            if weight > 0.0 and np.any(np.asarray(vector, dtype=np.float64) > 0.0)
        ]
        if not active_sources:
            return np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        total_weight = sum(weight for weight, _ in active_sources)
        merged = np.zeros_like(active_sources[0][1], dtype=np.float64)
        for weight, vector in active_sources:
            merged += (weight / total_weight) * vector
        total = float(merged.sum())
        if total > 0.0:
            merged /= total
        return merged

    def _prompt_answer_readout_prior(
        self,
        answer_anchor_state: Vector | None,
        *,
        start: bool,
    ) -> Vector:
        assert self.embedding_model is not None
        if answer_anchor_state is None:
            return [0.0 for _ in self.embedding_model.id_to_token]
        weights = self.prompt_answer_start_weights if start else self.prompt_answer_weights
        bias = self.prompt_answer_start_bias if start else self.prompt_answer_bias
        if np is not None:
            return self._prompt_answer_readout_prior_array(
                answer_anchor_state,
                start=start,
            ).tolist()
        if not weights:
            return [0.0 for _ in self.embedding_model.id_to_token]
        state = self._center_state_vector(self._masked_combined_state(answer_anchor_state))
        logits = apply_readout(weights, state)
        if bias:
            logits = [value + bias[index] for index, value in enumerate(logits)]
        return self._calibrated_softmax(
            logits,
            scale=PROMPT_READOUT_LOGIT_ZSCORE_SCALE,
        )

    def _prompt_answer_readout_prior_array(
        self,
        answer_anchor_state: Vector | None,
        *,
        start: bool,
    ) -> object:
        assert np is not None
        assert self.embedding_model is not None
        if answer_anchor_state is None:
            return np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        weights = (
            self.prompt_answer_start_weights_array
            if start
            else self.prompt_answer_weights_array
        )
        bias = self.prompt_answer_start_bias_array if start else self.prompt_answer_bias_array
        if weights is None:
            return np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        state_array = self._center_state_array(
            self._masked_combined_state_array(answer_anchor_state)
        )
        logits = weights @ state_array
        if bias is not None and bias.shape == logits.shape:
            logits = logits + bias
        return self._calibrated_softmax_array(
            logits,
            scale=PROMPT_READOUT_LOGIT_ZSCORE_SCALE,
        )

    def save(self, path: str | Path) -> None:
        self._require_fit()
        assert self.tokenizer is not None
        assert self.embedding_model is not None
        assert self.ternary_mask is not None
        assert self.readout_weights is not None
        assert self.associative_keys is not None
        assert self.associative_values is not None
        assert self.transition_tables is not None

        metadata = {
            "schema_version": "1",
            "checkpoint_kind": "reframr-analytical",
            "tokenizer_name": self.tokenizer.name,
            "config": json.dumps(self.config.to_dict(), separators=(",", ":")),
            "tokenizer": json.dumps(self.tokenizer.to_dict(), separators=(",", ":")),
            "embedding_id_to_token": json.dumps(self.embedding_model.id_to_token, separators=(",", ":")),
            "tokenizer_vocab_size": str(self.tokenizer.vocab_size),
            "transition_table_format": "tensor-v1",
        }
        self._refresh_answer_fingerprint_hashes()
        if np is not None:
            self._refresh_numeric_caches()
        transition_tensors = self._transition_table_tensors()
        tensors = {
            "embedding_table": self.embedding_model.embeddings,
            "ternary_scale": [self.ternary_scale],
            "ternary_mask": self.ternary_mask,
            "readout_weights": self.readout_weights,
            "readout_bias": self.readout_bias
            or [0.0 for _ in self.embedding_model.id_to_token],
            "prompt_answer_weights": self.prompt_answer_weights
            if self.prompt_answer_weights is not None
            else [],
            "prompt_answer_bias": self.prompt_answer_bias
            or [0.0 for _ in self.embedding_model.id_to_token],
            "prompt_answer_start_weights": self.prompt_answer_start_weights
            if self.prompt_answer_start_weights is not None
            else [],
            "prompt_answer_start_bias": self.prompt_answer_start_bias
            or [0.0 for _ in self.embedding_model.id_to_token],
            "trace_token_weights": self.trace_token_weights
            or [1.0 for _ in self.embedding_model.id_to_token],
            "preference_bias": self.preference_bias
            or [0.0 for _ in self.embedding_model.id_to_token],
            "state_offset": self.state_offset
            or [0.0 for _ in range(self._combined_state_width())],
            "associative_keys": self.associative_keys,
            "associative_key_norms": self.associative_key_norms_array
            if self.associative_key_norms_array is not None
            else self.associative_key_norms or [],
            "associative_values": self.associative_values,
            "answer_keys": self.answer_keys if self.answer_keys is not None else [],
            "answer_key_norms": self.answer_key_norms_array
            if self.answer_key_norms_array is not None
            else self.answer_key_norms or [],
            "answer_similarity_keys": self.answer_similarity_keys_array
            if self.answer_similarity_keys_array is not None
            else [],
            "answer_similarity_key_norms": self.answer_similarity_key_norms_array
            if self.answer_similarity_key_norms_array is not None
            else [],
            "answer_values": self.answer_values if self.answer_values is not None else [],
            "answer_start_keys": self.answer_start_keys if self.answer_start_keys is not None else [],
            "answer_start_key_norms": self.answer_start_key_norms_array
            if self.answer_start_key_norms_array is not None
            else self.answer_start_key_norms or [],
            "answer_start_similarity_keys": self.answer_start_similarity_keys_array
            if self.answer_start_similarity_keys_array is not None
            else [],
            "answer_start_similarity_key_norms": self.answer_start_similarity_key_norms_array
            if self.answer_start_similarity_key_norms_array is not None
            else [],
            "answer_start_values": self.answer_start_values if self.answer_start_values is not None else [],
            "answer_sequence_keys": self.answer_sequence_keys if self.answer_sequence_keys is not None else [],
            "answer_sequence_key_norms": self.answer_sequence_key_norms_array
            if self.answer_sequence_key_norms_array is not None
            else self.answer_sequence_key_norms or [],
            "answer_sequence_similarity_keys": self.answer_sequence_similarity_keys_array
            if self.answer_sequence_similarity_keys_array is not None
            else [],
            "answer_sequence_similarity_key_norms": self.answer_sequence_similarity_key_norms_array
            if self.answer_sequence_similarity_key_norms_array is not None
            else [],
            "answer_sequence_prompt_tokens": self.answer_sequence_prompt_tokens if self.answer_sequence_prompt_tokens is not None else [],
            "answer_sequence_tokens": self.answer_sequence_tokens if self.answer_sequence_tokens is not None else [],
            "answer_fingerprint_hashes": self._answer_fingerprint_tensor(),
            **transition_tensors,
        }
        write_safetensor_file(path, tensors, metadata=metadata)

    @classmethod
    def load(cls, path: str | Path) -> "ReframrModel":
        checkpoint_path = Path(path)
        checkpoint = read_safetensor_file(
            checkpoint_path,
            arrays=np is not None and checkpoint_path.stat().st_size > 10_000_000,
        )
        metadata = checkpoint.metadata
        config = ReframrConfig.from_dict(json.loads(metadata["config"]))
        model = cls(config)
        model.tokenizer = NativeTokenizer.from_dict(json.loads(metadata["tokenizer"]))
        id_to_token = [str(token) for token in json.loads(metadata["embedding_id_to_token"])]
        embedding_table = checkpoint.tensors["embedding_table"]
        if np is not None and hasattr(embedding_table, "shape"):
            embeddings = embedding_table.astype(RUNTIME_ARRAY_DTYPE, copy=False)
        else:
            embeddings = [[float(value) for value in row] for row in embedding_table]
        model.embedding_model = EmbeddingModel(
            token_to_id={token: index for index, token in enumerate(id_to_token)},
            id_to_token=id_to_token,
            embeddings=embeddings,
            ppmi_matrix=[],
        )
        model.memory_units = [
            AnalyticalMemoryUnit(model.config.state_dim, timescale)
            for timescale in model.config.timescales
        ]
        model.ternary_scale = float(checkpoint.tensors["ternary_scale"][0])
        model.ternary_mask = [int(value) for value in checkpoint.tensors["ternary_mask"]]
        readout_tensor = checkpoint.tensors["readout_weights"]
        model.readout_weights = (
            readout_tensor.astype(RUNTIME_ARRAY_DTYPE, copy=False)
            if np is not None and hasattr(readout_tensor, "shape")
            else [[float(value) for value in row] for row in readout_tensor]
        )
        readout_bias_tensor = checkpoint.tensors.get("readout_bias", [])
        model.readout_bias = [
            float(value) for value in (
                readout_bias_tensor.tolist()
                if hasattr(readout_bias_tensor, "tolist")
                else readout_bias_tensor
            )
        ]
        if not model.readout_bias:
            model.readout_bias = [0.0 for _ in id_to_token]
        prompt_answer_tensor = checkpoint.tensors.get("prompt_answer_weights", [])
        model.prompt_answer_weights = (
            prompt_answer_tensor.astype(RUNTIME_ARRAY_DTYPE, copy=False)
            if np is not None
            and hasattr(prompt_answer_tensor, "shape")
            and len(prompt_answer_tensor.shape) == 2
            else [[float(value) for value in row] for row in prompt_answer_tensor]
        )
        prompt_answer_bias_tensor = checkpoint.tensors.get("prompt_answer_bias", [])
        model.prompt_answer_bias = [
            float(value) for value in (
                prompt_answer_bias_tensor.tolist()
                if hasattr(prompt_answer_bias_tensor, "tolist")
                else prompt_answer_bias_tensor
            )
        ]
        if not model.prompt_answer_bias:
            model.prompt_answer_bias = [0.0 for _ in id_to_token]
        prompt_answer_start_tensor = checkpoint.tensors.get("prompt_answer_start_weights", [])
        model.prompt_answer_start_weights = (
            prompt_answer_start_tensor.astype(RUNTIME_ARRAY_DTYPE, copy=False)
            if np is not None
            and hasattr(prompt_answer_start_tensor, "shape")
            and len(prompt_answer_start_tensor.shape) == 2
            else [[float(value) for value in row] for row in prompt_answer_start_tensor]
        )
        prompt_answer_start_bias_tensor = checkpoint.tensors.get("prompt_answer_start_bias", [])
        model.prompt_answer_start_bias = [
            float(value) for value in (
                prompt_answer_start_bias_tensor.tolist()
                if hasattr(prompt_answer_start_bias_tensor, "tolist")
                else prompt_answer_start_bias_tensor
            )
        ]
        if not model.prompt_answer_start_bias:
            model.prompt_answer_start_bias = [0.0 for _ in id_to_token]
        trace_weight_tensor = checkpoint.tensors.get("trace_token_weights", [])
        model.trace_token_weights = [
            float(value) for value in (
                trace_weight_tensor.tolist()
                if hasattr(trace_weight_tensor, "tolist")
                else trace_weight_tensor
            )
        ]
        if not model.trace_token_weights:
            model.trace_token_weights = [
                1.0 if token in TOOL_PROTOCOL_TOKENS else 0.0 if token in model.tokenizer.special_tokens else 1.0
                for token in id_to_token
            ]
        preference_bias_tensor = checkpoint.tensors.get("preference_bias", [])
        model.preference_bias = [
            float(value) for value in (
                preference_bias_tensor.tolist()
                if hasattr(preference_bias_tensor, "tolist")
                else preference_bias_tensor
            )
        ]
        if not model.preference_bias:
            model.preference_bias = [0.0 for _ in id_to_token]
        state_offset_tensor = checkpoint.tensors.get("state_offset", [])
        model.state_offset = [
            float(value) for value in (
                state_offset_tensor.tolist()
                if hasattr(state_offset_tensor, "tolist")
                else state_offset_tensor
            )
        ]
        if not model.state_offset:
            model.state_offset = [0.0 for _ in range(model._combined_state_width())]

        def _runtime_vector_tensor(name: str) -> object | None:
            tensor = checkpoint.tensors.get(name, [])
            if np is not None and hasattr(tensor, "shape"):
                if len(tensor.shape) == 1 and int(tensor.shape[0]) > 0:
                    return tensor.astype(RUNTIME_ARRAY_DTYPE, copy=False)
                return None
            values = tensor.tolist() if hasattr(tensor, "tolist") else tensor
            return [float(value) for value in values] if values else None

        def _runtime_matrix_tensor(name: str) -> object | None:
            tensor = checkpoint.tensors.get(name, [])
            if (
                np is not None
                and hasattr(tensor, "shape")
                and len(tensor.shape) == 2
                and int(tensor.shape[0]) > 0
            ):
                return tensor.astype(RUNTIME_ARRAY_DTYPE, copy=False)
            return None

        associative_tensor = checkpoint.tensors.get("associative_keys", [])
        model.associative_keys = (
            associative_tensor.astype(RUNTIME_ARRAY_DTYPE, copy=False)
            if np is not None and hasattr(associative_tensor, "shape")
            else [[float(value) for value in row] for row in associative_tensor]
        )
        cached_associative_key_norms = _runtime_vector_tensor("associative_key_norms")
        if cached_associative_key_norms is not None:
            model.associative_key_norms = cached_associative_key_norms
        elif np is not None and hasattr(model.associative_keys, "shape"):
            model.associative_key_norms = None
        else:
            model.associative_key_norms = [norm(key) for key in model.associative_keys]
        raw_associative_values = checkpoint.tensors.get("associative_values", [])
        model.associative_values = [
            int(value) for value in (
                raw_associative_values.tolist()
                if hasattr(raw_associative_values, "tolist")
                else raw_associative_values
            )
        ]
        answer_tensor = checkpoint.tensors.get("answer_keys", [])
        if np is not None and hasattr(answer_tensor, "shape"):
            model.answer_keys = (
                answer_tensor.astype(RUNTIME_ARRAY_DTYPE, copy=False)
                if len(answer_tensor.shape) == 2
                else []
            )
        else:
            model.answer_keys = [[float(value) for value in row] for row in answer_tensor]
        if (
            np is not None
            and hasattr(model.answer_keys, "shape")
            and len(model.answer_keys.shape) == 2
        ):
            model.answer_key_norms = _runtime_vector_tensor("answer_key_norms")
        else:
            model.answer_key_norms = (
                _runtime_vector_tensor("answer_key_norms")
                or [norm(key) for key in model.answer_keys]
            )
        raw_answer_values = checkpoint.tensors.get("answer_values", [])
        model.answer_values = [
            int(value) for value in (
                raw_answer_values.tolist()
                if hasattr(raw_answer_values, "tolist")
                else raw_answer_values
            )
        ]
        answer_start_tensor = checkpoint.tensors.get("answer_start_keys", [])
        if np is not None and hasattr(answer_start_tensor, "shape"):
            model.answer_start_keys = (
                answer_start_tensor.astype(RUNTIME_ARRAY_DTYPE, copy=False)
                if len(answer_start_tensor.shape) == 2
                else []
            )
        else:
            model.answer_start_keys = [
                [float(value) for value in row] for row in answer_start_tensor
            ]
        if (
            np is not None
            and hasattr(model.answer_start_keys, "shape")
            and len(model.answer_start_keys.shape) == 2
        ):
            model.answer_start_key_norms = _runtime_vector_tensor("answer_start_key_norms")
        else:
            model.answer_start_key_norms = (
                _runtime_vector_tensor("answer_start_key_norms")
                or [norm(key) for key in model.answer_start_keys]
            )
        raw_answer_start_values = checkpoint.tensors.get("answer_start_values", [])
        model.answer_start_values = [
            int(value) for value in (
                raw_answer_start_values.tolist()
                if hasattr(raw_answer_start_values, "tolist")
                else raw_answer_start_values
            )
        ]
        answer_sequence_tensor = checkpoint.tensors.get("answer_sequence_keys", [])
        if np is not None and hasattr(answer_sequence_tensor, "shape"):
            model.answer_sequence_keys = (
                answer_sequence_tensor.astype(RUNTIME_ARRAY_DTYPE, copy=False)
                if len(answer_sequence_tensor.shape) == 2
                else []
            )
        else:
            model.answer_sequence_keys = [
                [float(value) for value in row] for row in answer_sequence_tensor
            ]
        if (
            np is not None
            and hasattr(model.answer_sequence_keys, "shape")
            and len(model.answer_sequence_keys.shape) == 2
        ):
            model.answer_sequence_key_norms = _runtime_vector_tensor("answer_sequence_key_norms")
        else:
            model.answer_sequence_key_norms = (
                _runtime_vector_tensor("answer_sequence_key_norms")
                or [norm(key) for key in model.answer_sequence_keys]
            )
        raw_answer_sequence_prompt_tokens = checkpoint.tensors.get("answer_sequence_prompt_tokens", [])
        if np is not None and hasattr(raw_answer_sequence_prompt_tokens, "shape"):
            model.answer_sequence_prompt_tokens = raw_answer_sequence_prompt_tokens.astype(int, copy=False)
        else:
            model.answer_sequence_prompt_tokens = [
                [int(value) for value in row] for row in raw_answer_sequence_prompt_tokens
            ]
        raw_answer_sequence_tokens = checkpoint.tensors.get("answer_sequence_tokens", [])
        if np is not None and hasattr(raw_answer_sequence_tokens, "shape"):
            model.answer_sequence_tokens = raw_answer_sequence_tokens.astype(int, copy=False)
        else:
            model.answer_sequence_tokens = [
                [int(value) for value in row] for row in raw_answer_sequence_tokens
            ]
        model.answer_sequence_token_id_rows = None
        raw_fingerprints = checkpoint.tensors.get("answer_fingerprint_hashes", [])
        model.answer_fingerprint_hashes = model._coerce_answer_fingerprint_hashes(
            raw_fingerprints
        )
        model.answer_fingerprint_token_lengths = None
        model.answer_fingerprint_token_sequences_by_length = None
        if not model.answer_fingerprint_hashes:
            model._refresh_answer_fingerprint_hashes()
        model.answer_similarity_keys_array = _runtime_matrix_tensor("answer_similarity_keys")
        model.answer_similarity_key_norms_array = _runtime_vector_tensor("answer_similarity_key_norms")
        model.answer_start_similarity_keys_array = _runtime_matrix_tensor("answer_start_similarity_keys")
        model.answer_start_similarity_key_norms_array = _runtime_vector_tensor("answer_start_similarity_key_norms")
        model.answer_sequence_similarity_keys_array = _runtime_matrix_tensor("answer_sequence_similarity_keys")
        model.answer_sequence_similarity_key_norms_array = _runtime_vector_tensor("answer_sequence_similarity_key_norms")
        model.transition_id_tables = model._deserialize_transition_id_tables_from_tensors(
            checkpoint.tensors
        )
        if model.transition_id_tables is not None:
            model.transition_tables = {order: {} for order in sorted(TRANSITION_ORDERS)}
        else:
            model.transition_tables = model._deserialize_transition_tables(
                json.loads(metadata.get("transition_tables", "{}"))
            )
        model._refresh_numeric_caches()
        return model

    def _collect_training_examples(
        self,
        tokens: list[str],
    ) -> tuple[list[Vector], list[Vector], list[int]]:
        assert self.embedding_model is not None
        if np is not None:
            hidden_states = [
                np.zeros(self.config.state_dim, dtype=np.float64)
                for _ in self.config.timescales
            ]
            context_traces = [
                np.zeros(self.config.embedding_dim, dtype=np.float64)
                for _ in self.config.timescales
            ]
            zero_embedding: Vector | object = np.zeros(self.config.embedding_dim, dtype=np.float64)
        else:
            hidden_states = [zeros_vector(self.config.state_dim) for _ in self.config.timescales]
            context_traces = [zeros_vector(self.config.embedding_dim) for _ in self.config.timescales]
            zero_embedding = zeros_vector(self.config.embedding_dim)
        states: list[Vector] = []
        labels: list[Vector] = []
        label_ids: list[int] = []
        token_ids = [
            self.embedding_model.token_to_id.get(token, -1)
            for token in tokens
        ]
        example_count = max(0, len(tokens) - 1)
        stride = 1
        if self.config.max_training_examples and example_count > self.config.max_training_examples:
            stride = max(
                1,
                (example_count + self.config.max_training_examples - 1) // self.config.max_training_examples,
            )

        for index in range(len(tokens) - 1):
            token = tokens[index]
            token_id = token_ids[index]
            embedding = (
                self.embedding_model.embeddings[token_id]
                if token_id >= 0
                else zero_embedding
            )
            trace_embedding = self._trace_embedding_from_token_id(embedding, token_id)
            hidden_states, context_traces, combined_state = self._step_hidden_states_from_embedding(
                hidden_states,
                context_traces,
                embedding,
                trace_embedding=trace_embedding,
            )
            if stride > 1 and index % stride != 0 and index != len(tokens) - 2:
                continue
            states.append(combined_state)
            next_token_id = token_ids[index + 1]
            labels.append(self._one_hot_from_id(next_token_id))
            label_ids.append(next_token_id)

        if self.config.max_training_examples and len(states) > self.config.max_training_examples:
            states = states[: self.config.max_training_examples]
            labels = labels[: self.config.max_training_examples]
            label_ids = label_ids[: self.config.max_training_examples]
        return states, labels, label_ids

    def _is_punctuation_piece(self, piece: str) -> bool:
        return bool(piece) and all(character in string.punctuation for character in piece)

    def _encode_context(self, tokens: list[str]) -> Vector:
        return self._masked_decode_state(self._build_decode_state(tokens))

    def _build_decode_state(self, tokens: list[str]) -> DecodeState:
        assert self.memory_units is not None

        state = DecodeState(
            hidden_states=(
                [
                    np.zeros(self.config.state_dim, dtype=np.float64)
                    for _ in self.config.timescales
                ]
                if np is not None
                else [zeros_vector(self.config.state_dim) for _ in self.config.timescales]
            ),
            context_traces=(
                [
                    np.zeros(self.config.embedding_dim, dtype=np.float64)
                    for _ in self.config.timescales
                ]
                if np is not None
                else [zeros_vector(self.config.embedding_dim) for _ in self.config.timescales]
            ),
            combined_state=self._zero_combined_state(),
            context_tokens=[],
        )
        for token in tokens:
            self._advance_decode_state(state, token)
        self._apply_sparse_context_anchor(state)
        return state

    def _advance_decode_state(self, state: DecodeState, token: str) -> DecodeState:
        next_hidden_states, next_context_traces, combined_state = self._step_hidden_states(
            state.hidden_states,
            state.context_traces,
            token,
        )
        state.hidden_states = next_hidden_states
        state.context_traces = next_context_traces
        state.combined_state = combined_state
        state.context_tokens.append(token)
        if token == "<answer>":
            state.answer_anchor_state = combined_state.copy() if hasattr(combined_state, "copy") else combined_state[:]
            state.answer_matches = None
            state.answer_start_matches = None
            state.answer_sequence_matches = None
            state.prompt_answer_prior = None
            state.prompt_answer_start_prior = None
        return state

    def _apply_sparse_context_anchor(self, state: DecodeState) -> None:
        if (
            np is None
            or self.embedding_model is None
            or state.answer_anchor_state is None
            or not state.context_tokens
        ):
            return
        answer_index = _last_index(state.context_tokens, "<answer>")
        if answer_index is None or answer_index <= 0:
            return
        context_ids = self._long_context_sparse_token_ids(state.context_tokens[:answer_index])
        if len(context_ids) < SPARSE_CONTEXT_MIN_TOKENS:
            return
        query_id = context_ids[-1]
        embeddings = np.asarray(self.embedding_model.embeddings, dtype=np.float32)
        if embeddings.ndim != 2 or embeddings.shape[0] == 0:
            return
        selector = HashedSparseAttention(
            embeddings,
            k_neighbors=min(SPARSE_CONTEXT_TOP_K, len(context_ids)),
            hash_bits=SPARSE_CONTEXT_HASH_BITS,
            probe_radius=SPARSE_CONTEXT_PROBE_RADIUS,
            candidate_multiplier=SPARSE_CONTEXT_CANDIDATE_MULTIPLIER,
        )
        token_ids = np.asarray(context_ids, dtype=np.int64)
        selector.build_context_index(token_ids)
        selection = selector.select_positions_cached(query_id)
        if not selection.positions:
            return
        selected_ids = token_ids[np.asarray(selection.positions, dtype=np.int64)]
        selected_embeddings = embeddings[selected_ids]
        scores = np.asarray(selection.scores, dtype=np.float32)
        scores -= float(scores.max())
        weights = np.exp(scores)
        weights /= max(float(weights.sum()), 1e-8)
        sparse_embedding = weights @ selected_embeddings
        blended_anchor = self._blend_sparse_embedding_into_combined_state(
            state.answer_anchor_state,
            sparse_embedding,
            state_dim=self.config.state_dim,
            embedding_dim=self.config.embedding_dim,
            timescale_count=len(self.config.timescales),
            blend=SPARSE_CONTEXT_TRACE_BLEND,
        )
        state.answer_anchor_state = blended_anchor
        if state.context_tokens and state.context_tokens[-1] == "<answer>":
            state.combined_state = blended_anchor.copy()
        state.answer_matches = None
        state.answer_start_matches = None
        state.answer_sequence_matches = None
        state.prompt_answer_prior = None
        state.prompt_answer_start_prior = None

    def _long_context_sparse_token_ids(self, tokens: Sequence[str]) -> list[int]:
        assert self.embedding_model is not None
        special_tokens = self.tokenizer.special_tokens if self.tokenizer is not None else set()
        ids: list[int] = []
        for token in tokens:
            if token in special_tokens and token not in TOOL_PROTOCOL_TOKENS:
                continue
            token_id = self._token_id_for_token(token)
            if token_id >= 0:
                ids.append(token_id)
        return ids

    @staticmethod
    def _blend_sparse_embedding_into_combined_state(
        combined_state: Vector,
        sparse_embedding: object,
        *,
        state_dim: int,
        embedding_dim: int,
        timescale_count: int,
        blend: float,
    ) -> Vector:
        if np is None:
            return combined_state
        state_array = np.asarray(combined_state, dtype=np.float32).copy()
        sparse_array = np.asarray(sparse_embedding, dtype=np.float32)
        if sparse_array.shape[0] != embedding_dim:
            return combined_state
        block_width = state_dim + embedding_dim
        expected_width = block_width * timescale_count
        if state_array.shape[0] != expected_width:
            return combined_state
        alpha = min(1.0, max(0.0, float(blend)))
        for block_index in range(timescale_count):
            trace_start = block_index * block_width + state_dim
            trace_end = trace_start + embedding_dim
            state_array[trace_start:trace_end] = (
                (1.0 - alpha) * state_array[trace_start:trace_end]
                + alpha * sparse_array
            )
        return state_array.tolist()

    def _masked_decode_state(self, state: DecodeState) -> Vector:
        assert self.ternary_mask is not None
        return apply_ternary_mask(state.combined_state, self.ternary_mask, self.ternary_scale)

    def _masked_combined_state(self, combined_state: Vector) -> Vector:
        assert self.ternary_mask is not None
        return apply_ternary_mask(combined_state, self.ternary_mask, self.ternary_scale)

    def _masked_decode_state_array(self, state: DecodeState) -> object:
        assert np is not None
        if self.ternary_mask_array is None:
            return np.asarray(self._masked_decode_state(state), dtype=RUNTIME_ARRAY_DTYPE)
        return (
            np.asarray(state.combined_state, dtype=RUNTIME_ARRAY_DTYPE)
            * self.ternary_scale
            * self.ternary_mask_array
        )

    def _masked_combined_state_array(self, combined_state: Vector) -> object:
        assert np is not None
        if self.ternary_mask_array is None:
            return np.asarray(self._masked_combined_state(combined_state), dtype=RUNTIME_ARRAY_DTYPE)
        return (
            np.asarray(combined_state, dtype=RUNTIME_ARRAY_DTYPE)
            * self.ternary_scale
            * self.ternary_mask_array
        )

    def _center_state_vector(self, state: Vector) -> Vector:
        if not self.state_offset or len(self.state_offset) != len(state):
            return state
        return [value - self.state_offset[index] for index, value in enumerate(state)]

    def _center_state_array(self, state: object) -> object:
        assert np is not None
        state_array = np.asarray(state, dtype=RUNTIME_ARRAY_DTYPE)
        if self.state_offset_array is None or self.state_offset_array.shape != state_array.shape:
            return state_array
        return state_array - self.state_offset_array

    def _zero_combined_state(self) -> Vector:
        return [0.0 for _ in range(self._combined_state_width())]

    def _combined_state_width(self) -> int:
        return (self.config.state_dim + self.config.embedding_dim) * len(self.config.timescales)

    def _derive_trace_token_weights_from_counts(self, token_counts: dict[str, float]) -> Vector:
        assert self.embedding_model is not None
        assert self.tokenizer is not None
        counts = [
            float(token_counts.get(token, 0.0))
            for token in self.embedding_model.id_to_token
        ]
        positive_counts = sorted(value for value in counts if value > 0.0)
        reference = (
            positive_counts[len(positive_counts) // 2]
            if positive_counts
            else 1.0
        )
        weights: Vector = []
        for token, count in zip(self.embedding_model.id_to_token, counts):
            if token in TOOL_PROTOCOL_TOKENS:
                weights.append(1.0)
            elif token in self.tokenizer.special_tokens:
                weights.append(0.0)
            elif count <= 0.0:
                weights.append(1.0)
            else:
                weight = (reference / count) ** 0.75
                weights.append(max(0.08, min(4.8, weight)))
        return weights

    def _token_id_for_token(self, token: str) -> int:
        assert self.embedding_model is not None
        token_id = self.embedding_model.token_to_id.get(token)
        if token_id is None and token.lower() != token:
            token_id = self.embedding_model.token_to_id.get(token.lower())
        return int(token_id) if token_id is not None else -1

    def _trace_embedding_from_token_id(
        self,
        embedding: Vector | object,
        token_id: int,
    ) -> Vector | object:
        if token_id < 0:
            return embedding
        if self.trace_embedding_table_array is not None:
            return self.trace_embedding_table_array[token_id]
        weight = self.trace_token_weights[token_id] if self.trace_token_weights is not None else 1.0
        dimension = self.config.embedding_dim
        if hasattr(embedding, "shape"):
            trace_embedding = embedding * weight
            for bucket_multiplier, bucket_offset, sign_multiplier, sign_offset in TRACE_IDENTITY_HASHES:
                bucket = (token_id * bucket_multiplier + bucket_offset) % dimension
                sign = 1.0 if ((token_id * sign_multiplier + sign_offset) & 1) == 0 else -1.0
                trace_embedding[bucket] += weight * TRACE_IDENTITY_SCALE * sign
            return trace_embedding
        trace_values = [float(value) * weight for value in embedding]
        for bucket_multiplier, bucket_offset, sign_multiplier, sign_offset in TRACE_IDENTITY_HASHES:
            bucket = (token_id * bucket_multiplier + bucket_offset) % dimension
            sign = 1.0 if ((token_id * sign_multiplier + sign_offset) & 1) == 0 else -1.0
            trace_values[bucket] += weight * TRACE_IDENTITY_SCALE * sign
        return trace_values

    def _build_trace_embedding_table_array(self, embedding_array: object) -> object | None:
        if np is None or self.trace_token_weights is None:
            return None
        values = np.asarray(embedding_array, dtype=np.float64)
        if values.size == 0 or len(values.shape) != 2:
            return None
        weights = np.asarray(self.trace_token_weights, dtype=np.float64)
        if weights.shape[0] != values.shape[0]:
            return None
        trace_values = values * weights[:, None]
        if values.shape[1] <= 0:
            return trace_values
        token_ids = np.arange(values.shape[0], dtype=np.int64)
        for bucket_multiplier, bucket_offset, sign_multiplier, sign_offset in TRACE_IDENTITY_HASHES:
            buckets = ((token_ids * bucket_multiplier + bucket_offset) % values.shape[1]).astype(
                np.int64,
                copy=False,
            )
            signs = np.where(
                ((token_ids * sign_multiplier + sign_offset) & 1) == 0,
                1.0,
                -1.0,
            )
            np.add.at(trace_values, (token_ids, buckets), weights * TRACE_IDENTITY_SCALE * signs)
        return trace_values

    def _runtime_key_norms_array(
        self,
        key_array: object | None,
        key_norms: list[float] | None,
    ) -> object | None:
        assert np is not None
        if key_norms is not None and len(key_norms) > 0:
            return np.asarray(key_norms, dtype=RUNTIME_ARRAY_DTYPE)
        if key_array is None:
            return None
        keys = np.asarray(key_array, dtype=RUNTIME_ARRAY_DTYPE)
        if len(keys.shape) != 2 or keys.shape[0] == 0:
            return None
        return np.linalg.norm(keys, axis=1).astype(RUNTIME_ARRAY_DTYPE, copy=False)

    def _runtime_vector_cache(self, cached: object | None, length: int) -> object | None:
        assert np is not None
        if cached is None or not hasattr(cached, "shape"):
            return None
        array = np.asarray(cached, dtype=RUNTIME_ARRAY_DTYPE)
        if len(array.shape) != 1 or int(array.shape[0]) != int(length):
            return None
        return array

    def _runtime_matrix_cache(
        self,
        cached: object | None,
        rows: int,
        width: int,
    ) -> object | None:
        assert np is not None
        if cached is None or not hasattr(cached, "shape"):
            return None
        array = np.asarray(cached, dtype=RUNTIME_ARRAY_DTYPE)
        if (
            len(array.shape) != 2
            or int(array.shape[0]) != int(rows)
            or int(array.shape[1]) != int(width)
        ):
            return None
        return array

    def _refresh_numeric_caches(self) -> None:
        if np is None:
            self.ternary_mask_array = None
            self.readout_weights_array = None
            self.readout_bias_array = None
            self.prompt_answer_weights_array = None
            self.prompt_answer_bias_array = None
            self.prompt_answer_start_weights_array = None
            self.prompt_answer_start_bias_array = None
            self.trace_token_weights_array = None
            self.trace_embedding_table_array = None
            self.preference_bias_array = None
            self.preference_valid_mask_array = None
            self.state_offset_array = None
            self.associative_keys_array = None
            self.associative_key_norms_array = None
            self.associative_values_array = None
            self.associative_valid_mask_array = None
            self.answer_keys_array = None
            self.answer_key_norms_array = None
            self.answer_similarity_keys_array = None
            self.answer_similarity_key_norms_array = None
            self.answer_similarity_mask_array = None
            self.answer_values_array = None
            self.answer_valid_mask_array = None
            self.answer_start_keys_array = None
            self.answer_start_key_norms_array = None
            self.answer_start_similarity_keys_array = None
            self.answer_start_similarity_key_norms_array = None
            self.answer_start_values_array = None
            self.answer_start_valid_mask_array = None
            self.answer_sequence_keys_array = None
            self.answer_sequence_key_norms_array = None
            self.answer_sequence_similarity_keys_array = None
            self.answer_sequence_similarity_key_norms_array = None
            self.answer_sequence_prompt_tokens_array = None
            self.answer_sequence_tokens_array = None
            self.answer_sequence_prompt_weight_maps = None
            self.answer_sequence_prompt_weight_norms = None
            self.answer_sequence_prompt_bigram_sets = None
            self.answer_sequence_prompt_trigram_sets = None
            self.answer_sequence_prompt_number_sets = None
            self.answer_sequence_prompt_inverted_index = None
            self._refresh_answer_sequence_prompt_overlap_cache()
            self.prompt_overlap_valid_token_mask_array = None
            return
        cached_associative_key_norms_array = self.associative_key_norms_array
        cached_answer_key_norms_array = self.answer_key_norms_array
        cached_answer_similarity_keys_array = self.answer_similarity_keys_array
        cached_answer_similarity_key_norms_array = self.answer_similarity_key_norms_array
        cached_answer_start_key_norms_array = self.answer_start_key_norms_array
        cached_answer_start_similarity_keys_array = self.answer_start_similarity_keys_array
        cached_answer_start_similarity_key_norms_array = self.answer_start_similarity_key_norms_array
        cached_answer_sequence_key_norms_array = self.answer_sequence_key_norms_array
        cached_answer_sequence_similarity_keys_array = self.answer_sequence_similarity_keys_array
        cached_answer_sequence_similarity_key_norms_array = self.answer_sequence_similarity_key_norms_array
        self.ternary_mask_array = (
            np.asarray(self.ternary_mask, dtype=RUNTIME_ARRAY_DTYPE)
            if self.ternary_mask is not None
            else None
        )
        self.readout_weights_array = (
            np.asarray(self.readout_weights, dtype=RUNTIME_ARRAY_DTYPE)
            if self.readout_weights is not None
            else None
        )
        self.readout_bias_array = (
            np.asarray(self.readout_bias, dtype=RUNTIME_ARRAY_DTYPE)
            if self.readout_bias is not None
            else None
        )
        self.prompt_answer_weights_array = (
            np.asarray(self.prompt_answer_weights, dtype=RUNTIME_ARRAY_DTYPE)
            if self.prompt_answer_weights is not None
            and len(self.prompt_answer_weights) > 0
            else None
        )
        self.prompt_answer_bias_array = (
            np.asarray(self.prompt_answer_bias, dtype=RUNTIME_ARRAY_DTYPE)
            if self.prompt_answer_bias is not None
            else None
        )
        self.prompt_answer_start_weights_array = (
            np.asarray(self.prompt_answer_start_weights, dtype=RUNTIME_ARRAY_DTYPE)
            if self.prompt_answer_start_weights is not None
            and len(self.prompt_answer_start_weights) > 0
            else None
        )
        self.prompt_answer_start_bias_array = (
            np.asarray(self.prompt_answer_start_bias, dtype=RUNTIME_ARRAY_DTYPE)
            if self.prompt_answer_start_bias is not None
            else None
        )
        self.trace_token_weights_array = (
            np.asarray(self.trace_token_weights, dtype=RUNTIME_ARRAY_DTYPE)
            if self.trace_token_weights is not None
            else None
        )
        trace_embedding_table = (
            self._build_trace_embedding_table_array(self.embedding_model.embeddings)
            if self.embedding_model is not None and self.trace_token_weights is not None
            else None
        )
        self.trace_embedding_table_array = (
            trace_embedding_table.astype(RUNTIME_ARRAY_DTYPE, copy=False)
            if trace_embedding_table is not None
            else None
        )
        self.preference_bias_array = (
            np.asarray(self.preference_bias, dtype=RUNTIME_ARRAY_DTYPE)
            if self.preference_bias is not None
            else None
        )
        self.preference_valid_mask_array = (
            np.asarray(
                [
                    self._eligible_preference_token(token)
                    for token in self.embedding_model.id_to_token
                ],
                dtype=bool,
            )
            if self.embedding_model is not None and self.tokenizer is not None
            else None
        )
        self.state_offset_array = (
            np.asarray(self.state_offset, dtype=RUNTIME_ARRAY_DTYPE)
            if self.state_offset is not None
            else None
        )
        self.associative_keys_array = (
            np.asarray(self.associative_keys, dtype=RUNTIME_ARRAY_DTYPE)
            if self.associative_keys is not None and len(self.associative_keys) > 0
            else None
        )
        associative_key_norms_cache = (
            self._runtime_vector_cache(
                cached_associative_key_norms_array,
                int(self.associative_keys_array.shape[0]),
            )
            if self.associative_keys_array is not None
            else None
        )
        self.associative_key_norms_array = (
            associative_key_norms_cache
            if associative_key_norms_cache is not None
            else self._runtime_key_norms_array(
                self.associative_keys_array,
                self.associative_key_norms,
            )
        )
        self.associative_values_array = (
            np.asarray(self.associative_values, dtype=np.int64)
            if self.associative_values is not None and len(self.associative_values) > 0
            else None
        )
        self.associative_valid_mask_array = (
            self.associative_values_array >= 0
            if self.associative_values_array is not None
            else None
        )
        self.answer_keys_array = (
            np.asarray(self.answer_keys, dtype=RUNTIME_ARRAY_DTYPE)
            if self.answer_keys is not None and len(self.answer_keys) > 0
            else None
        )
        answer_key_norms_cache = (
            self._runtime_vector_cache(
                cached_answer_key_norms_array,
                int(self.answer_keys_array.shape[0]),
            )
            if self.answer_keys_array is not None
            else None
        )
        self.answer_key_norms_array = (
            answer_key_norms_cache
            if answer_key_norms_cache is not None
            else self._runtime_key_norms_array(
                self.answer_keys_array,
                self.answer_key_norms,
            )
        )
        self.answer_similarity_keys_array = None
        self.answer_similarity_key_norms_array = None
        self.answer_similarity_mask_array = None
        if self.answer_keys_array is not None and len(self.answer_keys_array.shape) == 2:
            width = int(self.answer_keys_array.shape[1])
            block_width = self.config.state_dim + self.config.embedding_dim
            expected_width = block_width * len(self.config.timescales)
            if block_width > 0 and width == expected_width:
                mask = np.zeros(width, dtype=RUNTIME_ARRAY_DTYPE)
                for scale_index in range(len(self.config.timescales)):
                    start = scale_index * block_width + self.config.state_dim
                    end = start + self.config.embedding_dim
                    mask[start:end] = 1.0
                self.answer_similarity_mask_array = mask
                answer_similarity_keys_cache = self._runtime_matrix_cache(
                    cached_answer_similarity_keys_array,
                    int(self.answer_keys_array.shape[0]),
                    width,
                )
                answer_similarity_key_norms_cache = self._runtime_vector_cache(
                    cached_answer_similarity_key_norms_array,
                    int(self.answer_keys_array.shape[0]),
                )
                if (
                    answer_similarity_keys_cache is not None
                    and answer_similarity_key_norms_cache is not None
                ):
                    self.answer_similarity_keys_array = answer_similarity_keys_cache
                    self.answer_similarity_key_norms_array = answer_similarity_key_norms_cache
                else:
                    self.answer_similarity_keys_array = self.answer_keys_array * mask[None, :]
                    self.answer_similarity_key_norms_array = np.linalg.norm(
                        self.answer_similarity_keys_array,
                        axis=1,
                    ).astype(RUNTIME_ARRAY_DTYPE, copy=False)
        self.answer_values_array = (
            np.asarray(self.answer_values, dtype=np.int64)
            if self.answer_values is not None and len(self.answer_values) > 0
            else None
        )
        self.answer_valid_mask_array = (
            self.answer_values_array >= 0
            if self.answer_values_array is not None
            else None
        )
        self.answer_start_keys_array = (
            np.asarray(self.answer_start_keys, dtype=RUNTIME_ARRAY_DTYPE)
            if self.answer_start_keys is not None and len(self.answer_start_keys) > 0
            else None
        )
        answer_start_key_norms_cache = (
            self._runtime_vector_cache(
                cached_answer_start_key_norms_array,
                int(self.answer_start_keys_array.shape[0]),
            )
            if self.answer_start_keys_array is not None
            else None
        )
        self.answer_start_key_norms_array = (
            answer_start_key_norms_cache
            if answer_start_key_norms_cache is not None
            else self._runtime_key_norms_array(
                self.answer_start_keys_array,
                self.answer_start_key_norms,
            )
        )
        self.answer_start_similarity_keys_array = None
        self.answer_start_similarity_key_norms_array = None
        if (
            self.answer_start_keys_array is not None
            and len(self.answer_start_keys_array.shape) == 2
            and self.answer_similarity_mask_array is not None
            and int(self.answer_start_keys_array.shape[1]) == int(self.answer_similarity_mask_array.shape[0])
        ):
            answer_start_similarity_keys_cache = self._runtime_matrix_cache(
                cached_answer_start_similarity_keys_array,
                int(self.answer_start_keys_array.shape[0]),
                int(self.answer_start_keys_array.shape[1]),
            )
            answer_start_similarity_key_norms_cache = self._runtime_vector_cache(
                cached_answer_start_similarity_key_norms_array,
                int(self.answer_start_keys_array.shape[0]),
            )
            if (
                answer_start_similarity_keys_cache is not None
                and answer_start_similarity_key_norms_cache is not None
            ):
                self.answer_start_similarity_keys_array = answer_start_similarity_keys_cache
                self.answer_start_similarity_key_norms_array = answer_start_similarity_key_norms_cache
            else:
                self.answer_start_similarity_keys_array = (
                    self.answer_start_keys_array * self.answer_similarity_mask_array[None, :]
                )
                self.answer_start_similarity_key_norms_array = np.linalg.norm(
                    self.answer_start_similarity_keys_array,
                    axis=1,
                ).astype(RUNTIME_ARRAY_DTYPE, copy=False)
        self.answer_start_values_array = (
            np.asarray(self.answer_start_values, dtype=np.int64)
            if self.answer_start_values is not None and len(self.answer_start_values) > 0
            else None
        )
        self.answer_start_valid_mask_array = (
            self.answer_start_values_array >= 0
            if self.answer_start_values_array is not None
            else None
        )
        self.answer_sequence_keys_array = (
            np.asarray(self.answer_sequence_keys, dtype=RUNTIME_ARRAY_DTYPE)
            if self.answer_sequence_keys is not None and len(self.answer_sequence_keys) > 0
            else None
        )
        answer_sequence_key_norms_cache = (
            self._runtime_vector_cache(
                cached_answer_sequence_key_norms_array,
                int(self.answer_sequence_keys_array.shape[0]),
            )
            if self.answer_sequence_keys_array is not None
            else None
        )
        self.answer_sequence_key_norms_array = (
            answer_sequence_key_norms_cache
            if answer_sequence_key_norms_cache is not None
            else self._runtime_key_norms_array(
                self.answer_sequence_keys_array,
                self.answer_sequence_key_norms,
            )
        )
        self.answer_sequence_similarity_keys_array = None
        self.answer_sequence_similarity_key_norms_array = None
        if (
            self.answer_sequence_keys_array is not None
            and len(self.answer_sequence_keys_array.shape) == 2
            and self.answer_similarity_mask_array is not None
            and int(self.answer_sequence_keys_array.shape[1]) == int(self.answer_similarity_mask_array.shape[0])
        ):
            answer_sequence_similarity_keys_cache = self._runtime_matrix_cache(
                cached_answer_sequence_similarity_keys_array,
                int(self.answer_sequence_keys_array.shape[0]),
                int(self.answer_sequence_keys_array.shape[1]),
            )
            answer_sequence_similarity_key_norms_cache = self._runtime_vector_cache(
                cached_answer_sequence_similarity_key_norms_array,
                int(self.answer_sequence_keys_array.shape[0]),
            )
            if (
                answer_sequence_similarity_keys_cache is not None
                and answer_sequence_similarity_key_norms_cache is not None
            ):
                self.answer_sequence_similarity_keys_array = answer_sequence_similarity_keys_cache
                self.answer_sequence_similarity_key_norms_array = answer_sequence_similarity_key_norms_cache
            else:
                self.answer_sequence_similarity_keys_array = (
                    self.answer_sequence_keys_array * self.answer_similarity_mask_array[None, :]
                )
                self.answer_sequence_similarity_key_norms_array = np.linalg.norm(
                    self.answer_sequence_similarity_keys_array,
                    axis=1,
                ).astype(RUNTIME_ARRAY_DTYPE, copy=False)
        self.answer_sequence_tokens_array = (
            np.asarray(self.answer_sequence_tokens, dtype=np.int64)
            if self.answer_sequence_tokens is not None and len(self.answer_sequence_tokens) > 0
            else None
        )
        self.answer_sequence_prompt_tokens_array = (
            np.asarray(self.answer_sequence_prompt_tokens, dtype=np.int64)
            if self.answer_sequence_prompt_tokens is not None
            and len(self.answer_sequence_prompt_tokens) > 0
            else None
        )
        self.prompt_overlap_valid_token_mask_array = None
        if not self._defer_answer_sequence_prompt_overlap_cache():
            self._refresh_answer_sequence_prompt_overlap_cache()
        else:
            self._refresh_answer_sequence_prompt_overlap_cache()

    def _defer_answer_sequence_prompt_overlap_cache(self) -> bool:
        if self.answer_sequence_prompt_tokens is None:
            return False
        try:
            row_count = len(self.answer_sequence_prompt_tokens)
        except TypeError:
            return False
        return (
            row_count > ANSWER_SEQUENCE_EAGER_OVERLAP_CACHE_LIMIT
            and np is not None
            and self.answer_sequence_prompt_tokens_array is not None
        )

    def _prompt_overlap_valid_token_mask(self) -> object | None:
        if np is None or self.embedding_model is None:
            return None
        if (
            self.prompt_overlap_valid_token_mask_array is not None
            and int(self.prompt_overlap_valid_token_mask_array.shape[0]) == len(self.embedding_model.id_to_token)
        ):
            return self.prompt_overlap_valid_token_mask_array
        mask = np.fromiter(
            (
                not self._should_skip_prompt_overlap_token(token)
                for token in self.embedding_model.id_to_token
            ),
            dtype=bool,
            count=len(self.embedding_model.id_to_token),
        )
        self.prompt_overlap_valid_token_mask_array = mask
        return mask

    def _answer_prompt_row_ids_from_array(self) -> tuple[dict[int, list[int]], list[list[int]] | None] | None:
        if (
            np is None
            or self.answer_sequence_prompt_tokens_array is None
            or self.trace_token_weights is None
            or self.embedding_model is None
        ):
            return None
        rows = np.asarray(self.answer_sequence_prompt_tokens_array, dtype=np.int64)
        if len(rows.shape) != 2 or rows.size == 0:
            return {}, [] if rows.shape[0] <= ANSWER_SEQUENCE_EAGER_OVERLAP_CACHE_LIMIT else None
        vocab_size = len(self.trace_token_weights)
        if vocab_size <= 0:
            return {}, [] if rows.shape[0] <= ANSWER_SEQUENCE_EAGER_OVERLAP_CACHE_LIMIT else None
        valid_token_mask = self._prompt_overlap_valid_token_mask()
        if valid_token_mask is None:
            return None
        bounded = (rows >= 0) & (rows < vocab_size)
        clipped = np.clip(rows, 0, max(0, vocab_size - 1))
        bounded &= valid_token_mask[clipped]
        row_positions, column_positions = np.nonzero(bounded)
        if row_positions.size == 0:
            empty_rows = [[] for _ in range(int(rows.shape[0]))] if rows.shape[0] <= ANSWER_SEQUENCE_EAGER_OVERLAP_CACHE_LIMIT else None
            return {}, empty_rows
        token_values = rows[row_positions, column_positions].astype(np.int64, copy=False)
        order = np.lexsort((row_positions, token_values))
        token_values = token_values[order]
        row_positions = row_positions[order]
        unique = np.ones(token_values.shape[0], dtype=bool)
        unique[1:] = (token_values[1:] != token_values[:-1]) | (row_positions[1:] != row_positions[:-1])
        token_values = token_values[unique]
        row_positions = row_positions[unique]
        boundaries = np.flatnonzero(token_values[1:] != token_values[:-1]) + 1
        token_groups = np.split(token_values, boundaries)
        row_groups = np.split(row_positions, boundaries)
        inverted = {
            int(token_group[0]): row_group.astype(np.int64, copy=False).tolist()
            for token_group, row_group in zip(token_groups, row_groups)
            if token_group.size
        }
        if rows.shape[0] > ANSWER_SEQUENCE_EAGER_OVERLAP_CACHE_LIMIT:
            return inverted, None
        row_id_lists: list[list[int]] = [[] for _ in range(int(rows.shape[0]))]
        for token_id, row_index in zip(token_values.tolist(), row_positions.tolist()):
            row_id_lists[int(row_index)].append(int(token_id))
        return inverted, row_id_lists

    def _refresh_answer_sequence_prompt_overlap_cache(self) -> None:
        self.answer_sequence_prompt_weight_maps = None
        self.answer_sequence_prompt_weight_norms = None
        self.answer_sequence_prompt_bigram_sets = None
        self.answer_sequence_prompt_trigram_sets = None
        self.answer_sequence_prompt_number_sets = None
        self.answer_sequence_prompt_inverted_index = None
        self.answer_sequence_prompt_specificity = None
        if self.answer_sequence_prompt_tokens is None or self.trace_token_weights is None:
            return
        array_index = self._answer_prompt_row_ids_from_array()
        if array_index is not None:
            inverted, row_id_lists = array_index
            total_rows = (
                int(self.answer_sequence_prompt_tokens_array.shape[0])
                if self.answer_sequence_prompt_tokens_array is not None
                else len(row_id_lists or [])
            )
        else:
            inverted = {}
            row_id_lists = []
            for row in self.answer_sequence_prompt_tokens:
                row_values = row.tolist() if hasattr(row, "tolist") else row
                row_ids: list[int] = []
                for raw_token_id in row_values:
                    token_id = int(raw_token_id)
                    if token_id < 0 or token_id >= len(self.trace_token_weights):
                        continue
                    if self.embedding_model is not None and self._should_skip_prompt_overlap_token(
                        self.embedding_model.id_to_token[token_id]
                    ):
                        continue
                    row_ids.append(token_id)
                sequence_index = len(row_id_lists)
                for token_id in set(row_ids):
                    inverted.setdefault(token_id, []).append(sequence_index)
                row_id_lists.append(row_ids)
            total_rows = len(row_id_lists)

        specificity = {
            token_id: self._prompt_overlap_token_specificity(len(indices), total_rows)
            for token_id, indices in inverted.items()
        }
        self.answer_sequence_prompt_inverted_index = inverted
        self.answer_sequence_prompt_specificity = specificity

        if total_rows > ANSWER_SEQUENCE_EAGER_OVERLAP_CACHE_LIMIT:
            return
        if row_id_lists is None:
            return

        weight_maps: list[dict[int, float]] = []
        weight_norms: list[float] = []
        bigram_sets: list[set[tuple[int, int]]] = []
        trigram_sets: list[set[tuple[int, int, int]]] = []
        number_sets: list[set[str]] = []
        for row_index, row_ids in enumerate(row_id_lists):
            row_weights: dict[int, float] = {}
            for token_id in row_ids:
                row_weights[token_id] = max(
                    row_weights.get(token_id, 0.0),
                    float(self.trace_token_weights[token_id]) * specificity.get(token_id, 1.0),
                )
            weight_maps.append(row_weights)
            weight_norms.append(sum(value * value for value in row_weights.values()) ** 0.5)
            bigram_sets.append(
                {
                    (row_ids[index], row_ids[index + 1])
                    for index in range(len(row_ids) - 1)
                }
            )
            trigram_sets.append(
                {
                    (row_ids[index], row_ids[index + 1], row_ids[index + 2])
                    for index in range(len(row_ids) - 2)
                }
            )
            raw_row = self.answer_sequence_prompt_tokens[row_index]
            raw_values = raw_row.tolist() if hasattr(raw_row, "tolist") else raw_row
            raw_ids = [
                int(value)
                for value in raw_values
                if 0 <= int(value) < len(self.embedding_model.id_to_token)
            ]
            number_sets.append(self._number_strings_from_token_ids(raw_ids))
        self.answer_sequence_prompt_weight_maps = weight_maps
        self.answer_sequence_prompt_weight_norms = weight_norms
        self.answer_sequence_prompt_bigram_sets = bigram_sets
        self.answer_sequence_prompt_trigram_sets = trigram_sets
        self.answer_sequence_prompt_number_sets = number_sets

    @staticmethod
    def _prompt_overlap_token_specificity(document_frequency: int, total_documents: int) -> float:
        if document_frequency <= 0 or total_documents <= 0:
            return 1.0
        coverage = min(1.0, document_frequency / total_documents)
        return max(0.02, 1.0 - (coverage ** 0.5))

    def _number_strings_from_token_ids(self, token_ids: list[int]) -> set[str]:
        assert self.embedding_model is not None
        tokens = [
            self.embedding_model.id_to_token[token_id]
            for token_id in token_ids
            if 0 <= token_id < len(self.embedding_model.id_to_token)
        ]
        return self._number_strings_from_tokens(tokens)

    def _number_strings_from_tokens(self, tokens: list[str]) -> set[str]:
        numbers: set[str] = set()
        current = ""
        for token in tokens:
            if self.tokenizer is not None and token in self.tokenizer.special_tokens:
                if current:
                    numbers.add(current)
                    current = ""
                continue
            rendered = self._render_token(token)
            digits = "".join(character for character in rendered if character.isdigit())
            starts_number = self._starts_new_word(token) if self.tokenizer is not None else True
            if digits and starts_number:
                if current:
                    numbers.add(current)
                current = digits
            elif digits and current:
                current += digits
            else:
                if current:
                    numbers.add(current)
                    current = ""
        if current:
            numbers.add(current)
        return numbers

    @staticmethod
    def _numeric_prompt_can_match(query_numbers: set[str], row_numbers: set[str]) -> bool:
        if not query_numbers:
            return True
        if not row_numbers:
            return False
        return query_numbers.issubset(row_numbers)

    def _vector_answer_sequence_candidate_indices(
        self,
        query_token_ids: object,
    ) -> list[int] | None:
        if (
            np is None
            or self.answer_sequence_prompt_tokens_array is None
            or not hasattr(self.answer_sequence_prompt_tokens_array, "shape")
        ):
            return None
        query_ids = np.asarray(list(query_token_ids), dtype=np.int64)
        if query_ids.size == 0:
            return []
        prompt_array = self.answer_sequence_prompt_tokens_array
        if len(prompt_array.shape) != 2 or prompt_array.shape[0] == 0:
            return None
        mask = np.isin(prompt_array, query_ids).any(axis=1)
        return [int(index) for index in np.flatnonzero(mask)]

    def _vector_answer_sequence_local_frequency(
        self,
        token_id: int,
        candidate_indices: list[int],
    ) -> int | None:
        if (
            np is None
            or self.answer_sequence_prompt_tokens_array is None
            or not hasattr(self.answer_sequence_prompt_tokens_array, "shape")
            or not candidate_indices
        ):
            return None
        rows = self.answer_sequence_prompt_tokens_array[
            np.asarray(candidate_indices, dtype=np.int64)
        ]
        return int(np.any(rows == int(token_id), axis=1).sum())

    def _apply_readout_fast(self, state: Vector) -> Vector:
        if self.readout_weights_array is None or np is None:
            assert self.readout_weights is not None
            centered_state = self._center_state_vector(state)
            logits = apply_readout(self.readout_weights, centered_state)
            if self.readout_bias:
                logits = [
                    value + self.readout_bias[index]
                    for index, value in enumerate(logits)
                ]
            return logits
        state_array = np.asarray(state, dtype=RUNTIME_ARRAY_DTYPE)
        if self.state_offset_array is not None and self.state_offset_array.shape == state_array.shape:
            state_array = state_array - self.state_offset_array
        logits = self.readout_weights_array @ state_array
        if self.readout_bias_array is not None and self.readout_bias_array.shape == logits.shape:
            logits = logits + self.readout_bias_array
        return logits.tolist()

    def _apply_readout_array(self, state: object) -> object:
        assert np is not None
        assert self.readout_weights_array is not None
        state_array = np.asarray(state, dtype=RUNTIME_ARRAY_DTYPE)
        if self.state_offset_array is not None and self.state_offset_array.shape == state_array.shape:
            state_array = state_array - self.state_offset_array
        logits = self.readout_weights_array @ state_array
        if self.readout_bias_array is not None and self.readout_bias_array.shape == logits.shape:
            logits = logits + self.readout_bias_array
        return logits

    def _step_hidden_states(
        self,
        hidden_states: list[Vector],
        context_traces: list[Vector],
        token: str,
    ) -> tuple[list[Vector], list[Vector], Vector]:
        assert self.embedding_model is not None
        assert self.tokenizer is not None
        token_id = self._token_id_for_token(token)
        embedding = self.embedding_model.vector(token)
        trace_embedding = self._trace_embedding_from_token_id(embedding, token_id)
        return self._step_hidden_states_from_embedding(
            hidden_states,
            context_traces,
            embedding,
            trace_embedding=trace_embedding,
        )

    def _step_hidden_states_from_embedding(
        self,
        hidden_states: list[Vector],
        context_traces: list[Vector],
        embedding: Vector | object,
        *,
        trace_embedding: Vector | object | None = None,
    ) -> tuple[list[Vector], list[Vector], Vector]:
        assert self.memory_units is not None
        if trace_embedding is None:
            trace_embedding = embedding

        if np is not None and hidden_states and hasattr(hidden_states[0], "shape"):
            embedding_array = (
                embedding
                if hasattr(embedding, "shape")
                else np.asarray(embedding, dtype=np.float64)
            )
            trace_embedding_array = (
                trace_embedding
                if hasattr(trace_embedding, "shape")
                else np.asarray(trace_embedding, dtype=np.float64)
            )
            drive = analytical_embedding_drive_fast(embedding_array, self.config.state_dim)
            next_states: list[Vector] = []
            next_traces: list[Vector] = []
            combined_state: Vector = []
            for unit, state, trace in zip(self.memory_units, hidden_states, context_traces):
                next_state = unit.step_vector_fast(state, drive)
                decay = 1.0 / (1.0 + unit.timescale)
                next_trace = trace + ((1.0 - decay) * trace_embedding_array)
                next_states.append(next_state)
                next_traces.append(next_trace)
                combined_state.extend(next_state.tolist())
                combined_state.extend(next_trace.tolist())
            return next_states, next_traces, combined_state

        embedding_vector = embedding.tolist() if hasattr(embedding, "tolist") else embedding
        trace_embedding_vector = (
            trace_embedding.tolist()
            if hasattr(trace_embedding, "tolist")
            else trace_embedding
        )
        drive = analytical_embedding_drive(embedding_vector, self.config.state_dim)
        next_states: list[Vector] = []
        next_traces: list[Vector] = []
        combined_state: Vector = []
        for unit, state, trace in zip(self.memory_units, hidden_states, context_traces):
            next_state = unit.step_vector(state, drive)
            decay = 1.0 / (1.0 + unit.timescale)
            next_trace = [
                previous + ((1.0 - decay) * value)
                for previous, value in zip(trace, trace_embedding_vector)
            ]
            next_states.append(next_state)
            next_traces.append(next_trace)
            combined_state.extend(next_state)
            combined_state.extend(next_trace)
        return next_states, next_traces, combined_state

    def _one_hot(self, token: str) -> Vector:
        assert self.embedding_model is not None
        return self._one_hot_from_id(self.embedding_model.token_to_id.get(token, -1))

    def _one_hot_from_id(self, token_id: int) -> Vector:
        assert self.embedding_model is not None
        vector = [0.0 for _ in self.embedding_model.id_to_token]
        if token_id >= 0:
            vector[token_id] = 1.0
        return vector

    def _blend_probabilities(
        self,
        base: Vector,
        answer: Vector,
        associative: Vector,
        transition: Vector,
        copy: Vector,
        source_evidence: Vector,
        preference: Vector,
        *,
        transition_order: int | None,
        generated_count: int = 0,
        answer_locked: bool = False,
        answer_guided_start: bool = False,
        copy_guided_start: bool = False,
    ) -> tuple[Vector, dict[str, float]]:
        base_weight = FAST_BASE_BLEND
        answer_weight = FAST_ANSWER_BLEND
        associative_weight = FAST_ASSOCIATIVE_BLEND
        transition_weight = FAST_TRANSITION_BLEND
        copy_weight = FAST_COPY_BLEND
        source_evidence_weight = FAST_SOURCE_EVIDENCE_BLEND
        preference_weight = FAST_PREFERENCE_BLEND
        source_grounded = any(value > 0.0 for value in source_evidence)
        if answer_locked:
            base_weight *= 0.005
            answer_weight *= 250.0
            associative_weight *= 0.05
            transition_weight *= 0.005
            copy_weight *= 0.005
            source_evidence_weight *= 0.05
            preference_weight *= 0.05
        elif answer_guided_start:
            base_weight *= 0.45
            answer_weight *= 3.1
            associative_weight *= 0.2
            transition_weight *= 0.35
            copy_weight *= 0.2
            source_evidence_weight *= 1.1
            preference_weight *= 0.2
        elif copy_guided_start:
            base_weight *= 0.55
            answer_weight *= 0.35
            associative_weight *= 0.4
            transition_weight *= 0.35
            copy_weight *= 4.5
            preference_weight *= 0.6
        elif generated_count > 0:
            answer_weight *= 0.32
            transition_weight *= 2.0
            copy_weight *= 0.75
            source_evidence_weight *= 0.85
            if source_grounded:
                base_weight *= 0.45
                answer_weight *= 0.35
                associative_weight *= 0.50
                transition_weight *= 0.25
                copy_weight *= 0.50
                source_evidence_weight *= 3.50

        if source_grounded:
            base_weight *= 0.60
            answer_weight *= 0.35
            associative_weight *= 0.50
            transition_weight *= 0.80
            copy_weight *= 0.20
            source_evidence_weight *= 1.80
        else:
            source_evidence_weight = 0.0

        if transition_order is None:
            answer_weight *= 1.1
            associative_weight *= 0.75
            copy_weight += 0.02
        elif transition_order <= 2:
            answer_weight *= 1.15
            associative_weight *= 0.65
            transition_weight *= 0.55
            copy_weight += 0.01
        elif transition_order >= 5:
            transition_weight *= 1.25

        sources: list[tuple[str, float, Vector]] = [("base", base_weight, base)]
        if any(value > 0.0 for value in answer):
            sources.append(("answer", answer_weight, answer))
        if any(value > 0.0 for value in associative):
            sources.append(("associative", associative_weight, associative))
        if any(value > 0.0 for value in transition):
            sources.append(("transition", transition_weight, transition))
        if any(value > 0.0 for value in copy):
            sources.append(("copy", copy_weight, copy))
        if any(value > 0.0 for value in source_evidence):
            sources.append(("source_evidence", source_evidence_weight, source_evidence))
        if any(value > 0.0 for value in preference):
            sources.append(("preference", preference_weight, preference))

        total_weight = sum(weight for _, weight, _ in sources)
        blended = [0.0 for _ in base]
        blend_weights: dict[str, float] = {}
        for name, weight, source in sources:
            normalized_weight = weight / total_weight if total_weight else 0.0
            blend_weights[name] = normalized_weight
            for index, value in enumerate(source):
                blended[index] += normalized_weight * value
        return _normalize_vector(blended), blend_weights

    def _blend_probability_arrays(
        self,
        base: object,
        answer: object,
        associative: object,
        transition: object,
        copy: object,
        source_evidence: object,
        preference: object,
        *,
        transition_order: int | None,
        generated_count: int = 0,
        answer_locked: bool = False,
        answer_guided_start: bool = False,
        copy_guided_start: bool = False,
    ) -> tuple[object, dict[str, float]]:
        assert np is not None

        base_weight = FAST_BASE_BLEND
        answer_weight = FAST_ANSWER_BLEND
        associative_weight = FAST_ASSOCIATIVE_BLEND
        transition_weight = FAST_TRANSITION_BLEND
        copy_weight = FAST_COPY_BLEND
        source_evidence_weight = FAST_SOURCE_EVIDENCE_BLEND
        preference_weight = FAST_PREFERENCE_BLEND
        source_grounded = bool(np.any(source_evidence > 0.0))
        if answer_locked:
            base_weight *= 0.005
            answer_weight *= 250.0
            associative_weight *= 0.05
            transition_weight *= 0.005
            copy_weight *= 0.005
            source_evidence_weight *= 0.05
            preference_weight *= 0.05
        elif answer_guided_start:
            base_weight *= 0.45
            answer_weight *= 3.1
            associative_weight *= 0.2
            transition_weight *= 0.35
            copy_weight *= 0.2
            source_evidence_weight *= 1.1
            preference_weight *= 0.2
        elif copy_guided_start:
            base_weight *= 0.55
            answer_weight *= 0.35
            associative_weight *= 0.4
            transition_weight *= 0.35
            copy_weight *= 4.5
            preference_weight *= 0.6
        elif generated_count > 0:
            answer_weight *= 0.32
            transition_weight *= 2.0
            copy_weight *= 0.75
            source_evidence_weight *= 0.85
            if source_grounded:
                base_weight *= 0.45
                answer_weight *= 0.35
                associative_weight *= 0.50
                transition_weight *= 0.25
                copy_weight *= 0.50
                source_evidence_weight *= 3.50
        if source_grounded:
            base_weight *= 0.60
            answer_weight *= 0.35
            associative_weight *= 0.50
            transition_weight *= 0.80
            copy_weight *= 0.20
            source_evidence_weight *= 1.80
        else:
            source_evidence_weight = 0.0
        if transition_order is None:
            answer_weight *= 1.1
            associative_weight *= 0.75
            copy_weight += 0.02
        elif transition_order <= 2:
            answer_weight *= 1.15
            associative_weight *= 0.65
            transition_weight *= 0.55
            copy_weight += 0.01
        elif transition_order >= 5:
            transition_weight *= 1.25

        sources: list[tuple[str, float, object]] = [("base", base_weight, base)]
        if np.any(answer > 0.0):
            sources.append(("answer", answer_weight, answer))
        if np.any(associative > 0.0):
            sources.append(("associative", associative_weight, associative))
        if np.any(transition > 0.0):
            sources.append(("transition", transition_weight, transition))
        if np.any(copy > 0.0):
            sources.append(("copy", copy_weight, copy))
        if np.any(source_evidence > 0.0):
            sources.append(("source_evidence", source_evidence_weight, source_evidence))
        if np.any(preference > 0.0):
            sources.append(("preference", preference_weight, preference))

        total_weight = sum(weight for _, weight, _ in sources)
        blended = np.zeros_like(base, dtype=np.float64)
        blend_weights: dict[str, float] = {}
        for name, weight, source in sources:
            normalized_weight = weight / total_weight if total_weight else 0.0
            blend_weights[name] = normalized_weight
            blended += normalized_weight * source
        total = float(blended.sum())
        if total <= 0.0:
            return base, blend_weights
        return blended / total, blend_weights

    def _score_associative_matches(
        self,
        state: Vector,
        *,
        limit: int = ASSOCIATIVE_TOP_K,
    ) -> list[tuple[float, int, int]]:
        if (
            self.associative_keys is None
            or self.associative_values is None
            or len(self.associative_keys) == 0
            or len(self.associative_values) == 0
        ):
            return []

        if (
            np is not None
            and
            self.associative_keys_array is not None
            and self.associative_key_norms_array is not None
            and self.associative_values_array is not None
            and self.associative_valid_mask_array is not None
            and limit > 0
        ):
            state_array = self._center_state_array(state).astype(self.associative_keys_array.dtype, copy=False)
            state_norm = float(np.linalg.norm(state_array))
            if state_norm == 0.0:
                return []
            numerators = self.associative_keys_array @ state_array
            denominators = self.associative_key_norms_array * state_norm
            valid_mask = self.associative_valid_mask_array & (denominators > 0.0)
            if np.any(valid_mask):
                scores = np.zeros_like(numerators, dtype=self.associative_keys_array.dtype)
                np.divide(numerators, denominators, out=scores, where=valid_mask)
                positive_positions = np.flatnonzero(valid_mask & (scores > 0.0))
                if positive_positions.size:
                    selected_positions = positive_positions
                    if positive_positions.size > limit:
                        partition = np.argpartition(scores[positive_positions], -limit)[-limit:]
                        selected_positions = positive_positions[partition]
                    ordered_positions = selected_positions[np.argsort(scores[selected_positions])[::-1]]
                    return [
                        (
                            float(scores[position]),
                            int(self.associative_values_array[position]),
                            int(position),
                        )
                        for position in ordered_positions
                    ]

        if self.associative_key_norms is None or len(self.associative_key_norms) == 0:
            return []

        state = self._center_state_vector(state)
        state_norm = norm(state)
        if state_norm == 0.0:
            return []

        scored: list[tuple[float, int, int]] = []
        for example_index, (key, key_norm, token_id) in enumerate(
            zip(self.associative_keys, self.associative_key_norms, self.associative_values)
        ):
            if token_id < 0:
                continue
            denominator = state_norm * key_norm
            if denominator == 0.0:
                continue
            similarity = dot(state, key) / denominator
            if similarity > 0.0:
                scored.append((similarity, token_id, example_index))
        scored.sort(key=lambda item: item[0], reverse=True)
        return scored[:limit]

    def _associative_prior_from_matches(
        self,
        matches: list[tuple[float, int, int]],
    ) -> Vector:
        assert self.embedding_model is not None
        if not matches:
            return [0.0 for _ in self.embedding_model.id_to_token]

        prior = [0.0 for _ in self.embedding_model.id_to_token]
        for similarity, token_id, _ in matches[:ASSOCIATIVE_TOP_K]:
            prior[token_id] += similarity
        return _normalize_vector(prior)

    def _associative_prior(self, state: Vector) -> Vector:
        return self._associative_prior_from_matches(self._score_associative_matches(state))

    def _score_answer_matches(
        self,
        answer_anchor_state: Vector | None,
        *,
        limit: int = ANSWER_TOP_K,
    ) -> list[tuple[float, int, int]]:
        return self._score_prompt_anchor_matches(
            answer_anchor_state,
            self.answer_keys,
            self.answer_key_norms,
            self.answer_values,
            self.answer_keys_array,
            self.answer_key_norms_array,
            self.answer_values_array,
            self.answer_valid_mask_array,
            self.answer_similarity_keys_array,
            self.answer_similarity_key_norms_array,
            self.answer_similarity_mask_array,
            limit=limit,
        )

    def _score_answer_start_matches(
        self,
        answer_anchor_state: Vector | None,
        *,
        limit: int = ANSWER_START_TOP_K,
    ) -> list[tuple[float, int, int]]:
        matches = self._score_prompt_anchor_matches(
            answer_anchor_state,
            self.answer_start_keys,
            self.answer_start_key_norms,
            self.answer_start_values,
            self.answer_start_keys_array,
            self.answer_start_key_norms_array,
            self.answer_start_values_array,
            self.answer_start_valid_mask_array,
            self.answer_start_similarity_keys_array,
            self.answer_start_similarity_key_norms_array,
            self.answer_similarity_mask_array,
            limit=limit,
        )
        if matches:
            return matches
        return self._score_prompt_anchor_matches(
            answer_anchor_state,
            self.answer_start_keys,
            self.answer_start_key_norms,
            self.answer_start_values,
            self.answer_start_keys_array,
            self.answer_start_key_norms_array,
            self.answer_start_values_array,
            self.answer_start_valid_mask_array,
            None,
            None,
            None,
            limit=limit,
        )

    def _score_answer_sequence_matches(
        self,
        answer_anchor_state: Vector | None,
        context_tokens: list[str],
        *,
        limit: int = ANSWER_START_TOP_K,
    ) -> list[tuple[float, int, int]]:
        if (
            answer_anchor_state is None
            or self.answer_sequence_keys is None
            or self.answer_sequence_key_norms is None
            or self.answer_sequence_tokens is None
        ):
            return []
        values = list(range(len(self.answer_sequence_tokens)))
        values_array = np.arange(len(values), dtype=np.int64) if np is not None else None
        anchor_matches = self._score_prompt_anchor_matches(
            answer_anchor_state,
            self.answer_sequence_keys,
            self.answer_sequence_key_norms,
            values,
            self.answer_sequence_keys_array,
            self.answer_sequence_key_norms_array,
            values_array,
            values_array >= 0 if values_array is not None else None,
            self.answer_sequence_similarity_keys_array,
            self.answer_sequence_similarity_key_norms_array,
            self.answer_similarity_mask_array,
            limit=max(limit * 4, limit),
        )
        overlap_scores = self._answer_sequence_prompt_overlap_scores(context_tokens)
        if overlap_scores is None:
            return anchor_matches[:limit]
        if not overlap_scores:
            return []
        best_overlap = max(overlap_scores.values()) if overlap_scores else 0.0
        overlap_floor = max(0.16, best_overlap * 0.90)
        focused_overlap_scores = {
            sequence_index: overlap
            for sequence_index, overlap in overlap_scores.items()
            if overlap >= overlap_floor
        }
        if not focused_overlap_scores:
            focused_overlap_scores = overlap_scores
        focused_indices = set(focused_overlap_scores)
        merged: dict[int, float] = {}
        for similarity, sequence_index, _ in anchor_matches:
            if sequence_index not in focused_indices:
                continue
            merged[sequence_index] = max(merged.get(sequence_index, 0.0), 0.20 * similarity)
        for sequence_index, overlap in focused_overlap_scores.items():
            merged[sequence_index] = merged.get(sequence_index, 0.0) + (0.80 * overlap)
        ranked = [
            (score, sequence_index, sequence_index)
            for sequence_index, score in merged.items()
            if score > 0.0
        ]
        ranked.sort(key=lambda item: item[0], reverse=True)
        return ranked[:limit]

    def _answer_sequence_prompt_overlap_scores(
        self,
        context_tokens: list[str],
    ) -> dict[int, float] | None:
        if (
            self.embedding_model is None
            or self.answer_sequence_prompt_tokens is None
            or self.trace_token_weights is None
        ):
            return None
        answer_boundary = _last_index(context_tokens, "<answer>")
        prompt_tokens = (
            context_tokens[:answer_boundary]
            if answer_boundary is not None
            else context_tokens
        )
        if (
            self.answer_sequence_prompt_specificity is None
            and not self._defer_answer_sequence_prompt_overlap_cache()
        ):
            self._refresh_answer_sequence_prompt_overlap_cache()
        specificity_map = self.answer_sequence_prompt_specificity or {}
        query_weights: dict[int, float] = {}
        query_specificity: dict[int, float] = {}
        query_segment_multipliers: dict[int, float] = {}
        query_content_weight = 0.0
        query_ids: list[int] = []
        primary_query_ids: list[int] = []
        inside_tool_evidence = False
        prompt_segment_index = 0
        for token in prompt_tokens:
            if token in {"<tool_result>", "<source>"}:
                inside_tool_evidence = True
                continue
            if token == "<final>":
                inside_tool_evidence = False
                continue
            if self.tokenizer is not None and token in self.tokenizer.special_tokens:
                continue
            if self._is_structural_punctuation_token(token):
                prompt_segment_index += 1
                continue
            if self._should_skip_prompt_overlap_token(token):
                continue
            token_id = self.embedding_model.token_to_id.get(token)
            if token_id is None:
                continue
            query_ids.append(token_id)
            specificity = specificity_map.get(token_id, 1.0)
            evidence_multiplier = 0.35 if inside_tool_evidence else 1.0
            segment_multiplier = evidence_multiplier / (1.0 + prompt_segment_index)
            weight = specificity * segment_multiplier
            query_weights[token_id] = max(
                query_weights.get(token_id, 0.0),
                weight,
            )
            query_specificity[token_id] = max(
                query_specificity.get(token_id, 0.0),
                specificity,
            )
            query_segment_multipliers[token_id] = max(
                query_segment_multipliers.get(token_id, 0.0),
                segment_multiplier,
            )
            if not inside_tool_evidence:
                primary_query_ids.append(token_id)
            if specificity >= 0.20:
                query_content_weight += weight
        if not query_weights:
            return None
        full_query_token_ids = set(query_ids)
        primary_query_token_ids = set(primary_query_ids)
        has_tool_evidence = any(token in {"<tool_result>", "<source>"} for token in prompt_tokens)
        query_norm = sum(value * value for value in query_weights.values()) ** 0.5
        if query_norm <= 0.0:
            return None

        query_bigrams = {
            (query_ids[index], query_ids[index + 1])
            for index in range(len(query_ids) - 1)
        }
        query_trigrams = {
            (query_ids[index], query_ids[index + 1], query_ids[index + 2])
            for index in range(len(query_ids) - 2)
        }
        query_numbers = self._number_strings_from_tokens(prompt_tokens)

        def ordered_ngram_score(
            query_grams: set[tuple[int, ...]],
            row_grams: set[tuple[int, ...]],
        ) -> float:
            if not query_grams or not row_grams:
                return 0.0
            overlap = len(query_grams & row_grams)
            if overlap <= 0:
                return 0.0
            return overlap / ((len(query_grams) * len(row_grams)) ** 0.5)

        def prompt_length_fit(row_token_count: int) -> float:
            query_token_count = len(full_query_token_ids)
            if query_token_count <= 0 or row_token_count <= 0:
                return 1.0
            if row_token_count <= query_token_count:
                return 1.0
            extra_fraction = (row_token_count - query_token_count) / row_token_count
            return max(0.25, 1.0 - extra_fraction)

        cached_maps = self.answer_sequence_prompt_weight_maps
        cached_norms = self.answer_sequence_prompt_weight_norms
        cached_bigrams = self.answer_sequence_prompt_bigram_sets
        cached_trigrams = self.answer_sequence_prompt_trigram_sets
        cached_numbers = self.answer_sequence_prompt_number_sets
        cached_index = self.answer_sequence_prompt_inverted_index
        if (
            cached_maps is not None
            and cached_norms is not None
            and cached_bigrams is not None
            and cached_trigrams is not None
            and cached_numbers is not None
            and len(cached_maps) == len(self.answer_sequence_prompt_tokens)
        ):
            candidate_indices: set[int] | range
            if cached_index is not None:
                candidates: set[int] = set()
                ranked_query_ids = sorted(
                    query_weights,
                    key=lambda token_id: specificity_map.get(token_id, 1.0),
                    reverse=True,
                )
                distinctive_query_ids = [
                    token_id
                    for token_id in ranked_query_ids
                    if specificity_map.get(token_id, 1.0) >= 0.75
                ] or ranked_query_ids[:4]
                for token_id in distinctive_query_ids:
                    candidates.update(cached_index.get(token_id, ()))
                candidate_indices = candidates if candidates else range(len(cached_maps))
            else:
                candidate_indices = range(len(cached_maps))
            candidate_indices = list(candidate_indices)
            if cached_index is not None and candidate_indices:
                candidate_set = set(candidate_indices)
                local_query_weights: dict[int, float] = {}
                local_query_specificity: dict[int, float] = {}
                local_query_content_weight = 0.0
                for token_id in query_weights:
                    local_frequency = len(candidate_set & set(cached_index.get(token_id, ())))
                    if local_frequency <= 0:
                        continue
                    specificity = self._prompt_overlap_token_specificity(
                        local_frequency,
                        len(candidate_indices),
                    )
                    weight = specificity * query_segment_multipliers.get(token_id, 1.0)
                    local_query_weights[token_id] = weight
                    local_query_specificity[token_id] = specificity
                    if specificity >= 0.20:
                        local_query_content_weight += weight
                local_query_norm = sum(value * value for value in local_query_weights.values()) ** 0.5
                if local_query_norm > 0.0:
                    query_weights = local_query_weights
                    query_specificity = local_query_specificity
                    query_norm = local_query_norm
            scores: dict[int, float] = {}
            for sequence_index in candidate_indices:
                row_weights = cached_maps[sequence_index]
                if not row_weights:
                    continue
                if query_numbers and not self._numeric_prompt_can_match(
                    query_numbers,
                    cached_numbers[sequence_index],
                ):
                    continue
                matched_content_weight = sum(
                    query_weights[token_id]
                    for token_id in query_weights.keys() & row_weights.keys()
                    if query_specificity.get(token_id, 0.0) >= 0.20
                )
                row_token_coverage = len(query_weights.keys() & row_weights.keys()) / max(
                    1,
                    len(row_weights),
                )
                full_query_coverage = len(full_query_token_ids & row_weights.keys()) / max(
                    1,
                    len(full_query_token_ids),
                )
                primary_query_coverage = len(primary_query_token_ids & row_weights.keys()) / max(
                    1,
                    len(primary_query_token_ids),
                )
                if (
                    has_tool_evidence
                    and len(primary_query_token_ids) >= 3
                    and primary_query_coverage < 0.45
                    and row_token_coverage < 0.75
                ):
                    continue
                partial_query_floor = 0.60 if len(full_query_token_ids) < 8 else 0.50
                if (
                    len(full_query_token_ids) >= 5
                    and full_query_coverage <= partial_query_floor
                    and row_token_coverage < 0.75
                ):
                    continue
                if (
                    len(full_query_token_ids) >= 12
                    and full_query_coverage < 0.45
                    and row_token_coverage <= 0.75
                ):
                    continue
                if (
                    query_content_weight > 0.0
                    and matched_content_weight / query_content_weight < 0.40
                    and row_token_coverage < 0.75
                    and full_query_coverage < 0.60
                ):
                    continue
                query_coverage = (
                    matched_content_weight / query_content_weight
                    if query_content_weight > 0.0
                    else row_token_coverage
                )
                numerator = sum(
                    query_weights[token_id] * row_weights[token_id]
                    for token_id in query_weights.keys() & row_weights.keys()
                )
                if numerator <= 0.0:
                    continue
                row_norm = cached_norms[sequence_index]
                if row_norm <= 0.0:
                    continue
                token_score = numerator / (query_norm * row_norm)
                bigram_score = ordered_ngram_score(
                    query_bigrams,
                    cached_bigrams[sequence_index],
                )
                trigram_score = ordered_ngram_score(
                    query_trigrams,
                    cached_trigrams[sequence_index],
                )
                scores[sequence_index] = (
                    (0.35 * token_score)
                    + (0.35 * query_coverage)
                    + (0.15 * bigram_score)
                    + (0.15 * trigram_score)
                ) * prompt_length_fit(len(row_weights))
            return scores

        vector_candidate_indices: list[int] | None = None
        if cached_index is not None:
            candidate_set: set[int] = set()
            ranked_query_ids = sorted(
                query_weights,
                key=lambda token_id: specificity_map.get(token_id, 1.0),
                reverse=True,
            )
            distinctive_query_ids = [
                token_id
                for token_id in ranked_query_ids
                if specificity_map.get(token_id, 1.0) >= 0.75
            ] or ranked_query_ids[:4]
            for token_id in distinctive_query_ids:
                candidate_set.update(cached_index.get(token_id, ()))
            if not candidate_set:
                for token_id in ranked_query_ids:
                    candidate_set.update(cached_index.get(token_id, ()))
                    if candidate_set:
                        break
            if not candidate_set:
                candidate_indices = range(len(self.answer_sequence_prompt_tokens))
            else:
                candidate_indices = sorted(candidate_set)
            local_query_weights: dict[int, float] = {}
            local_query_specificity: dict[int, float] = {}
            local_query_content_weight = 0.0
            candidate_count = len(candidate_indices)
            for token_id in query_weights:
                local_frequency = len(candidate_set & set(cached_index.get(token_id, ())))
                if local_frequency <= 0:
                    continue
                specificity = self._prompt_overlap_token_specificity(
                    local_frequency,
                    candidate_count,
                )
                local_query_weights[token_id] = specificity * query_segment_multipliers.get(token_id, 1.0)
                local_query_specificity[token_id] = specificity
                if specificity >= 0.20:
                    local_query_content_weight += local_query_weights[token_id]
            local_query_norm = sum(value * value for value in local_query_weights.values()) ** 0.5
            if local_query_norm > 0.0:
                query_weights = local_query_weights
                query_specificity = local_query_specificity
                query_norm = local_query_norm
        elif self._defer_answer_sequence_prompt_overlap_cache():
            vector_candidate_indices = self._vector_answer_sequence_candidate_indices(
                query_weights.keys()
            )
            if vector_candidate_indices is not None:
                if not vector_candidate_indices:
                    return {}
                candidate_indices = vector_candidate_indices
                local_query_weights = {}
                local_query_specificity = {}
                local_query_content_weight = 0.0
                candidate_count = len(vector_candidate_indices)
                for token_id in query_weights:
                    local_frequency = self._vector_answer_sequence_local_frequency(
                        token_id,
                        vector_candidate_indices,
                    )
                    if local_frequency is None or local_frequency <= 0:
                        continue
                    specificity = self._prompt_overlap_token_specificity(
                        local_frequency,
                        candidate_count,
                    )
                    local_query_weights[token_id] = specificity * query_segment_multipliers.get(token_id, 1.0)
                    local_query_specificity[token_id] = specificity
                    if specificity >= 0.20:
                        local_query_content_weight += local_query_weights[token_id]
                local_query_norm = sum(value * value for value in local_query_weights.values()) ** 0.5
                if local_query_norm > 0.0:
                    query_weights = local_query_weights
                    query_specificity = local_query_specificity
                    query_norm = local_query_norm
        else:
            candidate_indices = range(len(self.answer_sequence_prompt_tokens))

        valid_token_mask = self._prompt_overlap_valid_token_mask()
        scores: dict[int, float] = {}
        for sequence_index in candidate_indices:
            row = self.answer_sequence_prompt_tokens[sequence_index]
            row_values = row.tolist() if hasattr(row, "tolist") else row
            row_weights: dict[int, float] = {}
            row_ids: list[int] = []
            raw_row_ids: list[int] = []
            for raw_token_id in row_values:
                token_id = int(raw_token_id)
                if token_id < 0 or token_id >= len(self.trace_token_weights):
                    continue
                raw_row_ids.append(token_id)
                if valid_token_mask is not None:
                    if token_id >= len(valid_token_mask) or not bool(valid_token_mask[token_id]):
                        continue
                elif self._should_skip_prompt_overlap_token(
                    self.embedding_model.id_to_token[token_id]
                ):
                    continue
                row_ids.append(token_id)
                row_weights[token_id] = max(
                    row_weights.get(token_id, 0.0),
                    specificity_map.get(token_id, 1.0),
                )
            if not row_weights:
                continue
            if query_numbers and not self._numeric_prompt_can_match(
                query_numbers,
                self._number_strings_from_token_ids(raw_row_ids),
            ):
                continue
            matched_content_weight = sum(
                query_weights[token_id]
                for token_id in query_weights.keys() & row_weights.keys()
                if query_specificity.get(token_id, 0.0) >= 0.20
            )
            row_token_coverage = len(query_weights.keys() & row_weights.keys()) / max(
                1,
                len(row_weights),
            )
            full_query_coverage = len(full_query_token_ids & row_weights.keys()) / max(
                1,
                len(full_query_token_ids),
            )
            primary_query_coverage = len(primary_query_token_ids & row_weights.keys()) / max(
                1,
                len(primary_query_token_ids),
            )
            if (
                has_tool_evidence
                and len(primary_query_token_ids) >= 3
                and primary_query_coverage < 0.45
                and row_token_coverage < 0.75
            ):
                continue
            partial_query_floor = 0.60 if len(full_query_token_ids) < 8 else 0.30
            if (
                len(full_query_token_ids) >= 5
                and full_query_coverage <= partial_query_floor
                and row_token_coverage < 0.75
            ):
                continue
            if (
                len(full_query_token_ids) >= 12
                and full_query_coverage < 0.25
                and row_token_coverage <= 0.75
            ):
                continue
            if (
                query_content_weight > 0.0
                and matched_content_weight / query_content_weight < 0.25
                and row_token_coverage < 0.75
                and full_query_coverage < 0.60
            ):
                continue
            query_coverage = (
                matched_content_weight / query_content_weight
                if query_content_weight > 0.0
                else row_token_coverage
            )
            numerator = sum(
                query_weights[token_id] * row_weights[token_id]
                for token_id in query_weights.keys() & row_weights.keys()
            )
            if numerator <= 0.0:
                continue
            row_norm = sum(value * value for value in row_weights.values()) ** 0.5
            if row_norm > 0.0:
                token_score = numerator / (query_norm * row_norm)
                row_bigrams = {
                    (row_ids[index], row_ids[index + 1])
                    for index in range(len(row_ids) - 1)
                }
                row_trigrams = {
                    (row_ids[index], row_ids[index + 1], row_ids[index + 2])
                    for index in range(len(row_ids) - 2)
                }
                bigram_score = ordered_ngram_score(query_bigrams, row_bigrams)
                trigram_score = ordered_ngram_score(query_trigrams, row_trigrams)
                scores[sequence_index] = (
                    (0.35 * token_score)
                    + (0.35 * query_coverage)
                    + (0.15 * bigram_score)
                    + (0.15 * trigram_score)
                ) * prompt_length_fit(len(row_weights))
        return scores

    def _score_prompt_anchor_matches(
        self,
        answer_anchor_state: Vector | None,
        keys: object | None,
        key_norms_list: object | None,
        values: object | None,
        keys_array: object | None,
        key_norms_array: object | None,
        values_array: object | None,
        valid_mask_array: object | None,
        similarity_keys_array: object | None,
        similarity_key_norms_array: object | None,
        similarity_mask_array: object | None,
        *,
        limit: int,
    ) -> list[tuple[float, int, int]]:
        if (
            answer_anchor_state is None
            or keys is None
            or key_norms_list is None
            or values is None
        ):
            return []

        if (
            np is not None
            and keys_array is not None
            and key_norms_array is not None
            and values_array is not None
            and valid_mask_array is not None
            and limit > 0
        ):
            key_array = keys_array
            key_norms = key_norms_array
            if (
                similarity_keys_array is not None
                and similarity_key_norms_array is not None
                and similarity_mask_array is not None
            ):
                state_array = self._center_state_array(
                    self._masked_combined_state_array(answer_anchor_state)
                ).astype(keys_array.dtype, copy=False)
                state_array = state_array * similarity_mask_array
                key_array = similarity_keys_array
                key_norms = similarity_key_norms_array
            else:
                state_array = self._center_state_array(answer_anchor_state).astype(
                    keys_array.dtype,
                    copy=False,
                )
            state_norm = float(np.linalg.norm(state_array))
            if state_norm == 0.0:
                return []
            numerators = key_array @ state_array
            denominators = key_norms * state_norm
            valid_mask = valid_mask_array & (denominators > 0.0)
            if np.any(valid_mask):
                scores = np.zeros_like(numerators, dtype=key_array.dtype)
                np.divide(numerators, denominators, out=scores, where=valid_mask)
                positive_positions = np.flatnonzero(valid_mask & (scores > 0.0))
                if positive_positions.size:
                    selected_positions = positive_positions
                    if positive_positions.size > limit:
                        partition = np.argpartition(scores[positive_positions], -limit)[-limit:]
                        selected_positions = positive_positions[partition]
                    ordered_positions = selected_positions[np.argsort(scores[selected_positions])[::-1]]
                    return [
                        (
                            float(scores[position]),
                            int(values_array[position]),
                            int(position),
                        )
                        for position in ordered_positions
                    ]

        if similarity_mask_array is not None:
            state = self._center_state_vector(self._masked_combined_state(answer_anchor_state))
        else:
            state = self._center_state_vector(answer_anchor_state)
        state_norm = norm(state)
        if state_norm == 0.0:
            return []

        scored: list[tuple[float, int, int]] = []
        for example_index, (key, key_norm, token_id) in enumerate(
            zip(keys, key_norms_list, values)
        ):
            if token_id < 0:
                continue
            denominator = state_norm * key_norm
            if denominator == 0.0:
                continue
            similarity = dot(state, key) / denominator
            if similarity > 0.0:
                scored.append((similarity, token_id, example_index))
        scored.sort(key=lambda item: item[0], reverse=True)
        return scored[:limit]

    def _answer_prior_from_matches(
        self,
        matches: list[tuple[float, int, int]],
        generated_tokens: list[str],
    ) -> Vector:
        assert self.embedding_model is not None
        if not matches:
            return [0.0 for _ in self.embedding_model.id_to_token]

        prior = [0.0 for _ in self.embedding_model.id_to_token]
        generated_ids = {
            self.embedding_model.token_to_id[token]
            for token in generated_tokens
            if token in self.embedding_model.token_to_id
        }
        for similarity, token_id, _ in matches[:ANSWER_TOP_K]:
            token = self.embedding_model.id_to_token[token_id]
            if not self._allowed_generation_token(token, generated_tokens):
                continue
            if token_id in generated_ids:
                prior[token_id] += similarity * 0.35
            else:
                prior[token_id] += similarity
        return _normalize_vector(prior)

    def _answer_start_matches_from_sequences(
        self,
        matches: list[tuple[float, int, int]],
    ) -> list[tuple[float, int, int]]:
        if not matches or self.answer_sequence_tokens is None:
            return []
        start_matches: list[tuple[float, int, int]] = []
        for similarity, sequence_index, example_index in matches[:ANSWER_START_TOP_K]:
            if sequence_index >= len(self.answer_sequence_tokens):
                continue
            row = self.answer_sequence_tokens[sequence_index]
            token_ids = [
                int(value)
                for value in (row.tolist() if hasattr(row, "tolist") else row)
                if int(value) >= 0
            ]
            if token_ids:
                start_matches.append((similarity, token_ids[0], example_index))
        return start_matches

    def _answer_sequence_prior_from_matches(
        self,
        matches: list[tuple[float, int, int]],
        generated_tokens: list[str],
        *,
        temperature: float = 0.0,
    ) -> Vector:
        assert self.embedding_model is not None
        if not matches or self.answer_sequence_tokens is None:
            return [0.0 for _ in self.embedding_model.id_to_token]

        generated_ids = [
            self.embedding_model.token_to_id[token]
            for token in generated_tokens
            if token in self.embedding_model.token_to_id
        ]
        prior = [0.0 for _ in self.embedding_model.id_to_token]
        best_similarity = matches[0][0]
        if best_similarity >= 0.9:
            floor_delta = 0.14 if temperature >= ANSWER_SEQUENCE_CREATIVE_TEMPERATURE else 0.02
            match_floor = best_similarity - floor_delta
        else:
            match_floor = 0.0
        for similarity, sequence_index, _ in matches[:ANSWER_START_TOP_K]:
            if similarity < ANSWER_SEQUENCE_MATCH_FLOOR:
                continue
            if similarity < match_floor:
                continue
            token_ids = self._answer_sequence_token_row(sequence_index)
            if not token_ids:
                continue
            next_token_id = self._next_sequence_token_id(token_ids, generated_ids)
            if next_token_id is None:
                continue
            token = self.embedding_model.id_to_token[next_token_id]
            if self._allowed_answer_sequence_token(token, generated_tokens):
                prior[next_token_id] += max(1e-9, similarity - match_floor)
        return _normalize_vector(prior)

    def _answer_sequence_token_row(self, sequence_index: int) -> list[int]:
        if sequence_index < 0 or self.answer_sequence_tokens is None:
            return []
        if self.answer_sequence_token_id_rows is not None:
            if sequence_index >= len(self.answer_sequence_token_id_rows):
                return []
            return self.answer_sequence_token_id_rows[sequence_index]
        if (
            np is not None
            and hasattr(self.answer_sequence_tokens, "shape")
            and len(self.answer_sequence_tokens.shape) == 2
        ):
            if sequence_index >= int(self.answer_sequence_tokens.shape[0]):
                return []
            row = np.asarray(self.answer_sequence_tokens[sequence_index])
            return [int(value) for value in row.tolist() if int(value) >= 0]
        try:
            row = self.answer_sequence_tokens[sequence_index]
        except (IndexError, TypeError):
            return []
        return self._answer_token_ids_from_row(row)

    def _filter_avoided_answer_sequence_matches(
        self,
        matches: list[tuple[float, int, int]] | None,
        avoid_token_sequences: Sequence[Sequence[str]] | None,
    ) -> list[tuple[float, int, int]]:
        if (
            not matches
            or not avoid_token_sequences
            or self.embedding_model is None
            or self.answer_sequence_tokens is None
        ):
            return list(matches or [])

        token_to_id = self.embedding_model.token_to_id
        avoided_id_sequences: set[tuple[int, ...]] = set()
        for sequence in avoid_token_sequences:
            ids: list[int] = []
            for token in sequence:
                token_id = token_to_id.get(token)
                if token_id is None:
                    ids = []
                    break
                ids.append(token_id)
            if ids:
                avoided_id_sequences.add(tuple(ids))
        if not avoided_id_sequences:
            return list(matches)

        sequence_rows = self._answer_sequence_token_rows()
        filtered: list[tuple[float, int, int]] = []
        for match in matches:
            _, sequence_index, _ = match
            if sequence_index >= len(sequence_rows):
                filtered.append(match)
                continue
            if tuple(sequence_rows[sequence_index]) in avoided_id_sequences:
                continue
            filtered.append(match)
        return filtered

    def _answer_sequence_token_rows(self) -> list[list[int]]:
        if self.answer_sequence_token_id_rows is not None:
            return self.answer_sequence_token_id_rows
        rows: list[list[int]] = []
        if (
            np is not None
            and self.answer_sequence_tokens is not None
            and hasattr(self.answer_sequence_tokens, "shape")
            and len(self.answer_sequence_tokens.shape) == 2
        ):
            token_rows = np.asarray(self.answer_sequence_tokens).tolist()
            rows = [
                [int(value) for value in row if int(value) >= 0]
                for row in token_rows
            ]
        elif self.answer_sequence_tokens is not None:
            for row in self.answer_sequence_tokens:
                rows.append(self._answer_token_ids_from_row(row))
        self.answer_sequence_token_id_rows = rows
        return rows

    @staticmethod
    def _answer_token_ids_from_row(row: object) -> list[int]:
        values = row.tolist() if hasattr(row, "tolist") else row
        if not isinstance(values, list):
            return []
        return [int(value) for value in values if int(value) >= 0]

    @staticmethod
    def _answer_fingerprint_from_token_ids(token_ids: list[int]) -> tuple[int, ...]:
        payload = ",".join(str(token_id) for token_id in token_ids).encode("ascii")
        digest = hashlib.blake2s(
            payload,
            digest_size=ANSWER_FINGERPRINT_WORDS * 4,
        ).digest()
        return tuple(
            int.from_bytes(
                digest[index * 4 : (index + 1) * 4],
                "little",
                signed=True,
            )
            for index in range(ANSWER_FINGERPRINT_WORDS)
        )

    def _refresh_answer_fingerprint_hashes(self) -> None:
        hashes: set[tuple[int, ...]] = set()
        lengths: set[int] = set()
        sequences_by_length: dict[int, set[tuple[int, ...]]] = {}
        if self.answer_sequence_tokens is not None:
            for token_ids in self._answer_sequence_token_rows():
                if token_ids:
                    token_length = len(token_ids)
                    lengths.add(token_length)
                    sequences_by_length.setdefault(token_length, set()).add(tuple(token_ids))
                    hashes.add(self._answer_fingerprint_from_token_ids(token_ids))
        self.answer_fingerprint_hashes = hashes
        self.answer_fingerprint_token_lengths = lengths
        self.answer_fingerprint_token_sequences_by_length = sequences_by_length

    def _answer_fingerprint_tensor(self) -> list[list[int]]:
        if self.answer_fingerprint_hashes is None:
            self._refresh_answer_fingerprint_hashes()
        return [
            list(fingerprint)
            for fingerprint in sorted(self.answer_fingerprint_hashes or set())
        ]

    @staticmethod
    def _coerce_answer_fingerprint_hashes(raw_fingerprints: object) -> set[tuple[int, ...]]:
        rows = raw_fingerprints.tolist() if hasattr(raw_fingerprints, "tolist") else raw_fingerprints
        hashes: set[tuple[int, ...]] = set()
        if not isinstance(rows, list):
            return hashes
        for row in rows:
            values = row.tolist() if hasattr(row, "tolist") else row
            if not isinstance(values, list):
                continue
            fingerprint = tuple(int(value) for value in values)
            if len(fingerprint) == ANSWER_FINGERPRINT_WORDS:
                hashes.add(fingerprint)
        return hashes

    def _answer_fingerprint_lengths(self) -> set[int]:
        if self.answer_fingerprint_token_lengths is not None:
            return self.answer_fingerprint_token_lengths
        lengths: set[int] = set()
        if (
            np is not None
            and self.answer_sequence_tokens is not None
            and hasattr(self.answer_sequence_tokens, "shape")
            and len(self.answer_sequence_tokens.shape) == 2
        ):
            token_matrix = np.asarray(self.answer_sequence_tokens)
            length_values = np.sum(token_matrix >= 0, axis=1)
            lengths = {
                int(length)
                for length in np.unique(length_values).tolist()
                if int(length) > 0
            }
        elif self.answer_sequence_tokens is not None:
            for token_ids in self._answer_sequence_token_rows():
                if token_ids:
                    lengths.add(len(token_ids))
        self.answer_fingerprint_token_lengths = lengths
        return lengths

    def _use_runtime_fingerprint_blacklist(self) -> bool:
        if (
            np is None
            or self.answer_sequence_tokens is None
            or not hasattr(self.answer_sequence_tokens, "shape")
            or len(self.answer_sequence_tokens.shape) != 2
        ):
            return False
        return int(self.answer_sequence_tokens.shape[0]) > ANSWER_SEQUENCE_EAGER_OVERLAP_CACHE_LIMIT

    def _answer_fingerprint_token_sequence_sets(self) -> dict[int, set[tuple[int, ...]]]:
        if self.answer_fingerprint_token_sequences_by_length is not None:
            return self.answer_fingerprint_token_sequences_by_length
        sequences_by_length: dict[int, set[tuple[int, ...]]] = {}
        lengths: set[int] = set()
        if self.answer_sequence_tokens is not None:
            for token_ids in self._answer_sequence_token_rows():
                if token_ids:
                    token_length = len(token_ids)
                    lengths.add(token_length)
                    sequences_by_length.setdefault(token_length, set()).add(tuple(token_ids))
        self.answer_fingerprint_token_lengths = lengths
        self.answer_fingerprint_token_sequences_by_length = sequences_by_length
        return sequences_by_length

    def _token_ids_for_generated_tokens(self, generated_tokens: Sequence[str]) -> list[int] | None:
        if self.embedding_model is None:
            return None
        token_ids: list[int] = []
        for token in generated_tokens:
            token_id = self.embedding_model.token_to_id.get(token)
            if token_id is None:
                return None
            token_ids.append(token_id)
        return token_ids

    def _would_complete_blacklisted_answer(
        self,
        generated_tokens: list[str],
        candidate: str,
    ) -> bool:
        generated_token_ids = self._token_ids_for_generated_tokens(generated_tokens)
        return self._would_complete_blacklisted_answer_ids(generated_token_ids, candidate)

    def _would_complete_blacklisted_answer_ids(
        self,
        generated_token_ids: Sequence[int] | None,
        candidate: str,
    ) -> bool:
        if (
            self.embedding_model is None
            or not self.answer_fingerprint_hashes
            or candidate not in self.embedding_model.token_to_id
            or generated_token_ids is None
        ):
            return False
        candidate_id = self.embedding_model.token_to_id[candidate]
        if self._is_terminal_punctuation_text(self._render_token(candidate)):
            return False
        candidate_length = len(generated_token_ids) + 1
        if self._use_runtime_fingerprint_blacklist():
            lengths = self._answer_fingerprint_lengths()
            if lengths and candidate_length not in lengths:
                return False
            token_ids = [*generated_token_ids, candidate_id]
            if not token_ids:
                return False
            return self._answer_fingerprint_from_token_ids(token_ids) in self.answer_fingerprint_hashes
        sequence_sets = self._answer_fingerprint_token_sequence_sets()
        candidate_sequences = sequence_sets.get(candidate_length)
        if candidate_sequences is not None:
            return (*generated_token_ids, candidate_id) in candidate_sequences
        if self.answer_sequence_tokens is not None:
            return False
        lengths = self._answer_fingerprint_lengths()
        if lengths and candidate_length not in lengths:
            return False
        token_ids = [*generated_token_ids, candidate_id]
        if not token_ids:
            return False
        return self._answer_fingerprint_from_token_ids(token_ids) in self.answer_fingerprint_hashes

    def _would_follow_blacklisted_answer_prefix_ids(
        self,
        generated_token_ids: Sequence[int] | None,
        candidate: str,
        *,
        minimum_prefix_length: int = ANSWER_REPLAY_PREFIX_MIN_TOKENS,
    ) -> bool:
        if (
            self.embedding_model is None
            or self.answer_sequence_tokens is None
            or candidate not in self.embedding_model.token_to_id
            or generated_token_ids is None
        ):
            return False
        candidate_id = self.embedding_model.token_to_id[candidate]
        candidate_path = (*generated_token_ids, candidate_id)
        if len(candidate_path) < minimum_prefix_length:
            return False
        prefix_sets = self._answer_sequence_prefix_sets(minimum_prefix_length)
        return candidate_path in prefix_sets.get(len(candidate_path), set())

    def _answer_sequence_prefix_sets(
        self,
        minimum_prefix_length: int = ANSWER_REPLAY_PREFIX_MIN_TOKENS,
    ) -> dict[int, set[tuple[int, ...]]]:
        cached = self.answer_sequence_prefixes_by_length
        if cached is not None:
            return cached
        prefixes: dict[int, set[tuple[int, ...]]] = {}
        for token_ids in self._answer_sequence_token_rows():
            for length in range(minimum_prefix_length, len(token_ids) + 1):
                prefixes.setdefault(length, set()).add(tuple(token_ids[:length]))
        self.answer_sequence_prefixes_by_length = prefixes
        return prefixes

    def _avoid_text_token_sequences(
        self,
        avoid_texts: Sequence[str] | None,
    ) -> list[list[str]]:
        if not avoid_texts or self.tokenizer is None:
            return []
        sequences: list[list[str]] = []
        seen: set[tuple[str, ...]] = set()
        for text in avoid_texts:
            if not isinstance(text, str) or not text.strip():
                continue
            tokens = [
                token
                for token in self.tokenizer.encode(text)
                if token not in self.tokenizer.special_tokens
            ]
            key = tuple(tokens)
            if tokens and key not in seen:
                seen.add(key)
                sequences.append(tokens)
        return sequences

    @staticmethod
    def _runtime_generation_history_key(context: str) -> str:
        return " ".join(context.split()).casefold()

    @staticmethod
    def _runtime_history_enabled(context: str, *, temperature: float) -> bool:
        if temperature < ANSWER_REPLAY_PREFIX_TEMPERATURE:
            return False
        lowered = context.casefold()
        return "<source>" not in lowered and "<tool_result>" not in lowered

    def _runtime_avoid_texts(
        self,
        context: str,
        avoid_texts: Sequence[str] | None,
        *,
        temperature: float,
    ) -> list[str]:
        combined: list[str] = []
        seen: set[str] = set()
        for text in avoid_texts or ():
            cleaned = " ".join(str(text).split())
            if cleaned and cleaned not in seen:
                combined.append(cleaned)
                seen.add(cleaned)
        if not self._runtime_history_enabled(context, temperature=temperature):
            return combined
        history = self.runtime_generation_history.get(
            self._runtime_generation_history_key(context),
            [],
        )
        for text in history:
            cleaned = " ".join(str(text).split())
            if cleaned and cleaned not in seen:
                combined.append(cleaned)
                seen.add(cleaned)
        return combined

    def _remember_runtime_generation(
        self,
        context: str,
        generated_text: str,
        *,
        temperature: float,
    ) -> None:
        if not self._runtime_history_enabled(context, temperature=temperature):
            return
        cleaned = " ".join(generated_text.split())
        if not cleaned:
            return
        key = self._runtime_generation_history_key(context)
        history = [
            existing
            for existing in self.runtime_generation_history.get(key, [])
            if existing != cleaned
        ]
        history.append(cleaned)
        self.runtime_generation_history[key] = history[-RUNTIME_GENERATION_HISTORY_LIMIT:]

    @staticmethod
    def _would_follow_avoided_sequence(
        generated_tokens: list[str],
        candidate: str,
        avoid_token_sequences: Sequence[Sequence[str]] | None,
    ) -> bool:
        if not avoid_token_sequences:
            return False
        prefix_length = len(generated_tokens) + 1
        if prefix_length < AVOID_SEQUENCE_MIN_TOKENS:
            return False
        candidate_path = [*generated_tokens, candidate]
        for sequence in avoid_token_sequences:
            if prefix_length <= len(sequence) and list(sequence[:prefix_length]) == candidate_path:
                return True
        return False

    def _should_stop_answer_sequence(
        self,
        decode_state: DecodeState,
        generated_tokens: list[str],
    ) -> bool:
        matches = decode_state.answer_sequence_matches
        if matches is None:
            matches = self._score_answer_sequence_matches(
                decode_state.answer_anchor_state,
                decode_state.context_tokens,
            )
        return self._answer_sequence_is_complete(generated_tokens, matches)

    def _should_stop_after_answer_path_drift(
        self,
        decode_state: DecodeState,
        generated_tokens: list[str],
    ) -> bool:
        matches = decode_state.answer_sequence_matches
        if matches is None:
            matches = self._score_answer_sequence_matches(
                decode_state.answer_anchor_state,
                decode_state.context_tokens,
            )
        if not matches or matches[0][0] < ANSWER_SEQUENCE_MATCH_FLOOR:
            return False
        if self._answer_sequence_has_continuation(generated_tokens, matches):
            return False
        if self._generated_answer_ends_terminal_sentence(generated_tokens):
            return True
        return self._generated_word_count(generated_tokens) >= 14

    def _generated_answer_ends_terminal_sentence(self, generated_tokens: list[str]) -> bool:
        if not generated_tokens:
            return False
        rendered = self._render_token(generated_tokens[-1])
        if not self._is_terminal_punctuation_text(rendered):
            return False
        return self._generated_word_count(generated_tokens) > 0

    def _answer_decode_has_continuation(
        self,
        decode_state: DecodeState,
        generated_tokens: list[str],
    ) -> bool:
        matches = decode_state.answer_sequence_matches
        if matches is None:
            matches = self._score_answer_sequence_matches(
                decode_state.answer_anchor_state,
                decode_state.context_tokens,
            )
        return self._answer_sequence_has_continuation(generated_tokens, matches)

    def _answer_sequence_is_complete(
        self,
        generated_tokens: list[str],
        matches: list[tuple[float, int, int]],
    ) -> bool:
        if (
            self.embedding_model is None
            or self.answer_sequence_tokens is None
            or not generated_tokens
            or not matches
        ):
            return False
        generated_ids = [
            self.embedding_model.token_to_id[token]
            for token in generated_tokens
            if token in self.embedding_model.token_to_id
        ]
        if not generated_ids:
            return False
        for similarity, sequence_index, _ in matches[:ANSWER_START_TOP_K]:
            if similarity < ANSWER_SEQUENCE_MATCH_FLOOR or sequence_index >= len(self.answer_sequence_tokens):
                continue
            row = self.answer_sequence_tokens[sequence_index]
            token_ids = [
                int(value)
                for value in (row.tolist() if hasattr(row, "tolist") else row)
                if int(value) >= 0
            ]
            if not token_ids:
                continue
            if len(generated_ids) >= len(token_ids) and generated_ids[: len(token_ids)] == token_ids:
                return True
            if (
                self.answer_fingerprint_hashes
                and len(generated_ids) + 1 == len(token_ids)
                and generated_ids == token_ids[: len(generated_ids)]
                and self._answer_fingerprint_from_token_ids(token_ids)
                in self.answer_fingerprint_hashes
            ):
                generated_tail = self._render_token(generated_tokens[-1])
                if self._is_structural_punctuation_text(
                    generated_tail
                ) and not self._is_terminal_punctuation_text(generated_tail):
                    continue
                final_token = self.embedding_model.id_to_token[token_ids[-1]]
                if self._is_terminal_punctuation_text(self._render_token(final_token)):
                    continue
                return True
        return False

    def _answer_sequence_has_continuation(
        self,
        generated_tokens: list[str],
        matches: list[tuple[float, int, int]],
    ) -> bool:
        if (
            self.embedding_model is None
            or self.answer_sequence_tokens is None
            or not generated_tokens
            or not matches
        ):
            return False
        generated_ids = [
            self.embedding_model.token_to_id[token]
            for token in generated_tokens
            if token in self.embedding_model.token_to_id
        ]
        if not generated_ids:
            return False
        for similarity, sequence_index, _ in matches[:ANSWER_START_TOP_K]:
            if similarity < ANSWER_SEQUENCE_MATCH_FLOOR or sequence_index >= len(self.answer_sequence_tokens):
                continue
            row = self.answer_sequence_tokens[sequence_index]
            token_ids = [
                int(value)
                for value in (row.tolist() if hasattr(row, "tolist") else row)
                if int(value) >= 0
            ]
            if not token_ids:
                continue
            next_token_id = self._next_sequence_token_id(token_ids, generated_ids)
            if next_token_id is None:
                continue
            token = self.embedding_model.id_to_token[next_token_id]
            if self._allowed_answer_sequence_token(token, generated_tokens):
                return True
        return False

    def _next_sequence_token_id(
        self,
        token_ids: list[int],
        generated_ids: list[int],
    ) -> int | None:
        if not generated_ids:
            return token_ids[0]
        if len(generated_ids) >= len(token_ids):
            return None
        if token_ids[: len(generated_ids)] != generated_ids:
            return None
        return token_ids[len(generated_ids)]

    def _transition_prior(self, context_tokens: list[str]) -> Vector:
        prior, _ = self._transition_prior_with_order(context_tokens)
        return prior

    def _transition_prior_with_order(
        self,
        context_tokens: list[str],
    ) -> tuple[Vector, int | None]:
        assert self.embedding_model is not None
        if self.transition_id_tables:
            for order in TRANSITION_ORDERS:
                if len(context_tokens) < order:
                    continue
                key_ids: list[int] = []
                for token in context_tokens[-order:]:
                    token_id = self.embedding_model.token_to_id.get(token)
                    if token_id is None:
                        key_ids = []
                        break
                    key_ids.append(token_id)
                if not key_ids:
                    continue
                transitions = self._transition_tensor_lookup(order, key_ids)
                if transitions is None:
                    transitions = self.transition_id_tables.get(order, {}).get(tuple(key_ids))
                if not transitions:
                    continue
                next_token_ids, probabilities = transitions
                prior = [0.0 for _ in self.embedding_model.id_to_token]
                for token_id, probability in zip(next_token_ids, probabilities):
                    token_index = int(token_id)
                    if 0 <= token_index < len(prior):
                        prior[token_index] = float(probability)
                return _normalize_vector(prior), order
        if not self.transition_tables:
            return [0.0 for _ in self.embedding_model.id_to_token], None

        for order in TRANSITION_ORDERS:
            if len(context_tokens) < order:
                continue
            key = tuple(context_tokens[-order:])
            transitions = self.transition_tables.get(order, {}).get(key)
            if not transitions:
                continue
            prior = [0.0 for _ in self.embedding_model.id_to_token]
            for token, probability in transitions.items():
                token_id = self.embedding_model.token_to_id.get(token)
                if token_id is not None:
                    prior[token_id] = probability
            return _normalize_vector(prior), order
        return [0.0 for _ in self.embedding_model.id_to_token], None

    def _transition_prior_array_with_order(
        self,
        context_tokens: list[str],
    ) -> tuple[object, int | None]:
        assert np is not None
        assert self.embedding_model is not None
        prior = np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        if self.transition_id_tables:
            for order in TRANSITION_ORDERS:
                if len(context_tokens) < order:
                    continue
                key_ids: list[int] = []
                for token in context_tokens[-order:]:
                    token_id = self.embedding_model.token_to_id.get(token)
                    if token_id is None:
                        key_ids = []
                        break
                    key_ids.append(token_id)
                if not key_ids:
                    continue
                transitions = self._transition_tensor_lookup(order, key_ids)
                if transitions is None:
                    transitions = self.transition_id_tables.get(order, {}).get(tuple(key_ids))
                if not transitions:
                    continue
                next_token_ids, probabilities = transitions
                token_ids_array = np.asarray(next_token_ids, dtype=np.int64)
                probabilities_array = np.asarray(probabilities, dtype=np.float64)
                valid = (
                    (token_ids_array >= 0)
                    & (token_ids_array < len(self.embedding_model.id_to_token))
                    & (probabilities_array > 0.0)
                )
                if np.any(valid):
                    prior[token_ids_array[valid]] = probabilities_array[valid]
                    total = float(prior.sum())
                    if total > 0.0:
                        prior /= total
                    return prior, order
            return prior, None
        if not self.transition_tables:
            return prior, None

        for order in TRANSITION_ORDERS:
            if len(context_tokens) < order:
                continue
            key = tuple(context_tokens[-order:])
            transitions = self.transition_tables.get(order, {}).get(key)
            if not transitions:
                continue
            for token, probability in transitions.items():
                token_id = self.embedding_model.token_to_id.get(token)
                if token_id is not None:
                    prior[token_id] = probability
            total = float(prior.sum())
            if total > 0.0:
                prior /= total
            return prior, order
        return prior, None

    def _copy_prior(self, context_tokens: list[str]) -> Vector:
        assert self.embedding_model is not None
        assert self.tokenizer is not None

        prior = [0.0 for _ in self.embedding_model.id_to_token]
        decay = 0.82
        answer_start = None
        for index in range(len(context_tokens) - 1, -1, -1):
            if context_tokens[index] == "<answer>":
                answer_start = index + 1
                break
        source_tokens = (
            context_tokens[: max(0, answer_start - 1)]
            if answer_start is not None
            else context_tokens
        )
        if not source_tokens:
            return prior
        for distance, token in enumerate(reversed(source_tokens)):
            if token in self.tokenizer.special_tokens:
                continue
            if not self._eligible_copy_token(token):
                continue
            token_id = self.embedding_model.token_to_id.get(token)
            if token_id is None:
                continue
            prior[token_id] += (decay**distance) * self._copy_token_distinctiveness(token)
        return _normalize_vector(prior)

    def _copy_prior_array(self, context_tokens: list[str]) -> object:
        assert np is not None
        assert self.embedding_model is not None
        assert self.tokenizer is not None

        prior = np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        decay = 0.82
        answer_start = None
        for index in range(len(context_tokens) - 1, -1, -1):
            if context_tokens[index] == "<answer>":
                answer_start = index + 1
                break
        source_tokens = (
            context_tokens[: max(0, answer_start - 1)]
            if answer_start is not None
            else context_tokens
        )
        for distance, token in enumerate(reversed(source_tokens)):
            if token in self.tokenizer.special_tokens:
                continue
            if not self._eligible_copy_token(token):
                continue
            token_id = self.embedding_model.token_to_id.get(token)
            if token_id is None:
                continue
            prior[token_id] += (decay**distance) * self._copy_token_distinctiveness(token)
        total = float(prior.sum())
        if total > 0.0:
            prior /= total
        return prior

    def _copy_token_distinctiveness(self, token: str) -> float:
        rendered = self._render_token(token).strip()
        if not rendered:
            return 0.0
        letters = sum(character.isalpha() for character in rendered)
        digits = sum(character.isdigit() for character in rendered)
        symbols = sum(
            not character.isalnum() and not character.isspace()
            for character in rendered
        )
        score = 1.0
        if any(character.isupper() for character in rendered) and letters:
            score += 0.8
        if digits:
            score += 0.9
        if symbols:
            score += 0.5
        if len(rendered) >= 4:
            score += 0.2
        return score

    def _prompt_copy_evidence_is_distinctive(self, context_tokens: list[str]) -> bool:
        answer_start = None
        for index in range(len(context_tokens) - 1, -1, -1):
            if context_tokens[index] == "<answer>":
                answer_start = index
                break
        prompt_tokens = context_tokens[:answer_start] if answer_start is not None else context_tokens
        for token in prompt_tokens:
            if self.tokenizer is not None and token in self.tokenizer.special_tokens:
                continue
            rendered = self._render_token(token).strip()
            if any(character.isdigit() for character in rendered):
                return True
            if sum(character.isupper() for character in rendered) >= 2:
                return True
        return False

    def _source_evidence_prior(
        self,
        context_tokens: list[str],
        generated_tokens: list[str] | None = None,
    ) -> Vector:
        assert self.embedding_model is not None
        prior = [0.0 for _ in self.embedding_model.id_to_token]
        for token_id, weight in self._source_evidence_token_weights(
            context_tokens,
            generated_tokens or [],
        ).items():
            if 0 <= token_id < len(prior):
                prior[token_id] += weight
        return _normalize_vector(prior)

    def _source_evidence_prior_array(
        self,
        context_tokens: list[str],
        generated_tokens: list[str] | None = None,
    ) -> object:
        assert np is not None
        assert self.embedding_model is not None
        prior = np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        for token_id, weight in self._source_evidence_token_weights(
            context_tokens,
            generated_tokens or [],
        ).items():
            if 0 <= token_id < prior.size:
                prior[token_id] += weight
        total = float(prior.sum())
        if total > 0.0:
            prior /= total
        return prior

    def _source_evidence_token_weights(
        self,
        context_tokens: list[str],
        generated_tokens: list[str],
    ) -> dict[int, float]:
        if self.embedding_model is None or self.tokenizer is None:
            return {}
        segments = self._source_evidence_segments(context_tokens)
        if not segments:
            return {}

        generated_ids = [
            self.embedding_model.token_to_id[token]
            for token in generated_tokens
            if token in self.embedding_model.token_to_id
        ]
        first_source_index = _first_index(context_tokens, "<source>")
        query_tokens = (
            context_tokens[:first_source_index]
            if first_source_index is not None
            else context_tokens
        )
        query_token_ids = {
            self.embedding_model.token_to_id[token]
            for token in query_tokens
            if token in self.embedding_model.token_to_id
            and token not in self.tokenizer.special_tokens
            and self._eligible_copy_token(token)
        }
        weights: dict[int, float] = {}

        def add_token(token: str, weight: float, *, allow_piece: bool = False) -> None:
            if token in self.tokenizer.special_tokens:
                return
            if not allow_piece and not self._allowed_generation_token(token, generated_tokens):
                return
            if allow_piece:
                rendered = self._render_token(token)
                if not rendered or not rendered.strip():
                    return
            elif not self._eligible_copy_token(token):
                return
            token_id = self.embedding_model.token_to_id.get(token)
            if token_id is None:
                return
            weights[token_id] = weights.get(token_id, 0.0) + weight

        for segment_tokens, segment_weight, segment_role in segments[-6:]:
            if generated_ids and segment_role != "snippet":
                continue
            token_ids = [
                self.embedding_model.token_to_id[token]
                for token in segment_tokens
                if token in self.embedding_model.token_to_id
            ]
            aligned = False
            if generated_ids and token_ids:
                max_suffix = min(8, len(generated_ids), len(token_ids))
                for suffix_length in range(max_suffix, 0, -1):
                    suffix = generated_ids[-suffix_length:]
                    for index in range(len(token_ids) - suffix_length):
                        if token_ids[index : index + suffix_length] != suffix:
                            continue
                        next_token_id = token_ids[index + suffix_length]
                        next_token = self.embedding_model.id_to_token[next_token_id]
                        add_token(
                            next_token,
                            segment_weight * (3.0 + suffix_length),
                            allow_piece=True,
                        )
                        aligned = True
                    if aligned:
                        break
            if aligned:
                continue

            content_rank = 0
            anchor_seen = False
            segment_has_query_anchor = any(token_id in query_token_ids for token_id in token_ids)
            for token in segment_tokens:
                rendered = self._render_token(token)
                if "://" in rendered or rendered.casefold().startswith("http"):
                    continue
                if not self._eligible_copy_token(token):
                    continue
                token_id = self.embedding_model.token_to_id.get(token)
                if token_id is None:
                    continue
                if segment_has_query_anchor:
                    in_query = token_id in query_token_ids
                    if in_query:
                        weight = segment_weight * 0.42
                        anchor_seen = True
                    elif anchor_seen:
                        weight = segment_weight * 2.10
                    else:
                        weight = segment_weight * 0.32
                elif content_rank == 0:
                    weight = segment_weight * 4.0
                elif content_rank == 1:
                    weight = segment_weight * 1.35
                else:
                    weight = segment_weight * 0.65
                weight *= 0.94 ** min(content_rank, 24)
                add_token(token, weight)
                content_rank += 1
        return weights

    def _source_evidence_segments(self, context_tokens: list[str]) -> list[tuple[list[str], float, str]]:
        if self.tokenizer is None:
            return []
        answer_boundary = _last_index(context_tokens, "<answer>")
        upper_bound = answer_boundary if answer_boundary is not None else len(context_tokens)
        boundary_tokens = {"<source>", "<tool_result>", "<tool_call>", "<final>", "<answer>"}
        segments: list[tuple[list[str], float, str]] = []
        index = 0
        while index < upper_bound:
            if context_tokens[index] != "<source>":
                index += 1
                continue
            start = index + 1
            end = start
            while (
                end < upper_bound
                and context_tokens[end] not in boundary_tokens
                and self._render_token(context_tokens[end]) != "\n"
            ):
                end += 1
            source_tokens = context_tokens[start:end]
            pipe_positions = [
                position
                for position, token in enumerate(source_tokens)
                if self._render_token(token).strip() == "|"
            ]
            if pipe_positions:
                snippet_tokens = source_tokens[pipe_positions[-1] + 1 :]
                if snippet_tokens:
                    segments.append((snippet_tokens, 1.0, "snippet"))
            elif source_tokens:
                segments.append((source_tokens, 0.90, "snippet"))
            index = end + 1
        return segments

    def _source_evidence_is_complete(
        self,
        context_tokens: list[str],
        generated_tokens: list[str],
    ) -> bool:
        if (
            self.embedding_model is None
            or self.tokenizer is None
            or self._generated_word_count(generated_tokens) < 5
        ):
            return False
        generated_ids = [
            self.embedding_model.token_to_id[token]
            for token in generated_tokens
            if token in self.embedding_model.token_to_id
        ]
        if not generated_ids:
            return False
        for segment_tokens, _, segment_role in self._source_evidence_segments(context_tokens):
            if segment_role != "snippet":
                continue
            segment_ids = [
                self.embedding_model.token_to_id[token]
                for token in segment_tokens
                if token in self.embedding_model.token_to_id
            ]
            if len(generated_ids) > len(segment_ids):
                continue
            max_suffix = min(12, len(generated_ids), len(segment_ids))
            for suffix_length in range(max_suffix, 4, -1):
                suffix_ids = generated_ids[-suffix_length:]
                for start in range(len(segment_ids) - suffix_length + 1):
                    if segment_ids[start : start + suffix_length] != suffix_ids:
                        continue
                    next_index = start + suffix_length
                    if next_index >= len(segment_ids):
                        return True
                    next_token = self.embedding_model.id_to_token[segment_ids[next_index]]
                    if self._source_punctuation_continues_numeric_span(
                        segment_ids,
                        next_index,
                    ):
                        return False
                    if self._is_terminal_punctuation_text(self._render_token(next_token)):
                        return True
        return False

    def _source_evidence_has_continuation(
        self,
        context_tokens: list[str],
        generated_tokens: list[str],
    ) -> bool:
        if self.embedding_model is None or not generated_tokens:
            return False
        generated_ids = [
            self.embedding_model.token_to_id[token]
            for token in generated_tokens
            if token in self.embedding_model.token_to_id
        ]
        if not generated_ids:
            return False
        for segment_tokens, _, segment_role in self._source_evidence_segments(context_tokens):
            if segment_role != "snippet":
                continue
            segment_ids = [
                self.embedding_model.token_to_id[token]
                for token in segment_tokens
                if token in self.embedding_model.token_to_id
            ]
            max_suffix = min(12, len(generated_ids), len(segment_ids))
            for suffix_length in range(max_suffix, 0, -1):
                suffix_ids = generated_ids[-suffix_length:]
                for start in range(len(segment_ids) - suffix_length + 1):
                    if segment_ids[start : start + suffix_length] != suffix_ids:
                        continue
                    next_index = start + suffix_length
                    if next_index >= len(segment_ids):
                        return False
                    if self._source_punctuation_continues_numeric_span(
                        segment_ids,
                        next_index,
                    ) or self._source_punctuation_continues_numeric_span(
                        segment_ids,
                        next_index - 1,
                    ):
                        return True
                    next_token = self.embedding_model.id_to_token[segment_ids[next_index]]
                    return not self._is_terminal_punctuation_text(
                        self._render_token(next_token)
                    )
        return False

    def _source_evidence_next_token(
        self,
        context_tokens: list[str],
        generated_tokens: list[str],
    ) -> str | None:
        if self.embedding_model is None:
            return None
        for segment_tokens, _, segment_role in self._source_evidence_segments(context_tokens):
            if segment_role != "snippet" or not segment_tokens:
                continue
            if not generated_tokens:
                return segment_tokens[0]
            segment_ids = [
                self.embedding_model.token_to_id[token]
                for token in segment_tokens
                if token in self.embedding_model.token_to_id
            ]
            generated_ids = [
                self.embedding_model.token_to_id[token]
                for token in generated_tokens
                if token in self.embedding_model.token_to_id
            ]
            if not segment_ids or not generated_ids:
                continue
            max_suffix = min(12, len(generated_ids), len(segment_ids))
            for suffix_length in range(max_suffix, 0, -1):
                suffix_ids = generated_ids[-suffix_length:]
                for start in range(len(segment_ids) - suffix_length + 1):
                    if segment_ids[start : start + suffix_length] != suffix_ids:
                        continue
                    next_index = start + suffix_length
                    if next_index < len(segment_ids):
                        return self.embedding_model.id_to_token[segment_ids[next_index]]
        return None

    def _source_punctuation_continues_numeric_span(
        self,
        segment_ids: list[int],
        punctuation_index: int,
    ) -> bool:
        if self.embedding_model is None:
            return False
        if punctuation_index <= 0 or punctuation_index + 1 >= len(segment_ids):
            return False
        punctuation_text = self._render_token(
            self.embedding_model.id_to_token[segment_ids[punctuation_index]]
        ).strip()
        if not self._is_structural_punctuation_text(punctuation_text):
            return False
        previous_text = self._render_token(
            self.embedding_model.id_to_token[segment_ids[punctuation_index - 1]]
        )
        next_text = self._render_token(
            self.embedding_model.id_to_token[segment_ids[punctuation_index + 1]]
        )
        return any(character.isdigit() for character in previous_text) and any(
            character.isdigit() for character in next_text
        )

    def _preference_prior(self) -> Vector:
        assert self.embedding_model is not None
        if not self.preference_bias or not any(value != 0.0 for value in self.preference_bias):
            return [0.0 for _ in self.embedding_model.id_to_token]
        eligible_indices = [
            index
            for index, token in enumerate(self.embedding_model.id_to_token)
            if self.preference_bias[index] > 0.0 and self._eligible_preference_token(token)
        ]
        if not eligible_indices:
            return [0.0 for _ in self.embedding_model.id_to_token]
        eligible_probabilities = self._calibrated_softmax(
            [self.preference_bias[index] for index in eligible_indices]
        )
        prior = [0.0 for _ in self.embedding_model.id_to_token]
        for index, probability in zip(eligible_indices, eligible_probabilities):
            prior[index] = probability
        return prior

    def _preference_prior_array(self) -> object:
        assert np is not None
        assert self.embedding_model is not None
        if self.preference_bias_array is None or not np.any(self.preference_bias_array != 0.0):
            return np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        if self.preference_valid_mask_array is None or not np.any(self.preference_valid_mask_array):
            return np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        positive_mask = self.preference_bias_array > 0.0
        active_mask = self.preference_valid_mask_array & positive_mask
        if not np.any(active_mask):
            return np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        prior = np.zeros(len(self.embedding_model.id_to_token), dtype=np.float64)
        prior[active_mask] = self._calibrated_softmax_array(
            self.preference_bias_array[active_mask]
        )
        return prior

    def _eligible_preference_token(self, token: str) -> bool:
        assert self.tokenizer is not None
        if token == self.tokenizer.unk_token or token in self.tokenizer.special_tokens:
            return False
        if not self._starts_new_word(token):
            return False
        rendered = self._render_token(token)
        if not rendered.strip() or self._is_punctuation_piece(rendered):
            return False
        alphanumeric = "".join(character for character in rendered if character.isalnum())
        return len(alphanumeric) >= 1

    def _build_transition_tables(
        self,
        tokens: list[str],
    ) -> dict[int, dict[tuple[str, ...], dict[str, float]]]:
        counts: dict[int, dict[tuple[str, ...], dict[str, int]]] = {
            order: {} for order in sorted(TRANSITION_ORDERS)
        }
        for order in sorted(TRANSITION_ORDERS):
            for index in range(order - 1, len(tokens) - 1):
                key = tuple(tokens[index - order + 1 : index + 1])
                nxt = tokens[index + 1]
                bucket = counts[order].setdefault(key, {})
                bucket[nxt] = bucket.get(nxt, 0) + 1

        probabilities: dict[int, dict[tuple[str, ...], dict[str, float]]] = {
            order: {} for order in sorted(TRANSITION_ORDERS)
        }
        for order, mapping in counts.items():
            items = list(mapping.items())
            items.sort(key=lambda item: (-sum(item[1].values()), item[0]))
            if (
                self.config.max_transition_contexts_per_order is not None
                and self.config.max_transition_contexts_per_order >= 0
            ):
                items = items[: self.config.max_transition_contexts_per_order]
            for key, bucket in items:
                next_items = sorted(bucket.items(), key=lambda item: (-item[1], item[0]))
                if self.config.max_transition_next_tokens > 0:
                    next_items = next_items[: self.config.max_transition_next_tokens]
                total = sum(value for _, value in next_items)
                if total <= 0:
                    continue
                probabilities[order][key] = {
                    token: value / total
                    for token, value in next_items
                }
        return probabilities

    def _transition_table_tensors(self) -> dict[str, object]:
        assert self.embedding_model is not None
        if self.transition_tensor_cache is not None:
            return {
                "transition_orders": self.transition_tensor_cache["orders"],
                "transition_key_offsets": self.transition_tensor_cache["key_offsets"],
                "transition_key_token_ids": self.transition_tensor_cache["key_token_ids"],
                "transition_next_offsets": self.transition_tensor_cache["next_offsets"],
                "transition_next_token_ids": self.transition_tensor_cache["next_token_ids"],
                "transition_next_probabilities": self.transition_tensor_cache["next_probabilities"],
            }
        if not self.transition_tables:
            return {
                "transition_orders": [],
                "transition_key_offsets": [0],
                "transition_key_token_ids": [],
                "transition_next_offsets": [0],
                "transition_next_token_ids": [],
                "transition_next_probabilities": [],
            }
        token_to_id = self.embedding_model.token_to_id
        orders: list[int] = []
        key_offsets: list[int] = [0]
        key_token_ids: list[int] = []
        next_offsets: list[int] = [0]
        next_token_ids: list[int] = []
        next_probabilities: list[float] = []
        for order in sorted(self.transition_tables):
            mapping = self.transition_tables.get(order, {})
            for key, transitions in mapping.items():
                key_ids = [token_to_id.get(token, -1) for token in key]
                if len(key_ids) != order or any(token_id < 0 for token_id in key_ids):
                    continue
                next_items = [
                    (token_to_id[token], float(probability))
                    for token, probability in transitions.items()
                    if token in token_to_id and probability > 0.0
                ]
                if not next_items:
                    continue
                orders.append(order)
                key_token_ids.extend(key_ids)
                key_offsets.append(len(key_token_ids))
                for token_id, probability in next_items:
                    next_token_ids.append(token_id)
                    next_probabilities.append(probability)
                next_offsets.append(len(next_token_ids))
        return {
            "transition_orders": orders,
            "transition_key_offsets": key_offsets,
            "transition_key_token_ids": key_token_ids,
            "transition_next_offsets": next_offsets,
            "transition_next_token_ids": next_token_ids,
            "transition_next_probabilities": next_probabilities,
        }

    def _deserialize_transition_id_tables_from_tensors(
        self,
        tensors: dict[str, object],
    ) -> dict[int, dict[tuple[int, ...], tuple[object, object]]] | None:
        required = (
            "transition_orders",
            "transition_key_offsets",
            "transition_key_token_ids",
            "transition_next_offsets",
            "transition_next_token_ids",
            "transition_next_probabilities",
        )
        if any(name not in tensors for name in required):
            return None

        def _as_sequence(name: str) -> object:
            value = tensors.get(name, [])
            return value if hasattr(value, "shape") else list(value)

        orders = _as_sequence("transition_orders")
        key_offsets = _as_sequence("transition_key_offsets")
        key_token_ids = _as_sequence("transition_key_token_ids")
        next_offsets = _as_sequence("transition_next_offsets")
        next_token_ids = _as_sequence("transition_next_token_ids")
        next_probabilities = _as_sequence("transition_next_probabilities")
        row_count = len(orders)
        if row_count == 0:
            return {order: {} for order in sorted(TRANSITION_ORDERS)}
        if len(key_offsets) != row_count + 1 or len(next_offsets) != row_count + 1:
            return None
        if np is not None and hasattr(orders, "shape"):
            self.transition_tensor_cache = {
                "orders": orders,
                "key_offsets": key_offsets,
                "key_token_ids": key_token_ids,
                "next_offsets": next_offsets,
                "next_token_ids": next_token_ids,
                "next_probabilities": next_probabilities,
                "order_spans": {},
            }
            self.transition_built_orders = set()
            return {order: {} for order in sorted(TRANSITION_ORDERS)}
        tables: dict[int, dict[tuple[int, ...], tuple[object, object]]] = {
            order: {} for order in sorted(TRANSITION_ORDERS)
        }
        for index in range(row_count):
            order = int(orders[index])
            key_start = int(key_offsets[index])
            key_end = int(key_offsets[index + 1])
            next_start = int(next_offsets[index])
            next_end = int(next_offsets[index + 1])
            key = tuple(int(token_id) for token_id in key_token_ids[key_start:key_end])
            if len(key) != order or next_end <= next_start:
                continue
            tables.setdefault(order, {})[key] = (
                next_token_ids[next_start:next_end],
                next_probabilities[next_start:next_end],
            )
        return tables

    def _serialize_transition_tables(self) -> dict[str, dict[str, dict[str, float]]]:
        assert self.transition_tables is not None
        return {
            str(order): {
                _encode_ngram_key(key): value
                for key, value in mapping.items()
            }
            for order, mapping in self.transition_tables.items()
        }

    def _deserialize_transition_tables(
        self,
        payload: dict[str, dict[str, dict[str, float]]],
    ) -> dict[int, dict[tuple[str, ...], dict[str, float]]]:
        tables: dict[int, dict[tuple[str, ...], dict[str, float]]] = {
            order: {} for order in sorted(TRANSITION_ORDERS)
        }
        for order_text, mapping in payload.items():
            order = int(order_text)
            tables[order] = {
                _decode_ngram_key(key): {
                    str(token): float(probability)
                    for token, probability in value.items()
                }
                for key, value in mapping.items()
            }
        return tables

    def _transition_tensor_order_span(self, order: int) -> tuple[int, int] | None:
        if np is None or self.transition_tensor_cache is None:
            return None
        spans = self.transition_tensor_cache.get("order_spans")
        if isinstance(spans, dict) and order in spans:
            return spans[order]
        orders = self.transition_tensor_cache["orders"]
        positions = np.flatnonzero(orders == order)
        span = (
            (int(positions[0]), int(positions[-1]) + 1)
            if positions.size
            else None
        )
        if isinstance(spans, dict):
            spans[order] = span
        return span

    def _transition_tensor_lookup(
        self,
        order: int,
        key_ids: list[int],
    ) -> tuple[object, object] | None:
        if (
            np is None
            or self.transition_tensor_cache is None
            or len(key_ids) != order
        ):
            return None
        span = self._transition_tensor_order_span(order)
        if span is None:
            return None
        row_start, row_end = span
        key_offsets = self.transition_tensor_cache["key_offsets"]
        key_token_ids = self.transition_tensor_cache["key_token_ids"]
        next_offsets = self.transition_tensor_cache["next_offsets"]
        next_token_ids = self.transition_tensor_cache["next_token_ids"]
        next_probabilities = self.transition_tensor_cache["next_probabilities"]
        key_start = int(key_offsets[row_start])
        key_end = int(key_offsets[row_end])
        key_block = np.asarray(key_token_ids[key_start:key_end], dtype=np.int64)
        row_count = row_end - row_start
        if row_count <= 0 or key_block.size != row_count * order:
            return None
        keys = key_block.reshape(row_count, order)
        query = np.asarray(key_ids, dtype=np.int64)
        matches = np.flatnonzero(np.all(keys == query[None, :], axis=1))
        if not matches.size:
            return None
        row = row_start + int(matches[0])
        next_start = int(next_offsets[row])
        next_end = int(next_offsets[row + 1])
        if next_end <= next_start:
            return None
        return (
            next_token_ids[next_start:next_end],
            next_probabilities[next_start:next_end],
        )

    def _eligible_copy_token(self, token: str) -> bool:
        rendered = self._render_token(token)
        if not rendered.strip():
            return False
        if self._is_punctuation_piece(rendered):
            return False
        if not self._starts_new_word(token):
            return False
        alphanumeric = "".join(character for character in rendered if character.isalnum())
        return len(alphanumeric) >= 2

    def _allowed_generation_token(
        self,
        token: str,
        generated_tokens: list[str],
        context_tokens: list[str] | None = None,
    ) -> bool:
        return self._allowed_generation_token_with_meta(
            token,
            self._generation_token_meta(token),
            generated_tokens,
            context_tokens,
        )

    def _allowed_generation_token_with_meta(
        self,
        token: str,
        meta: GenerationTokenMeta,
        generated_tokens: list[str],
        context_tokens: list[str] | None = None,
    ) -> bool:
        assert self.embedding_model is not None
        assert self.tokenizer is not None
        if token == self.tokenizer.unk_token:
            return False
        if token in self.tokenizer.special_tokens:
            return self._allowed_tool_protocol_token(
                token,
                generated_tokens=generated_tokens,
                context_tokens=context_tokens or [],
            )
        if len(self.embedding_model.id_to_token) < 1024:
            return True
        if meta.rendered == "\n":
            return bool(generated_tokens)
        if not meta.stripped:
            return False
        if meta.word_joiner:
            return (
                self._can_attach_word_joiner(generated_tokens)
                or self._can_start_line_with_word_joiner(token, generated_tokens)
            )
        if meta.structural_punctuation:
            return bool(generated_tokens) or self._can_start_answer_with_structural_punctuation(token)
        if meta.structural_symbol:
            return bool(generated_tokens) or meta.starts_new_word
        if not meta.starts_new_word:
            if not generated_tokens:
                return False
            previous_rendered = self._render_token(generated_tokens[-1])
            return (
                bool(previous_rendered)
                and any(character.isalnum() for character in previous_rendered)
                and bool(meta.alphanumeric)
            )
        return len(meta.alphanumeric) >= 1 or not meta.punctuation_piece

    @staticmethod
    def _allowed_tool_protocol_token(
        token: str,
        *,
        generated_tokens: list[str],
        context_tokens: list[str],
    ) -> bool:
        if token not in TOOL_PROTOCOL_TOKENS:
            return False
        if token == "<tool_call>":
            return (
                ReframrModel._context_requests_tool_call(context_tokens)
                and
                "<tool_call>" not in generated_tokens
                and "<tool_result>" not in generated_tokens
                and "<source>" not in generated_tokens
            )
        if token in {"<tool_result>", "<source>"}:
            return False
        if token == "<final>":
            return (
                "<tool_result>" in context_tokens
                or "<source>" in context_tokens
                or "<final>" in context_tokens
            )
        return True

    @staticmethod
    def _context_requests_tool_call(context_tokens: list[str]) -> bool:
        rendered_terms: list[str] = []
        for token in context_tokens:
            if token in TOOL_PROTOCOL_TOKENS or token.startswith("<"):
                continue
            normalized = token.replace("▁", " ").strip().casefold()
            if not normalized:
                continue
            rendered_terms.append(normalized)
            pieces = {
                "".join(
                    character
                    for character in piece
                    if character.isalnum() or character in {"-", "."}
                )
                for piece in normalized.split()
            }
            if pieces & TOOL_CALL_CONTEXT_TERMS:
                return True
        joined = " ".join(rendered_terms)
        compact = "".join(rendered_terms)
        return any(
            term in joined or term.replace("-", "") in compact
            for term in TOOL_CALL_CONTEXT_TERMS
        )

    def _would_repeat_recent_pattern(
        self,
        candidate: str,
        generated_tokens: list[str],
        recent_rendered_words: list[str] | None = None,
    ) -> bool:
        if len(generated_tokens) >= 2 and generated_tokens[-1] == candidate and generated_tokens[-2] == candidate:
            return True

        if len(generated_tokens) >= 2:
            trigram = tuple(generated_tokens[-2:] + [candidate])
            recent_tokens = generated_tokens[-12:]
            for index in range(max(0, len(recent_tokens) - 4)):
                if tuple(recent_tokens[index : index + 3]) == trigram:
                    return True

        rendered_words = recent_rendered_words
        if rendered_words is None:
            rendered_words = self._recent_rendered_words(generated_tokens)
        candidate_meta = self._generation_token_meta(candidate)
        candidate_word = candidate_meta.rendered.casefold()
        if (
            rendered_words
            and candidate_meta.starts_new_word
            and any(character.isalnum() for character in candidate_word)
        ):
            candidate_bigram = (rendered_words[-1], candidate_word)
            recent_window = rendered_words[-10:]
            recent_bigrams = {
                (recent_window[index], recent_window[index + 1])
                for index in range(len(recent_window) - 1)
            }
            if candidate_bigram in recent_bigrams:
                return True
            if (
                len(candidate_word) > 2
                and rendered_words[-10:].count(candidate_word) >= 2
                and not candidate_meta.common_connector
            ):
                return True

        return False

    @staticmethod
    def _is_inside_tool_protocol_continuation(generated_tokens: list[str]) -> bool:
        return any(token in TOOL_PROTOCOL_TOKENS for token in generated_tokens[-6:])

    def _would_repeat_recent_phrase(
        self,
        candidate: str,
        generated_tokens: list[str],
        *,
        recent_rendered_words: list[str] | None = None,
    ) -> bool:
        if not self._starts_new_word(candidate):
            return False
        rendered_words = list(
            recent_rendered_words
            if recent_rendered_words is not None
            else self._recent_rendered_words(generated_tokens)
        )
        candidate_word = self._render_token(candidate).casefold()
        if not any(character.isalnum() for character in candidate_word):
            return False
        rendered_words.append(candidate_word)
        recent_window = rendered_words[-48:]
        for span in range(4, min(8, len(recent_window)) + 1):
            suffix = tuple(recent_window[-span:])
            earlier = recent_window[:-span]
            for index in range(len(earlier) - span + 1):
                if tuple(earlier[index : index + span]) == suffix:
                    return True
        return False

    def _recent_phrase_repeat_candidate_words(
        self,
        recent_rendered_words: list[str],
    ) -> set[str]:
        repeat_candidates: set[str] = set()
        base_window = recent_rendered_words[-47:]
        max_span = min(8, len(base_window) + 1)
        if max_span < 4:
            return repeat_candidates
        for span in range(4, max_span + 1):
            prefix_length = span - 1
            suffix_prefix = tuple(base_window[-prefix_length:])
            earlier_length = len(base_window) - prefix_length
            if earlier_length < span:
                continue
            for index in range(earlier_length - span + 1):
                earlier_segment = base_window[index : index + span]
                if tuple(earlier_segment[:-1]) == suffix_prefix:
                    candidate_word = earlier_segment[-1]
                    if any(character.isalnum() for character in candidate_word):
                        repeat_candidates.add(candidate_word)
        return repeat_candidates

    def _recent_rendered_words(self, generated_tokens: list[str]) -> list[str]:
        rendered_words: list[str] = []
        for token in generated_tokens:
            if not self._starts_new_word(token):
                continue
            rendered = self._render_token(token).casefold()
            if any(character.isalnum() for character in rendered):
                rendered_words.append(rendered)
        return rendered_words

    def _select_generation_token(
        self,
        distribution: dict[str, float],
        *,
        context_tokens: list[str] | None = None,
        generated_tokens: list[str] | None = None,
        temperature: float = DEFAULT_GENERATION_TEMPERATURE,
        top_k: int = DEFAULT_GENERATION_TOP_K,
        top_p: float = DEFAULT_GENERATION_TOP_P,
        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
        preserve_dominant_candidates: bool = False,
        avoid_token_sequences: Sequence[Sequence[str]] | None = None,
    ) -> str:
        assert self.tokenizer is not None
        generated_tokens = generated_tokens or []
        candidates = self._prepare_generation_candidates(
            distribution,
            context_tokens=context_tokens or [],
            generated_tokens=generated_tokens,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            preserve_dominant_candidates=preserve_dominant_candidates,
            avoid_token_sequences=avoid_token_sequences,
        )
        if candidates:
            return self._sample_generation_candidate(
                candidates,
                context_tokens=context_tokens or [],
                generated_tokens=generated_tokens,
                stochastic=temperature > 0.0,
                preserve_dominant_candidates=preserve_dominant_candidates,
            )

        for token, _ in sorted(distribution.items(), key=lambda item: item[1], reverse=True):
            if token in self.tokenizer.special_tokens and token not in TOOL_PROTOCOL_TOKENS:
                continue
            if token == self.tokenizer.unk_token:
                continue
            if not self._allowed_generation_token(token, generated_tokens, context_tokens or []):
                continue
            if self._would_complete_blacklisted_answer(generated_tokens, token):
                continue
            return token
        return ""

    def _select_generation_token_from_array(
        self,
        probabilities: object,
        *,
        context_tokens: list[str],
        generated_tokens: list[str],
        temperature: float = DEFAULT_GENERATION_TEMPERATURE,
        top_k: int = DEFAULT_GENERATION_TOP_K,
        top_p: float = DEFAULT_GENERATION_TOP_P,
        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
        preserve_dominant_candidates: bool = False,
        avoid_token_sequences: Sequence[Sequence[str]] | None = None,
    ) -> str:
        assert np is not None
        assert self.tokenizer is not None
        assert self.embedding_model is not None

        values = np.asarray(probabilities, dtype=np.float64)
        if values.size == 0:
            return ""
        first_pool_size = min(values.size, max(top_k, 64))
        if first_pool_size <= 0:
            first_pool_size = min(values.size, 64)
        expanded_pool_size = min(values.size, max(top_k * 4, 64))
        pool_sizes: list[int] = []
        for pool_size in (first_pool_size, expanded_pool_size, values.size):
            if pool_size > 0 and pool_size not in pool_sizes:
                pool_sizes.append(pool_size)

        for pool_size in pool_sizes:
            if pool_size < values.size:
                candidate_indices = np.argpartition(values, -pool_size)[-pool_size:]
                candidate_indices = candidate_indices[np.argsort(values[candidate_indices])[::-1]]
            else:
                candidate_indices = np.argsort(values)[::-1]

            distribution: dict[str, float] = {}
            for raw_index in candidate_indices:
                index = int(raw_index)
                score = float(values[index])
                if score <= 0.0:
                    continue
                token = self.embedding_model.id_to_token[index]
                if (
                    token == self.tokenizer.unk_token
                    or token in self.tokenizer.special_tokens
                    and token not in TOOL_PROTOCOL_TOKENS
                ):
                    continue
                distribution[token] = score
            selected = self._select_generation_token(
                distribution,
                context_tokens=context_tokens,
                generated_tokens=generated_tokens,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                preserve_dominant_candidates=preserve_dominant_candidates,
                avoid_token_sequences=avoid_token_sequences,
            )
            if selected:
                return selected
        return ""

    def _prepare_generation_candidates(
        self,
        distribution: dict[str, float],
        *,
        context_tokens: list[str] | None = None,
        generated_tokens: list[str],
        temperature: float,
        top_k: int,
        top_p: float,
        repetition_penalty: float,
        preserve_dominant_candidates: bool = False,
        avoid_token_sequences: Sequence[Sequence[str]] | None = None,
    ) -> list[tuple[str, float]]:
        assert self.tokenizer is not None
        assert self.embedding_model is not None
        context_tokens = context_tokens or []

        generated_word_count = self._generated_word_count(generated_tokens)
        clause_words = self._words_since_clause_break(generated_tokens)
        recent_rendered_words = self._recent_rendered_words(generated_tokens)
        generated_token_ids = self._token_ids_for_generated_tokens(generated_tokens)
        inside_tool_protocol = self._is_inside_tool_protocol_continuation(generated_tokens)
        phrase_repeat_candidate_words = (
            self._recent_phrase_repeat_candidate_words(recent_rendered_words)
            if generated_word_count >= MIN_COMPLETE_ANSWER_WORDS and not inside_tool_protocol
            else set()
        )
        prompt_content_tokens = [
            token
            for token in context_tokens
            if token not in self.tokenizer.special_tokens
            and self._generation_token_meta(token).starts_new_word
            and self._generation_token_meta(token).alphanumeric
            and not self._generation_token_meta(token).punctuation_piece
        ]
        initial_prompt_content_token = (
            prompt_content_tokens[0]
            if len(prompt_content_tokens) > 1
            else None
        )
        best_probability = max(distribution.values(), default=0.0)
        has_uppercase_start_candidate = any(
            probability > 0.0
            and self._generation_token_meta(token).starts_new_word
            and self._generation_token_meta(token).rendered[:1].isupper()
            for token, probability in distribution.items()
        )
        adjusted: list[tuple[str, float]] = []
        for token, probability in sorted(distribution.items(), key=lambda item: item[1], reverse=True):
            if token in self.tokenizer.special_tokens and token not in TOOL_PROTOCOL_TOKENS:
                continue
            if token == self.tokenizer.unk_token or probability <= 0.0:
                continue
            meta = self._generation_token_meta(token)
            allowed_by_general_filter = self._allowed_generation_token_with_meta(
                token,
                meta,
                generated_tokens,
                context_tokens,
            )
            if not allowed_by_general_filter:
                dominant_learned_continuation = (
                    preserve_dominant_candidates
                    and best_probability > 0.0
                    and probability >= best_probability * 0.99
                    and self._allowed_answer_sequence_token(token, generated_tokens)
                )
                if not dominant_learned_continuation:
                    continue
            if self._would_complete_blacklisted_answer_ids(generated_token_ids, token):
                continue
            repeats_recent_pattern = self._would_repeat_recent_pattern(
                token,
                generated_tokens,
                recent_rendered_words=recent_rendered_words,
            )
            hard_phrase_loop = (
                generated_word_count >= MIN_COMPLETE_ANSWER_WORDS
                and not inside_tool_protocol
                and meta.starts_new_word
                and meta.rendered.casefold() in phrase_repeat_candidate_words
            )
            if hard_phrase_loop:
                continue
            if repeats_recent_pattern:
                dominant_candidate_allowed = (
                    preserve_dominant_candidates
                    and best_probability > 0.0
                    and probability >= best_probability * 0.80
                )
                if not dominant_candidate_allowed:
                    continue

            score = probability
            if (
                temperature >= ANSWER_REPLAY_PREFIX_TEMPERATURE
                and not inside_tool_protocol
                and self._would_follow_blacklisted_answer_prefix_ids(
                    generated_token_ids,
                    token,
                )
            ):
                score *= ANSWER_REPLAY_PREFIX_PENALTY
            if (
                temperature > 0.0
                and self._would_follow_avoided_sequence(
                    generated_tokens,
                    token,
                    avoid_token_sequences,
                )
            ):
                score *= 0.12
            rendered = meta.rendered
            punctuation_token = meta.structural_punctuation
            starts_new_word = meta.starts_new_word
            alphanumeric = meta.alphanumeric
            if (
                not generated_tokens
                and initial_prompt_content_token is not None
                and token == initial_prompt_content_token
            ):
                dominant_answer_candidate = (
                    preserve_dominant_candidates
                    and best_probability > 0.0
                    and probability >= best_probability * 0.80
                )
                if not dominant_answer_candidate:
                    continue
            if (
                not generated_tokens
                and temperature > 0.0
                and has_uppercase_start_candidate
                and starts_new_word
                and rendered[:1].islower()
                and best_probability > 0.0
                and probability < best_probability * 0.85
            ):
                continue
            if generated_tokens and starts_new_word and alphanumeric:
                previous_alphanumeric = self._generation_token_meta(
                    generated_tokens[-1]
                ).alphanumeric
                if previous_alphanumeric.casefold() == alphanumeric.casefold():
                    continue
            common_connector = meta.common_connector
            if (
                starts_new_word
                and len(alphanumeric) == 1
                and not common_connector
            ):
                score *= 0.08
            recent_count = generated_tokens[-12:].count(token)
            if recent_count > 0 and not common_connector:
                score /= repetition_penalty ** (2 * recent_count)
            if generated_tokens and token == generated_tokens[-1]:
                score /= repetition_penalty**3
            if generated_tokens and token in generated_tokens[-4:] and not common_connector:
                score *= 0.35
            if generated_tokens and not starts_new_word and self._starts_new_word(generated_tokens[-1]):
                score *= 0.08
            if not generated_tokens and punctuation_token:
                if best_probability <= 0.0 or probability < best_probability * 0.80:
                    score *= 0.01
            elif not generated_tokens and not starts_new_word:
                score *= 0.02
            if (
                not generated_tokens
                and temperature > 0.0
                and has_uppercase_start_candidate
                and starts_new_word
                and rendered[:1].islower()
            ):
                score *= 0.03
            if punctuation_token:
                if generated_tokens and self._is_structural_punctuation_token(generated_tokens[-1]):
                    score *= 0.05
                if clause_words >= 6:
                    score *= 1.0 + min(1.4, 0.18 * (clause_words - 5))
                elif generated_word_count >= 12:
                    score *= 1.1
            if score > 0.0:
                adjusted.append((token, score))

        if not adjusted:
            return []
        adjusted.sort(key=lambda item: item[1], reverse=True)
        if preserve_dominant_candidates:
            top_score = adjusted[0][1]
            second_score = adjusted[1][1] if len(adjusted) > 1 else 0.0
            if top_score >= 0.5 and (
                second_score <= 0.0
                or top_score >= second_score * 1.2
                or top_score - second_score >= 0.08
            ):
                return [(adjusted[0][0], 1.0)]
        effective_top_k = top_k
        if (
            temperature >= CREATIVE_EARLY_POOL_TEMPERATURE
            and generated_word_count < CREATIVE_EARLY_POOL_WORD_LIMIT
            and not inside_tool_protocol
            and top_k > CREATIVE_EARLY_POOL_MAX
        ):
            effective_top_k = CREATIVE_EARLY_POOL_MAX
        if effective_top_k > 0:
            adjusted = adjusted[:effective_top_k]
        if 0.0 < top_p < 1.0:
            kept: list[tuple[str, float]] = []
            cumulative = 0.0
            total = sum(score for _, score in adjusted)
            for token, score in adjusted:
                normalized = score / total if total else 0.0
                kept.append((token, score))
                cumulative += normalized
                if cumulative >= top_p:
                    break
            adjusted = kept

        if temperature <= 0.0:
            return [(adjusted[0][0], 1.0)]

        exponent = 1.0 / temperature
        tempered = [
            (token, score**exponent)
            for token, score in adjusted
            if score > 0.0
        ]
        total = sum(score for _, score in tempered)
        if total <= 0.0:
            return []
        return [(token, score / total) for token, score in tempered]

    def _sample_generation_candidate(
        self,
        candidates: list[tuple[str, float]],
        *,
        context_tokens: list[str],
        generated_tokens: list[str],
        stochastic: bool = False,
        preserve_dominant_candidates: bool = False,
    ) -> str:
        if not candidates:
            return ""
        if len(candidates) == 1:
            return candidates[0][0]
        top_probability = candidates[0][1]
        second_probability = candidates[1][1]
        top_has_clear_half_majority = top_probability >= 0.5 and (
            second_probability <= 0.0
            or top_probability - second_probability >= 0.02
        )
        if preserve_dominant_candidates and top_has_clear_half_majority:
            return candidates[0][0]
        decisive_stochastic_winner = stochastic and (
            top_probability >= 0.985
            or (
                top_probability >= 0.96
                and second_probability > 0.0
                and top_probability >= second_probability * 20.0
            )
            or (
                top_probability >= 0.90
                and second_probability > 0.0
                and top_probability >= second_probability * 40.0
            )
            or (
                top_probability >= 0.90
                and top_probability - second_probability >= 0.75
            )
        )
        decisive_deterministic_winner = not stochastic and (
            top_has_clear_half_majority
            or (second_probability > 0.0 and top_probability >= second_probability * 2.5)
            or (
                top_probability >= 0.08
                and second_probability > 0.0
                and top_probability >= second_probability * 1.35
            )
        )
        if decisive_stochastic_winner or decisive_deterministic_winner:
            return candidates[0][0]
        if stochastic:
            threshold = random.random()
        else:
            seed_payload = "\u0002".join([*context_tokens, "<generated>", *generated_tokens, str(len(candidates))])
            seed = int.from_bytes(hashlib.sha256(seed_payload.encode("utf-8")).digest()[:8], "big")
            threshold = random.Random(seed).random()
        cumulative = 0.0
        for token, probability in candidates:
            cumulative += probability
            if threshold <= cumulative:
                return token
        return candidates[-1][0]

    def _top_entries_from_vector(
        self,
        values: Vector,
        limit: int,
    ) -> list[dict[str, object]]:
        if limit <= 0:
            return []
        ranked = sorted(
            enumerate(values),
            key=lambda item: item[1],
            reverse=True,
        )
        return [
            self._token_entry(index, probability)
            for index, probability in ranked[:limit]
            if probability > 0.0
        ]

    def _token_entry(
        self,
        index: int,
        probability: float,
    ) -> dict[str, object]:
        assert self.embedding_model is not None
        token = self.embedding_model.id_to_token[index]
        return {
            "token": token,
            "text": self._render_token(token),
            "probability": probability,
        }

    def _build_reasoning_summary(
        self,
        transition_order: int | None,
        blend_weights: dict[str, float],
    ) -> str:
        dominant_source = max(blend_weights.items(), key=lambda item: item[1])[0] if blend_weights else "base"
        if transition_order is not None:
            transition_message = f" Transition prior is using order-{transition_order} context."
        else:
            transition_message = " Transition prior found no matching n-gram."

        return (
            "Generation is running on analytical state, recurrent traces, and corpus-derived token transitions."
            f"{transition_message}"
            f" Dominant blend source: {dominant_source}."
        )

    def _generated_word_count(self, tokens: list[str]) -> int:
        count = 0
        for token in tokens:
            rendered = self._render_token(token)
            if not any(character.isalnum() for character in rendered):
                continue
            if self._starts_new_word(token) or count == 0:
                count += 1
        return count

    def _is_structural_punctuation_text(self, text: str) -> bool:
        if len(text) != 1:
            return False
        if self._is_word_joiner_text(text):
            return False
        category = unicodedata.category(text)
        return category.startswith("P")

    def _is_structural_punctuation_token(self, token: str) -> bool:
        return self._is_structural_punctuation_text(self._render_token(token))

    def _is_structural_symbol_token(self, token: str) -> bool:
        rendered = self._render_token(token)
        return len(rendered) == 1 and unicodedata.category(rendered).startswith("S")

    def _is_word_joiner_token(self, token: str) -> bool:
        return self._is_word_joiner_text(self._render_token(token))

    def _is_word_joiner_text(self, text: str) -> bool:
        if len(text) != 1:
            return False
        category = unicodedata.category(text)
        if category in ("Pc", "Pd", "Lm"):
            return True
        name = unicodedata.name(text, "")
        return "APOSTROPHE" in name or (
            "SINGLE" in name and "QUOTATION MARK" in name
        )

    def _can_start_line_with_word_joiner(self, token: str, generated_tokens: list[str]) -> bool:
        rendered = self._render_token(token)
        if len(rendered) != 1 or unicodedata.category(rendered) != "Pd":
            return False
        if not self._starts_new_word(token):
            return False
        return not generated_tokens or self._render_token(generated_tokens[-1]) == "\n"

    def _can_start_answer_with_structural_punctuation(self, token: str) -> bool:
        rendered = self._render_token(token)
        if len(rendered) != 1 or not self._starts_new_word(token):
            return False
        return unicodedata.category(rendered) in ("Ps", "Pi")

    def _is_common_connector_token(self, token: str) -> bool:
        rendered = self._render_token(token)
        return rendered.isalpha() and len(rendered) == 1 and rendered.islower()

    def _can_attach_word_joiner(self, generated_tokens: list[str]) -> bool:
        if not generated_tokens:
            return False
        rendered = self._render_token(generated_tokens[-1])
        if not rendered:
            return False
        if any(character.isalnum() for character in rendered):
            return True
        if len(rendered) != 1:
            return False
        return unicodedata.category(rendered) in ("Ps", "Pi")

    def _words_since_clause_break(self, tokens: list[str]) -> int:
        assert self.tokenizer is not None

        words = 0
        for token in reversed(tokens):
            if token in self.tokenizer.special_tokens:
                continue
            rendered = self._render_token(token)
            if self._is_structural_punctuation_text(rendered):
                break
            if self._starts_new_word(token) and not self._is_punctuation_piece(rendered):
                words += 1
        return words

    def _should_stop_generation(self, generated_tokens: list[str]) -> bool:
        if not generated_tokens:
            return False
        if not self._is_terminal_punctuation_text(self._render_token(generated_tokens[-1])):
            return False
        word_count = self._generated_word_count(generated_tokens)
        if word_count >= MIN_COMPLETE_ANSWER_WORDS:
            return True
        return (
            word_count >= MIN_COMPLETE_MULTI_SENTENCE_WORDS
            and self._terminal_sentence_count(generated_tokens) >= 2
        )

    def _terminal_sentence_count(self, tokens: list[str]) -> int:
        return sum(
            1
            for token in tokens
            if self._is_terminal_punctuation_text(self._render_token(token))
        )

    def _is_terminal_punctuation_text(self, text: str) -> bool:
        stripped = text.strip()
        if not stripped:
            return False
        terminal_character = stripped[-1]
        if not self._is_structural_punctuation_text(terminal_character):
            return False
        return not self._is_word_joiner_text(terminal_character)

    def _should_skip_prompt_overlap_token(self, token: str) -> bool:
        rendered = self._render_token(token)
        if not rendered.strip():
            return True
        if (
            self.embedding_model is not None
            and len(self.embedding_model.id_to_token) >= 1024
            and not self._starts_new_word(token)
        ):
            return True
        if self._is_structural_punctuation_text(rendered):
            return True
        return rendered.strip().casefold() in PROMPT_ENVELOPE_TERMS

    def _starts_new_word(self, token: str) -> bool:
        assert self.tokenizer is not None
        if token in self.tokenizer.special_tokens:
            return True
        if token.startswith(self.tokenizer.word_prefix):
            return True
        return len(token) == 1 and not token.isalnum() and not self._is_word_joiner_token(token)

    def _generation_token_meta(self, token: str) -> GenerationTokenMeta:
        cache = self.generation_token_meta_cache
        if cache is None:
            cache = {}
            self.generation_token_meta_cache = cache
        cached = cache.get(token)
        if cached is not None:
            return cached
        rendered = self._render_token(token)
        meta = GenerationTokenMeta(
            rendered=rendered,
            stripped=rendered.strip(),
            starts_new_word=self._starts_new_word(token),
            punctuation_piece=self._is_punctuation_piece(rendered),
            structural_punctuation=self._is_structural_punctuation_token(token),
            structural_symbol=self._is_structural_symbol_token(token),
            word_joiner=self._is_word_joiner_token(token),
            alphanumeric="".join(character for character in rendered if character.isalnum()),
            common_connector=self._is_common_connector_token(token),
        )
        cache[token] = meta
        return meta

    def _decode_tokens(self, tokens: list[str]) -> str:
        assert self.tokenizer is not None
        return self.tokenizer.decode(
            tokens,
            preserve_special_tokens=TOOL_PROTOCOL_TOKENS,
        )

    @staticmethod
    def _normalize_generated_tool_protocol_text(text: str, *, context: str | None = None) -> str:
        marker = "<tool_call>"
        call_index = text.find(marker)
        if call_index < 0:
            return text

        cleaned = text[:]
        for boundary in ("<tool_result>", "<source>", "<final>"):
            boundary_index = cleaned.find(boundary, call_index + len(marker))
            if boundary_index >= 0:
                cleaned = cleaned[:boundary_index].rstrip()

        second_call_index = cleaned.find(marker, call_index + len(marker))
        if second_call_index >= 0:
            cleaned = cleaned[:second_call_index].rstrip()

        brace_start = cleaned.find("{", call_index)
        if brace_start < 0:
            return cleaned.strip()

        depth = 0
        in_string = False
        escaped = False
        last_top_level_comma: int | None = None
        for index in range(brace_start, len(cleaned)):
            character = cleaned[index]
            if escaped:
                escaped = False
                continue
            if in_string and character == "\\":
                escaped = True
                continue
            if character == '"':
                in_string = not in_string
                continue
            if in_string:
                continue
            if character == "{":
                depth += 1
                continue
            if character == "}":
                depth -= 1
                if depth <= 0:
                    candidate = cleaned[: index + 1].strip()
                    return ReframrModel._repair_tool_call_payload_if_needed(
                        candidate,
                        context=context,
                    )
                continue
            if character == "," and depth == 1:
                last_top_level_comma = index

        if depth > 0:
            if last_top_level_comma is not None:
                candidate = cleaned[:last_top_level_comma].rstrip() + "}"
                return ReframrModel._repair_tool_call_payload_if_needed(
                    candidate,
                    context=context,
                )
            candidate = cleaned.rstrip() + "}"
            return ReframrModel._repair_tool_call_payload_if_needed(
                candidate,
                context=context,
            )
        return ReframrModel._repair_tool_call_payload_if_needed(
            cleaned.strip(),
            context=context,
        )

    @staticmethod
    def _repair_tool_call_payload_if_needed(text: str, *, context: str | None = None) -> str:
        marker = "<tool_call>"
        if not text.startswith(marker):
            return text
        brace_start = text.find("{", len(marker))
        if brace_start < 0:
            return text
        tool_name = text[len(marker) : brace_start].strip()
        payload_text = text[brace_start:].strip()
        try:
            payload = json.loads(payload_text)
            if isinstance(payload, dict) and tool_name == "web.search":
                repaired_query = ReframrModel._repair_search_query_from_context_if_weak(
                    str(payload.get("query", "")),
                    context,
                )
                if repaired_query is not None:
                    payload["query"] = repaired_query
                    return f"{marker} {tool_name} {json.dumps(payload, ensure_ascii=False)}"
            return text
        except (TypeError, json.JSONDecodeError):
            pass
        body = payload_text.strip()
        if body.startswith("{"):
            body = body[1:]
        if body.endswith("}"):
            body = body[:-1]
        body = " ".join(body.replace('"', "").split())
        if not tool_name or not body:
            return text
        if tool_name == "web.search":
            payload = {
                "query": ReframrModel._repair_search_query_from_context_if_weak(
                    body,
                    context,
                )
                or body
            }
        else:
            payload = {"input": body}
        return f"{marker} {tool_name} {json.dumps(payload, ensure_ascii=False)}"

    @staticmethod
    def _repair_search_query_from_context_if_weak(
        query: str,
        context: str | None,
    ) -> str | None:
        cleaned_query = " ".join(query.replace("{", " ").replace("}", " ").split())
        normalized_words = [
            word.strip(" \t\r\n:,.;!?\"'()[]{}").casefold()
            for word in cleaned_query.split()
            if word.strip(" \t\r\n:,.;!?\"'()[]{}")
        ]
        unique_content_words = {
            word
            for word in normalized_words
            if word not in {"query", "web.search", "tool_call"}
        }
        lowered_query = cleaned_query.casefold()
        weak = (
            len(unique_content_words) < 3
            or lowered_query.startswith("query:")
            or "web.search" in lowered_query
            or any(
                marker in lowered_query
                for marker in ("<tool", "<source>", "<final>", "according to")
            )
        )
        if not weak:
            return None
        context_query = ReframrModel._search_query_from_context(context or "")
        return context_query or None

    @staticmethod
    def _search_query_from_context(context: str) -> str:
        if not context:
            return ""
        before_tool_result = context.split("<tool_result>", 1)[0]
        before_final = before_tool_result.split("<final>", 1)[0]
        lines = [line.strip() for line in before_final.splitlines() if line.strip()]
        if not lines:
            lines = [before_final.strip()]
        latest_user = ""
        for line in lines:
            lowered = line.casefold()
            if lowered.startswith("user:"):
                latest_user = line.split(":", 1)[1].strip()
            elif lowered.startswith("question:"):
                latest_user = line.split(":", 1)[1].strip()
        if not latest_user:
            latest_user = lines[-1]
        for prefix in ("User:", "Question:", "Prompt:", "Context:"):
            if latest_user.casefold().startswith(prefix.casefold()):
                latest_user = latest_user[len(prefix) :].strip()
        cleaned = " ".join(latest_user.split())
        return cleaned.strip(" \t\r\n\"'")

    @staticmethod
    def _finalize_generated_text(text: str) -> str:
        stripped = text.rstrip()
        if not stripped:
            return stripped
        if stripped.startswith("<tool_call>"):
            return stripped
        stripped = ReframrModel._remove_separator_punctuation_before_boundary(stripped)
        if stripped and ReframrModel._is_separator_punctuation(stripped[-1:]):
            stripped = stripped[:-1].rstrip()
            if not stripped:
                return stripped
        if (
            ReframrModel._is_surface_punctuation(stripped[:1])
            or ReframrModel._is_surface_punctuation(stripped[-1:])
        ):
            return stripped
        if any(character.isalnum() for character in stripped[-8:]):
            return f"{stripped}."
        return stripped

    @staticmethod
    def _remove_separator_punctuation_before_boundary(text: str) -> str:
        cleaned: list[str] = []
        for character in text:
            if (
                ReframrModel._is_separator_punctuation(character)
                and cleaned
                and ReframrModel._is_separator_punctuation(cleaned[-1])
            ):
                cleaned.pop()
            cleaned.append(character)
        return "".join(cleaned)

    @staticmethod
    def _is_surface_punctuation(character: str) -> bool:
        return len(character) == 1 and unicodedata.category(character).startswith("P")

    @staticmethod
    def _is_separator_punctuation(character: str) -> bool:
        return (
            ReframrModel._is_surface_punctuation(character)
            and unicodedata.bidirectional(character) == "CS"
        )

    def _render_token(self, token: str) -> str:
        assert self.tokenizer is not None
        if token.startswith(self.tokenizer.word_prefix):
            return token[len(self.tokenizer.word_prefix) :]
        return token

    def _require_fit(self) -> None:
        if (
            self.tokenizer is None
            or self.embedding_model is None
            or self.memory_units is None
            or self.readout_weights is None
            or self.ternary_mask is None
            or self.associative_keys is None
            or (
                self.associative_key_norms is None
                and self.associative_key_norms_array is None
            )
            or self.associative_values is None
            or self.transition_tables is None
        ):
            raise RuntimeError("Call fit() before using the REFRAMR model.")

    def _ensure_numeric_caches(self) -> None:
        if np is None:
            return
        if self.readout_weights_array is None:
            self._refresh_numeric_caches()