"""Online adaptive bias layer for per-document learning.

Applies a learned bias vector in log-probability space on top of the
LLM's output. The bias is updated via gradient descent after each
observed token, allowing the model to adapt to the specific document
being compressed.

Both compressor and decompressor start from the same initial state
(zero bias) and apply identical updates, maintaining lossless symmetry.

Uses float64 numpy throughout to ensure bit-exact reproducibility
across compression and decompression.
"""

import numpy as np


class AdaptiveHead:
    """Thin adaptive bias layer on top of LLM probabilities.

    Instead of fine-tuning the full model, this learns a per-token
    bias correction:

        adjusted[i] = softmax(log(probs[i]) + bias[i])

    This is equivalent to multiplicatively rescaling each probability:

        adjusted[i] = probs[i] * exp(bias[i]) / Z

    The bias starts at zero (identity transform) and is updated after
    each observed token using the gradient of cross-entropy loss. Over
    time, it learns to boost tokens that the LLM under-predicts for
    this specific document and suppress over-predicted ones.
    """

    def __init__(self, vocab_size: int = 49152, lr: float = 0.001):
        """Initialize the adaptive head.

        Args:
            vocab_size: Size of the token vocabulary.
            lr: Learning rate for bias updates. Small values (0.001)
                give gentle adaptation; larger values risk oscillation.
        """
        self.vocab_size = vocab_size
        self.lr = lr

        # Bias in log-probability space. float64 for precision.
        self.bias = np.zeros(vocab_size, dtype=np.float64)

        # Pre-allocated buffers to avoid per-token allocation.
        self._log_buf = np.zeros(vocab_size, dtype=np.float64)
        self._grad_buf = np.zeros(vocab_size, dtype=np.float64)

    def reset(self):
        """Reset bias to zero. Call when starting a new sequence."""
        self.bias[:] = 0

    def adjust(self, probs: np.ndarray) -> np.ndarray:
        """Apply adaptive bias to LLM probabilities.

        Args:
            probs: numpy array of shape (vocab_size,) from the LLM,
                   summing to ~1.

        Returns:
            Adjusted probabilities, float64 numpy array, summing to ~1.
        """
        # Work in float64 for precision
        log_buf = self._log_buf
        np.log(probs + 1e-10, out=log_buf)
        log_buf += self.bias

        # Numerically stable softmax
        log_buf -= log_buf.max()
        np.exp(log_buf, out=log_buf)
        log_buf /= log_buf.sum()

        return log_buf

    def update(self, actual_token: int, adjusted_probs: np.ndarray):
        """Update bias after observing a token.

        Performs one step of gradient descent on cross-entropy loss:
            L = -log(adjusted_probs[actual_token])
            dL/d(bias[i]) = adjusted_probs[i] - 1_{i == actual_token}

        Must be called identically during compression and decompression.

        Args:
            actual_token: The token that was actually observed.
            adjusted_probs: The probabilities returned by adjust() for
                            this token (before mixing with other models).
        """
        grad = self._grad_buf
        np.copyto(grad, adjusted_probs)
        grad[actual_token] -= 1.0
        self.bias -= self.lr * grad