Nacrith-GPU / adaptive_head.py
robtacconelli's picture
Upload 11 files
5b8133e verified
"""Online adaptive bias layer for per-document learning.
Applies a learned bias vector in log-probability space on top of the
LLM's output. The bias is updated via gradient descent after each
observed token, allowing the model to adapt to the specific document
being compressed.
Both compressor and decompressor start from the same initial state
(zero bias) and apply identical updates, maintaining lossless symmetry.
Uses float64 numpy throughout to ensure bit-exact reproducibility
across compression and decompression.
"""
import numpy as np
class AdaptiveHead:
"""Thin adaptive bias layer on top of LLM probabilities.
Instead of fine-tuning the full model, this learns a per-token
bias correction:
adjusted[i] = softmax(log(probs[i]) + bias[i])
This is equivalent to multiplicatively rescaling each probability:
adjusted[i] = probs[i] * exp(bias[i]) / Z
The bias starts at zero (identity transform) and is updated after
each observed token using the gradient of cross-entropy loss. Over
time, it learns to boost tokens that the LLM under-predicts for
this specific document and suppress over-predicted ones.
"""
def __init__(self, vocab_size: int = 49152, lr: float = 0.001):
"""Initialize the adaptive head.
Args:
vocab_size: Size of the token vocabulary.
lr: Learning rate for bias updates. Small values (0.001)
give gentle adaptation; larger values risk oscillation.
"""
self.vocab_size = vocab_size
self.lr = lr
# Bias in log-probability space. float64 for precision.
self.bias = np.zeros(vocab_size, dtype=np.float64)
# Pre-allocated buffers to avoid per-token allocation.
self._log_buf = np.zeros(vocab_size, dtype=np.float64)
self._grad_buf = np.zeros(vocab_size, dtype=np.float64)
def reset(self):
"""Reset bias to zero. Call when starting a new sequence."""
self.bias[:] = 0
def adjust(self, probs: np.ndarray) -> np.ndarray:
"""Apply adaptive bias to LLM probabilities.
Args:
probs: numpy array of shape (vocab_size,) from the LLM,
summing to ~1.
Returns:
Adjusted probabilities, float64 numpy array, summing to ~1.
"""
# Work in float64 for precision
log_buf = self._log_buf
np.log(probs + 1e-10, out=log_buf)
log_buf += self.bias
# Numerically stable softmax
log_buf -= log_buf.max()
np.exp(log_buf, out=log_buf)
log_buf /= log_buf.sum()
return log_buf
def update(self, actual_token: int, adjusted_probs: np.ndarray):
"""Update bias after observing a token.
Performs one step of gradient descent on cross-entropy loss:
L = -log(adjusted_probs[actual_token])
dL/d(bias[i]) = adjusted_probs[i] - 1_{i == actual_token}
Must be called identically during compression and decompression.
Args:
actual_token: The token that was actually observed.
adjusted_probs: The probabilities returned by adjust() for
this token (before mixing with other models).
"""
grad = self._grad_buf
np.copyto(grad, adjusted_probs)
grad[actual_token] -= 1.0
self.bias -= self.lr * grad