Spaces:
Running on Zero
Running on Zero
| """Online adaptive bias layer for per-document learning. | |
| Applies a learned bias vector in log-probability space on top of the | |
| LLM's output. The bias is updated via gradient descent after each | |
| observed token, allowing the model to adapt to the specific document | |
| being compressed. | |
| Both compressor and decompressor start from the same initial state | |
| (zero bias) and apply identical updates, maintaining lossless symmetry. | |
| Uses float64 numpy throughout to ensure bit-exact reproducibility | |
| across compression and decompression. | |
| """ | |
| import numpy as np | |
| class AdaptiveHead: | |
| """Thin adaptive bias layer on top of LLM probabilities. | |
| Instead of fine-tuning the full model, this learns a per-token | |
| bias correction: | |
| adjusted[i] = softmax(log(probs[i]) + bias[i]) | |
| This is equivalent to multiplicatively rescaling each probability: | |
| adjusted[i] = probs[i] * exp(bias[i]) / Z | |
| The bias starts at zero (identity transform) and is updated after | |
| each observed token using the gradient of cross-entropy loss. Over | |
| time, it learns to boost tokens that the LLM under-predicts for | |
| this specific document and suppress over-predicted ones. | |
| """ | |
| def __init__(self, vocab_size: int = 49152, lr: float = 0.001): | |
| """Initialize the adaptive head. | |
| Args: | |
| vocab_size: Size of the token vocabulary. | |
| lr: Learning rate for bias updates. Small values (0.001) | |
| give gentle adaptation; larger values risk oscillation. | |
| """ | |
| self.vocab_size = vocab_size | |
| self.lr = lr | |
| # Bias in log-probability space. float64 for precision. | |
| self.bias = np.zeros(vocab_size, dtype=np.float64) | |
| # Pre-allocated buffers to avoid per-token allocation. | |
| self._log_buf = np.zeros(vocab_size, dtype=np.float64) | |
| self._grad_buf = np.zeros(vocab_size, dtype=np.float64) | |
| def reset(self): | |
| """Reset bias to zero. Call when starting a new sequence.""" | |
| self.bias[:] = 0 | |
| def adjust(self, probs: np.ndarray) -> np.ndarray: | |
| """Apply adaptive bias to LLM probabilities. | |
| Args: | |
| probs: numpy array of shape (vocab_size,) from the LLM, | |
| summing to ~1. | |
| Returns: | |
| Adjusted probabilities, float64 numpy array, summing to ~1. | |
| """ | |
| # Work in float64 for precision | |
| log_buf = self._log_buf | |
| np.log(probs + 1e-10, out=log_buf) | |
| log_buf += self.bias | |
| # Numerically stable softmax | |
| log_buf -= log_buf.max() | |
| np.exp(log_buf, out=log_buf) | |
| log_buf /= log_buf.sum() | |
| return log_buf | |
| def update(self, actual_token: int, adjusted_probs: np.ndarray): | |
| """Update bias after observing a token. | |
| Performs one step of gradient descent on cross-entropy loss: | |
| L = -log(adjusted_probs[actual_token]) | |
| dL/d(bias[i]) = adjusted_probs[i] - 1_{i == actual_token} | |
| Must be called identically during compression and decompression. | |
| Args: | |
| actual_token: The token that was actually observed. | |
| adjusted_probs: The probabilities returned by adjust() for | |
| this token (before mixing with other models). | |
| """ | |
| grad = self._grad_buf | |
| np.copyto(grad, adjusted_probs) | |
| grad[actual_token] -= 1.0 | |
| self.bias -= self.lr * grad | |