Spaces:

robtacconelli
/

Nacrith-GPU

Running on Zero

App Files Files Community

Nacrith-GPU / adaptive_head.py

robtacconelli

Upload 11 files

5b8133e verified 1 day ago

raw

history blame contribute delete

3.49 kB

	"""Online adaptive bias layer for per-document learning.

	Applies a learned bias vector in log-probability space on top of the
	LLM's output. The bias is updated via gradient descent after each
	observed token, allowing the model to adapt to the specific document
	being compressed.

	Both compressor and decompressor start from the same initial state
	(zero bias) and apply identical updates, maintaining lossless symmetry.

	Uses float64 numpy throughout to ensure bit-exact reproducibility
	across compression and decompression.
	"""

	import numpy as np


	class AdaptiveHead:
	"""Thin adaptive bias layer on top of LLM probabilities.

	Instead of fine-tuning the full model, this learns a per-token
	bias correction:

	adjusted[i] = softmax(log(probs[i]) + bias[i])

	This is equivalent to multiplicatively rescaling each probability:

	adjusted[i] = probs[i] * exp(bias[i]) / Z

	The bias starts at zero (identity transform) and is updated after
	each observed token using the gradient of cross-entropy loss. Over
	time, it learns to boost tokens that the LLM under-predicts for
	this specific document and suppress over-predicted ones.
	"""

	def __init__(self, vocab_size: int = 49152, lr: float = 0.001):
	"""Initialize the adaptive head.

	Args:
	vocab_size: Size of the token vocabulary.
	lr: Learning rate for bias updates. Small values (0.001)
	give gentle adaptation; larger values risk oscillation.
	"""
	self.vocab_size = vocab_size
	self.lr = lr

	# Bias in log-probability space. float64 for precision.
	self.bias = np.zeros(vocab_size, dtype=np.float64)

	# Pre-allocated buffers to avoid per-token allocation.
	self._log_buf = np.zeros(vocab_size, dtype=np.float64)
	self._grad_buf = np.zeros(vocab_size, dtype=np.float64)

	def reset(self):
	"""Reset bias to zero. Call when starting a new sequence."""
	self.bias[:] = 0

	def adjust(self, probs: np.ndarray) -> np.ndarray:
	"""Apply adaptive bias to LLM probabilities.

	Args:
	probs: numpy array of shape (vocab_size,) from the LLM,
	summing to ~1.

	Returns:
	Adjusted probabilities, float64 numpy array, summing to ~1.
	"""
	# Work in float64 for precision
	log_buf = self._log_buf
	np.log(probs + 1e-10, out=log_buf)
	log_buf += self.bias

	# Numerically stable softmax
	log_buf -= log_buf.max()
	np.exp(log_buf, out=log_buf)
	log_buf /= log_buf.sum()

	return log_buf

	def update(self, actual_token: int, adjusted_probs: np.ndarray):
	"""Update bias after observing a token.

	Performs one step of gradient descent on cross-entropy loss:
	L = -log(adjusted_probs[actual_token])
	dL/d(bias[i]) = adjusted_probs[i] - 1_{i == actual_token}

	Must be called identically during compression and decompression.

	Args:
	actual_token: The token that was actually observed.
	adjusted_probs: The probabilities returned by adjust() for
	this token (before mixing with other models).
	"""
	grad = self._grad_buf
	np.copyto(grad, adjusted_probs)
	grad[actual_token] -= 1.0
	self.bias -= self.lr * grad