Upload 29 files

18f4d80 verified about 2 months ago

12.4 kB

	"""
	RotorQuant: Reimagining TurboQuant with Clifford Algebra

	Instead of TurboQuant's random orthogonal matrix Π (via QR decomposition),
	RotorQuant uses Clifford rotors R = exp(B/2) for decorrelation.

	Why this is better for geometric data:
	1. Rotor sandwich R x R̃ preserves the FULL algebraic structure
	(inner products, outer products, grades) — not just norms
	2. Rotors compose naturally: R₂(R₁ x R̃₁)R̃₂ = (R₂R₁) x (R₂R₁)~
	3. Grade-aware quantization: different grades can use different bit budgets
	4. The bivector structure of rotors means we only need 3 parameters
	(not d² for a full rotation matrix) — massive parameter savings

	Algorithm:
	Stage 1 (MSE): Embed vectors as Cl(3,0) multivectors → rotor sandwich →
	grade-aware Lloyd-Max quantization per component
	Stage 2 (QJL): 1-bit sign quantization on residual for unbiased inner products
	"""

	import torch
	import torch.nn as nn
	import math
	from typing import Optional, Tuple, Dict

	try:
	from .clifford import (
	MV_DIM, geometric_product, reverse, make_random_rotor,
	rotor_sandwich, embed_vectors_as_multivectors,
	extract_vectors_from_multivectors, multivector_norm_sq,
	)
	from .lloyd_max import LloydMaxCodebook
	except ImportError:
	from clifford import (
	MV_DIM, geometric_product, reverse, make_random_rotor,
	rotor_sandwich, embed_vectors_as_multivectors,
	extract_vectors_from_multivectors, multivector_norm_sq,
	)
	from lloyd_max import LloydMaxCodebook


	class RotorQuantMSE(nn.Module):
	"""
	Stage 1: MSE-optimal quantizer using Clifford rotors.

	Instead of Π @ x (matrix multiply), we do R x R̃ (rotor sandwich).
	Then per-component Lloyd-Max quantization on the rotated multivector.
	"""

	def __init__(self, d: int, bits: int, seed: int = 42,
	grade_bits: Optional[Dict[str, int]] = None,
	device: str = "cpu"):
	"""
	Args:
	d: original vector dimension
	bits: default bits per component
	grade_bits: optional per-grade bit override, e.g.
	{'scalar': 2, 'vector': 3, 'bivector': 2, 'trivector': 1}
	seed: random seed
	device: torch device
	"""
	super().__init__()
	self.d = d
	self.bits = bits
	self.device = device

	# Compute how many multivector groups we need
	self.n_groups = (d + 2) // 3 # ceil(d/3)
	self.mv_dim = self.n_groups * MV_DIM # total components

	# Grade-aware bit allocation
	# Only store non-zero grades: the rotor sandwich R v R̃ of a grade-1
	# vector produces ONLY odd grades (vector + trivector). Scalar and
	# bivector are mathematically guaranteed to be zero — don't waste
	# storage on them. This cuts stored indices from 8/group to 4/group.
	if grade_bits is None:
	grade_bits = {
	'vector': bits,
	}
	self.grade_bits = grade_bits

	# Create per-grade codebooks
	# d_eff determines the Lloyd-Max Gaussian σ = 1/√d_eff
	d_eff_vector = d # vector grades: σ ≈ 1/√d
	self.codebooks = nn.ModuleDict()
	for grade_name, gb in grade_bits.items():
	cb = LloydMaxCodebook(d_eff_vector, gb)
	self.register_buffer(f'centroids_{grade_name}',
	cb.centroids.to(device))

	# Only quantize grade-1 (vector) components.
	# Trivector (e123) carries 15% of energy but contributes ZERO to
	# reconstruction because extract_vectors_from_multivectors only reads
	# grade-1 (e1, e2, e3). Dropping trivector saves 25% of indices
	# with zero MSE impact, matching TurboQuant's compression ratio.
	# [scalar, e1, e2, e3, e12, e13, e23, e123]
	self.grade_map = {
	'vector': [1, 2, 3],
	}

	# Pre-compute random rotors (one per group for decorrelation)
	rotors = []
	for i in range(self.n_groups):
	r = make_random_rotor((), device=device, seed=seed + i)
	rotors.append(r)
	self.register_buffer('rotors', torch.stack(rotors)) # (n_groups, 8)

	def _apply_rotors(self, mv: torch.Tensor) -> torch.Tensor:
	"""Apply per-group rotor sandwich: R_i x_i R̃_i"""
	# mv: (..., n_groups, 8)
	# rotors: (n_groups, 8) → broadcast over batch dims
	return rotor_sandwich(self.rotors, mv)

	def _unapply_rotors(self, mv: torch.Tensor) -> torch.Tensor:
	"""Inverse rotor sandwich: R̃_i x_i R_i"""
	rotor_rev = reverse(self.rotors)
	return rotor_sandwich(rotor_rev, mv)

	def _quantize_grade(self, x: torch.Tensor, grade_name: str) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Quantize components of a specific grade."""
	centroids = getattr(self, f'centroids_{grade_name}')
	diffs = x.unsqueeze(-1) - centroids # (..., n_components, n_levels)
	indices = diffs.abs().argmin(dim=-1)
	x_q = centroids[indices]
	return x_q, indices

	def quantize(self, x: torch.Tensor) -> Tuple[torch.Tensor, dict]:
	"""
	Quantize vectors via rotor + grade-aware Lloyd-Max.

	x: (..., d) input vectors
	Returns: (mv_q, indices_dict)
	"""
	# Normalize to unit vectors (store norms separately)
	norms = torch.norm(x, dim=-1, keepdim=True).clamp(min=1e-8)
	x_unit = x / norms

	# Embed as multivectors
	mv = embed_vectors_as_multivectors(x_unit) # (..., n_groups, 8)

	# Apply rotor decorrelation
	mv_rot = self._apply_rotors(mv)

	# Grade-aware quantization
	mv_q = torch.zeros_like(mv_rot)
	all_indices = {}

	for grade_name, component_indices in self.grade_map.items():
	grade_data = mv_rot[..., component_indices] # (..., n_groups, n_components)
	flat = grade_data.reshape(*grade_data.shape[:-1], -1)
	q_flat, idx = self._quantize_grade(flat, grade_name)
	q_data = q_flat.reshape_as(grade_data)
	mv_q[..., component_indices] = q_data
	all_indices[grade_name] = idx

	# Store norms in indices for dequantize
	all_indices['_norms'] = norms.squeeze(-1)

	return mv_q, all_indices

	def dequantize(self, indices: dict) -> torch.Tensor:
	"""Reconstruct vectors from quantized indices."""
	sample_centroids = getattr(self, 'centroids_vector')
	vector_idx = indices['vector']
	flat_batch = vector_idx.shape[0] if vector_idx.dim() >= 1 else 1

	mv_q = torch.zeros(flat_batch, self.n_groups, MV_DIM,
	dtype=sample_centroids.dtype,
	device=sample_centroids.device)

	for grade_name, component_indices in self.grade_map.items():
	if grade_name.startswith('_'):
	continue
	centroids = getattr(self, f'centroids_{grade_name}')
	idx = indices[grade_name]
	values = centroids[idx]
	n_components = len(component_indices)
	values = values.reshape(flat_batch, self.n_groups, n_components)
	mv_q[..., component_indices] = values

	# Undo rotor rotation
	mv_recon = self._unapply_rotors(mv_q)

	# Extract unit vectors and rescale by stored norms
	x_hat = extract_vectors_from_multivectors(mv_recon, self.d)
	if '_norms' in indices:
	norms = indices['_norms']
	if norms.dim() < x_hat.dim():
	norms = norms.unsqueeze(-1)
	x_hat = x_hat * norms

	return x_hat

	def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, dict]:
	"""Full quantize-dequantize cycle."""
	mv_q, indices = self.quantize(x)
	x_hat = self.dequantize(indices)
	return x_hat, indices


	class RotorQuantProd(nn.Module):
	"""
	Stage 1 + Stage 2: Unbiased inner product quantizer.

	Uses (b-1)-bit rotor MSE quantizer + 1-bit QJL on residuals.
	The QJL operates in the original vector space (not multivector space)
	since inner products are computed there.
	"""

	def __init__(self, d: int, bits: int, qjl_dim: Optional[int] = None,
	seed: int = 42, device: str = "cpu"):
	super().__init__()
	self.d = d
	self.bits = bits
	self.mse_bits = max(bits - 1, 1)
	self.qjl_dim = qjl_dim or d
	self.device = device

	# Stage 1: Rotor MSE quantizer
	self.mse = RotorQuantMSE(d, self.mse_bits, seed=seed, device=device)

	# Stage 2: QJL projection matrix (same as TurboQuant)
	gen = torch.Generator(device='cpu')
	gen.manual_seed(seed + 1)
	S = torch.randn(self.qjl_dim, d, generator=gen)
	self.register_buffer("S", S.to(device))

	def quantize(self, x: torch.Tensor) -> dict:
	"""Full RotorQuant quantization."""
	# Stage 1: Rotor MSE
	x_hat, mse_indices = self.mse(x)

	# Residual
	residual = x - x_hat
	residual_norm = torch.norm(residual, dim=-1, keepdim=True)

	# Stage 2: QJL sign quantization on residual
	projected = residual @ self.S.T
	qjl_signs = torch.sign(projected)
	qjl_signs[qjl_signs == 0] = 1.0

	return {
	'mse_indices': mse_indices,
	'qjl_signs': qjl_signs,
	'residual_norm': residual_norm.squeeze(-1),
	}

	def dequantize(self, compressed: dict) -> torch.Tensor:
	"""Reconstruct from MSE component."""
	return self.mse.dequantize(compressed['mse_indices'])

	def inner_product(self, y: torch.Tensor, compressed: dict) -> torch.Tensor:
	"""
	Unbiased inner product estimate: <y, x>.

	Same QJL correction as TurboQuant — the math doesn't change
	because QJL operates in vector space.
	"""
	x_mse = self.mse.dequantize(compressed['mse_indices'])
	term1 = (y * x_mse).sum(dim=-1)

	y_projected = y @ self.S.T
	qjl_ip = (y_projected * compressed['qjl_signs']).sum(dim=-1)

	m = self.qjl_dim
	correction_scale = math.sqrt(math.pi / 2) / m
	term2 = compressed['residual_norm'] * correction_scale * qjl_ip

	return term1 + term2

	def forward(self, x: torch.Tensor) -> dict:
	return self.quantize(x)


	class RotorQuantKVCache:
	"""
	KV cache using RotorQuant compression.
	Drop-in replacement for TurboQuantKVCache.
	"""

	def __init__(self, d_key: int, d_value: int, bits: int = 3,
	seed: int = 42, device: str = "cpu"):
	self.d_key = d_key
	self.d_value = d_value
	self.bits = bits
	self.device = device

	self.key_quantizer = RotorQuantProd(d_key, bits, seed=seed, device=device)
	self.value_quantizer = RotorQuantMSE(d_value, bits, seed=seed + 100, device=device)

	self.key_cache = []
	self.value_cache = []

	def append(self, keys: torch.Tensor, values: torch.Tensor):
	flat_keys = keys.reshape(-1, self.d_key)
	flat_values = values.reshape(-1, self.d_value)

	compressed_keys = self.key_quantizer.quantize(flat_keys)
	_, value_indices = self.value_quantizer(flat_values)

	self.key_cache.append(compressed_keys)
	self.value_cache.append(value_indices)

	def attention_scores(self, queries: torch.Tensor) -> torch.Tensor:
	scores = []
	for cached in self.key_cache:
	s = self.key_quantizer.inner_product(queries, cached)
	scores.append(s)
	return torch.cat(scores, dim=-1) if scores else torch.tensor([])

	def get_values(self) -> torch.Tensor:
	values = []
	for indices in self.value_cache:
	v = self.value_quantizer.dequantize(indices)
	values.append(v)
	return torch.cat(values, dim=0) if values else torch.tensor([])

	def __len__(self):
	return sum(
	c['qjl_signs'].shape[0] for c in self.key_cache
	) if self.key_cache else 0