Upload fVLM-135M: Foveated Vision-Language Model (Stage 3 DPO)

6d320d6 verified 8 days ago

37 kB

	"""
	Foveated Vision-Language Model (release implementation).

	Architecture: DINOv2 encoder + foveated cross-attention + SmolLM2 LLM.
	Each video frame is compressed to ONE visual token via query-guided attention.
	The LLM controls WHERE to look by generating the query for the next frame.

	Three forward modes:
	1. forward_coarse_fine -- Training (two parallel passes)
	2. forward_coarse_only -- Fast eval (single static-query pass)
	3. forward_autoregressive -- True inference (sequential, KV-cached)

	Loss: text cross-entropy only (no reconstruction, no VAE).
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import AutoModelForCausalLM, AutoConfig
	from typing import Dict, Optional


	class FoveatedVLM(nn.Module):
	"""
	Foveated Vision-Language Model.

	Parameters
	----------
	llm_name : str
	HuggingFace model id for SmolLM2 (e.g. "HuggingFaceTB/SmolLM2-135M-Instruct").
	dino_name : str
	HuggingFace model id for DINOv2 (e.g. "facebook/dinov2-small").
	query_dim : int
	Dimension of the foveated query vectors (matches DINO dim by default).
	visual_scale : float
	Multiplicative factor applied to projected visual tokens so their
	magnitude matches the LLM embedding std (~0.14 for SmolLM2).
	lambda_coarse : float
	Weight for the optional auxiliary coarse-pass CE loss during training.
	Set to 0 to disable.
	"""

	def __init__(
	self,
	llm_name: str = "HuggingFaceTB/SmolLM2-135M-Instruct",
	dino_name: str = "facebook/dinov2-small",
	query_dim: int = 384,
	visual_scale: float = 0.14,
	lambda_coarse: float = 0.0,
	deep_query: bool = True,
	):
	super().__init__()

	# ---- delayed import so encoder.py can live next to this file ----
	from release.model.encoder import FoveatedEncoder

	# ---- Vision encoder (DINOv2 + query cross-attention) ----
	self.encoder = FoveatedEncoder(
	dino_model_name=dino_name,
	query_dim=query_dim,
	output_dim=None, # output_dim = dino_dim by default inside encoder
	)
	dino_dim = self.encoder.dino_dim

	# ---- Language model ----
	self.llm = AutoModelForCausalLM.from_pretrained(
	llm_name, attn_implementation="sdpa", torch_dtype=torch.float32,
	)
	self.llm.config.use_cache = False # training default; overridden per-method
	llm_dim = self.llm.config.hidden_size

	# ---- Projections ----
	self.dino_to_llm = nn.Linear(dino_dim, llm_dim)
	self.llm_to_query = nn.Linear(llm_dim, query_dim)

	# ---- Learnable queries ----
	# BUG-001 FIX: init with std=1.0 so queries dominate over projection
	# bias and produce meaningful (non-uniform) attention patterns.
	self.q_static = nn.Parameter(torch.randn(1, query_dim)) # std=1.0
	self.q_init = nn.Parameter(torch.randn(1, query_dim)) # std=1.0

	# ---- Hyperparams stored as plain Python (not buffers) ----
	self.visual_scale = visual_scale
	self.lambda_coarse = lambda_coarse
	self.query_dim = query_dim
	self.deep_query = deep_query

	# ---- Dimension bookkeeping (useful for external code) ----
	self.dino_dim = dino_dim
	self.llm_dim = llm_dim

	# ------------------------------------------------------------------
	# helpers
	# ------------------------------------------------------------------

	def _get_pad_token_id(self) -> int:
	"""Return pad_token_id from the LLM config (never hardcoded)."""
	pid = getattr(self.llm.config, "pad_token_id", None)
	if pid is None:
	pid = getattr(self.llm.config, "eos_token_id", 0)
	return pid

	def _llm_dtype(self) -> torch.dtype:
	"""Return the dtype of the LLM parameters (e.g. bfloat16)."""
	return next(self.llm.parameters()).dtype

	def _embed_text(self, input_ids: torch.Tensor) -> torch.Tensor:
	"""[B, S] -> [B, S, llm_dim] via LLM embedding table."""
	return self.llm.get_input_embeddings()(input_ids)

	def _project_visual(self, z: torch.Tensor) -> torch.Tensor:
	"""
	Project DINO features to LLM space and rescale.

	z : [B, T, dino_dim] or [B, dino_dim]
	Returns same shape with last dim = llm_dim.
	"""
	h = self.dino_to_llm(z) # -> llm_dim
	h = h * self.visual_scale # match LLM embedding magnitude
	return h

	# Maximum frames per DINO encode/query call to prevent OOM on large batches.
	_MAX_ENCODE_CHUNK = 200

	def _encode_all_frames(self, frames: torch.Tensor, frame_mask=None):
	"""
	Run DINO patch encoding for every frame in the batch.

	frames : [B, T, 3, 224, 224]
	frame_mask : [B, T] bool — True for real frames, False for padding.

	Returns (kv_cache, patch_features, mask_flat):
	kv_cache : list of (K, V) per layer, each [n_real, N+1, D]
	(compact — only real frames, no padding waste).
	patch_features : [n_real, N+1, D] final DINO embeddings (for shallow mode).
	mask_flat : [B*T] bool tensor or None. Used to scatter results back.
	"""
	B, T, C, H, W = frames.shape
	BT = B * T
	frames_flat = frames.reshape(BT, C, H, W)

	if frame_mask is not None:
	mask_flat = frame_mask.reshape(BT)
	n_real = mask_flat.sum().item()
	else:
	mask_flat = None
	n_real = BT

	if mask_flat is not None and n_real < BT:
	real_frames = frames_flat[mask_flat] # [n_real, C, H, W]
	else:
	real_frames = frames_flat

	# Chunked encoding to prevent OOM on batches with many real frames
	if real_frames.shape[0] <= self._MAX_ENCODE_CHUNK:
	patch_features, kv_cache = self.encoder.encode_patches(real_frames)
	else:
	pf_chunks, kv_chunks = [], []
	for start in range(0, real_frames.shape[0], self._MAX_ENCODE_CHUNK):
	pf_chunk, kv_chunk = self.encoder.encode_patches(
	real_frames[start:start + self._MAX_ENCODE_CHUNK]
	)
	pf_chunks.append(pf_chunk)
	kv_chunks.append(kv_chunk)
	patch_features = torch.cat(pf_chunks, dim=0)
	kv_cache = [
	(torch.cat([c[li][0] for c in kv_chunks], dim=0),
	torch.cat([c[li][1] for c in kv_chunks], dim=0))
	for li in range(len(kv_chunks[0]))
	]

	return kv_cache, patch_features, mask_flat

	def _batched_query_attend(self, queries: torch.Tensor, kv_cache: list,
	patch_features: torch.Tensor = None) -> torch.Tensor:
	"""Chunked query_attend (deep) or shallow_query_attend to prevent OOM."""
	n = queries.shape[0]
	if not self.deep_query:
	# Shallow mode: single cross-attention on final features
	if n <= self._MAX_ENCODE_CHUNK:
	return self.encoder.shallow_query_attend(queries, patch_features)
	chunks = []
	for start in range(0, n, self._MAX_ENCODE_CHUNK):
	end = min(start + self._MAX_ENCODE_CHUNK, n)
	chunks.append(self.encoder.shallow_query_attend(
	queries[start:end], patch_features[start:end]))
	return torch.cat(chunks, dim=0)
	# Deep mode: propagate through all DINO layers
	if n <= self._MAX_ENCODE_CHUNK:
	return self.encoder.query_attend(queries, kv_cache)
	chunks = []
	for start in range(0, n, self._MAX_ENCODE_CHUNK):
	end = min(start + self._MAX_ENCODE_CHUNK, n)
	kv_slice = [(K[start:end], V[start:end]) for K, V in kv_cache]
	chunks.append(self.encoder.query_attend(queries[start:end], kv_slice))
	return torch.cat(chunks, dim=0)

	def _query_all_frames(
	self, query: torch.Tensor, kv_cache: list,
	B: int, T: int, mask_flat=None, patch_features=None,
	) -> torch.Tensor:
	"""
	Apply a single query to every frame in ONE batched query_attend call.

	query : [B, query_dim]
	kv_cache : list of (K, V) per layer, each [n_real, N+1, D]
	B, T : batch and temporal dimensions
	mask_flat : [B*T] bool or None
	patch_features : [n_real, N+1, D] (needed for shallow mode)
	Returns : [B, T, dino_dim]
	"""
	BT = B * T
	dd = self.encoder.dino_dim

	# Expand: same query for all T frames → [B*T, qd]
	query_exp = query.unsqueeze(1).expand(B, T, -1).reshape(BT, -1)

	if mask_flat is not None:
	n_real = mask_flat.sum().item()
	if n_real == 0:
	return torch.zeros(B, T, dd, device=query.device, dtype=query.dtype)
	query_real = query_exp[mask_flat] # [n_real, qd]
	z_real = self._batched_query_attend(query_real, kv_cache, patch_features)
	z_flat = torch.zeros(BT, dd, device=query.device, dtype=z_real.dtype)
	z_flat[mask_flat] = z_real
	else:
	z_flat = self._batched_query_attend(query_exp, kv_cache, patch_features)

	return z_flat.reshape(B, T, dd)

	def _query_all_frames_batched(
	self, queries: torch.Tensor, kv_cache: list,
	B: int, T: int, mask_flat=None, patch_features=None,
	) -> torch.Tensor:
	"""
	Apply per-frame queries in ONE batched query_attend call.

	queries : [B, T, query_dim]
	kv_cache : list of (K, V) per layer, each [n_real, N+1, D]
	B, T : batch and temporal dimensions
	mask_flat : [B*T] bool or None
	patch_features : [n_real, N+1, D] (needed for shallow mode)
	Returns : [B, T, dino_dim]
	"""
	BT = B * T
	dd = self.encoder.dino_dim
	queries_flat = queries.reshape(BT, -1)

	if mask_flat is not None:
	n_real = mask_flat.sum().item()
	if n_real == 0:
	return torch.zeros(B, T, dd, device=queries.device, dtype=queries.dtype)
	query_real = queries_flat[mask_flat] # [n_real, qd]
	z_real = self._batched_query_attend(query_real, kv_cache, patch_features)
	z_flat = torch.zeros(BT, dd, device=queries.device, dtype=z_real.dtype)
	z_flat[mask_flat] = z_real
	else:
	z_flat = self._batched_query_attend(queries_flat, kv_cache, patch_features)

	return z_flat.reshape(B, T, dd)

	def _extract_frame_kv(self, kv_cache: list, mask_flat, B: int, T: int, frame_idx: int):
	"""
	Extract single-frame KV cache from flat format (for autoregressive/eval).

	Returns list of (K, V) per layer, each [B, N+1, D].
	"""
	if mask_flat is not None:
	# Scatter compact caches to full [B*T] then extract frame
	N1 = kv_cache[0][0].shape[1]
	D = kv_cache[0][0].shape[2]
	frame_kv = []
	for K_real, V_real in kv_cache:
	K_full = torch.zeros(B * T, N1, D, dtype=K_real.dtype, device=K_real.device)
	V_full = torch.zeros(B * T, N1, D, dtype=V_real.dtype, device=V_real.device)
	K_full[mask_flat] = K_real
	V_full[mask_flat] = V_real
	K_t = K_full.reshape(B, T, N1, D)[:, frame_idx] # [B, N+1, D]
	V_t = V_full.reshape(B, T, N1, D)[:, frame_idx]
	frame_kv.append((K_t, V_t))
	return frame_kv
	else:
	N1 = kv_cache[0][0].shape[1]
	D = kv_cache[0][0].shape[2]
	frame_kv = []
	for K_all, V_all in kv_cache:
	K_t = K_all.reshape(B, T, N1, D)[:, frame_idx]
	V_t = V_all.reshape(B, T, N1, D)[:, frame_idx]
	frame_kv.append((K_t, V_t))
	return frame_kv

	def _build_causal_mask(self, seq_len: int, device: torch.device) -> torch.Tensor:
	"""
	Standard causal attention mask [1, 1, S, S] for the LLM.
	True = masked (cannot attend), False = allowed.
	"""
	mask = torch.ones(seq_len, seq_len, dtype=torch.bool, device=device).triu(1)
	return mask.unsqueeze(0).unsqueeze(0) # [1, 1, S, S]

	def _ce_loss(
	self,
	logits: torch.Tensor,
	labels: torch.Tensor,
	loss_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Standard autoregressive CE loss with shift-by-1.

	logits : [B, S, V] (full sequence logits)
	labels : [B, S] (token ids; positions without loss use pad)
	loss_mask : [B, S] (1 = compute loss, 0 = ignore). Applied BEFORE
	the shift so that loss_mask[i] guards label[i].

	Returns scalar loss.
	"""
	# Shift: predict position i+1 from position i
	shift_logits = logits[:, :-1, :].contiguous() # [B, S-1, V]
	shift_labels = labels[:, 1:].contiguous() # [B, S-1]

	if loss_mask is not None:
	shift_mask = loss_mask[:, 1:].contiguous() # [B, S-1]
	# Replace masked positions with ignore_index so CE ignores them
	pad_id = self._get_pad_token_id()
	shift_labels = shift_labels.clone()
	shift_labels[shift_mask == 0] = pad_id

	V = shift_logits.shape[-1]
	loss = F.cross_entropy(
	shift_logits.reshape(-1, V),
	shift_labels.reshape(-1),
	ignore_index=self._get_pad_token_id(),
	reduction="mean",
	)
	return loss

	# ------------------------------------------------------------------
	# Forward mode 1: Coarse+Fine (TRAINING)
	# ------------------------------------------------------------------

	def forward_coarse_fine(
	self,
	frames: torch.Tensor,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor,
	loss_mask: Optional[torch.Tensor] = None,
	frame_mask: Optional[torch.Tensor] = None,
	) -> Dict[str, torch.Tensor]:
	"""
	Two-pass parallel training forward.

	Pass 1 (coarse): q_static -> all frames -> z_coarse -> LLM -> dynamic queries
	Pass 2 (fine): shifted queries -> all frames -> z_fine -> LLM + text -> loss

	Parameters
	----------
	frames : [B, T, 3, 224, 224]
	input_ids : [B, S] tokenized text (prompt + answer)
	attention_mask : [B, S] text attention mask
	loss_mask : [B, S] which tokens contribute to loss (1=yes, 0=no).
	If None, all non-pad tokens have loss.

	Returns
	-------
	dict with keys: loss, logits, coarse_loss (optional), fine_loss
	"""
	B, T = frames.shape[:2]
	S = input_ids.shape[1]

	# ---- Step 0: Encode all frames (DINO, shared across both passes) ----
	kv_cache, patch_features, mask_flat = self._encode_all_frames(frames, frame_mask)

	# ---- Pass 1: Coarse ----
	q_static = self.q_static.expand(B, -1) # [B, qd]
	z_coarse = self._query_all_frames(q_static, kv_cache, B, T, mask_flat, patch_features) # [B,T,dd]
	z_coarse_llm = self._project_visual(z_coarse) # [B,T,ld]

	# Build coarse sequence: [visual_coarse, text]
	text_embeds = self._embed_text(input_ids) # [B,S,ld]
	seq_coarse = torch.cat([z_coarse_llm, text_embeds], dim=1) # [B,T+S,ld]
	# dtype handled by autocast on GPU; float32 on CPU

	# LLM forward (backbone only, no lm_head yet)
	out_coarse = self.llm.model(inputs_embeds=seq_coarse)
	h_coarse = out_coarse.last_hidden_state # [B,T+S,ld]

	# Extract dynamic queries from visual positions
	# h_coarse[:, 0..T-1] are the hidden states at visual token positions
	# Each one generates a query for the corresponding frame
	h_visual_coarse = h_coarse[:, :T, :] # [B,T,ld]
	queries = self.llm_to_query(h_visual_coarse) # [B,T,qd]

	# Shift queries: frame t gets query from frame t-1; frame 0 gets q_init
	q_init = self.q_init.expand(B, 1, -1) # [B,1,qd]
	shifted_queries = torch.cat([q_init, queries[:, :-1]], dim=1) # [B,T,qd]

	# ---- Pass 2: Fine ----
	z_fine = self._query_all_frames_batched(shifted_queries, kv_cache, B, T, mask_flat, patch_features) # [B,T,dd]
	z_fine_llm = self._project_visual(z_fine) # [B,T,ld]

	# Build fine sequence: [visual_fine, text]
	seq_fine = torch.cat([z_fine_llm, text_embeds], dim=1) # [B,T+S,ld]
	# dtype handled by autocast on GPU; float32 on CPU

	out_fine = self.llm.model(inputs_embeds=seq_fine)
	h_fine = out_fine.last_hidden_state # [B,T+S,ld]

	# Get logits over the FULL sequence (visual + text positions)
	logits_full = self.llm.lm_head(h_fine) # [B,T+S,V]

	# ---- Loss on text portion only ----
	# The text tokens start at position T in the sequence.
	# We need labels aligned with the full sequence: visual positions get pad.
	pad_id = self._get_pad_token_id()
	visual_pad = torch.full(
	(B, T), pad_id, dtype=input_ids.dtype, device=input_ids.device,
	)
	full_labels = torch.cat([visual_pad, input_ids], dim=1) # [B, T+S]

	# Build full loss mask: 0 for visual positions, then the provided loss_mask
	if loss_mask is not None:
	visual_no_loss = torch.zeros(
	B, T, dtype=loss_mask.dtype, device=loss_mask.device,
	)
	full_loss_mask = torch.cat([visual_no_loss, loss_mask], dim=1) # [B,T+S]
	else:
	# Default: compute loss on all text positions that are not padding
	visual_no_loss = torch.zeros(B, T, dtype=attention_mask.dtype, device=attention_mask.device)
	text_loss_mask = attention_mask # non-pad text positions
	full_loss_mask = torch.cat([visual_no_loss, text_loss_mask], dim=1)

	fine_loss = self._ce_loss(logits_full, full_labels, full_loss_mask)

	# ---- Optional auxiliary coarse loss ----
	coarse_loss = torch.tensor(0.0, device=frames.device)
	if self.lambda_coarse > 0:
	logits_coarse = self.llm.lm_head(h_coarse)
	coarse_loss = self._ce_loss(logits_coarse, full_labels, full_loss_mask)

	# ---- Combined loss ----
	loss = fine_loss + self.lambda_coarse * coarse_loss

	return {
	"loss": loss,
	"fine_loss": fine_loss,
	"coarse_loss": coarse_loss,
	"logits": logits_full,
	}

	# ------------------------------------------------------------------
	# Forward mode: DPO (preference training)
	# ------------------------------------------------------------------

	def forward_dpo(
	self,
	frames: torch.Tensor,
	chosen_input_ids: torch.Tensor,
	chosen_attention_mask: torch.Tensor,
	chosen_loss_mask: torch.Tensor,
	rejected_input_ids: torch.Tensor,
	rejected_attention_mask: torch.Tensor,
	rejected_loss_mask: torch.Tensor,
	frame_mask: Optional[torch.Tensor] = None,
	) -> Dict[str, torch.Tensor]:
	"""
	DPO forward pass: run coarse+fine on both chosen and rejected sequences.

	Shares DINO encoding across chosen and rejected (same visual input).
	Returns per-sample sum of log-probabilities for both chosen and rejected,
	masked by loss_mask (answer-only tokens).

	Parameters
	----------
	frames : [B, T, 3, 224, 224]
	chosen_input_ids : [B, S_c]
	chosen_attention_mask : [B, S_c]
	chosen_loss_mask : [B, S_c] (1 = answer token, 0 = prompt/pad)
	rejected_input_ids : [B, S_r]
	rejected_attention_mask : [B, S_r]
	rejected_loss_mask : [B, S_r]
	frame_mask : [B, T] bool (optional)

	Returns
	-------
	dict with keys:
	chosen_logps : [B] per-sample sum of log-probs on chosen answer tokens
	rejected_logps : [B] per-sample sum of log-probs on rejected answer tokens
	chosen_logits : [B, T+S_c, V] full logits for chosen
	rejected_logits : [B, T+S_r, V] full logits for rejected
	"""
	B, T = frames.shape[:2]

	# ---- Step 0: Encode all frames (DINO, shared across chosen & rejected) ----
	kv_cache, patch_features, mask_flat = self._encode_all_frames(frames, frame_mask)

	# ---- Coarse pass (shared, used for dynamic query generation) ----
	q_static = self.q_static.expand(B, -1) # [B, qd]
	z_coarse = self._query_all_frames(q_static, kv_cache, B, T, mask_flat, patch_features)
	z_coarse_llm = self._project_visual(z_coarse) # [B, T, ld]

	# Run coarse LLM to get dynamic queries (use chosen text for query generation)
	text_embeds_chosen = self._embed_text(chosen_input_ids) # [B, S_c, ld]
	seq_coarse = torch.cat([z_coarse_llm, text_embeds_chosen], dim=1)
	out_coarse = self.llm.model(inputs_embeds=seq_coarse)
	h_coarse = out_coarse.last_hidden_state

	# Extract dynamic queries from visual positions
	h_visual_coarse = h_coarse[:, :T, :] # [B, T, ld]
	queries = self.llm_to_query(h_visual_coarse) # [B, T, qd]

	q_init = self.q_init.expand(B, 1, -1)
	shifted_queries = torch.cat([q_init, queries[:, :-1]], dim=1) # [B, T, qd]

	# ---- Fine pass: shared visual features ----
	z_fine = self._query_all_frames_batched(shifted_queries, kv_cache, B, T, mask_flat, patch_features)
	z_fine_llm = self._project_visual(z_fine) # [B, T, ld]

	# ---- Forward on CHOSEN ----
	seq_chosen = torch.cat([z_fine_llm, text_embeds_chosen], dim=1) # [B, T+S_c, ld]
	out_chosen = self.llm.model(inputs_embeds=seq_chosen)
	chosen_logits = self.llm.lm_head(out_chosen.last_hidden_state) # [B, T+S_c, V]

	# ---- Forward on REJECTED ----
	text_embeds_rejected = self._embed_text(rejected_input_ids) # [B, S_r, ld]
	seq_rejected = torch.cat([z_fine_llm, text_embeds_rejected], dim=1)
	out_rejected = self.llm.model(inputs_embeds=seq_rejected)
	rejected_logits = self.llm.lm_head(out_rejected.last_hidden_state)

	# ---- Compute per-token log-probs ----
	chosen_logps = self._sequence_logprobs(
	chosen_logits, chosen_input_ids, chosen_loss_mask, T,
	)
	rejected_logps = self._sequence_logprobs(
	rejected_logits, rejected_input_ids, rejected_loss_mask, T,
	)

	return {
	"chosen_logps": chosen_logps, # [B]
	"rejected_logps": rejected_logps, # [B]
	"chosen_logits": chosen_logits, # [B, T+S_c, V]
	"rejected_logits": rejected_logits, # [B, T+S_r, V]
	}

	def _sequence_logprobs(
	self,
	logits: torch.Tensor,
	input_ids: torch.Tensor,
	loss_mask: torch.Tensor,
	T: int,
	) -> torch.Tensor:
	"""
	Compute per-sample sum of log-probabilities on answer tokens.

	logits : [B, T+S, V] full sequence logits (visual + text)
	input_ids : [B, S] text token ids
	loss_mask : [B, S] 1.0 for answer tokens, 0.0 otherwise
	T : int number of visual token positions

	Returns : [B] sum of log-probs per sample
	"""
	B, S = input_ids.shape

	# Extract text logits and shift for autoregressive prediction
	text_logits = logits[:, T:, :] # [B, S, V]
	shift_logits = text_logits[:, :-1, :] # [B, S-1, V]
	shift_labels = input_ids[:, 1:] # [B, S-1]
	shift_mask = loss_mask[:, 1:] # [B, S-1]

	# Per-token log-probs: log_softmax then gather the label's prob
	log_probs = F.log_softmax(shift_logits, dim=-1) # [B, S-1, V]
	per_token_logps = log_probs.gather(
	dim=-1, index=shift_labels.unsqueeze(-1),
	).squeeze(-1) # [B, S-1]

	# Mask and sum per sample
	per_token_logps = per_token_logps * shift_mask # zero out non-answer tokens
	return per_token_logps.sum(dim=-1) # [B]

	# ------------------------------------------------------------------
	# Forward mode 2: Coarse only (FAST EVAL)
	# ------------------------------------------------------------------

	def forward_coarse_only(
	self,
	frames: torch.Tensor,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	loss_mask: Optional[torch.Tensor] = None,
	frame_mask: Optional[torch.Tensor] = None,
	) -> Dict[str, torch.Tensor]:
	"""
	Single-pass coarse forward (q_static only, no fine queries).

	Used for:
	- Training A6 ablation (coarse-only training)
	- Fast eval (wrap in torch.no_grad() externally)

	q_static -> all frames -> z_coarse -> LLM -> logits.

	Parameters
	----------
	frames : [B, T, 3, 224, 224]
	input_ids : [B, S] (optional, for loss computation)
	attention_mask : [B, S] (optional)
	loss_mask : [B, S] (optional)

	Returns
	-------
	dict with keys: logits, and optionally loss
	"""
	B, T = frames.shape[:2]

	kv_cache, patch_features, mask_flat = self._encode_all_frames(frames, frame_mask)

	q_static = self.q_static.expand(B, -1)
	z_coarse = self._query_all_frames(q_static, kv_cache, B, T, mask_flat, patch_features)
	z_coarse_llm = self._project_visual(z_coarse)

	if input_ids is not None:
	text_embeds = self._embed_text(input_ids)
	seq = torch.cat([z_coarse_llm, text_embeds], dim=1)
	else:
	seq = z_coarse_llm
	# dtype handled by autocast on GPU; float32 on CPU

	out = self.llm.model(inputs_embeds=seq)
	logits = self.llm.lm_head(out.last_hidden_state)

	result: Dict[str, torch.Tensor] = {"logits": logits}

	if input_ids is not None:
	S = input_ids.shape[1]
	pad_id = self._get_pad_token_id()
	visual_pad = torch.full(
	(B, T), pad_id, dtype=input_ids.dtype, device=input_ids.device,
	)
	full_labels = torch.cat([visual_pad, input_ids], dim=1)

	if loss_mask is not None:
	visual_no_loss = torch.zeros(
	B, T, dtype=loss_mask.dtype, device=loss_mask.device,
	)
	full_loss_mask = torch.cat([visual_no_loss, loss_mask], dim=1)
	elif attention_mask is not None:
	visual_no_loss = torch.zeros(
	B, T, dtype=attention_mask.dtype, device=attention_mask.device,
	)
	full_loss_mask = torch.cat([visual_no_loss, attention_mask], dim=1)
	else:
	full_loss_mask = None

	loss = self._ce_loss(logits, full_labels, full_loss_mask)
	result["loss"] = loss
	result["coarse_loss"] = loss
	result["fine_loss"] = torch.tensor(0.0, device=frames.device)

	return result

	# ------------------------------------------------------------------
	# Forward mode 3: Autoregressive (TRUE INFERENCE)
	# ------------------------------------------------------------------

	@torch.no_grad()
	def forward_autoregressive(
	self,
	frames: torch.Tensor,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	loss_mask: Optional[torch.Tensor] = None,
	frame_mask: Optional[torch.Tensor] = None,
	) -> Dict[str, torch.Tensor]:
	"""
	True autoregressive inference: sequential frame-by-frame with KV cache.

	q_init -> frame_1 -> z_1 -> LLM -> q_1 -> frame_2 -> z_2 -> ...

	No coarse pass. Each query is derived from the LLM hidden state after
	processing the previous fine visual token -- exactly what happens at
	real inference time.

	Parameters
	----------
	frames : [B, T, 3, 224, 224]
	input_ids : [B, S] (optional, for loss computation)
	attention_mask : [B, S] (optional)
	loss_mask : [B, S] (optional)

	Returns
	-------
	dict with keys: logits, and optionally loss
	"""
	B, T = frames.shape[:2]
	device = frames.device

	# Encode all frames with DINO up front (this is OK -- DINO encoding
	# does not depend on the query, only query_attend does).
	kv_cache, patch_features, mask_flat = self._encode_all_frames(frames, frame_mask)

	# Enable KV cache on the LLM for incremental decoding
	orig_use_cache = self.llm.config.use_cache
	self.llm.config.use_cache = True

	query = self.q_init.expand(B, -1) # [B, qd]
	llm_past_kv = None

	for t in range(T):
	# Foveated extraction with current query
	frame_kv = self._extract_frame_kv(kv_cache, mask_flat, B, T, t)
	z_t = self.encoder.query_attend(query, frame_kv) # [B, dd]
	z_t_llm = self._project_visual(z_t.unsqueeze(1)) # [B,1,ld]
	# dtype handled by autocast on GPU; float32 on CPU

	# Incremental LLM forward (one visual token at a time)
	out = self.llm.model(
	inputs_embeds=z_t_llm,
	past_key_values=llm_past_kv,
	use_cache=True,
	)
	llm_past_kv = out.past_key_values

	# Derive query for the NEXT frame from the current hidden state
	if t < T - 1:
	h_t = out.last_hidden_state[:, -1, :] # [B, ld]
	query = self.llm_to_query(h_t) # [B, qd]

	# ---- Now process text (if provided) using the accumulated KV cache ----
	if input_ids is not None:
	text_embeds = self._embed_text(input_ids) # [B, S, ld]

	out_text = self.llm.model(
	inputs_embeds=text_embeds,
	past_key_values=llm_past_kv,
	use_cache=False,
	)
	# Combine visual hidden states (already in KV cache) with text states
	# for logit computation. We only need logits over the text portion
	# (plus the last visual token which predicts the first text token).
	#
	# The KV cache holds T visual positions; out_text.last_hidden_state
	# holds S text positions. We reconstruct the full logits as
	# [visual_logits, text_logits] but only compute loss on text.
	h_text = out_text.last_hidden_state # [B, S, ld]
	logits_text = self.llm.lm_head(h_text) # [B, S, V]

	# For the loss we also need the logit at the last visual position
	# (it predicts the first text token). Re-derive it:
	h_last_visual = out.last_hidden_state[:, -1:, :] # [B,1,ld]
	logits_last_v = self.llm.lm_head(h_last_visual) # [B,1,V]

	# Full logits over [last_visual, text] = [B, 1+S, V]
	logits = torch.cat([logits_last_v, logits_text], dim=1)

	# Labels: [pad_for_last_visual, input_ids]
	pad_id = self._get_pad_token_id()
	lv_pad = torch.full(
	(B, 1), pad_id, dtype=input_ids.dtype, device=device,
	)
	full_labels = torch.cat([lv_pad, input_ids], dim=1)

	# Loss mask
	if loss_mask is not None:
	lv_no_loss = torch.zeros(
	B, 1, dtype=loss_mask.dtype, device=device,
	)
	full_loss_mask = torch.cat([lv_no_loss, loss_mask], dim=1)
	elif attention_mask is not None:
	lv_no_loss = torch.zeros(
	B, 1, dtype=attention_mask.dtype, device=device,
	)
	full_loss_mask = torch.cat([lv_no_loss, attention_mask], dim=1)
	else:
	full_loss_mask = None

	loss = self._ce_loss(logits, full_labels, full_loss_mask)

	self.llm.config.use_cache = orig_use_cache
	return {"loss": loss, "logits": logits}

	else:
	# No text -- just return logits at the last visual position
	h_last = out.last_hidden_state # [B, 1, ld]
	logits = self.llm.lm_head(h_last)
	self.llm.config.use_cache = orig_use_cache
	return {"logits": logits}

	# ------------------------------------------------------------------
	# Convenience: unified forward dispatching by name
	# ------------------------------------------------------------------

	def forward(
	self,
	frames: torch.Tensor,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor,
	loss_mask: Optional[torch.Tensor] = None,
	frame_mask: Optional[torch.Tensor] = None,
	mode: str = "coarse_fine",
	) -> Dict[str, torch.Tensor]:
	"""
	Unified forward entry point.

	mode : "coarse_fine" \| "coarse_only" \| "autoregressive"
	frame_mask : [B, T] bool — True for real frames, False for padding.
	"""
	if mode == "coarse_fine":
	return self.forward_coarse_fine(frames, input_ids, attention_mask, loss_mask, frame_mask)
	elif mode == "coarse_only":
	return self.forward_coarse_only(frames, input_ids, attention_mask, loss_mask, frame_mask)
	elif mode == "autoregressive":
	return self.forward_autoregressive(frames, input_ids, attention_mask, loss_mask, frame_mask)
	else:
	raise ValueError(
	f"Unknown forward mode '{mode}'. "
	"Expected one of: coarse_fine, coarse_only, autoregressive"
	)

	# ------------------------------------------------------------------
	# Utility methods for external callers (train.py, eval.py)
	# ------------------------------------------------------------------

	def enable_gradient_checkpointing(self) -> None:
	"""Turn on activation checkpointing for LLM and DINO."""
	self.llm.gradient_checkpointing_enable()
	if hasattr(self.encoder.dino, 'gradient_checkpointing_enable'):
	self.encoder.dino.gradient_checkpointing_enable()

	def get_param_groups(
	self,
	lr_backbone: float = 1e-5,
	lr_connector: float = 1e-4,
	) -> list:
	"""
	Return parameter groups with differential learning rates.

	Groups:
	1. Connector (dino_to_llm, llm_to_query, q_static, q_init) -- highest LR
	2. DINO encoder -- backbone LR
	3. LLM -- backbone LR

	This is a suggestion; train.py may override.
	"""
	connector_params = set()
	for name, param in self.named_parameters():
	if any(k in name for k in [
	"dino_to_llm", "llm_to_query", "q_static", "q_init",
	"query_input_proj", "query_output_proj",
	]):
	connector_params.add(id(param))

	encoder_params = set()
	for name, param in self.encoder.named_parameters():
	if id(param) not in connector_params:
	encoder_params.add(id(param))

	groups = [
	{
	"params": [p for p in self.parameters()
	if id(p) in connector_params and p.requires_grad],
	"lr": lr_connector,
	"name": "connector",
	},
	{
	"params": [p for n, p in self.encoder.named_parameters()
	if id(p) in encoder_params and p.requires_grad],
	"lr": lr_backbone,
	"name": "dino",
	},
	{
	"params": [p for p in self.llm.parameters() if p.requires_grad],
	"lr": lr_backbone,
	"name": "llm",
	},
	]
	return [g for g in groups if len(g["params"]) > 0]