sad / src /models /sad_model.py

Add files using upload-large-folder tool

922bb4b verified 26 days ago

21.5 kB

	"""
	SAD backbone model – Block-AR variant with vectorized training.

	Architecture:
	- Sequence divided into blocks of `block_size` tokens
	- Within a block: bidirectional attention
	- Across blocks: autoregressive (each block sees only earlier blocks' clean tokens)
	- No timestep conditioning; single learnable cond_bias vector
	- Input: continuous embeddings (leaf embs / prototype embs / mask emb)

	Vectorized training (forward_vectorized):
	Concatenate x_full = [x_noisy (0..L-1) \| x_clean (L..2L-1)], shape [B, 2L, d].
	Apply block-diff attention mask (see build_block_diff_mask), run one forward pass,
	return logits for the noisy half only [B, L, V].

	Block-diff mask rules (positions in x_full = [noisy \| clean]):
	noisy[i] → noisy[j]: allowed iff same block
	noisy[i] → clean[j]: allowed iff block(j) < block(i) (strictly earlier clean block)
	clean[i] → clean[j]: allowed iff block(j) <= block(i) (same or earlier clean block)
	clean[i] → noisy[j]: never

	Flex attention is used when available for the sparse block-diff mask.
	"""

	import math
	from typing import Optional, Tuple

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from .dit_components import (
	DDiTBlockWithMask,
	DDitFinalLayer,
	EmbeddingLayer,
	Rotary,
	)

	# Use FlexAttention via create_block_mask + mask_mod for true block-sparse
	# attention. The compiled kernel skips entirely-masked kv-blocks.
	try:
	from torch.nn.attention.flex_attention import create_block_mask
	_has_flex_attention = True
	except ImportError:
	create_block_mask = None
	_has_flex_attention = False


	# ──────────────────────────────────────────────────────────────────────────────
	# Block-diff attention mask
	# ──────────────────────────────────────────────────────────────────────────────

	def build_block_diff_mask(seq_len: int, block_size: int, device) -> torch.Tensor:
	"""
	Build [2L, 2L] bool attention mask for vectorized block-diff training.

	x_full layout: [x_noisy (0..L-1) \| x_clean (L..2L-1)]

	Rules:
	noisy->noisy : same block only (block_q == block_k)
	noisy->clean : strictly earlier block (block_k < block_q)
	clean->clean : same or earlier block (block_k <= block_q)
	clean->noisy : never

	Returns: mask [2L, 2L], True = allowed to attend
	"""
	L = seq_len
	N = 2 * L
	idx = torch.arange(N, device=device)
	q = idx[:, None]
	k = idx[None, :]

	is_clean_q = q >= L
	is_clean_k = k >= L

	block_q = torch.where(is_clean_q, (q - L) // block_size, q // block_size)
	block_k = torch.where(is_clean_k, (k - L) // block_size, k // block_size)

	noisy_to_noisy = (~is_clean_q) & (~is_clean_k) & (block_q == block_k)
	noisy_to_clean = (~is_clean_q) & is_clean_k & (block_k < block_q)
	clean_to_clean = is_clean_q & is_clean_k & (block_k <= block_q)

	return noisy_to_noisy \| noisy_to_clean \| clean_to_clean


	def _make_block_diff_mask_mod(seq_len: int, block_size: int):
	"""
	Return a mask_mod function for flex_attention's create_block_mask implementing
	the block-diff mask. Positions 0..L-1 are noisy; L..2L-1 are clean.

	Unlike score_mod (which runs per-score), mask_mod is used by create_block_mask
	to build a structured BlockMask that the compiled kernel uses to skip
	entire kv-blocks — this is where the real sparsity speedup comes from.
	"""
	L = seq_len

	def mask_mod(b, h, q_idx, k_idx):
	is_clean_q = q_idx >= L
	is_clean_k = k_idx >= L
	block_q = torch.where(is_clean_q, (q_idx - L) // block_size, q_idx // block_size)
	block_k = torch.where(is_clean_k, (k_idx - L) // block_size, k_idx // block_size)

	n2n = (~is_clean_q) & (~is_clean_k) & (block_q == block_k)
	n2c = (~is_clean_q) & is_clean_k & (block_k < block_q)
	c2c = is_clean_q & is_clean_k & (block_k <= block_q)
	return n2n \| n2c \| c2c

	return mask_mod


	# ──────────────────────────────────────────────────────────────────────────────
	# SADModel
	# ──────────────────────────────────────────────────────────────────────────────

	class SADModel(nn.Module):
	"""
	SAD backbone: Block-AR diffusion, no timestep conditioning.

	Args:
	vocab_size: V – leaf vocabulary size
	hidden_size: d – transformer hidden dimension
	n_blocks: number of transformer blocks
	n_heads: number of attention heads
	cond_dim: conditioning dimension for adaLN
	max_seq_len: total sequence length (must be divisible by block_size)
	block_size: tokens per block
	dropout: dropout rate
	num_levels: total hierarchy levels (leaf + intermediate, not counting mask)
	level_sizes: [V, K1, K2, ...]
	"""

	def __init__(
	self,
	vocab_size: int,
	hidden_size: int = 768,
	n_blocks: int = 12,
	n_heads: int = 12,
	cond_dim: int = 128,
	max_seq_len: int = 512,
	block_size: int = 8,
	dropout: float = 0.0,
	num_levels: int = 2,
	level_sizes: Optional[list] = None,
	tie_weights: bool = True,
	# kept for API compat, ignored:
	use_aux_head: bool = False,
	embed_input_mode: str = "embeddings",
	extended_vocab_size: int = 0,
	):
	super().__init__()
	assert max_seq_len % block_size == 0, \
	f"max_seq_len ({max_seq_len}) must be divisible by block_size ({block_size})"

	self.vocab_size = vocab_size
	self.hidden_size = hidden_size
	self.n_blocks = n_blocks
	self.n_heads = n_heads
	self.cond_dim = cond_dim
	self.max_seq_len = max_seq_len
	self.block_size = block_size
	self.num_total_blocks = max_seq_len // block_size
	self.num_levels = num_levels
	self.level_sizes = level_sizes
	self.aux_head = None

	# ---- Leaf token embedding table ----
	raw_size = vocab_size
	self.rounded_vocab_size = raw_size + (128 - raw_size % 128) % 128
	self.vocab_embed = EmbeddingLayer(hidden_size, self.rounded_vocab_size)

	# ---- Input projection (hidden_size -> hidden_size) ----
	self.input_proj = nn.Linear(hidden_size, hidden_size, bias=False)

	# ---- Conditioning: single learnable bias, no t ----
	self.cond_bias = nn.Parameter(torch.zeros(cond_dim))
	nn.init.normal_(self.cond_bias, std=0.02)

	# ---- Block-index embedding (AR position across blocks) ----
	# Used for both halves; block index is the same for noisy and clean halves.
	self.block_idx_embed = nn.Embedding(self.num_total_blocks, hidden_size)
	nn.init.normal_(self.block_idx_embed.weight, std=0.02)

	# ---- Intra-block position embedding ----
	self.intra_pos_embed = nn.Embedding(block_size, hidden_size)
	nn.init.normal_(self.intra_pos_embed.weight, std=0.02)

	# ---- Noisy/clean segment embedding ----
	# Tells the model whether a position is in the noisy half or clean half.
	self.segment_embed = nn.Embedding(2, hidden_size) # 0=noisy, 1=clean
	nn.init.normal_(self.segment_embed.weight, std=0.02)

	# ---- Rotary (intra-block positions only, repeated) ----
	self.rotary_emb = Rotary(
	hidden_size // n_heads,
	max_seq_len=max_seq_len * 2, # 2L for the full vectorized sequence
	)

	# ---- Transformer blocks ----
	self.blocks = nn.ModuleList([
	DDiTBlockWithMask(hidden_size, n_heads, cond_dim, dropout=dropout)
	for _ in range(n_blocks)
	])

	# ---- Output head ----
	rounded_leaf = vocab_size + (128 - vocab_size % 128) % 128
	self.output_layer = DDitFinalLayer(hidden_size, rounded_leaf, cond_dim)
	if tie_weights:
	# Weight tying: output projection shares the input embedding matrix,
	# so logits = h @ vocab_embed.embedding.T (same parameter, same gradients).
	self.output_layer.linear.weight = self.vocab_embed.embedding
	else:
	# Decoupled: independent output projection.
	# DDitFinalLayer zero-initialises its linear layer; re-init for trainability.
	nn.init.normal_(self.output_layer.linear.weight, std=0.02)

	# Cache block-diff masks (keyed by (seq_len, block_size))
	self._mask_cache: dict = {}

	def get_leaf_embeddings(self) -> torch.Tensor:
	"""[V, hidden_size] embedding matrix for leaf tokens."""
	return self.vocab_embed.embedding[:self.vocab_size]

	def _get_block_diff_mask(self, seq_len: int, device) -> torch.Tensor:
	key = (seq_len, self.block_size, str(device))
	if key not in self._mask_cache:
	self._mask_cache[key] = build_block_diff_mask(seq_len, self.block_size, device)
	return self._mask_cache[key]

	def _get_flex_block_mask(self, seq_len: int, device):
	"""Cached flex BlockMask for (L, block_size, device)."""
	key = ("flex", seq_len, self.block_size, str(device))
	if key not in self._mask_cache:
	mask_mod = _make_block_diff_mask_mod(seq_len, self.block_size)
	self._mask_cache[key] = create_block_mask(
	mask_mod, B=None, H=None,
	Q_LEN=2 * seq_len, KV_LEN=2 * seq_len,
	device=device,
	)
	return self._mask_cache[key]

	def _build_position_embeddings(
	self, seq_len: int, is_clean_half: bool, device, dtype
	) -> torch.Tensor:
	"""
	Build [1, seq_len, hidden_size] position + segment embeddings.

	seq_len = L (one half of the vectorized sequence).
	"""
	num_blocks = seq_len // self.block_size

	# Block indices: [0,0,...,0, 1,1,...,1, ..., nb-1,...,nb-1]
	block_idx = torch.arange(num_blocks, device=device).repeat_interleave(self.block_size)
	# Intra-block positions: [0,1,...,bs-1, 0,1,...,bs-1, ...]
	intra_pos = torch.arange(self.block_size, device=device).repeat(num_blocks)

	seg_id = torch.ones(seq_len, dtype=torch.long, device=device) * int(is_clean_half)

	pos_emb = (
	self.block_idx_embed(block_idx) # [L, d]
	+ self.intra_pos_embed(intra_pos) # [L, d]
	+ self.segment_embed(seg_id) # [L, d]
	).unsqueeze(0).to(dtype) # [1, L, d]
	return pos_emb

	def forward_vectorized(
	self,
	noisy_embs: torch.Tensor,
	clean_embs: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Single-forward vectorized block-diff training pass.

	Concatenates [noisy_embs \| clean_embs] → x_full [B, 2L, d],
	applies block-diff sparse attention mask, returns logits for noisy half.

	Args:
	noisy_embs: [B, L, d] corrupted token embeddings
	clean_embs: [B, L, d] original clean token embeddings
	attention_mask: [B, L] float/bool padding mask (optional)

	Returns:
	leaf_logits: [B, L, V]
	"""
	B, L, d = noisy_embs.shape
	device = noisy_embs.device
	dtype = noisy_embs.dtype

	# ---- Project embeddings to hidden size ----
	x_noisy = self.input_proj(noisy_embs) # [B, L, d]
	x_clean = self.input_proj(clean_embs) # [B, L, d]

	# ---- Add position + segment embeddings ----
	x_noisy = x_noisy + self._build_position_embeddings(L, False, device, dtype)
	x_clean = x_clean + self._build_position_embeddings(L, True, device, dtype)

	# ---- Concatenate ----
	x = torch.cat([x_noisy, x_clean], dim=1) # [B, 2L, d]

	# ---- Conditioning ----
	c = self.cond_bias.unsqueeze(0).expand(B, -1).to(dtype) # [B, cond_dim]

	# ---- Attention mask ----
	# Prefer flex (block-sparse, compiled) when available. Padding at attention
	# level is ignored — pad positions carry the eos token embedding (a valid
	# token) and loss already masks pad positions, so attending to them is
	# harmless and lets us keep a static BlockMask for compile.
	# Precondition: pad_token_id == eos_token_id (checked in train script).
	# TODO: to support multi-doc packing, extend the mask_mod in
	# _make_block_diff_mask_mod to AND with a same-document check
	# (doc_ids[q_idx] == doc_ids[k_idx]) — this requires threading doc_ids
	# into forward_vectorized and rebuilding the flex BlockMask per batch.
	if _has_flex_attention:
	flex_block_mask = self._get_flex_block_mask(L, device)
	attn_mask = None
	else:
	flex_block_mask = None
	attn_mask = self._get_block_diff_mask(L, device) # [2L, 2L] bool
	if attention_mask is not None:
	pad_mask = torch.cat([attention_mask, attention_mask], dim=1).bool()
	block_diff_float = torch.zeros_like(attn_mask, dtype=dtype).masked_fill_(~attn_mask, -1e9)
	attn_mask = block_diff_float[None, None, :, :].expand(B, 1, 2 * L, 2 * L).contiguous()
	attn_mask = attn_mask.masked_fill(~pad_mask[:, None, None, :], -1e9)

	# ---- Rotary ----
	# Align RoPE positions: noisy half and clean half both use 0..L-1
	# so that cross-attention between corresponding tokens sees the same
	# absolute position (zero relative offset from RoPE).
	position_ids = torch.cat([
	torch.arange(L, device=device),
	torch.arange(L, device=device),
	]) # [2L]
	rotary_cos_sin = self.rotary_emb(x, position_ids=position_ids)

	# ---- Transformer ----
	for block in self.blocks:
	x = block(
	x, rotary_cos_sin, c,
	attention_mask=attn_mask,
	flex_block_mask=flex_block_mask,
	)

	# ---- Output: noisy half only (slice BEFORE output projection to avoid
	# running the big V×d matmul on the clean half we would discard anyway).
	x_noisy_out = x[:, :L] # [B, L, d]
	leaf_logits = self.output_layer(x_noisy_out, c)[..., :self.vocab_size] # [B, L, V]
	return leaf_logits

	def _get_block_causal_mask(self, seq_len: int, device) -> torch.Tensor:
	"""
	Block-wise causal attention mask for inference.
	Token i attends to token j iff block(j) <= block(i).
	Within a block: bidirectional. Across blocks: autoregressive.
	Returns [seq_len, seq_len] bool mask (True = can attend).
	"""
	key = ("causal", seq_len, self.block_size, str(device))
	if key not in self._mask_cache:
	idx = torch.arange(seq_len, device=device)
	block_i = idx[:, None] // self.block_size # [S, 1]
	block_j = idx[None, :] // self.block_size # [1, S]
	self._mask_cache[key] = (block_j <= block_i) # [S, S]
	return self._mask_cache[key]

	def forward_causal(
	self,
	input_embeddings: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Block-wise causal forward pass for Block-AR inference.

	Identical to forward() but uses a block-wise causal attention mask so
	that each block only sees itself (bidirectionally) and earlier blocks.
	Used block-by-block during generation so that previously resolved blocks
	provide clean causal context for the current block.

	Args:
	input_embeddings: [B, S, d] (resolved leaf embs + current block state)
	attention_mask: [B, S] padding mask (optional)

	Returns:
	leaf_logits: [B, S, V]
	h: [B, S, d]
	"""
	B, S, d = input_embeddings.shape
	device = input_embeddings.device
	dtype = self.vocab_embed.embedding.dtype

	x = self.input_proj(input_embeddings.to(dtype))

	# Position embeddings (same as forward(), noisy segment id=0)
	num_blocks = S // self.block_size
	block_idx = torch.arange(num_blocks, device=device).repeat_interleave(self.block_size)
	intra_pos = torch.arange(self.block_size, device=device).repeat(num_blocks)
	seg_id = torch.zeros(S, dtype=torch.long, device=device)
	pos_emb = (
	self.block_idx_embed(block_idx)
	+ self.intra_pos_embed(intra_pos)
	+ self.segment_embed(seg_id)
	).unsqueeze(0).to(dtype)
	x = x + pos_emb

	c = self.cond_bias.unsqueeze(0).expand(B, -1).to(dtype)
	rotary_cos_sin = self.rotary_emb(x)

	# Block-wise causal attention mask: [S, S] → broadcast to [B, S, S]
	causal_mask = self._get_block_causal_mask(S, device) # [S, S] bool
	dtype_mask = causal_mask.to(dtype).masked_fill(~causal_mask, -1e9) # [S, S]
	# Expand to [B, S, S] if needed by transformer blocks
	attn_mask = dtype_mask.unsqueeze(0).expand(B, -1, -1) # [B, S, S]

	# Merge padding mask if provided
	if attention_mask is not None:
	pad_bias = torch.zeros(B, S, S, device=device, dtype=dtype)
	pad_bias.masked_fill_(~attention_mask.bool().unsqueeze(1), -1e9)
	attn_mask = attn_mask + pad_bias

	for block in self.blocks:
	x = block(x, rotary_cos_sin, c, attention_mask=attn_mask)

	h = x
	logits_full = self.output_layer(h, c)
	leaf_logits = logits_full[..., :self.vocab_size]
	return leaf_logits, h

	def forward(
	self,
	input_embeddings: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	# Block-AR training path dispatch (must go through .forward() for DDP grad sync):
	noisy_embs: Optional[torch.Tensor] = None,
	clean_embs: Optional[torch.Tensor] = None,
	# API compat, ignored:
	input_ids=None, t=None, level_ids=None,
	):
	"""
	Standard forward (full sequence, all-visible attention).
	Used for validation / evaluation.

	If `clean_embs` is provided (Block-AR training), dispatches to
	`forward_vectorized(noisy_embs, clean_embs, attention_mask)` and
	returns leaf_logits only — this routing is required for DDP to
	register the forward call and synchronize gradients.

	Args:
	input_embeddings: [B, S, d] continuous token embeddings
	(leaf emb / prototype emb / mask emb, same as training)
	attention_mask: [B, S]

	Returns:
	(leaf_logits, h) in standard mode
	leaf_logits in block-AR dispatch mode
	"""
	if clean_embs is not None:
	# Block-AR training dispatch — routes through DDP.
	noisy = noisy_embs if noisy_embs is not None else input_embeddings
	assert noisy is not None, "forward_vectorized dispatch requires noisy_embs"
	return self.forward_vectorized(noisy, clean_embs, attention_mask=attention_mask)

	B, S, d = input_embeddings.shape
	device = input_embeddings.device
	dtype = self.vocab_embed.embedding.dtype

	x = self.input_proj(input_embeddings.to(dtype))

	# Position embeddings (treat as single noisy sequence)
	num_blocks = S // self.block_size
	block_idx = torch.arange(num_blocks, device=device).repeat_interleave(self.block_size)
	intra_pos = torch.arange(self.block_size, device=device).repeat(num_blocks)
	seg_id = torch.zeros(S, dtype=torch.long, device=device)
	pos_emb = (
	self.block_idx_embed(block_idx)
	+ self.intra_pos_embed(intra_pos)
	+ self.segment_embed(seg_id)
	).unsqueeze(0).to(dtype)
	x = x + pos_emb

	c = self.cond_bias.unsqueeze(0).expand(B, -1).to(dtype)
	rotary_cos_sin = self.rotary_emb(x)

	# Pre-build a [B, 1, 1, S] additive float bias for the [B, S] padding
	# mask. Downstream DDiT blocks' 2D branch is for [S, S] block-diff masks
	# and can't disambiguate a [B, S] padding mask — we'd get a [1,1,B,S]
	# broken mask. Passing a float bias routes through the is_floating_point
	# branch instead, which broadcasts correctly.
	attn_bias = attention_mask
	if attn_bias is not None and not attn_bias.is_floating_point() and attn_bias.dim() == 2:
	attn_bias = (~attn_bias.bool())[:, None, None, :].to(dtype) * -1e9 # [B, 1, 1, S]

	for block in self.blocks:
	x = block(x, rotary_cos_sin, c, attention_mask=attn_bias)

	h = x # [B, S, d] — hidden state before output layer
	logits_full = self.output_layer(h, c)
	leaf_logits = logits_full[..., :self.vocab_size]
	return leaf_logits, h