Upload DINAC-AE export package

1b703d5 3 days ago

21.3 kB

	from __future__ import annotations

	import math
	from collections.abc import Callable

	import torch
	from torch import nn


	class Rope1D(nn.Module):
	"""
	Rotary Position Embedding (RoPE) 1D.

	Based on the reference LLaMA implementation (Hugging Face
	`modeling_llama.py`), adapted to this codebase without behavior changes.

	- dim: per-head dimension
	- max_position_embeddings: length used to precompute cached cos/sin (not required
	by forward)
	- base: RoPE base theta

	Forward expects:
	- x: (B, H, T, D)
	- position_ids: (B, T) integer positions
	Returns:
	- cos, sin: (B, T, D)
	"""

	inv_freq: torch.Tensor
	_cos_cached: torch.Tensor
	_sin_cached: torch.Tensor

	def __init__(
	self,
	dim: int,
	max_position_embeddings: int = 2048,
	base: float = 10000.0,
	device: torch.device \| None = None,
	scaling_factor: float = 1.0,
	) -> None:
	super().__init__()
	if dim % 2 != 0:
	raise AssertionError("head_dim must be even for RoPE")
	self.scaling_factor: float = float(scaling_factor)
	self.dim: int = int(dim)
	self.max_position_embeddings: int = int(max_position_embeddings)
	self.base: float = float(base)
	inv_freq = self._build_inv_freq(device=device)
	self.register_buffer("inv_freq", inv_freq, persistent=False)

	# Cached cos/sin (not used in application, but kept for parity with reference)
	self.max_seq_len_cached: int = self.max_position_embeddings
	cos_cached, sin_cached = self._build_cached_trig(device=device)
	self.register_buffer("_cos_cached", cos_cached, persistent=False)
	self.register_buffer("_sin_cached", sin_cached, persistent=False)

	def _build_inv_freq(self, *, device: torch.device \| None) -> torch.Tensor:
	"""Return the RoPE inverse-frequency vector in float32."""

	return 1.0 / (
	self.base
	** (
	torch.arange(0, self.dim, 2, device=device, dtype=torch.float32)
	/ float(self.dim)
	)
	)

	def _build_cached_trig(
	self, *, device: torch.device \| None
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""Return cached RoPE trig tensors in float32."""

	inv_freq = self._build_inv_freq(device=device)
	t = torch.arange(
	self.max_seq_len_cached,
	device=device,
	dtype=torch.float32,
	)
	t = t / self.scaling_factor
	freqs = torch.outer(t, inv_freq)
	emb = torch.cat((freqs, freqs), dim=-1)
	return emb.cos(), emb.sin()

	def _apply(
	self,
	fn: Callable[[torch.Tensor], torch.Tensor],
	recurse: bool = True,
	) -> Rope1D:
	"""Apply module moves/casts while preserving fp32 RoPE buffers."""

	out = super()._apply(fn, recurse=recurse)
	with torch.no_grad():
	device = self.inv_freq.device
	self.inv_freq.data = self._build_inv_freq(device=device)
	cos_cached, sin_cached = self._build_cached_trig(device=device)
	self._cos_cached.data = cos_cached
	self._sin_cached.data = sin_cached
	return out

	@torch.no_grad()
	def forward(
	self, x: torch.Tensor, position_ids: torch.Tensor
	) -> tuple[torch.Tensor, torch.Tensor]:
	inv_freq_tensor = self._build_inv_freq(device=x.device)
	inv_freq_expanded = (
	inv_freq_tensor[None, :, None].float().expand(position_ids.shape[0], -1, 1)
	)
	position_ids_expanded = position_ids[:, None, :].float() / self.scaling_factor
	device_type = x.device.type
	device_type = (
	device_type
	if isinstance(device_type, str) and device_type != "mps"
	else "cpu"
	)
	with torch.autocast(device_type=device_type, enabled=False):
	freqs = (
	inv_freq_expanded.float() @ position_ids_expanded.float()
	).transpose(1, 2)
	emb = torch.cat((freqs, freqs), dim=-1)
	cos = emb.cos()
	sin = emb.sin()
	return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


	def rotate_half(x: torch.Tensor) -> torch.Tensor:
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=-1)


	def rotate_half_adjacent(x: torch.Tensor) -> torch.Tensor:
	"""Rotate consecutive pairs in the last dimension.

	This matches the common EVA-02 / SpeedrunDiT RoPE convention where the last
	dimension is interpreted as pairs ``(x0, x1), (x2, x3), ...``.
	"""
	if x.shape[-1] % 2 != 0:
	raise ValueError("rotate_half_adjacent requires an even last dimension")
	x_pairs = x.reshape(*x.shape[:-1], x.shape[-1] // 2, 2)
	x1 = x_pairs[..., 0]
	x2 = x_pairs[..., 1]
	return torch.stack((-x2, x1), dim=-1).reshape_as(x)


	def apply_rotary_pos_emb(
	q: torch.Tensor,
	k: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	*,
	unsqueeze_dim: int = 1,
	) -> tuple[torch.Tensor, torch.Tensor]:
	cos = cos.unsqueeze(unsqueeze_dim)
	sin = sin.unsqueeze(unsqueeze_dim)
	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	class LearnableRoPE2D(nn.Module):
	r"""
	Learnable mixed 2D RoPE with axial RoPE2D-compatible initialization.

	- Learnable frequency banks for X and Y.
	- Frequencies can be shared across groups of attention heads (see
	``rope_param_dim``).
	- Angle per pair: theta = x * fx[g, i] + y * fy[g, i]
	- Initialization matches the axial RoPE2D parameterization used by DiTTrunk
	for ``ROPE_2D_AXIAL_FREQ_AWARE`` (AxialRoPE2DConfig(base=100, dim_layout=HALF_SPLIT)):
	- Angle multiplier ``2π``.
	- Period base ``100`` (DINOv3-style), applied per-axis.
	Each head group starts identically (deterministic init) so the learnable
	variant is functionally identical to axial RoPE2D at step 0.
	- Rotation is implemented with real-valued sin/cos to avoid complex tensors
	(torch.compile/inductor cannot codegen complex dtypes).

	Shapes:
	- Expects q,k of shape (B, H, T, D) with D % 4 == 0.
	- Positions xy: (T, 2) or (B, T, 2), any real dtype (cast to float32).
	- Parameter `freqs`: (2, G, D//2) in float32; index 0 = x, 1 = y.

	Head grouping / parameter budget
	-------------------------------
	``rope_param_dim`` controls the total number of learned RoPE frequency
	parameters (scalars) for this module.

	Let:
	- ``head_dim = D`` (per-head width)
	- ``num_heads = H``
	- ``rope_param_dim = P``

	Then the module uses:
	- ``num_groups = G = P // D``
	- ``heads_per_group = H // G``

	This is fail-fast: ``P`` must be divisible by ``D`` and ``H`` must be
	divisible by ``G``. When ``rope_param_dim`` is None (default), the module
	uses the classic per-head parameterization with ``P = H * D``.
	"""

	def __init__(
	self,
	head_dim: int,
	*,
	num_heads: int,
	rope_param_dim: int \| None = None,
	rope_base: float = 100.0,
	angle_multiplier: float = 2.0 * float(math.pi),
	learnable: bool = True,
	persist_buffers: bool = True,
	) -> None:
	super().__init__()
	if head_dim % 4 != 0:
	raise AssertionError("head_dim must be divisible by 4 for mixed 2D RoPE")
	self.head_dim: int = int(head_dim)
	# Avoid naming collisions with nn.Module.half() (dtype casting helper).
	self.half_dim: int = self.head_dim // 2
	self.num_heads: int = int(num_heads)
	effective_param_dim = (
	int(rope_param_dim)
	if rope_param_dim is not None
	else self.num_heads * self.head_dim
	)
	if effective_param_dim <= 0:
	raise ValueError("rope_param_dim must be positive for LearnableRoPE2D")
	self.rope_param_dim: int = int(effective_param_dim)
	self._learnable: bool = bool(learnable)
	theta = float(rope_base)
	mult = float(angle_multiplier)
	if not math.isfinite(theta) or theta <= 0.0:
	raise ValueError("rope_base must be finite and > 0 for LearnableRoPE2D")
	if not math.isfinite(mult) or mult <= 0.0:
	raise ValueError(
	"angle_multiplier must be finite and > 0 for LearnableRoPE2D"
	)

	if self.rope_param_dim % self.head_dim != 0:
	raise ValueError(
	"rope_param_dim must be divisible by head_dim for LearnableRoPE2D "
	f"(got rope_param_dim={self.rope_param_dim}, head_dim={self.head_dim})"
	)
	self.num_groups: int = self.rope_param_dim // self.head_dim
	if self.num_groups <= 0:
	raise RuntimeError("num_groups must be positive for LearnableRoPE2D")
	if self.num_heads % self.num_groups != 0:
	raise ValueError(
	"num_heads must be divisible by (rope_param_dim / head_dim) for LearnableRoPE2D "
	f"(got num_heads={self.num_heads}, num_groups={self.num_groups}, "
	f"rope_param_dim={self.rope_param_dim}, head_dim={self.head_dim})"
	)
	self.heads_per_group: int = self.num_heads // self.num_groups
	if self.heads_per_group <= 0:
	raise RuntimeError("heads_per_group must be positive for LearnableRoPE2D")

	# Axial-compatible deterministic init:
	# - periods match AxialRoPE2DConfig(base=100, dim_layout=HALF_SPLIT)
	# - angle = 2π * coord / period
	qtr = self.head_dim // 4
	exponents = (
	2.0
	* torch.arange(int(qtr), dtype=torch.float32)
	/ float(self.head_dim // 2)
	)
	periods = torch.tensor(theta, dtype=torch.float32) ** exponents # [qtr]
	axis_freqs = (mult / periods).to(dtype=torch.float32) # [qtr]

	zeros = torch.zeros_like(axis_freqs)
	# Match AxialRoPE2D(HALF_SPLIT) flatten order: [y-axis, x-axis].
	# Our xy columns are (x, y), so:
	# - x contributes to the second quarter (x-axis part)
	# - y contributes to the first quarter (y-axis part)
	fx_half = torch.cat((zeros, axis_freqs), dim=0) # [half_dim]
	fy_half = torch.cat((axis_freqs, zeros), dim=0) # [half_dim]

	freqs_x = fx_half.expand(int(self.num_groups), -1).clone()
	freqs_y = fy_half.expand(int(self.num_groups), -1).clone()
	freqs = torch.stack([freqs_x, freqs_y], dim=0) # (2, G, half)
	if self._learnable:
	self.freqs = nn.Parameter(freqs, requires_grad=True)
	else:
	self.register_buffer("freqs", freqs, persistent=persist_buffers)

	def _apply(
	self,
	fn: Callable[[torch.Tensor], torch.Tensor],
	recurse: bool = True,
	) -> LearnableRoPE2D:
	"""Apply module moves/casts while preserving fp32 frequency tensors."""

	out = super()._apply(fn, recurse=recurse)
	with torch.no_grad():
	self.freqs.data = self.freqs.data.to(dtype=torch.float32)
	return out

	def _apply_rotary_from_trig(
	self,
	x: torch.Tensor,
	*,
	sin: torch.Tensor,
	cos: torch.Tensor,
	) -> torch.Tensor:
	"""Rotate Q/K using precomputed grouped sin/cos buffers (HALF_SPLIT layout).

	This matches AxialRoPE2DConfig(dim_layout=HALF_SPLIT) rotation and keeps
	the learnable variant identical at initialization when combined with
	axial-compatible frequency init.

	Args:
	x: Tensor shaped ``(B, H, T, D)``.
	sin: Sin tensor shaped ``(G, T, D//2)`` or ``(B, G, T, D//2)``.
	cos: Cos tensor shaped ``(G, T, D//2)`` or ``(B, G, T, D//2)``.

	Returns:
	Tensor with the same shape/dtype/device as ``x``.
	"""
	if x.dim() != 4:
	raise ValueError("x must be shaped (B, H, T, D)")
	B, H, T, D = x.shape
	if self.num_heads != int(H):
	raise ValueError("num_heads mismatch for LearnableRoPE2D")
	if self.head_dim != int(D):
	raise ValueError("head_dim mismatch for LearnableRoPE2D")

	if sin.dim() == 3 and cos.dim() == 3:
	sin = sin.unsqueeze(0)
	cos = cos.unsqueeze(0)
	if sin.dim() != 4 or cos.dim() != 4:
	raise RuntimeError("Unexpected sin/cos rank for LearnableRoPE2D")
	if int(D) % 2 != 0:
	raise RuntimeError("LearnableRoPE2D requires even head_dim for HALF_SPLIT")
	half = int(D) // 2
	if int(sin.shape[-1]) != half or int(cos.shape[-1]) != half:
	raise RuntimeError(
	"LearnableRoPE2D expected sin/cos last dim == head_dim//2 "
	f"(got sin={tuple(sin.shape)}, cos={tuple(cos.shape)}, head_dim={int(D)})"
	)

	sin = sin[:, :, None, :, :] # [B, G, 1, T, half]
	cos = cos[:, :, None, :, :] # [B, G, 1, T, half]

	grouped = x.reshape(
	int(B),
	int(self.num_groups),
	int(self.heads_per_group),
	int(T),
	int(D),
	)
	x1 = grouped[..., :half]
	x2 = grouped[..., half:]
	out1 = x1 * cos - x2 * sin
	out2 = x2 * cos + x1 * sin
	out = torch.cat((out1, out2), dim=-1).reshape(int(B), int(H), int(T), int(D))
	return out.to(dtype=x.dtype)

	def _compute_mixed_cis(self, xy: torch.Tensor) -> torch.Tensor:
	# Returns complex cis angles with shape (G, T, half) or (B, G, T, half)
	if xy.dim() == 2:
	# (T, 2) -> (G, T, half)
	t_x = xy[:, 0].to(dtype=torch.float32)
	t_y = xy[:, 1].to(dtype=torch.float32)
	with torch.autocast(device_type=t_x.device.type, enabled=False):
	# Memory notes:
	# - Avoid materializing both fx and fy; accumulate in-place into angles.
	# - Avoid torch.ones_like(angles) (full-size allocation); a scalar
	# magnitude broadcasts in torch.polar.
	angles = t_x.unsqueeze(-1).unsqueeze(-1) * self.freqs[0].unsqueeze(
	0
	) # (T, G, half)
	angles.add_(
	t_y.unsqueeze(-1).unsqueeze(-1) * self.freqs[1].unsqueeze(0)
	)
	angles = angles.permute(1, 0, 2) # (G, T, half)
	cis = torch.polar(
	torch.ones((), device=angles.device, dtype=angles.dtype), angles
	)
	return cis
	elif xy.dim() == 3:
	# (B, T, 2) -> (B, G, T, half)
	t_x = xy[..., 0].to(dtype=torch.float32)
	t_y = xy[..., 1].to(dtype=torch.float32)
	with torch.autocast(device_type=t_x.device.type, enabled=False):
	angles = t_x.unsqueeze(-1).unsqueeze(-1) * self.freqs[0].unsqueeze(
	0
	).unsqueeze(0)
	angles.add_(
	t_y.unsqueeze(-1).unsqueeze(-1)
	* self.freqs[1].unsqueeze(0).unsqueeze(0)
	)
	angles = angles.permute(0, 2, 1, 3) # (B, G, T, half)
	cis = torch.polar(
	torch.ones((), device=angles.device, dtype=angles.dtype), angles
	)
	return cis
	else:
	raise ValueError("xy must have shape (T,2) or (B,T,2)")

	def _compute_mixed_angles(self, xy: torch.Tensor) -> torch.Tensor:
	"""Return mixed RoPE2D angles without applying cis/polar.

	Args:
	xy: XY positions shaped ``(T, 2)`` or ``(B, T, 2)``.

	Returns:
	Float tensor of angles shaped ``(G, T, half)`` or ``(B, G, T, half)``.
	"""
	if xy.dim() == 2:
	t_x = xy[:, 0].to(dtype=torch.float32)
	t_y = xy[:, 1].to(dtype=torch.float32)
	with torch.autocast(device_type=t_x.device.type, enabled=False):
	angles = t_x.unsqueeze(-1).unsqueeze(-1) * self.freqs[0].unsqueeze(0)
	angles.add_(
	t_y.unsqueeze(-1).unsqueeze(-1) * self.freqs[1].unsqueeze(0)
	)
	return angles.permute(1, 0, 2)
	if xy.dim() == 3:
	t_x = xy[..., 0].to(dtype=torch.float32)
	t_y = xy[..., 1].to(dtype=torch.float32)
	with torch.autocast(device_type=t_x.device.type, enabled=False):
	angles = t_x.unsqueeze(-1).unsqueeze(-1) * self.freqs[0].unsqueeze(
	0
	).unsqueeze(0)
	angles.add_(
	t_y.unsqueeze(-1).unsqueeze(-1)
	* self.freqs[1].unsqueeze(0).unsqueeze(0)
	)
	return angles.permute(0, 2, 1, 3)
	raise ValueError("xy must have shape (T,2) or (B,T,2)")

	def _cos_sin_half_from_xy(
	self,
	xy: torch.Tensor,
	*,
	device: torch.device \| None = None,
	out_dtype: torch.dtype \| None = None,
	) -> tuple[torch.Tensor, torch.Tensor]:
	# Helper used in tests to build real-valued cos/sin tensors.
	cis = self._compute_mixed_cis(xy.to(device=device) if device else xy)
	# Convert complex cis to cos/sin (real/imag) with matching shapes
	if cis.is_complex():
	cos_h = cis.real
	sin_h = cis.imag
	else:
	# Should not happen; torch.polar returns complex64/128
	raise RuntimeError("Expected complex cis tensor from polar")
	if out_dtype is not None:
	cos_h = cos_h.to(dtype=out_dtype)
	sin_h = sin_h.to(dtype=out_dtype)
	return cos_h, sin_h

	def _cos_sin_from_xy(
	self,
	xy: torch.Tensor,
	*,
	device: torch.device \| None = None,
	out_dtype: torch.dtype \| None = None,
	) -> tuple[torch.Tensor, torch.Tensor]:
	cos_h, sin_h = self._cos_sin_half_from_xy(
	xy, device=device, out_dtype=out_dtype
	)
	emb_cos = torch.cat((cos_h, cos_h), dim=-1)
	emb_sin = torch.cat((sin_h, sin_h), dim=-1)
	return emb_cos, emb_sin

	def rotate_qk(
	self,
	q: torch.Tensor,
	k: torch.Tensor,
	xy: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor]:
	if q.dim() != 4 or k.dim() != 4:
	raise ValueError("q,k must be shaped (B,H,T,D)")
	_, H, _, D = q.shape
	if self.num_heads != H:
	raise ValueError("num_heads mismatch for LearnableRoPE2D")
	if self.head_dim != D:
	raise ValueError("head_dim mismatch for LearnableRoPE2D")
	if D % 4 != 0:
	raise AssertionError("head_dim must be divisible by 4 for mixed 2D RoPE")

	# Use real-valued sin/cos rotation to keep torch.compile/inductor on the
	# fast path (inductor cannot codegen complex tensors).
	angles = self._compute_mixed_angles(xy.to(device=q.device))
	sin = torch.sin(angles)
	cos = torch.cos(angles)
	q_out = self._apply_rotary_from_trig(q, sin=sin, cos=cos)
	k_out = self._apply_rotary_from_trig(k, sin=sin, cos=cos)
	return q_out, k_out

	def rotate_qk_with_dilation(
	self,
	q: torch.Tensor,
	k: torch.Tensor,
	*,
	xy: torch.Tensor,
	scales: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""Rotate Q/K using mixed 2D RoPE with per-sample isotropic dilation.

	This implements dilation by scaling the RoPE angle, i.e.
	``theta_dilated = scale * theta_base`` where ``theta_base`` comes from the
	undilated XY coordinates.

	Args:
	q: Query tensor shaped ``(B, H, T, D)``.
	k: Key tensor shaped ``(B, H, T, D)``.
	xy: Base XY coordinates shaped ``(T, 2)`` or ``(B, T, 2)``.
	scales: Per-sample dilation scales shaped ``(B,)``.

	Raises:
	ValueError: If shapes are inconsistent or scales are not 1D.
	"""
	if q.dim() != 4 or k.dim() != 4:
	raise ValueError("q,k must be shaped (B,H,T,D)")
	B, H, T, D = q.shape
	if self.num_heads != H:
	raise ValueError("num_heads mismatch for LearnableRoPE2D")
	if self.head_dim != D:
	raise ValueError("head_dim mismatch for LearnableRoPE2D")
	if scales.dim() != 1 or scales.shape[0] != B:
	raise ValueError("scales must have shape (B,) matching q batch size")
	if xy.dim() == 2 and xy.shape[0] != T:
	raise ValueError("xy length must match q sequence length")
	if xy.dim() == 3 and (xy.shape[0] != B or xy.shape[1] != T):
	raise ValueError("xy must have shape (B,T,2) matching q batch/sequence")
	if xy.shape[-1] != 2:
	raise ValueError("xy must have last dimension 2")

	angles = self._compute_mixed_angles(xy.to(device=q.device))
	angles = angles * scales.to(device=q.device, dtype=torch.float32).view(
	B, 1, 1, 1
	)
	sin = torch.sin(angles)
	cos = torch.cos(angles)
	q_out = self._apply_rotary_from_trig(q, sin=sin, cos=cos)
	k_out = self._apply_rotary_from_trig(k, sin=sin, cos=cos)
	return q_out, k_out