dinac_ae / dit /axial_rope2d.py

Upload DINAC-AE export package

1b703d5 19 days ago

69.5 kB

	from __future__ import annotations

	import math
	from dataclasses import dataclass, replace
	from enum import Enum
	from typing import Final, cast

	import torch
	from torch import Tensor, nn

	__all__ = [
	"AxialRoPE2D",
	"AxialRoPE2DAlphaWarpConfig",
	"AxialRoPE2DBetaWarpConfig",
	"AxialRoPE2DConfig",
	"AxialRoPE2DCoordMode",
	"AxialRoPE2DDimLayout",
	"AxialRoPE2DDyPE",
	"AxialRoPE2DDyPEConfig",
	"AxialRoPE2DFrequencyAwareConfig",
	"AxialRoPE2DNormalizeCoords",
	"DyPERoPEMethod",
	"build_axial_rope2d_dype",
	"build_axial_rope2d_inference_warp_with_strength",
	"build_axial_rope2d_with_lumina_frequency_warp",
	"lumina_frequency_aware_periods_for_axis",
	"set_axial_rope2d_dype_noise_time",
	]


	class AxialRoPE2DNormalizeCoords(Enum):
	"""Coordinate normalization strategy for axial 2D RoPE (DINOv3-style)."""

	MIN = "min"
	MAX = "max"
	SEPARATE = "separate"


	class AxialRoPE2DCoordMode(Enum):
	"""Coordinate grid mode for axial 2D RoPE.

	- ``DINOV3_NORMALIZED``: DINOv3-style normalized patch-centre coordinates in
	``[-1, 1]`` (after normalization).
	- ``PATCH_INDICES``: Standard unnormalized patch-grid coordinates in patch
	units (e.g., ``x in [0, W-1]``, ``y in [0, H-1]``).
	"""

	DINOV3_NORMALIZED = "dinov3_normalized"
	PATCH_INDICES = "patch_indices"


	class AxialRoPE2DDimLayout(Enum):
	"""Layout of angles along the head-dimension.

	The layout must match the rotation convention used when applying RoPE to Q/K.

	- ``HALF_SPLIT``: LLaMA-style layout compatible with ``common.rope.rotate_half``
	(splits last dim into two halves).
	- ``PAIR_INTERLEAVED``: EVA-02 / SpeedrunDiT-style layout compatible with an
	adjacent-pair rotate_half (pairs consecutive dims).

	TODO(refactor): Standardize on ``PAIR_INTERLEAVED`` throughout DiT to reduce
	complexity and avoid layout mismatches, then delete ``HALF_SPLIT`` and any
	related branching once the migration is complete.
	"""

	HALF_SPLIT = "half_split"
	PAIR_INTERLEAVED = "pair_interleaved"


	class DyPERoPEMethod(Enum):
	"""Dynamic position extrapolation method applied to inference RoPE."""

	VISION_YARN = "vision_yarn"
	DY_YARN = "dy_yarn"
	DY_NTK = "dy_ntk"


	@dataclass(frozen=True)
	class AxialRoPE2DDyPEConfig:
	"""Inference-only DyPE controls for axial RoPE.

	Args:
	method: Dynamic extrapolation rule to apply.
	ref_h_tokens: Training/reference token height.
	ref_w_tokens: Training/reference token width.
	lambda_s: Dynamic extrapolation magnitude.
	lambda_t: Dynamic extrapolation noise-time exponent.
	yarn_beta_0: YaRN first-ramp high rotation threshold.
	yarn_beta_1: YaRN first-ramp low rotation threshold.
	yarn_gamma_0: YaRN base-blend high rotation threshold.
	yarn_gamma_1: YaRN base-blend low rotation threshold.
	yarn_attention_scale: Apply YaRN's static attention magnitude correction.
	"""

	method: DyPERoPEMethod
	ref_h_tokens: int
	ref_w_tokens: int
	lambda_s: float = 2.0
	lambda_t: float = 2.0
	yarn_beta_0: float = 1.25
	yarn_beta_1: float = 0.75
	yarn_gamma_0: float = 16.0
	yarn_gamma_1: float = 2.0
	yarn_attention_scale: bool = True

	def __post_init__(self) -> None:
	if not isinstance(self.method, DyPERoPEMethod):
	raise TypeError("method must be a DyPERoPEMethod")
	if int(self.ref_h_tokens) <= 0 or int(self.ref_w_tokens) <= 0:
	raise ValueError("ref_h_tokens and ref_w_tokens must be positive")
	for name, value in (
	("lambda_s", self.lambda_s),
	("lambda_t", self.lambda_t),
	("yarn_beta_0", self.yarn_beta_0),
	("yarn_beta_1", self.yarn_beta_1),
	("yarn_gamma_0", self.yarn_gamma_0),
	("yarn_gamma_1", self.yarn_gamma_1),
	):
	v = float(value)
	if not math.isfinite(v) or v <= 0.0:
	raise ValueError(f"{name} must be finite and > 0")
	if not isinstance(self.yarn_attention_scale, bool):
	raise TypeError("yarn_attention_scale must be a bool")


	@dataclass(frozen=True)
	class AxialRoPE2DFrequencyAwareConfig:
	"""Lumina/Next-DiT-style frequency-aware RoPE warping for one token grid.

	This config implements a per-axis, per-band frequency warp that depends on
	the input axis length ``L`` relative to a reference length ``L_ref``:

	- Define the axis scale ``s = L / L_ref``.
	- RoPE is parameterized by periods (wavelengths in tokens) ``period[d]``.
	In this module's axial parameterization (with patch-index coordinates),
	the angle for coordinate ``p`` and band ``d`` is:

	angle(p, d) = 2π * p / period[d]

	so the wavelength of band ``d`` is exactly ``period[d]`` tokens.

	- Pick a boundary wavelength ``L_boundary`` (in tokens), expressed as a
	trainable multiplier around the reference length:

	L_boundary = L_ref * exp(boundary_log_multiplier)

	The scalar ``boundary_log_multiplier`` is shared across H/W axes (and
	initialized by this config).

	- Define a (possibly fractional) boundary band index ``d*`` as the band
	whose wavelength equals ``L_boundary``:

	period(d*) = L_boundary

	In practice we compute ``d*`` by linear interpolation in log-period space
	(periods are geometric for both supported period parametrizations).

	- The Lumina/Next-DiT implicit exponent ramp is then:

	alpha[d] = clamp(d / d*, 0, 1)

	where:
	- high-frequency bands (small d) have alpha≈0 (extrapolation-like),
	- low-frequency bands (large d) have alpha→1 (interpolation-like),
	- alpha is capped at 1 to ensure we never compress a band more than
	plain position interpolation would.

	- Finally, warp the periods per axis:

	period'[d] = period[d] * s ** alpha[d]

	Equivalently, angular frequencies warp as:

	omega'[d] = omega[d] / s ** alpha[d]

	Notes
	-----
	- This warp is only meaningful for patch-index coordinates
	(``AxialRoPE2DCoordMode.PATCH_INDICES``). Mixing it with normalized
	coordinates would create an implicit "gauge switch"; we fail fast.
	- The boundary multiplier is trainable by construction (it is stored as an
	nn.Parameter inside AxialRoPE2D when this config is present).
	"""

	ref_h_tokens: int
	ref_w_tokens: int
	boundary_log_multiplier_init: float

	def __post_init__(self) -> None:
	if int(self.ref_h_tokens) <= 0 or int(self.ref_w_tokens) <= 0:
	raise ValueError("ref_h_tokens and ref_w_tokens must be positive")
	init = float(self.boundary_log_multiplier_init)
	if not math.isfinite(init):
	raise ValueError("boundary_log_multiplier_init must be finite")


	@dataclass(frozen=True)
	class AxialRoPE2DBetaWarpConfig:
	"""Trainable beta-curve warping for axial 2D RoPE periods (per token grid).

	This config defines a per-axis period warp that depends on the runtime axis
	length ``L`` relative to a reference length ``L_ref``:

	s = L / L_ref
	period'[d] = period[d] * s ** beta[d]

	where the per-band exponent curve beta(d) is parameterized by three
	trainable u-space scalars (shared across H/W axes):

	beta_hi = beta_max * tanh(beta_hi_u) (high-frequency endpoint, d=0)
	beta_lo = beta_max * tanh(beta_lo_u) (low-frequency endpoint, d=qtr-1)
	beta_bend = beta_max * tanh(beta_bend_u) (mid-band bump amplitude)

	and the per-band curve is:

	t = d / (qtr - 1) in [0, 1]
	beta(t) = lerp(beta_hi, beta_lo, t) + beta_bend * 4t(1-t)

	Interpretation
	--------------
	- ``beta(d) == 0``: identity / "extrapolation-like" (no warping; periods do not
	change with axis length).
	- ``beta(d) == 1``: position-interpolation-like for that band
	(``period'[d] = period[d] * s`` so ``omega'[d] = omega[d] / s``).

	This parameterization provides strong and smooth control over the effective
	scaling of each frequency band, including allowing beta<0 (increasing
	frequencies when s>1), which can be important for unnormalized RoPE bases
	(e.g. base=10_000) where some very low-frequency bands barely rotate on
	practical token grids.

	Notes:
	- This warp requires patch-index coordinates (coord_mode=PATCH_INDICES).
	- The u parameters are stored as nn.Parameter inside AxialRoPE2D when this
	config is present.
	"""

	ref_h_tokens: int
	ref_w_tokens: int
	beta_max: float
	beta_hi_u_init: float
	beta_lo_u_init: float
	beta_bend_u_init: float

	def __post_init__(self) -> None:
	if int(self.ref_h_tokens) <= 0 or int(self.ref_w_tokens) <= 0:
	raise ValueError("ref_h_tokens and ref_w_tokens must be positive")
	bmax = float(self.beta_max)
	if not math.isfinite(bmax) or bmax <= 0.0:
	raise ValueError("beta_max must be finite and > 0")
	for name, value in (
	("beta_hi_u_init", self.beta_hi_u_init),
	("beta_lo_u_init", self.beta_lo_u_init),
	("beta_bend_u_init", self.beta_bend_u_init),
	):
	v = float(value)
	if not math.isfinite(v):
	raise ValueError(f"{name} must be finite")


	@dataclass(frozen=True)
	class AxialRoPE2DAlphaWarpConfig:
	"""Per-band power-law warping of axial 2D RoPE frequencies (shared across axes).

	This config warps RoPE frequencies per band using a learned exponent vector
	``alpha[d]`` shared across H/W axes:

	f'[d] = f[d] * s ** alpha[d] where s = L / L_ref

	Since this module parameterizes angles via periods ``period[d]`` with
	``f[d] ∝ 1 / period[d]``, the equivalent period warp implemented in AxialRoPE2D is:

	period'[d] = period[d] / s ** alpha[d]

	Notes:
	- This warp requires patch-index coordinates (coord_mode=PATCH_INDICES).
	- ``alpha`` is stored as an unconstrained nn.Parameter vector of length Q
	(bands per axis), initialized to ``alpha_init`` for all bands.
	"""

	ref_h_tokens: int
	ref_w_tokens: int
	alpha_init: float

	def __post_init__(self) -> None:
	if int(self.ref_h_tokens) <= 0 or int(self.ref_w_tokens) <= 0:
	raise ValueError("ref_h_tokens and ref_w_tokens must be positive")
	init = float(self.alpha_init)
	if not math.isfinite(init):
	raise ValueError("alpha_init must be finite")


	@dataclass(frozen=True)
	class AxialRoPE2DConfig:
	"""Configuration for axial 2D RoPE sin/cos generation.

	This module supports two coordinate conventions via ``coord_mode``:
	- ``DINOV3_NORMALIZED``: DINOv3-style normalized patch-centre coordinates in
	``[-1, 1]`` (after normalization).
	- ``PATCH_INDICES``: Standard unnormalized patch-grid coordinates in patch
	units (e.g., ``x in [0, W-1]``).

	Period parametrization
	----------------------
	The periods parametrization matches DINOv3:
	- Provide either `base` (and leave `min_period/max_period` unset), or
	- Provide both `min_period` and `max_period` (and set `base=None`).
	"""

	base: float \| None = 100.0
	min_period: float \| None = None
	max_period: float \| None = None
	coord_mode: AxialRoPE2DCoordMode = AxialRoPE2DCoordMode.DINOV3_NORMALIZED
	normalize_coords: AxialRoPE2DNormalizeCoords = AxialRoPE2DNormalizeCoords.MAX
	dim_layout: AxialRoPE2DDimLayout = AxialRoPE2DDimLayout.HALF_SPLIT
	angle_multiplier: float = 2.0 * float(math.pi)
	coord_offset: float = 0.5
	frequency_aware: AxialRoPE2DFrequencyAwareConfig \| None = None
	beta_warp: AxialRoPE2DBetaWarpConfig \| None = None
	alpha_warp: AxialRoPE2DAlphaWarpConfig \| None = None

	def __post_init__(self) -> None:
	both_periods = self.min_period is not None and self.max_period is not None
	if (self.base is None and not both_periods) or (
	self.base is not None and both_periods
	):
	raise ValueError(
	"AxialRoPE2DConfig requires either base!=None, or both min_period and max_period."
	)
	if self.base is not None and float(self.base) <= 0.0:
	raise ValueError("AxialRoPE2DConfig.base must be positive when provided")
	if self.min_period is not None and float(self.min_period) <= 0.0:
	raise ValueError(
	"AxialRoPE2DConfig.min_period must be positive when provided"
	)
	if self.max_period is not None and float(self.max_period) <= 0.0:
	raise ValueError(
	"AxialRoPE2DConfig.max_period must be positive when provided"
	)
	if self.min_period is not None and self.max_period is not None:
	if float(self.max_period) <= float(self.min_period):
	raise ValueError("AxialRoPE2DConfig.max_period must be > min_period")
	if not isinstance(self.coord_mode, AxialRoPE2DCoordMode):
	raise TypeError(
	"AxialRoPE2DConfig.coord_mode must be an AxialRoPE2DCoordMode"
	)
	if not isinstance(self.normalize_coords, AxialRoPE2DNormalizeCoords):
	raise TypeError(
	"AxialRoPE2DConfig.normalize_coords must be an AxialRoPE2DNormalizeCoords"
	)
	if not isinstance(self.dim_layout, AxialRoPE2DDimLayout):
	raise TypeError(
	"AxialRoPE2DConfig.dim_layout must be an AxialRoPE2DDimLayout"
	)
	mult = float(self.angle_multiplier)
	if not math.isfinite(mult) or mult <= 0.0:
	raise ValueError(
	"AxialRoPE2DConfig.angle_multiplier must be finite and > 0"
	)
	off = float(self.coord_offset)
	if not math.isfinite(off):
	raise ValueError("AxialRoPE2DConfig.coord_offset must be finite")
	if self.frequency_aware is not None and not isinstance(
	self.frequency_aware, AxialRoPE2DFrequencyAwareConfig
	):
	raise TypeError(
	"AxialRoPE2DConfig.frequency_aware must be an AxialRoPE2DFrequencyAwareConfig"
	)
	if self.beta_warp is not None and not isinstance(
	self.beta_warp, AxialRoPE2DBetaWarpConfig
	):
	raise TypeError(
	"AxialRoPE2DConfig.beta_warp must be an AxialRoPE2DBetaWarpConfig"
	)
	if self.alpha_warp is not None and not isinstance(
	self.alpha_warp, AxialRoPE2DAlphaWarpConfig
	):
	raise TypeError(
	"AxialRoPE2DConfig.alpha_warp must be an AxialRoPE2DAlphaWarpConfig"
	)
	warp_count = (
	int(self.frequency_aware is not None)
	+ int(self.beta_warp is not None)
	+ int(self.alpha_warp is not None)
	)
	if warp_count > 1:
	raise ValueError(
	"AxialRoPE2DConfig requires at most one of frequency_aware, beta_warp, or alpha_warp"
	)
	if self.frequency_aware is not None and (
	self.coord_mode is not AxialRoPE2DCoordMode.PATCH_INDICES
	):
	raise ValueError(
	"AxialRoPE2D frequency-aware warping requires coord_mode=PATCH_INDICES"
	)
	if self.beta_warp is not None and (
	self.coord_mode is not AxialRoPE2DCoordMode.PATCH_INDICES
	):
	raise ValueError("AxialRoPE2D beta warp requires coord_mode=PATCH_INDICES")
	if self.alpha_warp is not None and (
	self.coord_mode is not AxialRoPE2DCoordMode.PATCH_INDICES
	):
	raise ValueError("AxialRoPE2D alpha warp requires coord_mode=PATCH_INDICES")


	_AXIAL_COORDS_CACHE: dict[
	tuple[
	int, int, torch.device, AxialRoPE2DCoordMode, AxialRoPE2DNormalizeCoords, float
	],
	Tensor,
	] = {}


	def _get_dinov3_normalized_coords(
	H: int,
	W: int,
	*,
	device: torch.device,
	normalize: AxialRoPE2DNormalizeCoords,
	offset: float,
	) -> Tensor:
	"""Return DINOv3-style flattened coords in [-1, 1] with shape [HW, 2]."""
	if H <= 0 or W <= 0:
	raise ValueError("H and W must be positive for axial RoPE coords")
	key = (
	int(H),
	int(W),
	device,
	AxialRoPE2DCoordMode.DINOV3_NORMALIZED,
	normalize,
	float(offset),
	)
	cached = _AXIAL_COORDS_CACHE.get(key)
	if cached is not None:
	return cached
	start = float(offset)
	end_h = start + float(int(H))
	end_w = start + float(int(W))
	match normalize:
	case AxialRoPE2DNormalizeCoords.MAX:
	denom = float(max(int(H), int(W)))
	coords_h = (
	torch.arange(start, end_h, device=device, dtype=torch.float32) / denom
	)
	coords_w = (
	torch.arange(start, end_w, device=device, dtype=torch.float32) / denom
	)
	case AxialRoPE2DNormalizeCoords.MIN:
	denom = float(min(int(H), int(W)))
	coords_h = (
	torch.arange(start, end_h, device=device, dtype=torch.float32) / denom
	)
	coords_w = (
	torch.arange(start, end_w, device=device, dtype=torch.float32) / denom
	)
	case AxialRoPE2DNormalizeCoords.SEPARATE:
	coords_h = torch.arange(
	start, end_h, device=device, dtype=torch.float32
	) / float(int(H))
	coords_w = torch.arange(
	start, end_w, device=device, dtype=torch.float32
	) / float(int(W))
	case _ as unreachable: # pragma: no cover - defensive
	raise RuntimeError(f"Unsupported normalize_coords: {unreachable}")
	coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)
	coords = coords.flatten(0, 1)
	coords = 2.0 * coords - 1.0
	# torch.compile cannot trace `torch.is_inference_mode_enabled()` and should
	# not record Python-side cache mutations in the graph.
	if torch.compiler.is_compiling():
	return coords
	if torch.is_inference_mode_enabled():
	return coords
	_AXIAL_COORDS_CACHE[key] = coords
	return coords


	def _get_patch_index_coords(
	H: int,
	W: int,
	*,
	device: torch.device,
	offset: float,
	) -> Tensor:
	"""Return unnormalized patch-grid coords with shape [HW, 2] and (y, x) columns."""
	if H <= 0 or W <= 0:
	raise ValueError("H and W must be positive for axial RoPE coords")
	key = (
	int(H),
	int(W),
	device,
	AxialRoPE2DCoordMode.PATCH_INDICES,
	AxialRoPE2DNormalizeCoords.MAX,
	float(offset),
	)
	cached = _AXIAL_COORDS_CACHE.get(key)
	if cached is not None:
	return cached
	start = float(offset)
	end_h = start + float(int(H))
	end_w = start + float(int(W))
	coords_h = torch.arange(start, end_h, device=device, dtype=torch.float32)
	coords_w = torch.arange(start, end_w, device=device, dtype=torch.float32)
	coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)
	coords = coords.flatten(0, 1)
	if torch.compiler.is_compiling():
	return coords
	if torch.is_inference_mode_enabled():
	return coords
	_AXIAL_COORDS_CACHE[key] = coords
	return coords


	def _lumina_boundary_band_index(
	*,
	periods: Tensor,
	boundary_wavelength: Tensor,
	) -> Tensor:
	"""Return the fractional boundary band index d* for a given boundary wavelength.

	This implements the Lumina/Next-DiT definition:

	period(d*) = boundary_wavelength

	We compute d* by linear interpolation in log-period space. For the supported
	period parameterizations, periods are geometric and log(period) is linear in
	band index.

	Args:
	periods: 1D float tensor of length Q containing monotonically increasing
	periods in tokens.
	boundary_wavelength: Scalar positive float tensor giving the desired
	boundary wavelength in tokens.

	Returns:
	Scalar float32 tensor giving the (possibly fractional) boundary index d*.

	Raises:
	ValueError: If periods are invalid or the boundary is outside valid range
	for a well-defined positive d*.
	"""
	if periods.dim() != 1:
	raise ValueError("periods must be 1D for boundary band index")
	if int(periods.numel()) < 2:
	raise ValueError("periods must have length >= 2 for boundary band index")
	if boundary_wavelength.dim() != 0:
	raise ValueError("boundary_wavelength must be a scalar tensor")
	if not torch.isfinite(boundary_wavelength).item():
	raise ValueError("boundary_wavelength must be finite")
	if float(boundary_wavelength.item()) <= 0.0:
	raise ValueError("boundary_wavelength must be > 0")

	periods_f = periods.to(dtype=torch.float32)
	if not torch.isfinite(periods_f).all().item():
	raise ValueError("periods must be finite for boundary band index")
	if float(periods_f[0].item()) <= 0.0:
	raise ValueError("periods must be positive for boundary band index")
	if not (periods_f[1:] > periods_f[:-1]).all().item():
	raise ValueError("periods must be strictly increasing for boundary band index")

	log_p0 = torch.log(periods_f[0])
	log_p1 = torch.log(periods_f[-1])
	denom = log_p1 - log_p0
	if float(denom.item()) <= 0.0:
	raise ValueError("Invalid periods range for boundary band index")
	log_boundary = torch.log(boundary_wavelength.to(dtype=torch.float32))
	q = int(periods_f.numel())
	d_star = (float(q - 1) * (log_boundary - log_p0)) / denom
	if not torch.isfinite(d_star).item():
	raise ValueError("Computed non-finite boundary band index d*")
	if float(d_star.item()) <= 0.0:
	raise ValueError(
	"Boundary wavelength implies d* <= 0; increase the boundary wavelength "
	"(or its multiplier) to be >= the wavelength of the first non-zero band."
	)
	return d_star


	def _lumina_alpha_ramp(
	*,
	qtr: int,
	d_star: Tensor,
	device: torch.device,
	) -> Tensor:
	"""Return alpha[d] = clamp(d / d*, 0, 1) for d in [0, qtr).

	Args:
	qtr: Number of RoPE bands per axis (Q).
	d_star: Scalar positive float tensor boundary index d*.
	device: Device for the returned alpha tensor.

	Returns:
	Float32 tensor of shape [Q] with values in [0, 1].
	"""
	if int(qtr) <= 0:
	raise ValueError("qtr must be positive for alpha ramp")
	if d_star.dim() != 0:
	raise ValueError("d_star must be a scalar tensor for alpha ramp")
	if float(d_star.item()) <= 0.0:
	raise ValueError("d_star must be > 0 for alpha ramp")
	d = torch.arange(int(qtr), device=device, dtype=torch.float32)
	alpha = d / d_star.to(device=device, dtype=torch.float32)
	return torch.clamp(alpha, min=0.0, max=1.0)


	def lumina_frequency_aware_periods_for_axis(
	*,
	periods: Tensor,
	axis_len: int,
	ref_axis_len: int,
	boundary_log_multiplier: Tensor,
	angle_multiplier: float,
	) -> Tensor:
	"""Return Lumina/Next-DiT frequency-aware warped periods for one axis.

	Implements:
	s = axis_len / ref_axis_len
	L_boundary = ref_axis_len * exp(boundary_log_multiplier)
	d* = boundary band index where period(d*) = L_boundary
	alpha[d] = clamp(d / d*, 0, 1)
	period'[d] = period[d] * s**alpha[d]

	Notes on ``angle_multiplier``
	-----------------------------
	This module parameterizes angles as:

	angle(p, d) = angle_multiplier * p / period[d]

	The wavelength (period in tokens) is the delta in ``p`` that increases
	the angle by ``2π``:

	wavelength[d] = 2π * period[d] / angle_multiplier

	Lumina/Next-DiT define the boundary by matching wavelength to the
	reference axis length. We therefore convert the boundary wavelength
	``L_boundary`` into a boundary period via:

	period_boundary = (angle_multiplier / 2π) * L_boundary

	When ``angle_multiplier == 2π`` (the DINOv3-style parameterization), this
	reduces to ``period_boundary == L_boundary``.

	Args:
	periods: Base periods ``[Q]`` in tokens (wavelengths).
	axis_len: Input axis length ``L`` in tokens.
	ref_axis_len: Reference axis length ``L_ref`` in tokens.
	boundary_log_multiplier: Scalar tensor; shared trainable log-multiplier.
	angle_multiplier: RoPE angle multiplier used when converting periods to
	physical wavelengths in tokens.

	Returns:
	Warped periods ``[Q]`` as float32.

	Raises:
	ValueError: If inputs are malformed or imply an invalid boundary index.
	"""
	if int(axis_len) <= 0:
	raise ValueError("axis_len must be positive for frequency-aware periods")
	if int(ref_axis_len) <= 0:
	raise ValueError("ref_axis_len must be positive for frequency-aware periods")
	if boundary_log_multiplier.dim() != 0:
	raise ValueError("boundary_log_multiplier must be a scalar tensor")
	if not torch.isfinite(boundary_log_multiplier).item():
	raise ValueError("boundary_log_multiplier must be finite")
	mult = float(angle_multiplier)
	if not math.isfinite(mult) or mult <= 0.0:
	raise ValueError("angle_multiplier must be finite and > 0")

	device = periods.device
	qtr = int(periods.numel())
	s = float(int(axis_len)) / float(int(ref_axis_len))
	if not math.isfinite(s) or s <= 0.0:
	raise ValueError("axis_len/ref_axis_len must be finite and > 0")
	boundary_wavelength = float(int(ref_axis_len)) * torch.exp(
	boundary_log_multiplier.to(device=device, dtype=torch.float32)
	)
	boundary_period = (mult / (2.0 * float(math.pi))) * boundary_wavelength
	d_star = _lumina_boundary_band_index(
	periods=periods, boundary_wavelength=boundary_period
	)
	alpha = _lumina_alpha_ramp(qtr=qtr, d_star=d_star, device=device)
	scale = torch.pow(torch.tensor(s, device=device, dtype=torch.float32), alpha)
	return periods.to(device=device, dtype=torch.float32) * scale


	def build_axial_rope2d_with_lumina_frequency_warp(
	base: AxialRoPE2D,
	*,
	ref_h_tokens: int,
	ref_w_tokens: int,
	boundary_log_multiplier: float \| None,
	boundary_band_multiplier: float \| None,
	) -> AxialRoPE2D:
	"""Return an AxialRoPE2D module that applies Lumina-style frequency warping.

	This helper is intended for inference-time experimentation on checkpoints
	that were trained without frequency-aware warping (e.g.
	``position_encoding=ROPE_2D_AXIAL_UNNORMALIZED``). It constructs a new
	AxialRoPE2D instance that:
	- Keeps the base RoPE periods and layout identical to ``base``.
	- Applies Lumina/Next-DiT per-axis warping based on the runtime token
	lengths ``H`` and ``W`` relative to reference lengths.
	- Uses a fixed (non-trainable) scalar boundary multiplier for inference.

	Args:
	base: Existing AxialRoPE2D instance from a loaded model.
	ref_h_tokens: Reference H token length (L_ref,h).
	ref_w_tokens: Reference W token length (L_ref,w).
	boundary_log_multiplier: Optional log multiplier applied to reference
	lengths to define the boundary wavelength. Use 0.0 for "boundary at
	L_ref". Mutually exclusive with boundary_band_multiplier.
	boundary_band_multiplier: Optional multiplier that directly selects the
	boundary band index d* relative to the lowest-frequency band index
	(qtr-1). Concretely, with qtr bands per axis:

	d* = boundary_band_multiplier * (qtr - 1)

	This lets you move the transition point in frequency space:
	- smaller values => more bands become PI-like (more interpolation)
	- larger values => fewer bands become PI-like (more extrapolation)

	When provided, we compute the implied boundary wavelength and store
	it as boundary_log_multiplier for the module.

	Returns:
	New AxialRoPE2D instance on the same device as ``base``.

	Raises:
	TypeError: If base is not an AxialRoPE2D.
	ValueError: If base uses incompatible coordinates for Lumina warping.
	"""
	if not isinstance(base, AxialRoPE2D):
	raise TypeError("base must be an AxialRoPE2D")
	if base.cfg.coord_mode is not AxialRoPE2DCoordMode.PATCH_INDICES:
	raise ValueError(
	"Lumina frequency-aware warping requires coord_mode=PATCH_INDICES"
	)
	if (boundary_log_multiplier is None) == (boundary_band_multiplier is None):
	raise ValueError(
	"Provide exactly one of boundary_log_multiplier or boundary_band_multiplier"
	)

	resolved_log_multiplier: float
	if boundary_band_multiplier is not None:
	if int(ref_h_tokens) != int(ref_w_tokens):
	raise ValueError(
	"boundary_band_multiplier requires ref_h_tokens == ref_w_tokens when using a shared scalar boundary"
	)
	mult = float(boundary_band_multiplier)
	if not math.isfinite(mult) or mult <= 0.0:
	raise ValueError("boundary_band_multiplier must be finite and > 0")
	qtr = int(base.periods.numel())
	if qtr < 2:
	raise ValueError(
	"AxialRoPE2D periods length must be >= 2 for boundary band selection"
	)
	# Solve for the boundary wavelength implied by choosing d* directly.
	#
	# We use geometric interpolation in log-period space:
	# log(period(d)) = log(period0) + (d/(qtr-1)) * (log(period_max) - log(period0))
	# with:
	# d* = boundary_band_multiplier * (qtr-1)
	#
	# This allows d* outside the trained band range (multiplier > 1), which
	# corresponds to pushing the transition beyond the lowest-frequency band.
	with torch.no_grad():
	periods_f = base.periods.to(dtype=torch.float32, device=torch.device("cpu"))
	if not (periods_f[1:] > periods_f[:-1]).all().item():
	raise ValueError(
	"base.periods must be strictly increasing for boundary band selection"
	)
	log_p0 = float(torch.log(periods_f[0]).item())
	log_p1 = float(torch.log(periods_f[-1]).item())
	d_star = mult * float(qtr - 1)
	log_boundary_period = log_p0 + (d_star / float(qtr - 1)) * (log_p1 - log_p0)
	boundary_period = math.exp(log_boundary_period)
	angle_mult = float(base.cfg.angle_multiplier)
	if not math.isfinite(angle_mult) or angle_mult <= 0.0:
	raise ValueError("base.cfg.angle_multiplier must be finite and > 0")
	boundary_wavelength = (2.0 * float(math.pi) / angle_mult) * boundary_period
	resolved_log_multiplier = math.log(
	boundary_wavelength / float(int(ref_h_tokens))
	)
	else:
	if boundary_log_multiplier is None: # pragma: no cover - validated above
	raise RuntimeError("boundary_log_multiplier missing despite validation")
	resolved_log_multiplier = float(boundary_log_multiplier)

	freq_cfg = AxialRoPE2DFrequencyAwareConfig(
	ref_h_tokens=int(ref_h_tokens),
	ref_w_tokens=int(ref_w_tokens),
	boundary_log_multiplier_init=resolved_log_multiplier,
	)
	cfg = replace(base.cfg, frequency_aware=freq_cfg, beta_warp=None, alpha_warp=None)
	device = base.periods.device
	warped = AxialRoPE2D(head_dim=int(base.head_dim), cfg=cfg).to(device=device)
	with torch.no_grad():
	warped.periods.copy_(base.periods.to(device=device, dtype=torch.float32))
	if warped.boundary_log_multiplier is None: # pragma: no cover - defensive
	raise RuntimeError("Expected boundary_log_multiplier to be initialized")
	warped.boundary_log_multiplier.copy_(
	torch.tensor(resolved_log_multiplier, device=device, dtype=torch.float32)
	)
	warped.boundary_log_multiplier.requires_grad_(False)
	return warped


	def build_axial_rope2d_inference_warp_with_strength(
	base: AxialRoPE2D,
	*,
	ref_h_tokens: int,
	ref_w_tokens: int,
	beta_hi_u: float,
	beta_lo_u: float,
	beta_bend_u: float,
	beta_max: float,
	) -> AxialRoPE2D:
	"""Build an inference-only RoPE warp parameterized by a 3-knob beta(t) curve.

	This helper is meant for notebook experimentation on checkpoints trained
	with patch-index axial RoPE (e.g. ``position_encoding=ROPE_2D_AXIAL_UNNORMALIZED``).

	We warp per-axis RoPE periods (wavelengths, in tokens) as:

	period'[d] = period[d] * s ** beta[d] where s = L / L_ref

	with a smooth exponent curve beta(d) over bands. Unlike a strict
	interpolation-only exponent (0..1), beta is allowed to be negative or > 1,
	which is important for unnormalized RoPE (e.g. base=10_000) where some very
	low-frequency bands are effectively "dead" on practical token grids unless
	their frequencies can be increased (beta < 0).

	Knobs (bounded via u-space)
	---------------------------
	We use three unconstrained parameters (u-space) which map to bounded beta
	values via tanh:

	beta_hi = beta_max * tanh(beta_hi_u) (high-frequency endpoint, d=0)
	beta_lo = beta_max * tanh(beta_lo_u) (low-frequency endpoint, d=qtr-1)
	beta_bend = beta_max * tanh(beta_bend_u) ("bump" amplitude in the middle)

	Then define the per-band curve over t in [0,1] (high -> low frequency):

	t = d / (qtr - 1)
	beta(t) = lerp(beta_hi, beta_lo, t) + beta_bend * 4t(1-t)

	The bump term is 0 at the endpoints and peaks at 1 at t=0.5.

	Notes
	-----
	- This wrapper is inference-only: it is not saved in checkpoints.
	- It requires patch-index coordinates (no normalized "gauge").
	- It preserves the base module's periods and layout exactly.

	Args:
	base: Existing AxialRoPE2D instance from a loaded model.
	ref_h_tokens: Reference H token length (L_ref,h).
	ref_w_tokens: Reference W token length (L_ref,w).
	beta_hi_u: Unconstrained u for beta_hi.
	beta_lo_u: Unconstrained u for beta_lo.
	beta_bend_u: Unconstrained u for beta_bend (mid-band bump).
	beta_max: Maximum absolute beta value (> 0). Higher increases control.

	Returns:
	An AxialRoPE2D instance whose forward applies the inference-only warp.
	"""
	if not isinstance(base, AxialRoPE2D):
	raise TypeError("base must be an AxialRoPE2D")
	if base.cfg.coord_mode is not AxialRoPE2DCoordMode.PATCH_INDICES:
	raise ValueError(
	"Inference freq-warp requires base.cfg.coord_mode=PATCH_INDICES"
	)
	if int(ref_h_tokens) <= 0 or int(ref_w_tokens) <= 0:
	raise ValueError("ref_h_tokens and ref_w_tokens must be positive")
	hi_u = float(beta_hi_u)
	lo_u = float(beta_lo_u)
	bend_u = float(beta_bend_u)
	if not math.isfinite(hi_u):
	raise ValueError("beta_hi_u must be finite")
	if not math.isfinite(lo_u):
	raise ValueError("beta_lo_u must be finite")
	if not math.isfinite(bend_u):
	raise ValueError("beta_bend_u must be finite")
	bmax = float(beta_max)
	if not math.isfinite(bmax) or bmax <= 0.0:
	raise ValueError("beta_max must be finite and > 0")

	class _AxialRoPE2DInferenceWarp(AxialRoPE2D):
	"""Inference-only axial RoPE variant with beta-curve knobs."""

	def __init__(self, *, device: torch.device) -> None:
	super().__init__(head_dim=int(base.head_dim), cfg=base.cfg)
	self.ref_h_tokens: Final[int] = int(ref_h_tokens)
	self.ref_w_tokens: Final[int] = int(ref_w_tokens)
	# Store as buffers so the notebook can mutate by replacing the module.
	self.register_buffer(
	"beta_hi_u",
	torch.tensor(float(hi_u), dtype=torch.float32),
	persistent=False,
	)
	self.register_buffer(
	"beta_lo_u",
	torch.tensor(float(lo_u), dtype=torch.float32),
	persistent=False,
	)
	self.register_buffer(
	"beta_bend_u",
	torch.tensor(float(bend_u), dtype=torch.float32),
	persistent=False,
	)
	self.register_buffer(
	"beta_max",
	torch.tensor(float(bmax), dtype=torch.float32),
	persistent=False,
	)
	self.to(device=device)
	with torch.no_grad():
	self.periods.copy_(
	base.periods.detach().to(device=device, dtype=torch.float32)
	)

	def forward(
	self,
	*,
	H: int,
	W: int,
	scales: Tensor \| None,
	) -> tuple[Tensor, Tensor]:
	if scales is not None:
	raise ValueError("Inference freq-warp does not support dilation scales")
	if int(H) <= 0 or int(W) <= 0:
	raise ValueError("H and W must be positive for axial RoPE")
	device = self.periods.device
	offset = float(self.cfg.coord_offset)
	coords = _get_patch_index_coords(
	int(H), int(W), device=device, offset=offset
	)
	if coords.dim() != 2 or coords.shape[1] != 2:
	raise RuntimeError("Axial RoPE coords must have shape [HW, 2]")

	qtr = int(self.periods.numel())
	if qtr <= 0:
	raise RuntimeError("Axial RoPE periods length must be positive")

	beta_max_t = cast("Tensor", self.beta_max).to(
	device=device, dtype=torch.float32
	)
	beta_hi = beta_max_t * torch.tanh(
	cast("Tensor", self.beta_hi_u).to(device=device, dtype=torch.float32)
	)
	beta_lo = beta_max_t * torch.tanh(
	cast("Tensor", self.beta_lo_u).to(device=device, dtype=torch.float32)
	)
	beta_bend = beta_max_t * torch.tanh(
	cast("Tensor", self.beta_bend_u).to(device=device, dtype=torch.float32)
	)

	if qtr == 1:
	beta = beta_hi[None]
	else:
	t = torch.arange(int(qtr), device=device, dtype=torch.float32) / float(
	qtr - 1
	)
	bump = 4.0 * t * (1.0 - t)
	beta = (1.0 - t) * beta_hi + t * beta_lo + beta_bend * bump

	s_h = float(int(H)) / float(int(self.ref_h_tokens))
	s_w = float(int(W)) / float(int(self.ref_w_tokens))
	if (
	not math.isfinite(s_h)
	or s_h <= 0.0
	or not math.isfinite(s_w)
	or s_w <= 0.0
	):
	raise ValueError(
	"H/ref_h_tokens and W/ref_w_tokens must be finite and > 0"
	)

	periods_h = self.periods * torch.pow(
	torch.tensor(s_h, device=device, dtype=torch.float32), beta
	)
	periods_w = self.periods * torch.pow(
	torch.tensor(s_w, device=device, dtype=torch.float32), beta
	)
	axis_periods = torch.stack([periods_h, periods_w], dim=0) # [2, Q]

	angles = (
	float(self.cfg.angle_multiplier)
	* coords[:, :, None].to(dtype=torch.float32)
	/ axis_periods[None, :, :].to(dtype=torch.float32)
	)
	match self.cfg.dim_layout:
	case AxialRoPE2DDimLayout.HALF_SPLIT:
	angles = angles.flatten(1, 2).repeat(1, 2)
	case AxialRoPE2DDimLayout.PAIR_INTERLEAVED:
	angles = angles.repeat_interleave(2, dim=-1).flatten(1, 2)
	case _ as unreachable: # pragma: no cover - defensive
	raise RuntimeError(f"Unsupported dim_layout: {unreachable}")
	if angles.shape != (int(H) * int(W), int(self.head_dim)):
	raise RuntimeError(
	"Unexpected angles shape in inference freq-warp: "
	f"{tuple(angles.shape)} for H={int(H)} W={int(W)}"
	)
	return torch.sin(angles), torch.cos(angles)

	return _AxialRoPE2DInferenceWarp(device=base.periods.device)


	class AxialRoPE2D(nn.Module):
	"""DINOv3-style axial 2D RoPE sin/cos generator.

	The base periods are fixed by ``AxialRoPE2DConfig``. Optionally, this module
	can include learnable scalar parameters when using:
	- ``frequency_aware`` (boundary_log_multiplier), or
	- ``beta_warp`` (beta_hi_u/beta_lo_u/beta_bend_u), or
	- ``alpha_warp`` (alpha per-band exponents).
	"""

	periods: Tensor

	def __init__(self, *, head_dim: int, cfg: AxialRoPE2DConfig) -> None:
	super().__init__()
	if int(head_dim) <= 0:
	raise ValueError("head_dim must be positive for AxialRoPE2D")
	if int(head_dim) % 4 != 0:
	raise ValueError(
	"AxialRoPE2D requires head_dim % 4 == 0 (DINOv3 constraint); "
	f"got head_dim={int(head_dim)}"
	)
	if not isinstance(cfg, AxialRoPE2DConfig):
	raise TypeError("cfg must be an AxialRoPE2DConfig for AxialRoPE2D")
	self.head_dim: Final[int] = int(head_dim)
	self.cfg: Final[AxialRoPE2DConfig] = cfg
	self._d_head: Final[int] = self.head_dim
	self.register_buffer(
	"periods",
	torch.empty(self._d_head // 4, dtype=torch.float32),
	persistent=True,
	)
	if cfg.frequency_aware is None:
	self.register_parameter("boundary_log_multiplier", None)
	else:
	init = float(cfg.frequency_aware.boundary_log_multiplier_init)
	self.boundary_log_multiplier = nn.Parameter(
	torch.tensor(init, dtype=torch.float32),
	requires_grad=True,
	)
	if cfg.beta_warp is None:
	self.register_parameter("beta_hi_u", None)
	self.register_parameter("beta_lo_u", None)
	self.register_parameter("beta_bend_u", None)
	else:
	beta = cfg.beta_warp
	self.beta_hi_u = nn.Parameter(
	torch.tensor(float(beta.beta_hi_u_init), dtype=torch.float32),
	requires_grad=True,
	)
	self.beta_lo_u = nn.Parameter(
	torch.tensor(float(beta.beta_lo_u_init), dtype=torch.float32),
	requires_grad=True,
	)
	self.beta_bend_u = nn.Parameter(
	torch.tensor(float(beta.beta_bend_u_init), dtype=torch.float32),
	requires_grad=True,
	)
	if cfg.alpha_warp is None:
	self.register_parameter("alpha", None)
	else:
	qtr = int(self._d_head) // 4
	if qtr <= 0: # pragma: no cover - defensive
	raise RuntimeError("AxialRoPE2D periods length must be positive")
	init = float(cfg.alpha_warp.alpha_init)
	if not math.isfinite(init):
	raise RuntimeError("alpha_init must be finite for alpha-warp RoPE")
	self.alpha = nn.Parameter(
	torch.full((int(qtr),), init, dtype=torch.float32),
	requires_grad=True,
	)
	self._init_periods()

	def _apply(self, fn): # type: ignore[override]
	out = super()._apply(fn)
	with torch.no_grad():
	self.periods.data = self.periods.data.to(dtype=torch.float32)
	if self.boundary_log_multiplier is not None:
	self.boundary_log_multiplier.data = (
	self.boundary_log_multiplier.data.to(dtype=torch.float32)
	)
	if self.beta_hi_u is not None:
	self.beta_hi_u.data = self.beta_hi_u.data.to(dtype=torch.float32)
	if self.beta_lo_u is not None:
	self.beta_lo_u.data = self.beta_lo_u.data.to(dtype=torch.float32)
	if self.beta_bend_u is not None:
	self.beta_bend_u.data = self.beta_bend_u.data.to(dtype=torch.float32)
	if self.alpha is not None:
	self.alpha.data = self.alpha.data.to(dtype=torch.float32)
	return out

	def _init_periods(self) -> None:
	"""Initialize per-dimension periods using DINOv3 formulas."""
	device: torch.device = self.periods.device
	dtype: torch.dtype = self.periods.dtype
	d_head = int(self._d_head)
	qtr = d_head // 4
	if qtr <= 0:
	raise RuntimeError("AxialRoPE2D periods length must be positive")
	if self.cfg.base is not None:
	base = float(self.cfg.base)
	exponents = (
	2.0
	* torch.arange(int(qtr), device=device, dtype=dtype)
	/ float(d_head // 2)
	)
	periods = torch.tensor(base, device=device, dtype=dtype) ** exponents
	else:
	if self.cfg.min_period is None or self.cfg.max_period is None:
	raise RuntimeError(
	"AxialRoPE2DConfig must provide min_period and max_period when base is None"
	)
	min_p = float(self.cfg.min_period)
	max_p = float(self.cfg.max_period)
	base = max_p / min_p
	exponents = torch.linspace(0.0, 1.0, int(qtr), device=device, dtype=dtype)
	periods = torch.tensor(base, device=device, dtype=dtype) ** exponents
	periods = periods / torch.tensor(base, device=device, dtype=dtype)
	periods = periods * torch.tensor(max_p, device=device, dtype=dtype)
	self.periods.data = periods

	def forward(
	self,
	*,
	H: int,
	W: int,
	scales: Tensor \| None,
	) -> tuple[Tensor, Tensor]:
	"""Return (sin, cos) buffers for axial 2D RoPE.

	Args:
	H: Patch-grid height.
	W: Patch-grid width.
	scales: Optional per-batch dilation scale (scalar tensor). When
	None, returns shared sin/cos shaped ``[HW, head_dim]``. When
	provided, applies the scalar dilation and still returns shared
	sin/cos shaped ``[HW, head_dim]``.
	"""
	if int(H) <= 0 or int(W) <= 0:
	raise ValueError("H and W must be positive for AxialRoPE2D forward")
	device = self.periods.device
	offset = float(self.cfg.coord_offset)
	coords: Tensor
	match self.cfg.coord_mode:
	case AxialRoPE2DCoordMode.DINOV3_NORMALIZED:
	coords = _get_dinov3_normalized_coords(
	int(H),
	int(W),
	device=device,
	normalize=self.cfg.normalize_coords,
	offset=offset,
	)
	case AxialRoPE2DCoordMode.PATCH_INDICES:
	coords = _get_patch_index_coords(
	int(H), int(W), device=device, offset=offset
	)
	case _ as unreachable: # pragma: no cover - defensive
	raise RuntimeError(f"Unsupported coord_mode: {unreachable}")
	if coords.dim() != 2 or coords.shape[1] != 2:
	raise RuntimeError("AxialRoPE2D coords must have shape [HW, 2]")
	if self.cfg.frequency_aware is not None:
	if scales is not None:
	raise ValueError(
	"frequency-aware axial RoPE does not support dilation scales"
	)
	if self.boundary_log_multiplier is None:
	raise RuntimeError(
	"boundary_log_multiplier parameter missing for frequency-aware RoPE"
	)
	ref_h = int(self.cfg.frequency_aware.ref_h_tokens)
	ref_w = int(self.cfg.frequency_aware.ref_w_tokens)
	periods_h = lumina_frequency_aware_periods_for_axis(
	periods=self.periods,
	axis_len=int(H),
	ref_axis_len=ref_h,
	boundary_log_multiplier=self.boundary_log_multiplier,
	angle_multiplier=float(self.cfg.angle_multiplier),
	)
	periods_w = lumina_frequency_aware_periods_for_axis(
	periods=self.periods,
	axis_len=int(W),
	ref_axis_len=ref_w,
	boundary_log_multiplier=self.boundary_log_multiplier,
	angle_multiplier=float(self.cfg.angle_multiplier),
	)
	axis_periods = torch.stack([periods_h, periods_w], dim=0) # [2, Q]
	elif self.cfg.beta_warp is not None:
	if scales is not None:
	raise ValueError(
	"beta-warp axial RoPE does not support dilation scales"
	)
	if (
	self.beta_hi_u is None
	or self.beta_lo_u is None
	or self.beta_bend_u is None
	):
	raise RuntimeError("beta warp parameters missing for beta-warp RoPE")
	beta_cfg = self.cfg.beta_warp
	ref_h = int(beta_cfg.ref_h_tokens)
	ref_w = int(beta_cfg.ref_w_tokens)
	qtr = int(self.periods.numel())
	if qtr <= 0: # pragma: no cover - defensive (checked elsewhere)
	raise RuntimeError("AxialRoPE2D periods length must be positive")
	beta_max = float(beta_cfg.beta_max)
	if not math.isfinite(beta_max) or beta_max <= 0.0:
	raise RuntimeError("beta_max must be finite and > 0")
	beta_max_t = torch.tensor(beta_max, device=device, dtype=torch.float32)
	beta_hi = beta_max_t * torch.tanh(self.beta_hi_u.to(dtype=torch.float32))
	beta_lo = beta_max_t * torch.tanh(self.beta_lo_u.to(dtype=torch.float32))
	beta_bend = beta_max_t * torch.tanh(
	self.beta_bend_u.to(dtype=torch.float32)
	)
	if qtr == 1:
	beta = beta_hi[None]
	else:
	t = torch.arange(int(qtr), device=device, dtype=torch.float32) / float(
	qtr - 1
	)
	bump = 4.0 * t * (1.0 - t)
	beta = (1.0 - t) * beta_hi + t * beta_lo + beta_bend * bump

	s_h = float(int(H)) / float(ref_h)
	s_w = float(int(W)) / float(ref_w)
	if (
	not math.isfinite(s_h)
	or s_h <= 0.0
	or not math.isfinite(s_w)
	or s_w <= 0.0
	):
	raise RuntimeError(
	"Computed invalid axis scale factors for beta-warp RoPE"
	)
	periods_h = self.periods.to(dtype=torch.float32) * torch.pow(
	torch.tensor(s_h, device=device, dtype=torch.float32), beta
	)
	periods_w = self.periods.to(dtype=torch.float32) * torch.pow(
	torch.tensor(s_w, device=device, dtype=torch.float32), beta
	)
	axis_periods = torch.stack([periods_h, periods_w], dim=0) # [2, Q]
	elif self.cfg.alpha_warp is not None:
	if scales is not None:
	raise ValueError(
	"alpha-warp axial RoPE does not support dilation scales"
	)
	if self.alpha is None:
	raise RuntimeError("alpha parameter missing for alpha-warp RoPE")
	alpha_cfg = self.cfg.alpha_warp
	ref_h = int(alpha_cfg.ref_h_tokens)
	ref_w = int(alpha_cfg.ref_w_tokens)
	qtr = int(self.periods.numel())
	if int(self.alpha.numel()) != qtr:
	raise RuntimeError(
	"alpha length must match RoPE periods length for alpha-warp RoPE"
	)
	s_h = float(int(H)) / float(ref_h)
	s_w = float(int(W)) / float(ref_w)
	if (
	not math.isfinite(s_h)
	or s_h <= 0.0
	or not math.isfinite(s_w)
	or s_w <= 0.0
	):
	raise RuntimeError(
	"Computed invalid axis scale factors for alpha-warp RoPE"
	)
	alpha = self.alpha.to(device=device, dtype=torch.float32)
	scale_h = torch.pow(
	torch.tensor(s_h, device=device, dtype=torch.float32), alpha
	)
	scale_w = torch.pow(
	torch.tensor(s_w, device=device, dtype=torch.float32), alpha
	)
	periods_h = self.periods.to(dtype=torch.float32) / scale_h
	periods_w = self.periods.to(dtype=torch.float32) / scale_w
	axis_periods = torch.stack([periods_h, periods_w], dim=0) # [2, Q]
	else:
	axis_periods = self.periods[None, :].expand(2, -1).to(dtype=torch.float32)

	# Angles: angle_multiplier * coords / periods, flattened and tiled.
	angles = (
	float(self.cfg.angle_multiplier)
	* coords[:, :, None].to(dtype=torch.float32)
	/ axis_periods[None, :, :].to(dtype=torch.float32)
	)
	match self.cfg.dim_layout:
	case AxialRoPE2DDimLayout.HALF_SPLIT:
	angles = angles.flatten(1, 2).repeat(1, 2)
	case AxialRoPE2DDimLayout.PAIR_INTERLEAVED:
	angles = angles.repeat_interleave(2, dim=-1).flatten(1, 2)
	case _ as unreachable: # pragma: no cover - defensive
	raise RuntimeError(f"Unsupported dim_layout: {unreachable}")
	if angles.shape != (int(H) * int(W), int(self._d_head)):
	raise RuntimeError(
	"Unexpected angles shape in AxialRoPE2D: "
	f"{tuple(angles.shape)} for H={int(H)} W={int(W)}"
	)
	if scales is not None:
	if scales.dim() != 0:
	raise ValueError(
	"AxialRoPE2D scales must be a scalar tensor for per-batch dilation; "
	"per-sample dilation is not supported"
	)
	angles = angles * scales.to(device=device, dtype=torch.float32)
	cos = torch.cos(angles)
	sin = torch.sin(angles)
	return sin, cos


	def _dy_ntk_periods_for_axis(
	*,
	periods: Tensor,
	axis_len: int,
	ref_axis_len: int,
	noise_time: Tensor,
	lambda_s: float,
	lambda_t: float,
	) -> Tensor:
	"""Return Dy-NTK periods for one spatial axis.

	Raises:
	ValueError: If token lengths or scheduler values are invalid.
	"""

	if int(axis_len) <= 0 or int(ref_axis_len) <= 0:
	raise ValueError("axis_len and ref_axis_len must be positive for Dy-NTK")
	qtr = int(periods.numel())
	if qtr <= 0:
	raise ValueError("periods must be non-empty for Dy-NTK")
	scale = float(int(axis_len)) / float(int(ref_axis_len))
	if not math.isfinite(scale) or scale <= 0.0:
	raise ValueError("Dy-NTK axis scale must be finite and > 0")
	return _dy_ntk_periods_for_scale(
	periods=periods,
	scale=scale,
	noise_time=noise_time,
	lambda_s=float(lambda_s),
	lambda_t=float(lambda_t),
	)


	def _dy_ntk_periods_for_scale(
	*,
	periods: Tensor,
	scale: float,
	noise_time: Tensor,
	lambda_s: float,
	lambda_t: float,
	) -> Tensor:
	"""Return Dy-NTK periods for a precomputed axis scale."""

	axis_scale = float(scale)
	if not math.isfinite(axis_scale) or axis_scale <= 0.0:
	raise ValueError("Dy-NTK scale must be finite and > 0")
	qtr = int(periods.numel())
	if qtr <= 0:
	raise ValueError("periods must be non-empty for Dy-NTK")
	if scale <= 1.0:
	return periods.to(dtype=torch.float32)
	if qtr == 1:
	exponent = torch.zeros((1,), device=periods.device, dtype=torch.float32)
	else:
	exponent = torch.arange(qtr, device=periods.device, dtype=torch.float32) / (
	float(qtr - 1)
	)
	kappa = float(lambda_s) * torch.pow(
	noise_time.to(device=periods.device, dtype=torch.float32),
	float(lambda_t),
	)
	return periods.to(dtype=torch.float32) * torch.pow(
	torch.tensor(axis_scale, device=periods.device, dtype=torch.float32),
	kappa * exponent,
	)


	def _dype_dynamic_exponent(
	*, noise_time: float, lambda_s: float, lambda_t: float
	) -> float:
	"""Return Comfy/DyPE-style dynamic magnitude for normalized noise time."""

	noise = float(noise_time)
	if not math.isfinite(noise):
	raise ValueError("DyPE noise_time must be finite")
	noise = max(0.0, min(1.0, noise))
	scale = float(lambda_s)
	exponent = float(lambda_t)
	if not math.isfinite(scale) or scale <= 0.0:
	raise ValueError("DyPE lambda_s must be finite and > 0")
	if not math.isfinite(exponent) or exponent <= 0.0:
	raise ValueError("DyPE lambda_t must be finite and > 0")
	return scale * (noise**exponent)


	def _dype_correction_factor(
	*,
	periods: Tensor,
	rotations: float,
	ref_axis_len: int,
	angle_multiplier: float,
	) -> float:
	"""Return fractional band index whose wavelength makes ``rotations`` turns."""

	if int(ref_axis_len) <= 0:
	raise ValueError("ref_axis_len must be positive for DyPE correction")
	rot = float(rotations)
	if not math.isfinite(rot) or rot <= 0.0:
	raise ValueError("rotations must be finite and > 0")
	mult = float(angle_multiplier)
	if not math.isfinite(mult) or mult <= 0.0:
	raise ValueError("angle_multiplier must be finite and > 0")
	if int(periods.numel()) < 2:
	return 0.0
	periods_cpu = periods.detach().to(device=torch.device("cpu"), dtype=torch.float32)
	p0 = float(periods_cpu[0].item())
	p1 = float(periods_cpu[-1].item())
	if p0 <= 0.0 or p1 <= p0:
	raise ValueError("periods must be positive and strictly increasing for DyPE")
	boundary_wavelength = float(int(ref_axis_len)) / rot
	boundary_period = (mult / (2.0 * float(math.pi))) * boundary_wavelength
	log_p0 = math.log(p0)
	log_p1 = math.log(p1)
	return float(periods.numel() - 1) * (
	(math.log(boundary_period) - log_p0) / (log_p1 - log_p0)
	)


	def _dype_ramp_mask(
	*,
	periods: Tensor,
	threshold_high_rotations: float,
	threshold_low_rotations: float,
	ref_axis_len: int,
	angle_multiplier: float,
	) -> Tensor:
	"""Return YaRN's high-to-low band mask for one dynamic threshold pair."""

	qtr = int(periods.numel())
	if qtr <= 0:
	raise ValueError("periods must be non-empty for DyPE ramp mask")
	device = periods.device
	if qtr == 1:
	return torch.ones((1,), device=device, dtype=torch.float32)
	low = math.floor(
	_dype_correction_factor(
	periods=periods,
	rotations=float(threshold_high_rotations),
	ref_axis_len=int(ref_axis_len),
	angle_multiplier=float(angle_multiplier),
	)
	)
	high = math.ceil(
	_dype_correction_factor(
	periods=periods,
	rotations=float(threshold_low_rotations),
	ref_axis_len=int(ref_axis_len),
	angle_multiplier=float(angle_multiplier),
	)
	)
	low = max(0, min(qtr - 1, int(low)))
	high = max(0, min(qtr, int(high)))
	if low == high:
	high = min(qtr, low + 1)
	band = torch.arange(qtr, device=device, dtype=torch.float32)
	ramp = (band - float(low)) / float(high - low)
	return 1.0 - torch.clamp(ramp, min=0.0, max=1.0)


	def _dy_yarn_periods_for_axis(
	*,
	periods: Tensor,
	linear_scale: float,
	ntk_scale: float,
	ref_axis_len: int,
	noise_time: float,
	lambda_s: float,
	cfg: AxialRoPE2DDyPEConfig,
	angle_multiplier: float,
	) -> Tensor:
	"""Return Dy-YaRN periods for one spatial axis."""

	if int(ref_axis_len) <= 0:
	raise ValueError("ref_axis_len must be positive for Dy-YaRN")
	linear_s = float(linear_scale)
	ntk_s = float(ntk_scale)
	if (
	not math.isfinite(linear_s)
	or linear_s <= 0.0
	or not math.isfinite(ntk_s)
	or ntk_s <= 0.0
	):
	raise ValueError("Dy-YaRN axis scales must be finite and > 0")
	periods_f = periods.to(dtype=torch.float32)
	if max(linear_s, ntk_s) <= 1.0:
	return periods_f
	kappa = _dype_dynamic_exponent(
	noise_time=float(noise_time),
	lambda_s=float(lambda_s),
	lambda_t=float(cfg.lambda_t),
	)
	if kappa <= 1e-6:
	return periods_f
	freq_base = float(angle_multiplier) / periods_f
	freq_linear = float(angle_multiplier) / (periods_f * max(1.0, linear_s))
	periods_ntk = _dy_ntk_periods_for_scale(
	periods=periods_f,
	scale=max(1.0, ntk_s),
	noise_time=torch.ones((), device=periods.device, dtype=torch.float32),
	lambda_s=1.0,
	lambda_t=1.0,
	)
	freq_ntk = float(angle_multiplier) / periods_ntk

	beta_mask = _dype_ramp_mask(
	periods=periods_f,
	threshold_high_rotations=float(cfg.yarn_beta_0) ** kappa,
	threshold_low_rotations=float(cfg.yarn_beta_1) ** kappa,
	ref_axis_len=int(ref_axis_len),
	angle_multiplier=float(angle_multiplier),
	)
	freq = freq_linear * (1.0 - beta_mask) + freq_ntk * beta_mask

	gamma_mask = _dype_ramp_mask(
	periods=periods_f,
	threshold_high_rotations=float(cfg.yarn_gamma_0) ** kappa,
	threshold_low_rotations=float(cfg.yarn_gamma_1) ** kappa,
	ref_axis_len=int(ref_axis_len),
	angle_multiplier=float(angle_multiplier),
	)
	freq = freq * (1.0 - gamma_mask) + freq_base * gamma_mask
	return float(angle_multiplier) / freq


	class AxialRoPE2DDyPE(AxialRoPE2D):
	"""Inference-only axial RoPE wrapper using dynamic position extrapolation."""

	dype_cfg: AxialRoPE2DDyPEConfig
	dype_noise_time: Tensor
	dype_noise_time_values: list[float]

	def __init__(self, *, base: AxialRoPE2D, cfg: AxialRoPE2DDyPEConfig) -> None:
	if not isinstance(base, AxialRoPE2D):
	raise TypeError("base must be an AxialRoPE2D")
	if not isinstance(cfg, AxialRoPE2DDyPEConfig):
	raise TypeError("cfg must be an AxialRoPE2DDyPEConfig")
	if base.cfg.coord_mode is not AxialRoPE2DCoordMode.PATCH_INDICES:
	raise ValueError("DyPE requires patch-index axial RoPE coordinates")
	super().__init__(head_dim=int(base.head_dim), cfg=base.cfg)
	self.dype_cfg = cfg # ty: ignore[unresolved-attribute]
	self.register_buffer(
	"dype_noise_time",
	torch.tensor(1.0, dtype=torch.float32),
	persistent=False,
	)
	self.dype_noise_time_values: list[float] = [1.0]
	with torch.no_grad():
	self.periods.copy_(base.periods.detach().to(dtype=torch.float32))

	def set_dype_noise_time(self, noise_time: float) -> None:
	"""Set the current normalized diffusion noise time in ``[0, 1]``."""

	t = float(noise_time)
	if not math.isfinite(t) or t < 0.0 or t > 1.0:
	raise ValueError("DyPE noise_time must be finite and within [0, 1]")
	self.dype_noise_time.fill_(t)
	self.dype_noise_time_values[0] = t

	def _dype_axis_periods(
	self,
	*,
	axis_len: int,
	ref_axis_len: int,
	global_scale: float,
	lambda_s: float,
	) -> Tensor:
	"""Return method-specific periods for one spatial axis."""

	cfg = self.dype_cfg
	axis_scale = float(int(axis_len)) / float(int(ref_axis_len))
	shared_scale = float(global_scale)
	if (
	not math.isfinite(axis_scale)
	or axis_scale <= 0.0
	or not math.isfinite(shared_scale)
	or shared_scale <= 0.0
	):
	raise ValueError("DyPE axis and global scales must be finite and > 0")
	match cfg.method:
	case DyPERoPEMethod.DY_NTK:
	return _dy_ntk_periods_for_scale(
	periods=self.periods,
	scale=shared_scale,
	noise_time=self.dype_noise_time,
	lambda_s=float(lambda_s),
	lambda_t=float(cfg.lambda_t),
	)
	case DyPERoPEMethod.VISION_YARN:
	return _dy_yarn_periods_for_axis(
	periods=self.periods,
	linear_scale=axis_scale,
	ntk_scale=shared_scale,
	ref_axis_len=int(ref_axis_len),
	noise_time=float(self.dype_noise_time_values[0]),
	lambda_s=float(lambda_s),
	cfg=cfg,
	angle_multiplier=float(self.cfg.angle_multiplier),
	)
	case DyPERoPEMethod.DY_YARN:
	return _dy_yarn_periods_for_axis(
	periods=self.periods,
	linear_scale=shared_scale,
	ntk_scale=shared_scale,
	ref_axis_len=int(ref_axis_len),
	noise_time=float(self.dype_noise_time_values[0]),
	lambda_s=float(lambda_s),
	cfg=cfg,
	angle_multiplier=float(self.cfg.angle_multiplier),
	)
	case _ as unreachable:
	raise RuntimeError(f"Unsupported DyPE method: {unreachable}")

	def forward(
	self,
	*,
	H: int,
	W: int,
	scales: Tensor \| None,
	) -> tuple[Tensor, Tensor]:
	"""Return timestep-aware DyPE sin/cos buffers."""

	if scales is not None:
	raise ValueError("DyPE axial RoPE does not support dilation scales")
	if int(H) <= 0 or int(W) <= 0:
	raise ValueError("H and W must be positive for DyPE axial RoPE")
	device = self.periods.device
	coords = _get_patch_index_coords(
	int(H), int(W), device=device, offset=float(self.cfg.coord_offset)
	)
	scale_h = float(int(H)) / float(int(self.dype_cfg.ref_h_tokens))
	scale_w = float(int(W)) / float(int(self.dype_cfg.ref_w_tokens))
	global_scale = max(scale_h, scale_w)
	periods_h = self._dype_axis_periods(
	axis_len=int(H),
	ref_axis_len=int(self.dype_cfg.ref_h_tokens),
	global_scale=global_scale,
	lambda_s=float(self.dype_cfg.lambda_s),
	)
	periods_w = self._dype_axis_periods(
	axis_len=int(W),
	ref_axis_len=int(self.dype_cfg.ref_w_tokens),
	global_scale=global_scale,
	lambda_s=float(self.dype_cfg.lambda_s),
	)
	axis_periods = torch.stack([periods_h, periods_w], dim=0)
	angles = (
	float(self.cfg.angle_multiplier)
	* coords[:, :, None].to(dtype=torch.float32)
	/ axis_periods[None, :, :].to(dtype=torch.float32)
	)
	match self.cfg.dim_layout:
	case AxialRoPE2DDimLayout.HALF_SPLIT:
	angles = angles.flatten(1, 2).repeat(1, 2)
	case AxialRoPE2DDimLayout.PAIR_INTERLEAVED:
	angles = angles.repeat_interleave(2, dim=-1).flatten(1, 2)
	case _ as unreachable:
	raise RuntimeError(f"Unsupported dim_layout: {unreachable}")
	expected_shape = (int(H) * int(W), int(self.head_dim))
	if angles.shape != expected_shape:
	raise RuntimeError(
	"Unexpected angles shape in DyPE axial RoPE: "
	f"{tuple(angles.shape)} for expected {expected_shape}"
	)
	sin = torch.sin(angles)
	cos = torch.cos(angles)
	if (
	self.dype_cfg.method in (DyPERoPEMethod.VISION_YARN, DyPERoPEMethod.DY_YARN)
	and bool(self.dype_cfg.yarn_attention_scale)
	and global_scale > 1.0
	):
	match self.dype_cfg.method:
	case DyPERoPEMethod.VISION_YARN:
	mscale_start = 0.1 * math.log(global_scale) + 1.0
	kappa = _dype_dynamic_exponent(
	noise_time=float(self.dype_noise_time_values[0]),
	lambda_s=1.0,
	lambda_t=float(self.dype_cfg.lambda_t),
	)
	mscale = 1.0 + (mscale_start - 1.0) * kappa
	case DyPERoPEMethod.DY_YARN:
	mscale = 1.0 + 0.1 * math.log(global_scale) / math.sqrt(
	global_scale
	)
	case _ as unreachable: # pragma: no cover - guarded above
	raise RuntimeError(
	f"Unsupported YaRN attention scale: {unreachable}"
	)
	if mscale > 1.0:
	sin = sin * float(mscale)
	cos = cos * float(mscale)
	return sin, cos


	def build_axial_rope2d_dype(
	*, base: AxialRoPE2D, cfg: AxialRoPE2DDyPEConfig
	) -> AxialRoPE2DDyPE:
	"""Build an inference-only DyPE wrapper for an existing axial RoPE."""

	return AxialRoPE2DDyPE(base=base, cfg=cfg).to(device=base.periods.device)


	def set_axial_rope2d_dype_noise_time(module: nn.Module, *, noise_time: float) -> bool:
	"""Set DyPE noise time on all axial DyPE modules inside ``module``."""

	updated = False
	for child in module.modules():
	match child:
	case AxialRoPE2DDyPE() as dype:
	dype.set_dype_noise_time(float(noise_time))
	updated = True
	case _:
	pass
	return updated