WJAD / src /wjad /modules /pos_encoding.py

Sync WJAD codebase

0cfefd2 verified 7 days ago

8.1 kB

	"""3D RoPE（仅作用于视觉 token）。

	12 头按 4+4+4 拆为三组：
	- 头 0-3：射线 RoPE，编码自车系下的单位射线方向 ``(dx, dy, dz)``。
	- 头 4-7：H/W/T RoPE，编码归一化的空间-时间索引 ``(h_norm, w_norm, t_norm)``。
	- 头 8-11：零频段 RoPE，cos=1 / sin=0 → 旋转矩阵恒为 I（identity）。

	为减少分支与显存通信，全部 12 头统一走同一份 RoPE 算子（不写 if/else），
	零频段头自然变为恒等映射。

	将 ``head_dim=64`` 切成 32 个 (cos, sin) 对（两两一组旋转）。每组头内部再按
	3 个分量（dx,dy,dz 或 h,w,t）平均分配 32/3 ≈ 10 对（最后 2 对补 0 频）。
	"""

	from __future__ import annotations

	import torch
	import torch.nn as nn


	def _split_head_dim_for_components(half: int, num_components: int) -> list[int]:
	"""把 head_dim/2 个旋转对均匀分给若干个分量；剩余补 0 频。

	返回每个分量分到的旋转对数，最后一项是 ``half - sum(其它)``。
	若 ``num_components == 0``（零频段头），则返回 ``[0, 0, ..., half]``，最后
	一项视为"零频段"——它的频率会被显式置为 0。
	"""
	if num_components == 0:
	return [0, half]
	base = half // num_components
	splits = [base] * num_components
	splits[-1] += half - base * num_components # 余数全归到最后一个分量
	return splits


	def build_rope_freqs(
	rays: torch.Tensor,
	hwt_grid: torch.Tensor,
	num_heads: int = 12,
	head_dim: int = 64,
	rope_theta: float = 10000.0,
	device: torch.device \| None = None,
	dtype: torch.dtype = torch.float32,
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""构造 3D RoPE 的 cos / sin 表。

	参数
	----
	rays : Tensor, shape ``[B, N_v, 3]``
	每个视觉 token 在自车系下的单位射线方向 ``(dx, dy, dz)``。
	hwt_grid : Tensor, shape ``[B, N_v, 3]``
	归一化的空间-时间坐标 ``(h_norm, w_norm, t_norm)`` ∈ [-1, 1]。
	num_heads : int
	总头数（默认 12）。
	head_dim : int
	每头维度（默认 64，必须为偶数）。

	返回
	----
	cos, sin : Tensor, shape ``[B, N_v, num_heads, head_dim // 2]``
	每个旋转对的 cos / sin 值，已就绪可送入 ``apply_rope``。
	"""
	assert head_dim % 2 == 0, "head_dim 必须为偶数"
	assert num_heads % 3 == 0, "num_heads 需被 3 整除以便 4+4+4 分组"

	half = head_dim // 2
	heads_per_group = num_heads // 3
	bsz, n_v, _ = rays.shape
	if device is None:
	device = rays.device

	# === 三组分量值 ===
	# group 0: rays (3 components)
	# group 1: hwt (3 components)
	# group 2: zero (0 components -> 全部 half 视为零频段)
	splits_g0 = _split_head_dim_for_components(half, 3) # 用于 rays
	splits_g1 = _split_head_dim_for_components(half, 3) # 用于 hwt
	splits_g2 = _split_head_dim_for_components(half, 0) # [0, half]

	# === 频率向量（沿 head_dim 半轴）===
	# 经典 RoPE: theta_i = base ^ (-2i / d)
	# 这里我们对每个分量独立排布频率
	def _freqs(num_pairs: int) -> torch.Tensor:
	# 前 num_pairs 个用 RoPE 频率，剩余补 0
	idx = torch.arange(num_pairs, device=device, dtype=dtype)
	freqs = rope_theta ** (-2.0 * idx / head_dim)
	return freqs # [num_pairs]

	# 把分量值与频率张量逐头展开为 [B, N_v, num_heads, half]
	angles = torch.zeros(bsz, n_v, num_heads, half, device=device, dtype=dtype)

	# ---- 第 0 组（4 头）：射线 ----
	base_offset = 0
	h0_start = 0
	h0_end = h0_start + heads_per_group
	cursor = 0
	for c in range(3): # dx, dy, dz
	n_pairs = splits_g0[c]
	if n_pairs > 0:
	f = _freqs(n_pairs) # [n_pairs]
	comp_val = rays[..., c : c + 1] # [B, N_v, 1]
	ang = comp_val * f # 广播 -> [B, N_v, n_pairs]
	angles[:, :, h0_start:h0_end, cursor : cursor + n_pairs] = ang.unsqueeze(2)
	cursor += n_pairs
	# 余数（splits_g0 最后一项的"补足"部分由 _split 已并入最后分量），无需置 0

	# ---- 第 1 组（4 头）：HWT ----
	h1_start = heads_per_group
	h1_end = h1_start + heads_per_group
	cursor = 0
	for c in range(3): # h, w, t
	n_pairs = splits_g1[c]
	if n_pairs > 0:
	f = _freqs(n_pairs)
	comp_val = hwt_grid[..., c : c + 1]
	ang = comp_val * f
	angles[:, :, h1_start:h1_end, cursor : cursor + n_pairs] = ang.unsqueeze(2)
	cursor += n_pairs

	# ---- 第 2 组（4 头）：零频段 ----
	# 角度恒为 0 → cos=1, sin=0 → 等价 identity；不需要再赋值（已是零）

	cos = torch.cos(angles)
	sin = torch.sin(angles)
	return cos, sin


	def apply_rope(
	q: torch.Tensor,
	k: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""对 ``q`` ``k`` 的视觉 token 部分应用 3D RoPE。

	所有 12 头一视同仁地走同一段代码（零频段头 cos=1/sin=0 → identity）。

	参数
	----
	q, k : Tensor, shape ``[B, H, N_v, head_dim]``
	cos, sin : Tensor, shape ``[B, N_v, H, head_dim // 2]``

	返回
	----
	旋转后的 q, k，形状不变。
	"""
	# 把 cos/sin 转成 [B, H, N_v, half]
	cos_e = cos.permute(0, 2, 1, 3)
	sin_e = sin.permute(0, 2, 1, 3)

	# 把 head_dim 维度按 (even, odd) 拆开成 [..., half]
	q_even = q[..., 0::2]
	q_odd = q[..., 1::2]
	k_even = k[..., 0::2]
	k_odd = k[..., 1::2]

	q_rot_even = q_even * cos_e - q_odd * sin_e
	q_rot_odd = q_even * sin_e + q_odd * cos_e
	k_rot_even = k_even * cos_e - k_odd * sin_e
	k_rot_odd = k_even * sin_e + k_odd * cos_e

	q_out = torch.empty_like(q)
	k_out = torch.empty_like(k)
	q_out[..., 0::2] = q_rot_even
	q_out[..., 1::2] = q_rot_odd
	k_out[..., 0::2] = k_rot_even
	k_out[..., 1::2] = k_rot_odd
	return q_out, k_out


	class RoPE3D(nn.Module):
	"""3D RoPE 工具模块：缓存 hwt_grid（视觉 token 网格上不变），动态计算 rays。

	使用方式：
	rope = RoPE3D(num_heads=12, head_dim=64, T=4, H=12, W=32)
	cos, sin = rope.compute_freqs(rays) # rays: [B, N_v, 3]
	q, k = apply_rope(q_visual_only, k_visual_only, cos, sin)
	"""

	def __init__(
	self,
	num_heads: int = 12,
	head_dim: int = 64,
	time_size: int = 4,
	height_size: int = 12,
	width_size: int = 32,
	rope_theta: float = 10000.0,
	) -> None:
	super().__init__()
	self.num_heads = num_heads
	self.head_dim = head_dim
	self.rope_theta = rope_theta
	self.T = time_size
	self.H = height_size
	self.W = width_size

	# 预计算并缓存归一化 H/W/T 网格 [N_v, 3]，N_v = THW
	t = torch.linspace(-1.0, 1.0, steps=time_size) if time_size > 1 else torch.zeros(1)
	h = torch.linspace(-1.0, 1.0, steps=height_size) if height_size > 1 else torch.zeros(1)
	w = torch.linspace(-1.0, 1.0, steps=width_size) if width_size > 1 else torch.zeros(1)
	# 顺序：t -> h -> w（与 Conv3D 输出展平顺序一致）
	T_, H_, W_ = torch.meshgrid(t, h, w, indexing="ij")
	hwt = torch.stack([H_, W_, T_], dim=-1).reshape(-1, 3) # [N_v, 3]
	self.register_buffer("hwt_grid", hwt, persistent=False)

	@property
	def num_visual_tokens(self) -> int:
	return self.T * self.H * self.W

	def compute_freqs(self, rays: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
	"""根据每 token 的射线方向计算 cos/sin。

	``rays`` shape: ``[B, N_v, 3]``。
	"""
	bsz = rays.shape[0]
	hwt = self.hwt_grid.unsqueeze(0).expand(bsz, -1, -1) # [B, N_v, 3]
	return build_rope_freqs(
	rays=rays,
	hwt_grid=hwt,
	num_heads=self.num_heads,
	head_dim=self.head_dim,
	rope_theta=self.rope_theta,
	dtype=rays.dtype,
	)