bounty

final working rotary fix and removed image cache waste

505474b 7 months ago

1.9 kB

	import torch
	import torch.nn as nn

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	class RotaryEmbedding(nn.Module):
	def __init__(self, head_dim: int, max_seq_len: int, theta: float = 10000.0):
	super().__init__()
	# Match RotaryEmbedding exactly
	self.rot_dim = head_dim // 2 # Only half of head_dim is rotated

	# Frequency calculation - match RotaryEmbedding exactly
	freqs = 1.0 / (theta ** (torch.arange(0, self.rot_dim, 2).float() / self.rot_dim))
	t = torch.arange(max_seq_len, dtype=torch.float32).unsqueeze(1)
	freqs = t * freqs.unsqueeze(0)


	freqs_cis = torch.exp(1j * freqs)
	cos_vals = freqs_cis.real
	sin_vals = freqs_cis.imag

	self.register_buffer('cos_cache', cos_vals, persistent=False)
	self.register_buffer('sin_cache', sin_vals, persistent=False)

	def apply(self, x: torch.Tensor, position_ids: torch.Tensor) -> torch.Tensor:
	"""
	Now works with [..., heads, seq_len, head_dim] - no permutation needed!
	"""
	d = self.rot_dim // 2

	# Get cos/sin for positions - shape: [seq_len, d]
	cos = self.cos_cache[position_ids] # [seq_len, d]
	sin = self.sin_cache[position_ids] # [seq_len, d]

	# Broadcast to match x: [..., heads, seq_len, d]
	cos = cos.unsqueeze(0).unsqueeze(0) # [1, 1, seq_len, d]
	sin = sin.unsqueeze(0).unsqueeze(0) # [1, 1, seq_len, d]

	# Split rotated part
	xq_r = x[..., :d] # [..., heads, seq_len, d]
	xq_i = x[..., d:d*2] # [..., heads, seq_len, d]

	# Apply rotation
	xq_out_r = xq_r * cos - xq_i * sin
	xq_out_i = xq_r * sin + xq_i * cos

	# Update in-place
	x[..., :self.rot_dim] = torch.stack([xq_out_r, xq_out_i], dim=-1).view(*x.shape[:-1], self.rot_dim)

	return x