| import torch |
|
|
|
|
| class LlamaRotaryEmbedding(torch.nn.Module): |
| def __init__(self, dim, max_position_embeddings, base=10000, device=None): |
| super().__init__() |
| self.dim = dim |
| self.max_position_embeddings = max_position_embeddings |
| self.base = base |
| inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) |
| self.register_buffer("inv_freq", inv_freq, persistent=False) |
|
|
| |
| self._set_cos_sin_cache( |
| seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() |
| ) |
|
|
| def _set_cos_sin_cache(self, seq_len, device, dtype): |
| self.max_seq_len_cached = seq_len |
| t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) |
|
|
| freqs = torch.einsum("i,j->ij", t, self.inv_freq) |
| |
| emb = torch.cat((freqs, freqs), dim=-1) |
| self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) |
| self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) |
|
|
| def forward(self, x, seq_len=None): |
| |
| |
| if seq_len > self.max_seq_len_cached: |
| self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) |
|
|
| return ( |
| self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), |
| self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), |
| ) |
|
|
| |
| class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): |
| """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" |
|
|
| def __init__(self, dim, max_position_embeddings, base=10000, device=None, scaling_factor=1.0): |
| self.scaling_factor = scaling_factor |
| super().__init__(dim, max_position_embeddings, base, device) |
|
|
| def _set_cos_sin_cache(self, seq_len, device, dtype): |
| self.max_seq_len_cached = seq_len |
| t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) |
| t = t / self.scaling_factor |
|
|
| freqs = torch.einsum("i,j->ij", t, self.inv_freq) |
| |
| emb = torch.cat((freqs, freqs), dim=-1) |
| self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) |
| self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) |
|
|
|
|
| class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): |
| """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" |
|
|
| def __init__(self, dim, max_position_embeddings, base=10000, device=None, scaling_factor=1.0): |
| self.scaling_factor = scaling_factor |
| super().__init__(dim, max_position_embeddings, base, device) |
|
|
| def _set_cos_sin_cache(self, seq_len, device, dtype): |
| self.max_seq_len_cached = seq_len |
|
|
| if seq_len > self.max_position_embeddings: |
| base = self.base * ( |
| (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) |
| ) ** (self.dim / (self.dim - 2)) |
| inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) |
| self.register_buffer("inv_freq", inv_freq, persistent=False) |
|
|
| t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) |
|
|
| freqs = torch.einsum("i,j->ij", t, self.inv_freq) |
| |
| |
| |
| |
| |
| |
| |
| |
| emb = torch.cat((freqs, freqs), dim=-1) |
| self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) |
| self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) |
| |
| |
| def rotate_half(x): |
| """Rotates half the hidden dims of the input.""" |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| x1 = x[..., : x.shape[-1] // 2] |
| x2 = x[..., x.shape[-1] // 2 :] |
| return torch.cat((-x2, x1), dim=-1) |
|
|
|
|
| def apply_rotary_pos_emb(q, k, cos, sin, position_ids): |
| |
| cos = cos.squeeze(1).squeeze(0) |
| sin = sin.squeeze(1).squeeze(0) |
| cos = cos[position_ids].unsqueeze(1) |
| sin = sin[position_ids].unsqueeze(1) |
| q_embed = (q * cos) + (rotate_half(q) * sin) |
| k_embed = (k * cos) + (rotate_half(k) * sin) |
| return q_embed, k_embed |