BucketOfFish
/

simplified_phi2

@@ -72,211 +72,106 @@ class Embedding(nn.Module):
         return hidden_states
-def _apply_rotary_emb(
-    x: torch.FloatTensor,
-    cos: torch.FloatTensor,
-    sin: torch.FloatTensor,
-) -> torch.FloatTensor:
-    _, seqlen, _, _ = x.shape
-    _, rotary_dim = cos.shape
-    rotary_dim *= 2
-    x_rot = x[:, :, :, :rotary_dim]
-    x_pass = x[:, :, :, rotary_dim:]
-    x1, x2 = x_rot.chunk(2, dim=-1)
-    c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
-    x1, x2, c, s = [t.to(dtype=torch.float32) for t in [x1, x2, c, s]]
-    x_rot = torch.cat([x1 * c - x2 * s, x1 * s + x2 * c], axis=-1).to(x.dtype)
-    return torch.cat([x_rot, x_pass], axis=-1)
-def _apply_rotary_emb_kv(
-    kv: torch.FloatTensor,
-    cos: torch.FloatTensor,
-    sin: torch.FloatTensor,
-    cos_k: Optional[torch.FloatTensor] = None,
-    sin_k: Optional[torch.FloatTensor] = None,
-) -> torch.FloatTensor:
-    _, seqlen, _, _, _ = kv.shape
-    _, rotary_dim = cos.shape
-    rotary_dim *= 2
-    k_rot = kv[:, :, 0, :, :rotary_dim]
-    k_pass = kv[:, :, 0, :, rotary_dim:]
-    k1, k2 = k_rot.chunk(2, dim=-1)
-    c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
-    k1, k2, c, s = [t.to(dtype=torch.float32) for t in [k1, k2, c, s]]
-    k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(kv.dtype)
-    return torch.cat(
-        [
-            torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
-            kv[:, :, 1:2, :, :],
-        ],
-        axis=2,
-    )
-def _apply_rotary_emb_qkv(
-    qkv: torch.FloatTensor,
-    cos: torch.FloatTensor,
-    sin: torch.FloatTensor,
-    cos_k: Optional[torch.FloatTensor] = None,
-    sin_k: Optional[torch.FloatTensor] = None,
-) -> torch.FloatTensor:
-    _, seqlen, _, _, _ = qkv.shape
-    _, rotary_dim = cos.shape
-    rotary_dim *= 2
-    q_rot = qkv[:, :, 0, :, :rotary_dim]
-    q_pass = qkv[:, :, 0, :, rotary_dim:]
-    k_rot = qkv[:, :, 1, :, :rotary_dim]
-    k_pass = qkv[:, :, 1, :, rotary_dim:]
-    q1, q2 = q_rot.chunk(2, dim=-1)
-    k1, k2 = k_rot.chunk(2, dim=-1)
-    c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
-    q1, q2, k1, k2, c, s = [t.to(dtype=torch.float32) for t in [q1, q2, k1, k2, c, s]]
-    q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
-    k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(qkv.dtype)
-    return torch.cat(
-        [
-            torch.cat([q_rot, q_pass], axis=-1).unsqueeze(2),
-            torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
-            qkv[:, :, 2:3, :, :],
-        ],
-        axis=2,
-    )
 class RotaryEmbedding(nn.Module):
-    """Rotary positional embedding (RoPE).
-    Reference:
-        RoFormer: Enhanced Transformer with Rotary Position Embedding.
-        https://arxiv.org/pdf/2104.09864.pdf.
     """
     def __init__(
         self,
-        dim: int,
-        base: int = 10000,
-        scale_base: Optional[float] = None,
-        pos_idx_in_fp32: bool = True,
-        max_position_embeddings: int = 2048,
-        device: Optional[str] = None,
-        **kwargs,
     ) -> None:
         super().__init__()
-        if scale_base is not None:
-            raise NotImplementedError
-        self.dim = dim
-        self.base = float(base)
-        self.scale_base = scale_base
-        self.pos_idx_in_fp32 = pos_idx_in_fp32
-        self.max_position_embeddings = max_position_embeddings
         self.device = device
-        # Generate and save the inverse frequency buffer (non-trainable)
-        inv_freq = self._compute_inv_freq(device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # Generate and save the scale buffer (non-trainable)
-        scale = (
-            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
-            if scale_base is not None
-            else None
         )
-        self.register_buffer("scale", scale, persistent=False)
-        # Initialize cached attributes since ONNX can't rely on dynamic initialization
-        self._update_cos_sin_cache(max_position_embeddings, device=device, dtype=torch.float32)
-    def _compute_inv_freq(self, device: Optional[str] = None) -> torch.FloatTensor:
-        return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
-    def _update_cos_sin_cache(
-        self,
-        seqlen: int,
-        device: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-    ) -> None:
-        self._seq_len_cached = seqlen
-        # fp32 is preferred since the output of `torch.arange` can be quite large
-        # and bf16 would lose a lot of precision
-        if self.pos_idx_in_fp32:
-            t = torch.arange(seqlen, device=device, dtype=torch.float32)
-            if self.inv_freq.dtype != torch.float32:
-                inv_freq = self._compute_inv_freq(device=device)
-            else:
-                inv_freq = self.inv_freq
-        else:
-            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-            inv_freq = self.inv_freq
-        # `torch.outer` is preferred since `torch.einsum` converts from fp32 to fp16 if used with AMP
-        freqs = torch.outer(t, inv_freq)
-        if self.scale is None:
-            self._cos_cached = torch.cos(freqs).to(dtype)
-            self._sin_cached = torch.sin(freqs).to(dtype)
-        else:
             power = (
-                torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
-            ) / self.scale_base
-            scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
-            # Force the scale multiplication to happen in fp32
-            self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
-            self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
-            self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
-            self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
     def forward(
         self,
-        qkv: torch.Tensor,
-        kv: Optional[torch.Tensor] = None,
-        seqlen_offset: int = 0,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
         if (
-            self._seq_len_cached < qkv.shape[1] + seqlen_offset
-            or self._cos_cached.device != qkv.device
-            or self._cos_cached.dtype != qkv.dtype
             or (self.training and self._cos_cached.is_inference())
         ):
-            self._update_cos_sin_cache(qkv.shape[1] + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
-        if kv is None:
-            return _apply_rotary_emb_qkv(
-                qkv,
-                self._cos_cached[seqlen_offset:],
-                self._sin_cached[seqlen_offset:],
-            )
-        else:
-            q = _apply_rotary_emb(
-                qkv,
-                self._cos_cached[seqlen_offset:],
-                self._sin_cached[seqlen_offset:],
-            )
-            kv = _apply_rotary_emb_kv(
-                kv,
-                self._cos_cached[seqlen_offset:],
-                self._sin_cached[seqlen_offset:],
-            )
-            return q, kv
 class MLP(nn.Module):
@@ -519,23 +414,10 @@ class MHA(nn.Module):
         super().__init__()
         # Rotary embedding
-        self.rotary_dim = rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
-        if self.rotary_dim > 0:
-            rotary_cls = RotaryEmbedding
-            if rotary_cls is None:
-                rotary_cls = RotaryEmbedding
-            rotary_kwargs = {}
-            if rotary_cls is RotaryEmbedding:
-                rotary_kwargs["max_position_embeddings"] = config.n_positions
-            self.rotary_emb = rotary_cls(
-                self.rotary_dim,
-                base=rotary_base,
-                scale_base=rotary_scale_base,
-                device=device,
-                **rotary_kwargs,
-            )
         # MLP
         self.n_head, self.n_head_kv, self.head_dim = _find_mha_dims(

         return hidden_states
 class RotaryEmbedding(nn.Module):
+    """Rotary positional embedding (RoPE) from Phi2.
+    See https://www.youtube.com/watch?v=C6rV8BsrrCc
     """
     def __init__(
         self,
+        d_rotary: int,
+        rotary_base: float = 10000.0,
+        initial_cos_sin_cache_len: int = 2048,
+        device: torch.device | None = None,
     ) -> None:
         super().__init__()
+        self.d_rotary = d_rotary
+        self.rotary_base = rotary_base
         self.device = device
+        self.dtype = torch.float32
+        self._update_cos_sin_cache(seqlen=initial_cos_sin_cache_len)
+    def _update_cos_sin_cache(self, seqlen: int) -> None:
+        # only call this function when seqlen is larger than _max_seqlen
+        self._max_seqlen = seqlen
+        # m * theta_i = m * base^(-2i/d) = m * (1 / base^(2i/d)), where i in [1, d/2]
+        m = torch.arange(
+            seqlen,
+            device=self.device,
+            dtype=self.dtype,
         )
+        theta_i = 1.0 / (
+            self.rotary_base ** (
+                torch.arange(
+                    start=0,
+                    end=self.d_rotary,
+                    step=2,
+                    device=self.device,
+                    dtype=self.dtype,
+                ) / self.d_rotary
+            )
+        )
+        # torch.outer, since torch.einsum converts from fp32 to fp16 if used with torch.amp
+        # TODO: does this matter if I'm disabling torch.autocast?
+        m_theta_i = torch.outer(m, theta_i)
+        self._cos_cached = torch.cos(m_theta_i).to(self.dtype)
+        self._sin_cached = torch.sin(m_theta_i).to(self.dtype)
+        # TODO: scale_base caching is labelled as not yet done in Phi2
+        """
+        if scale_base is not None:
+            scale = (
+                torch.arange(
+                    start=0,
+                    end=self.d_rotary,
+                    step=2,
+                    device=self.device,
+                    dtype=torch.float32,
+                ) + 0.4 * self.d_rotary
+            ) / (1.4 * self.d_rotary)
             power = (
+                torch.arange(seqlen, dtype=scale.dtype, device=scale.device) - seqlen // 2
+            ) / scale_base
+            scale = scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+            self._cos_cached = (torch.cos(m_theta_i) * scale).to(dtype)
+            self._sin_cached = (torch.sin(m_theta_i) * scale).to(dtype)
+        """
+    def _apply_rotary_emb_qkv(
+        self,
+        x: torch.FloatTensor,  # dim: (batch_size, seqlen, Optional[n_qkv], n_heads, d_head)
+        cos: torch.FloatTensor,  # dim: (_max_seqlen, d_rotary)
+        sin: torch.FloatTensor,  # dim: (_max_seqlen, d_rotary)
+    ) -> torch.FloatTensor:
+        seqlen = x.shape[1]
+        x1, x2 = x.chunk(2, dim=-1)  # dim: (batch_size, seqlen, Optional[n_qkv], n_heads, d_head/2)
+        broadcast_rearrange = "s d -> s 1 d" if x1.ndim == 4 else "s d -> s 1 1 d"
+        c, s = rearrange(cos[:seqlen], broadcast_rearrange), rearrange(sin[:seqlen], broadcast_rearrange)
+        x1, x2, c, s = [t.to(dtype=torch.float32) for t in [x1, x2, c, s]]  # make sure rotary embedding is in float32
+        return cast(
+            torch.FloatTensor,
+            torch.cat([x1 * c - x2 * s, x1 * s + x2 * c], dim=-1).to(x.dtype)
+        )
     def forward(
         self,
+        x: torch.FloatTensor,  # dim: (batch_size, seqlen, Optional[n_qkv], n_heads, d_head)
+        seqlen_offset: int = 0,  # each sequence is shifted by this amount - used in inference with KV cache
+    ) -> torch.FloatTensor:
         if (
+            not self._max_seqlen
+            or self._max_seqlen < x.shape[1] + seqlen_offset
+            or self._cos_cached.device != x.device
+            or self._cos_cached.dtype != x.dtype
             or (self.training and self._cos_cached.is_inference())
         ):
+            self._update_cos_sin_cache(seqlen=x.shape[1] + seqlen_offset)
+        return self._apply_rotary_emb_qkv(
+            x,
+            cast(torch.FloatTensor, self._cos_cached[seqlen_offset:]),
+            cast(torch.FloatTensor, self._sin_cached[seqlen_offset:]),
+        )
 class MLP(nn.Module):
         super().__init__()
         # Rotary embedding
+        self.rotary_emb = RotaryEmbedding(
+            d_rotary=math.ceil((rotary_dim // n_head) / 2),  # d_rotary is half of d_head
+            initial_cos_sin_cache_len=config.n_positions,
+        )
         # MLP
         self.n_head, self.n_head_kv, self.head_dim = _find_mha_dims(