AbstractPhil
/

tiny-flux

@@ -78,23 +78,26 @@ class RotaryEmbedding(nn.Module):
         self.axes_dims = axes_dims  # (temporal, height, width)
         self.theta = theta
-    def forward(self, ids: torch.Tensor) -> torch.Tensor:
         """
         ids: (B, N, 3) - temporal, height, width indices
         Returns: (B, N, dim) rotary embeddings
         """
         B, N, _ = ids.shape
         device = ids.device
-        dtype = torch.float32
         embeddings = []
         dim_offset = 0
         for axis_idx, axis_dim in enumerate(self.axes_dims):
             # Compute frequencies for this axis
-            freqs = 1.0 / (self.theta ** (torch.arange(0, axis_dim, 2, device=device, dtype=dtype) / axis_dim))
             # Get positions for this axis
-            positions = ids[:, :, axis_idx].float()  # (B, N)
             # Outer product: (B, N) x (axis_dim/2) -> (B, N, axis_dim/2)
             angles = positions.unsqueeze(-1) * freqs.unsqueeze(0).unsqueeze(0)
             # Interleave sin/cos
@@ -103,7 +106,8 @@ class RotaryEmbedding(nn.Module):
             embeddings.append(emb)
             dim_offset += axis_dim
-        return torch.cat(embeddings, dim=-1)  # (B, N, dim)
 def apply_rope(x: torch.Tensor, rope: torch.Tensor) -> torch.Tensor:
@@ -111,7 +115,9 @@ def apply_rope(x: torch.Tensor, rope: torch.Tensor) -> torch.Tensor:
     # x: (B, heads, N, head_dim)
     # rope: (B, N, head_dim)
     B, H, N, D = x.shape
-    rope = rope.unsqueeze(1)  # (B, 1, N, D)
     # Split into pairs
     x_pairs = x.reshape(B, H, N, D // 2, 2)
@@ -221,6 +227,11 @@ class Attention(nn.Module):
         mask: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         B, N, _ = x.shape
         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
         q, k, v = qkv.permute(2, 0, 3, 1, 4)  # 3 x (B, heads, N, head_dim)
@@ -263,6 +274,12 @@ class JointAttention(nn.Module):
         B, L, _ = txt.shape
         _, N, _ = img.shape
         # Compute Q, K, V for both streams
         txt_qkv = self.txt_qkv(txt).reshape(B, L, 3, self.num_heads, self.head_dim)
         img_qkv = self.img_qkv(img).reshape(B, N, 3, self.num_heads, self.head_dim)
@@ -506,8 +523,8 @@ class TinyFlux(nn.Module):
         if self.config.guidance_embeds and guidance is not None:
             vec = vec + self.guidance_in(guidance)
-        # RoPE for image positions
-        img_rope = self.rope(img_ids)
         # Double-stream blocks
         for block in self.double_blocks:

         self.axes_dims = axes_dims  # (temporal, height, width)
         self.theta = theta
+    def forward(self, ids: torch.Tensor, dtype: torch.dtype = None) -> torch.Tensor:
         """
         ids: (B, N, 3) - temporal, height, width indices
+        dtype: output dtype (defaults to ids.dtype, but use model dtype for bf16)
         Returns: (B, N, dim) rotary embeddings
         """
         B, N, _ = ids.shape
         device = ids.device
+        # Compute in float32 for precision, cast at the end
+        compute_dtype = torch.float32
+        output_dtype = dtype if dtype is not None else ids.dtype
         embeddings = []
         dim_offset = 0
         for axis_idx, axis_dim in enumerate(self.axes_dims):
             # Compute frequencies for this axis
+            freqs = 1.0 / (self.theta ** (torch.arange(0, axis_dim, 2, device=device, dtype=compute_dtype) / axis_dim))
             # Get positions for this axis
+            positions = ids[:, :, axis_idx].to(compute_dtype)  # (B, N)
             # Outer product: (B, N) x (axis_dim/2) -> (B, N, axis_dim/2)
             angles = positions.unsqueeze(-1) * freqs.unsqueeze(0).unsqueeze(0)
             # Interleave sin/cos
             embeddings.append(emb)
             dim_offset += axis_dim
+        result = torch.cat(embeddings, dim=-1)  # (B, N, dim)
+        return result.to(output_dtype)
 def apply_rope(x: torch.Tensor, rope: torch.Tensor) -> torch.Tensor:
     # x: (B, heads, N, head_dim)
     # rope: (B, N, head_dim)
     B, H, N, D = x.shape
+    # Ensure rope matches x dtype
+    rope = rope.to(x.dtype).unsqueeze(1)  # (B, 1, N, D)
     # Split into pairs
     x_pairs = x.reshape(B, H, N, D // 2, 2)
         mask: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         B, N, _ = x.shape
+        dtype = x.dtype
+        # Ensure RoPE matches input dtype
+        if rope is not None:
+            rope = rope.to(dtype)
         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
         q, k, v = qkv.permute(2, 0, 3, 1, 4)  # 3 x (B, heads, N, head_dim)
         B, L, _ = txt.shape
         _, N, _ = img.shape
+        # Ensure consistent dtype (use img dtype as reference)
+        dtype = img.dtype
+        txt = txt.to(dtype)
+        if rope is not None:
+            rope = rope.to(dtype)
         # Compute Q, K, V for both streams
         txt_qkv = self.txt_qkv(txt).reshape(B, L, 3, self.num_heads, self.head_dim)
         img_qkv = self.img_qkv(img).reshape(B, N, 3, self.num_heads, self.head_dim)
         if self.config.guidance_embeds and guidance is not None:
             vec = vec + self.guidance_in(guidance)
+        # RoPE for image positions (match model dtype)
+        img_rope = self.rope(img_ids, dtype=img.dtype)
         # Double-stream blocks
         for block in self.double_blocks: