AbstractPhil
/

tiny-flux-deep

@@ -724,17 +724,20 @@ class Attention(nn.Module):
             q = q * mod.unsqueeze(-1)  # [B, heads, N, head_dim]
             k = k * mod.unsqueeze(-1)
-        # Compute attention scores
-        scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale  # [B, heads, N, N]
-        # === Sol Temperature Scaling ===
         if sol_temperature is not None:
-            # temperature: [B, num_heads] → [B, heads, 1, 1]
-            temp = sol_temperature.unsqueeze(-1).unsqueeze(-1).clamp(min=0.1)
-            scores = scores / temp
-        attn = F.softmax(scores, dim=-1)
-        out = torch.matmul(attn, v)
         out = out.transpose(1, 2).reshape(B, N, -1)
         return self.out_proj(out)
@@ -817,19 +820,19 @@ class JointAttention(nn.Module):
         k = torch.cat([txt_k, img_k], dim=2)
         v = torch.cat([txt_v, img_v], dim=2)
-        # Text attention (NO Sol temperature - text is not spatial)
-        txt_scores = torch.matmul(txt_q, k.transpose(-2, -1)) * self.scale
-        txt_attn = F.softmax(txt_scores, dim=-1)
-        txt_out = torch.matmul(txt_attn, v)
         txt_out = txt_out.transpose(1, 2).reshape(B, L, -1)
-        # Image attention (Sol temperature applies here only)
-        img_scores = torch.matmul(img_q, k.transpose(-2, -1)) * self.scale
         if sol_temperature is not None:
-            temp = sol_temperature.unsqueeze(-1).unsqueeze(-1).clamp(min=0.1)
-            img_scores = img_scores / temp
-        img_attn = F.softmax(img_scores, dim=-1)
-        img_out = torch.matmul(img_attn, v)
         img_out = img_out.transpose(1, 2).reshape(B, N, -1)
         return self.txt_out(txt_out), self.img_out(img_out)
@@ -1562,4 +1565,4 @@ def test_model():
 #if __name__ == "__main__":
-#    test_model()

             q = q * mod.unsqueeze(-1)  # [B, heads, N, head_dim]
             k = k * mod.unsqueeze(-1)
+        # === Compute attention with SDPA (Flash Attention) ===
+        # Sol temperature is applied via scale modification
         if sol_temperature is not None:
+            # Average temperature across heads for SDPA scale
+            # temperature: [B, num_heads] → scalar per sample (SDPA limitation)
+            temp = sol_temperature.mean(dim=1, keepdim=True).clamp(min=0.1)  # [B, 1]
+            effective_scale = self.scale / temp.unsqueeze(-1).unsqueeze(-1)  # [B, 1, 1, 1]
+            # Pre-scale Q instead of post-scale scores (mathematically equivalent)
+            q = q * (effective_scale.sqrt())
+            k = k * (effective_scale.sqrt())
+            out = F.scaled_dot_product_attention(q, k, v, scale=1.0)
+        else:
+            out = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
         out = out.transpose(1, 2).reshape(B, N, -1)
         return self.out_proj(out)
         k = torch.cat([txt_k, img_k], dim=2)
         v = torch.cat([txt_v, img_v], dim=2)
+        # Text attention with SDPA (no Sol modulation)
+        txt_out = F.scaled_dot_product_attention(txt_q, k, v, scale=self.scale)
         txt_out = txt_out.transpose(1, 2).reshape(B, L, -1)
+        # Image attention with SDPA (Sol temperature via scale modification)
         if sol_temperature is not None:
+            temp = sol_temperature.mean(dim=1, keepdim=True).clamp(min=0.1)
+            effective_scale = self.scale / temp.unsqueeze(-1).unsqueeze(-1)
+            img_q_scaled = img_q * (effective_scale.sqrt())
+            k_scaled = k * (effective_scale.sqrt())
+            img_out = F.scaled_dot_product_attention(img_q_scaled, k_scaled, v, scale=1.0)
+        else:
+            img_out = F.scaled_dot_product_attention(img_q, k, v, scale=self.scale)
         img_out = img_out.transpose(1, 2).reshape(B, N, -1)
         return self.txt_out(txt_out), self.img_out(img_out)
 #if __name__ == "__main__":
+#   test_model()