TuKoResearch
/

AuriStream7B_librilight_dev

AuriStream.AuriStream

Model card Files Files and versions

klemenk commited on Apr 17, 2025

Commit

12d67f7

·

verified ·

1 Parent(s): 40d7f7e

Update modeling_auristream.py

Files changed (1) hide show

modeling_auristream.py +1 -14

modeling_auristream.py CHANGED Viewed

@@ -448,13 +448,6 @@ class CausalSelfAttention(nn.Module):
         if hasattr(config, 'use_rope') and not config.use_rope:
             self.rotary = None
-        # Check if we are running on TPU
-        try:
-            import torch_xla.core.xla_model as xm
-            self.tpu = True
-        except ImportError:
-            self.tpu = False
     def forward(self, x, return_kv=False, return_attn_maps=False):
         B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
@@ -470,13 +463,7 @@ class CausalSelfAttention(nn.Module):
             q = apply_rotary_emb(q, cos, sin)
             k = apply_rotary_emb(k, cos, sin)
-        if self.tpu and not return_kv and not return_attn_maps:
-            from torch_xla.experimental.custom_kernel import flash_attention
-            q_norm = q / math.sqrt(k.size(-1))
-            y = flash_attention(
-                q_norm.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2),
-                causal=True, partition_spec=('fsdp', None, None, None))
-        elif not return_kv and not return_attn_maps:
             y = F.scaled_dot_product_attention(
                 q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2),
                 is_causal=True)

         if hasattr(config, 'use_rope') and not config.use_rope:
             self.rotary = None
     def forward(self, x, return_kv=False, return_attn_maps=False):
         B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
             q = apply_rotary_emb(q, cos, sin)
             k = apply_rotary_emb(k, cos, sin)
+        if not return_kv and not return_attn_maps:
             y = F.scaled_dot_product_attention(
                 q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2),
                 is_causal=True)