SpiceeChat
/

Base-mini

@@ -13,20 +13,29 @@ _FLASH3_KERNEL = None
 def _get_flash2_kernel():
     global _FLASH2_KERNEL
     if _FLASH2_KERNEL is None:
-        kernels = importlib.import_module("kernels")
-        _FLASH2_KERNEL = kernels.get_kernel("kernels-community/flash-attn2", version=1)
     return _FLASH2_KERNEL
 def _get_flash3_kernel():
     global _FLASH3_KERNEL
     if _FLASH3_KERNEL is None:
-        kernels = importlib.import_module("kernels")
-        _FLASH3_KERNEL = kernels.get_kernel("kernels-community/flash-attn3", version=1)
     return _FLASH3_KERNEL
 def _get_sageattn():
-    module = importlib.import_module("sageattention")
-    return module.sageattn
 class CausalSelfAttention(nn.Module):
     def __init__(self, config: TinyGPTConfig):
@@ -36,49 +45,37 @@ class CausalSelfAttention(nn.Module):
         self.n_head = int(config.n_head)
         self.head_dim = int(config.n_embd // config.n_head)
         self.attention_backend = str(getattr(config, "attention_backend", "torch"))
-        self.torch_fallback = bool(getattr(config, "torch_fallback", False))
-        self.dropout_p = float(config.dropout)
         if self.attention_backend not in ("sage", "torch", "flash2", "flash3"):
-            raise ValueError("attention_backend must be sage, torch, flash2 or flash3")
         if self.attention_backend == "sage" and self.head_dim not in (64, 96, 128):
-            raise ValueError(f"SageAttention requires head_dim in [64, 96, 128], got {self.head_dim}")
         if self.attention_backend == "sage" and self.dropout_p != 0.0:
-            raise ValueError("SageAttention requires dropout=0.0")
         if self.attention_backend == "flash3" and self.dropout_p != 0.0:
-            raise ValueError("FlashAttention3 requires dropout=0.0")
         if self.attention_backend in ("flash2", "flash3") and self.head_dim % 8 != 0:
-            raise ValueError(f"FlashAttention requires head_dim multiple of 8, got {self.head_dim}")
         self.qkv = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
         self.proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
-        self.dropout = nn.Dropout(config.dropout)
         mask = torch.tril(torch.ones(config.ctx_len, config.ctx_len, dtype=torch.bool))
         self.register_buffer("mask", mask.view(1, 1, config.ctx_len, config.ctx_len), persistent=False)
         self.sageattn = None
         self.flash_kernel = None
         if self.attention_backend == "sage":
-            try:
-                self.sageattn = _get_sageattn()
-            except Exception:
-                if self.torch_fallback:
-                    self.attention_backend = "torch"
-                else:
-                    raise
         if self.attention_backend == "flash2":
-            try:
-                self.flash_kernel = _get_flash2_kernel()
-            except Exception:
-                if self.torch_fallback:
-                    self.attention_backend = "torch"
-                else:
-                    raise
         if self.attention_backend == "flash3":
-            try:
-                self.flash_kernel = _get_flash3_kernel()
-            except Exception:
-                if self.torch_fallback:
-                    self.attention_backend = "torch"
-                else:
-                    raise
     def _torch_attention(self, q, k, v, t):
         scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
@@ -88,34 +85,43 @@ class CausalSelfAttention(nn.Module):
         return att @ v
     def _sage_attention(self, q, k, v):
-        if self.sageattn is None or not q.is_cuda:
-            if self.torch_fallback:
-                return None
-            raise RuntimeError("SageAttention requires CUDA + sageattention")
-        return self.sageattn(q.contiguous(), k.contiguous(), v.contiguous(), tensor_layout="HND", is_causal=True)
     def _flash2_attention(self, q, k, v):
-        if self.flash_kernel is None or not q.is_cuda:
-            if self.torch_fallback:
-                return None
-            raise RuntimeError("FlashAttention2 requires CUDA + kernels")
-        q = q.transpose(1, 2).contiguous()
-        k = k.transpose(1, 2).contiguous()
-        v = v.transpose(1, 2).contiguous()
-        dropout_p = self.dropout_p if self.training else 0.0
-        y = self.flash_kernel.flash_attn_func(q, k, v, dropout_p=dropout_p, causal=True)
-        return y.transpose(1, 2).contiguous()
     def _flash3_attention(self, q, k, v):
-        if self.flash_kernel is None or not q.is_cuda:
-            if self.torch_fallback:
-                return None
-            raise RuntimeError("FlashAttention3 requires CUDA + kernels")
-        q = q.transpose(1, 2).contiguous()
-        k = k.transpose(1, 2).contiguous()
-        v = v.transpose(1, 2).contiguous()
-        y = self.flash_kernel.flash_attn_func(q, k, v, causal=True)
-        return y.transpose(1, 2).contiguous()
     def forward(self, x):
         b, t, c = x.shape
@@ -193,14 +199,13 @@ class TinyGPTModel(TinyGPTPreTrainedModel):
         self.ln_f = nn.LayerNorm(config.n_embd)
         self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.post_init()
     def get_input_embeddings(self):
         return self.tok_emb
     def set_input_embeddings(self, value):
         self.tok_emb = value
-        self.tie_weights()
     def get_output_embeddings(self):
         return self.head
@@ -246,7 +251,7 @@ class TinyGPTForCausalLM(TinyGPTPreTrainedModel, GenerationMixin):
     def set_input_embeddings(self, value):
         self.tiny_gpt.tok_emb = value
-        self.tie_weights()
     def get_output_embeddings(self):
         return self.tiny_gpt.head
@@ -280,4 +285,4 @@ class TinyGPTForCausalLM(TinyGPTPreTrainedModel, GenerationMixin):
             past_key_values=None,
             hidden_states=None,
             attentions=None,
-        )

 def _get_flash2_kernel():
     global _FLASH2_KERNEL
     if _FLASH2_KERNEL is None:
+        try:
+            kernels = importlib.import_module("kernels")
+            _FLASH2_KERNEL = kernels.get_kernel("kernels-community/flash-attn2", version=1)
+        except ImportError:
+            pass
     return _FLASH2_KERNEL
 def _get_flash3_kernel():
     global _FLASH3_KERNEL
     if _FLASH3_KERNEL is None:
+        try:
+            kernels = importlib.import_module("kernels")
+            _FLASH3_KERNEL = kernels.get_kernel("kernels-community/flash-attn3", version=1)
+        except ImportError:
+            pass
     return _FLASH3_KERNEL
 def _get_sageattn():
+    try:
+        module = importlib.import_module("sageattention")
+        return module.sageattn
+    except ImportError:
+        return None
 class CausalSelfAttention(nn.Module):
     def __init__(self, config: TinyGPTConfig):
         self.n_head = int(config.n_head)
         self.head_dim = int(config.n_embd // config.n_head)
         self.attention_backend = str(getattr(config, "attention_backend", "torch"))
+        self.torch_fallback = bool(getattr(config, "torch_fallback", True))
+        self.dropout_p = float(config.dropout) if hasattr(config, "dropout") else 0.0
         if self.attention_backend not in ("sage", "torch", "flash2", "flash3"):
+            self.attention_backend = "torch"
         if self.attention_backend == "sage" and self.head_dim not in (64, 96, 128):
+            self.attention_backend = "torch"
         if self.attention_backend == "sage" and self.dropout_p != 0.0:
+            self.attention_backend = "torch"
         if self.attention_backend == "flash3" and self.dropout_p != 0.0:
+            self.attention_backend = "torch"
         if self.attention_backend in ("flash2", "flash3") and self.head_dim % 8 != 0:
+            self.attention_backend = "torch"
         self.qkv = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
         self.proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.dropout = nn.Dropout(self.dropout_p)
         mask = torch.tril(torch.ones(config.ctx_len, config.ctx_len, dtype=torch.bool))
         self.register_buffer("mask", mask.view(1, 1, config.ctx_len, config.ctx_len), persistent=False)
         self.sageattn = None
         self.flash_kernel = None
         if self.attention_backend == "sage":
+            self.sageattn = _get_sageattn()
+            if self.sageattn is None and not self.torch_fallback:
+                raise RuntimeError("SageAttention requested but not available")
         if self.attention_backend == "flash2":
+            self.flash_kernel = _get_flash2_kernel()
+            if self.flash_kernel is None and not self.torch_fallback:
+                raise RuntimeError("FlashAttention2 requested but not available")
         if self.attention_backend == "flash3":
+            self.flash_kernel = _get_flash3_kernel()
+            if self.flash_kernel is None and not self.torch_fallback:
+                raise RuntimeError("FlashAttention3 requested but not available")
     def _torch_attention(self, q, k, v, t):
         scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
         return att @ v
     def _sage_attention(self, q, k, v):
+        if self.sageattn is None:
+            return None
+        if not q.is_cuda:
+            return None
+        try:
+            return self.sageattn(q.contiguous(), k.contiguous(), v.contiguous(), tensor_layout="HND", is_causal=True)
+        except Exception:
+            return None
     def _flash2_attention(self, q, k, v):
+        if self.flash_kernel is None:
+            return None
+        if not q.is_cuda:
+            return None
+        try:
+            q = q.transpose(1, 2).contiguous()
+            k = k.transpose(1, 2).contiguous()
+            v = v.transpose(1, 2).contiguous()
+            dropout_p = self.dropout_p if self.training else 0.0
+            y = self.flash_kernel.flash_attn_func(q, k, v, dropout_p=dropout_p, causal=True)
+            return y.transpose(1, 2).contiguous()
+        except Exception:
+            return None
     def _flash3_attention(self, q, k, v):
+        if self.flash_kernel is None:
+            return None
+        if not q.is_cuda:
+            return None
+        try:
+            q = q.transpose(1, 2).contiguous()
+            k = k.transpose(1, 2).contiguous()
+            v = v.transpose(1, 2).contiguous()
+            y = self.flash_kernel.flash_attn_func(q, k, v, causal=True)
+            return y.transpose(1, 2).contiguous()
+        except Exception:
+            return None
     def forward(self, x):
         b, t, c = x.shape
         self.ln_f = nn.LayerNorm(config.n_embd)
         self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.post_init()
     def get_input_embeddings(self):
         return self.tok_emb
     def set_input_embeddings(self, value):
         self.tok_emb = value
+        self.head.weight = self.tok_emb.weight
     def get_output_embeddings(self):
         return self.head
     def set_input_embeddings(self, value):
         self.tiny_gpt.tok_emb = value
+        self.tiny_gpt.head.weight = self.tiny_gpt.tok_emb.weight
     def get_output_embeddings(self):
         return self.tiny_gpt.head
             past_key_values=None,
             hidden_states=None,
             attentions=None,
+        )