SpiceeChat
/

Base-mini

@@ -6,23 +6,28 @@ import torch.nn.functional as F
 from transformers import PreTrainedModel, GenerationMixin
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from .configuration_tiny_gpt import TinyGPTConfig
 _FLASH2_KERNEL = None
 _FLASH3_KERNEL = None
 def _get_flash2_kernel():
     global _FLASH2_KERNEL
     if _FLASH2_KERNEL is None:
         kernels = importlib.import_module("kernels")
         _FLASH2_KERNEL = kernels.get_kernel("kernels-community/flash-attn2", version=1)
     return _FLASH2_KERNEL
 def _get_flash3_kernel():
     global _FLASH3_KERNEL
     if _FLASH3_KERNEL is None:
         kernels = importlib.import_module("kernels")
         _FLASH3_KERNEL = kernels.get_kernel("kernels-community/flash-attn3", version=1)
     return _FLASH3_KERNEL
 def _get_sageattn():
     module = importlib.import_module("sageattention")
     return module.sageattn
 class CausalSelfAttention(nn.Module):
     def __init__(self, config: TinyGPTConfig):
         super().__init__()
@@ -74,18 +79,21 @@ class CausalSelfAttention(nn.Module):
                     self.attention_backend = "torch"
                 else:
                     raise
     def _torch_attention(self, q, k, v, t):
         scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
         scores = scores.masked_fill(self.mask[:, :, :t, :t] == 0, float("-inf"))
         att = F.softmax(scores.float(), dim=-1).to(q.dtype)
         att = self.dropout(att)
         return att @ v
     def _sage_attention(self, q, k, v):
         if self.sageattn is None or not q.is_cuda:
             if self.torch_fallback:
                 return None
             raise RuntimeError("SageAttention requires CUDA + sageattention")
         return self.sageattn(q.contiguous(), k.contiguous(), v.contiguous(), tensor_layout="HND", is_causal=True)
     def _flash2_attention(self, q, k, v):
         if self.flash_kernel is None or not q.is_cuda:
             if self.torch_fallback:
@@ -97,6 +105,7 @@ class CausalSelfAttention(nn.Module):
         dropout_p = self.dropout_p if self.training else 0.0
         y = self.flash_kernel.flash_attn_func(q, k, v, dropout_p=dropout_p, causal=True)
         return y.transpose(1, 2).contiguous()
     def _flash3_attention(self, q, k, v):
         if self.flash_kernel is None or not q.is_cuda:
             if self.torch_fallback:
@@ -107,6 +116,7 @@ class CausalSelfAttention(nn.Module):
         v = v.transpose(1, 2).contiguous()
         y = self.flash_kernel.flash_attn_func(q, k, v, causal=True)
         return y.transpose(1, 2).contiguous()
     def forward(self, x):
         b, t, c = x.shape
         qkv = self.qkv(x)
@@ -130,18 +140,21 @@ class CausalSelfAttention(nn.Module):
             y = self._torch_attention(q, k, v, t)
         y = y.transpose(1, 2).contiguous().view(b, t, c)
         return self.proj(y)
 class MLP(nn.Module):
     def __init__(self, config: TinyGPTConfig):
         super().__init__()
         self.fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False)
         self.proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=False)
         self.dropout = nn.Dropout(config.dropout)
     def forward(self, x):
         x = self.fc(x)
         x = F.gelu(x)
         x = self.proj(x)
         x = self.dropout(x)
         return x
 class Block(nn.Module):
     def __init__(self, config: TinyGPTConfig):
         super().__init__()
@@ -149,14 +162,17 @@ class Block(nn.Module):
         self.attn = CausalSelfAttention(config)
         self.ln2 = nn.LayerNorm(config.n_embd)
         self.mlp = MLP(config)
     def forward(self, x):
         x = x + self.attn(self.ln1(x))
         x = x + self.mlp(self.ln2(x))
         return x
 class TinyGPTPreTrainedModel(PreTrainedModel):
     config_class = TinyGPTConfig
     base_model_prefix = "tiny_gpt"
     supports_gradient_checkpointing = False
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
             nn.init.normal_(module.weight, mean=0.0, std=0.02)
@@ -164,8 +180,10 @@ class TinyGPTPreTrainedModel(PreTrainedModel):
                 nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             nn.init.normal_(module.weight, mean=0.0, std=0.02)
 class TinyGPTModel(TinyGPTPreTrainedModel):
     _tied_weights_keys = ["head.weight"]
     def __init__(self, config: TinyGPTConfig):
         super().__init__(config)
         self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
@@ -175,18 +193,24 @@ class TinyGPTModel(TinyGPTPreTrainedModel):
         self.ln_f = nn.LayerNorm(config.n_embd)
         self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.post_init()
-        self.tie_weights()
     def get_input_embeddings(self):
         return self.tok_emb
     def set_input_embeddings(self, value):
         self.tok_emb = value
         self.tie_weights()
     def get_output_embeddings(self):
         return self.head
     def set_output_embeddings(self, new_embeddings):
         self.head = new_embeddings
     def tie_weights(self, *args, **kwargs):
-        self._tie_or_clone_weights(self.head, self.tok_emb)
     def forward(self, input_ids, attention_mask=None, return_dict=True, return_logits=False, **kwargs):
         b, t = input_ids.shape
         if t > self.config.ctx_len:
@@ -202,29 +226,47 @@ class TinyGPTModel(TinyGPTPreTrainedModel):
             return (hidden, logits) if return_logits else (hidden,)
         if return_logits:
             return hidden, logits
-        return BaseModelOutputWithPast(last_hidden_state=hidden, past_key_values=None, hidden_states=None, attentions=None)
 class TinyGPTForCausalLM(TinyGPTPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["tiny_gpt.head.weight"]
     def __init__(self, config: TinyGPTConfig):
         super().__init__(config)
         self.tiny_gpt = TinyGPTModel(config)
         self.post_init()
-        self.tie_weights()
     def get_input_embeddings(self):
         return self.tiny_gpt.tok_emb
     def set_input_embeddings(self, value):
         self.tiny_gpt.tok_emb = value
         self.tie_weights()
     def get_output_embeddings(self):
         return self.tiny_gpt.head
     def set_output_embeddings(self, new_embeddings):
         self.tiny_gpt.head = new_embeddings
     def tie_weights(self, *args, **kwargs):
-        self._tie_or_clone_weights(self.tiny_gpt.head, self.tiny_gpt.tok_emb)
     def prepare_inputs_for_generation(self, input_ids, **kwargs):
         return {"input_ids": input_ids}
     def forward(self, input_ids, attention_mask=None, labels=None, return_dict=True, **kwargs):
-        hidden, logits = self.tiny_gpt(input_ids=input_ids, attention_mask=attention_mask, return_dict=True, return_logits=True)
         loss = None
         if labels is not None:
             shift_logits = logits[:, :-1, :].contiguous()
@@ -232,4 +274,10 @@ class TinyGPTForCausalLM(TinyGPTPreTrainedModel, GenerationMixin):
             loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)).float(), shift_labels.view(-1))
         if not return_dict:
             return ((loss, logits) if loss is not None else (logits,))
-        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=None, hidden_states=None, attentions=None)

 from transformers import PreTrainedModel, GenerationMixin
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from .configuration_tiny_gpt import TinyGPTConfig
 _FLASH2_KERNEL = None
 _FLASH3_KERNEL = None
 def _get_flash2_kernel():
     global _FLASH2_KERNEL
     if _FLASH2_KERNEL is None:
         kernels = importlib.import_module("kernels")
         _FLASH2_KERNEL = kernels.get_kernel("kernels-community/flash-attn2", version=1)
     return _FLASH2_KERNEL
 def _get_flash3_kernel():
     global _FLASH3_KERNEL
     if _FLASH3_KERNEL is None:
         kernels = importlib.import_module("kernels")
         _FLASH3_KERNEL = kernels.get_kernel("kernels-community/flash-attn3", version=1)
     return _FLASH3_KERNEL
 def _get_sageattn():
     module = importlib.import_module("sageattention")
     return module.sageattn
 class CausalSelfAttention(nn.Module):
     def __init__(self, config: TinyGPTConfig):
         super().__init__()
                     self.attention_backend = "torch"
                 else:
                     raise
     def _torch_attention(self, q, k, v, t):
         scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
         scores = scores.masked_fill(self.mask[:, :, :t, :t] == 0, float("-inf"))
         att = F.softmax(scores.float(), dim=-1).to(q.dtype)
         att = self.dropout(att)
         return att @ v
     def _sage_attention(self, q, k, v):
         if self.sageattn is None or not q.is_cuda:
             if self.torch_fallback:
                 return None
             raise RuntimeError("SageAttention requires CUDA + sageattention")
         return self.sageattn(q.contiguous(), k.contiguous(), v.contiguous(), tensor_layout="HND", is_causal=True)
     def _flash2_attention(self, q, k, v):
         if self.flash_kernel is None or not q.is_cuda:
             if self.torch_fallback:
         dropout_p = self.dropout_p if self.training else 0.0
         y = self.flash_kernel.flash_attn_func(q, k, v, dropout_p=dropout_p, causal=True)
         return y.transpose(1, 2).contiguous()
     def _flash3_attention(self, q, k, v):
         if self.flash_kernel is None or not q.is_cuda:
             if self.torch_fallback:
         v = v.transpose(1, 2).contiguous()
         y = self.flash_kernel.flash_attn_func(q, k, v, causal=True)
         return y.transpose(1, 2).contiguous()
     def forward(self, x):
         b, t, c = x.shape
         qkv = self.qkv(x)
             y = self._torch_attention(q, k, v, t)
         y = y.transpose(1, 2).contiguous().view(b, t, c)
         return self.proj(y)
 class MLP(nn.Module):
     def __init__(self, config: TinyGPTConfig):
         super().__init__()
         self.fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False)
         self.proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=False)
         self.dropout = nn.Dropout(config.dropout)
     def forward(self, x):
         x = self.fc(x)
         x = F.gelu(x)
         x = self.proj(x)
         x = self.dropout(x)
         return x
 class Block(nn.Module):
     def __init__(self, config: TinyGPTConfig):
         super().__init__()
         self.attn = CausalSelfAttention(config)
         self.ln2 = nn.LayerNorm(config.n_embd)
         self.mlp = MLP(config)
     def forward(self, x):
         x = x + self.attn(self.ln1(x))
         x = x + self.mlp(self.ln2(x))
         return x
 class TinyGPTPreTrainedModel(PreTrainedModel):
     config_class = TinyGPTConfig
     base_model_prefix = "tiny_gpt"
     supports_gradient_checkpointing = False
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
             nn.init.normal_(module.weight, mean=0.0, std=0.02)
                 nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             nn.init.normal_(module.weight, mean=0.0, std=0.02)
 class TinyGPTModel(TinyGPTPreTrainedModel):
     _tied_weights_keys = ["head.weight"]
     def __init__(self, config: TinyGPTConfig):
         super().__init__(config)
         self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
         self.ln_f = nn.LayerNorm(config.n_embd)
         self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.post_init()
+        # tie_weights will be called by post_init, but we provide the override below.
     def get_input_embeddings(self):
         return self.tok_emb
     def set_input_embeddings(self, value):
         self.tok_emb = value
         self.tie_weights()
     def get_output_embeddings(self):
         return self.head
     def set_output_embeddings(self, new_embeddings):
         self.head = new_embeddings
     def tie_weights(self, *args, **kwargs):
+        self.head.weight = self.tok_emb.weight
     def forward(self, input_ids, attention_mask=None, return_dict=True, return_logits=False, **kwargs):
         b, t = input_ids.shape
         if t > self.config.ctx_len:
             return (hidden, logits) if return_logits else (hidden,)
         if return_logits:
             return hidden, logits
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
 class TinyGPTForCausalLM(TinyGPTPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["tiny_gpt.head.weight"]
     def __init__(self, config: TinyGPTConfig):
         super().__init__(config)
         self.tiny_gpt = TinyGPTModel(config)
         self.post_init()
     def get_input_embeddings(self):
         return self.tiny_gpt.tok_emb
     def set_input_embeddings(self, value):
         self.tiny_gpt.tok_emb = value
         self.tie_weights()
     def get_output_embeddings(self):
         return self.tiny_gpt.head
     def set_output_embeddings(self, new_embeddings):
         self.tiny_gpt.head = new_embeddings
     def tie_weights(self, *args, **kwargs):
+        self.tiny_gpt.head.weight = self.tiny_gpt.tok_emb.weight
     def prepare_inputs_for_generation(self, input_ids, **kwargs):
         return {"input_ids": input_ids}
     def forward(self, input_ids, attention_mask=None, labels=None, return_dict=True, **kwargs):
+        hidden, logits = self.tiny_gpt(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True,
+            return_logits=True,
+        )
         loss = None
         if labels is not None:
             shift_logits = logits[:, :-1, :].contiguous()
             loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)).float(), shift_labels.view(-1))
         if not return_dict:
             return ((loss, logits) if loss is not None else (logits,))
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )