Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

__pycache__/configuration_konkan.cpython-312.pyc +0 -0
__pycache__/modeling_konkan.cpython-312.pyc +0 -0
config.json +3 -28
configuration_konkan.py +3 -16
modeling_konkan.py +9 -40
pytorch_model.bin +2 -2
special_tokens_map.json +2 -6
tokenizer.json +0 -18
tokenizer_config.json +53 -4

__pycache__/configuration_konkan.cpython-312.pyc ADDED Viewed

Binary file (976 Bytes). View file

__pycache__/modeling_konkan.cpython-312.pyc ADDED Viewed

Binary file (9.9 kB). View file

config.json CHANGED Viewed

@@ -1,40 +1,15 @@
 {
-  "add_cross_attention": false,
   "architectures": [
     "KonkanGPT"
   ],
-  "auto_map": {
-    "AutoConfig": "configuration_konkan.KonkanSmallConfig",
-    "AutoModelForCausalLM": "modeling_konkan.KonkanGPT"
-  },
-  "bos_token_id": 0,
-  "cross_attention_hidden_size": null,
   "d_ff": 3072,
   "d_model": 768,
-  "decoder_start_token_id": null,
   "dropout": 0.1,
   "dtype": "float32",
-  "eos_token_id": 2,
-  "finetuning_task": null,
-  "hidden_size": 768,
-  "is_decoder": false,
-  "max_len": 1024,
   "model_type": "konkangpt",
   "n_heads": 12,
   "n_layers": 12,
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 1,
-  "prefix": null,
-  "pruned_heads": {},
-  "sep_token_id": null,
-  "task_specific_params": null,
-  "tf_legacy_loss": false,
-  "tie_encoder_decoder": false,
-  "tie_word_embeddings": true,
-  "tokenizer_class": null,
-  "torchscript": false,
-  "transformers_version": "5.2.0",
-  "use_bfloat16": false,
-  "vocab_size": 32002
 }

 {
   "architectures": [
     "KonkanGPT"
   ],
   "d_ff": 3072,
   "d_model": 768,
   "dropout": 0.1,
   "dtype": "float32",
+  "max_len": 2048,
   "model_type": "konkangpt",
   "n_heads": 12,
   "n_layers": 12,
+  "transformers_version": "4.57.1",
+  "vocab_size": 32000
 }

configuration_konkan.py CHANGED Viewed

@@ -1,19 +1,10 @@
 from transformers import PretrainedConfig
 class KonkanSmallConfig(PretrainedConfig):
     model_type = "konkangpt"
-    def __init__(
-        self,
-        vocab_size=32002,  # Changed from 32000 to 32002
-        d_model=768,
-        n_layers=12,
-        n_heads=12,
-        d_ff=3072,
-        max_len=1024,
-        dropout=0.1,
-        **kwargs
-    ):
         super().__init__(**kwargs)
         self.vocab_size = vocab_size
         self.d_model = d_model
@@ -22,7 +13,3 @@ class KonkanSmallConfig(PretrainedConfig):
         self.d_ff = d_ff
         self.max_len = max_len
         self.dropout = dropout
-        self.num_hidden_layers = n_layers
-        self.hidden_size = d_model
-        self.num_attention_heads = n_heads

 from transformers import PretrainedConfig
 class KonkanSmallConfig(PretrainedConfig):
     model_type = "konkangpt"
+    def __init__(self, vocab_size=32002, d_model=768, n_layers=12, n_heads=12,
+                 d_ff=3072, max_len=2048, dropout=0.1, **kwargs):
         super().__init__(**kwargs)
         self.vocab_size = vocab_size
         self.d_model = d_model
         self.d_ff = d_ff
         self.max_len = max_len
         self.dropout = dropout

modeling_konkan.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -9,11 +10,12 @@ class RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_seq_len=2048):
         super().__init__()
         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer("inv_freq", inv_freq)
     def forward(self, x, seq_len):
-        t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
-        freqs = torch.outer(t, self.inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)
         return emb.cos(), emb.sin()
@@ -31,7 +33,6 @@ class RMSNorm(nn.Module):
         super().__init__()
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(dim))
     def forward(self, x):
         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
@@ -59,25 +60,23 @@ class KonkanBlock(nn.Module):
         residual = x
         x = self.input_layernorm(x)
         b, t, c = x.shape
         q = self.q_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
         k = self.k_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
         v = self.v_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
         q = apply_rotary_pos_emb(q, cos, sin)
         k = apply_rotary_pos_emb(k, cos, sin)
         y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
         y = y.transpose(1, 2).contiguous().reshape(b, t, c)
         x = residual + self.o_proj(y)
         x = x + self.down_proj(self.act(self.gate_up_proj(self.post_attention_layernorm(x))))
         return x
 class KonkanGPT(PreTrainedModel):
     config_class = KonkanSmallConfig
-    main_input_name = "input_ids"
     def __init__(self, config):
         super().__init__(config)
         self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
@@ -85,49 +84,19 @@ class KonkanGPT(PreTrainedModel):
         self.layers = nn.ModuleList([KonkanBlock(config) for _ in range(config.n_layers)])
         self.norm = RMSNorm(config.d_model)
         self.head = nn.Linear(config.d_model, config.vocab_size, bias=False)
         self.post_init()
-        self.tie_weights()
-    def get_input_embeddings(self):
-        return self.token_emb
-    def set_input_embeddings(self, value):
-        self.token_emb = value
-    def get_output_embeddings(self):
-        return self.head
-    def set_output_embeddings(self, new_embeddings):
-        self.head = new_embeddings
-    def tie_weights(self, **kwargs):  # Added **kwargs to catch extra arguments
-        """Standard HF method to link embeddings and head weights."""
-        if hasattr(self, "token_emb") and hasattr(self, "head"):
-            self.head.weight = self.token_emb.weight
-    def forward(self, input_ids, labels=None, attention_mask=None, **kwargs):
         b, t = input_ids.shape
         cos, sin = self.rope(input_ids, t)
         mask = torch.tril(torch.ones(t, t, device=input_ids.device)).view(1, 1, t, t).bool()
-        if attention_mask is not None:
-            mask = mask & attention_mask.view(b, 1, 1, t).bool()
         x = self.token_emb(input_ids)
         for layer in self.layers:
             x = layer(x, cos, sin, mask)
         logits = self.head(self.norm(x))
         loss = None
         if labels is not None:
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
         return CausalLMOutput(loss=loss, logits=logits)
-    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
-        # Important: Since we don't use KV Cache, we always send the full input_ids
-        return {"input_ids": input_ids, "attention_mask": attention_mask}

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
     def __init__(self, dim, max_seq_len=2048):
         super().__init__()
         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
     def forward(self, x, seq_len):
+        device = x.device
+        t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq.to(device))
         emb = torch.cat((freqs, freqs), dim=-1)
         return emb.cos(), emb.sin()
         super().__init__()
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(dim))
     def forward(self, x):
         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
         residual = x
         x = self.input_layernorm(x)
         b, t, c = x.shape
         q = self.q_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
         k = self.k_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
         v = self.v_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
         q = apply_rotary_pos_emb(q, cos, sin)
         k = apply_rotary_pos_emb(k, cos, sin)
+        # DTYPE FIX
+        q, k = q.to(v.dtype), k.to(v.dtype)
         y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
         y = y.transpose(1, 2).contiguous().reshape(b, t, c)
         x = residual + self.o_proj(y)
         x = x + self.down_proj(self.act(self.gate_up_proj(self.post_attention_layernorm(x))))
         return x
 class KonkanGPT(PreTrainedModel):
     config_class = KonkanSmallConfig
     def __init__(self, config):
         super().__init__(config)
         self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
         self.layers = nn.ModuleList([KonkanBlock(config) for _ in range(config.n_layers)])
         self.norm = RMSNorm(config.d_model)
         self.head = nn.Linear(config.d_model, config.vocab_size, bias=False)
         self.post_init()
+    def forward(self, input_ids, labels=None, **kwargs):
         b, t = input_ids.shape
         cos, sin = self.rope(input_ids, t)
         mask = torch.tril(torch.ones(t, t, device=input_ids.device)).view(1, 1, t, t).bool()
         x = self.token_emb(input_ids)
         for layer in self.layers:
             x = layer(x, cos, sin, mask)
         logits = self.head(self.norm(x))
         loss = None
         if labels is not None:
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
         return CausalLMOutput(loss=loss, logits=logits)

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:488d504c8ba2e2bb4306639d591345b3ed5360f79a15ad5bbf75e0165357e612
-size 551408560

 version https://git-lfs.github.com/spec/v1
+oid sha256:31377ee6924efa41f711a77bbd3d64a6176aca196c165db9c71e0d7260f74dd1
+size 649700976

special_tokens_map.json CHANGED Viewed

@@ -26,9 +26,5 @@
     "normalized": false,
     "rstrip": false,
     "single_word": false
-  },
-  "additional_special_tokens": [
-    "<|user|>",
-    "<|assistant|>"
-  ]
-}

     "normalized": false,
     "rstrip": false,
     "single_word": false
+  }
+}

tokenizer.json CHANGED Viewed

@@ -56,24 +56,6 @@
       "rstrip": false,
       "normalized": false,
       "special": true
-    },
-    {
-      "id": 32000,
-      "content": "<|user|>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 32001,
-      "content": "<|assistant|>",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
     }
   ],
   "normalizer": null,

       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": null,

tokenizer_config.json CHANGED Viewed

@@ -1,11 +1,60 @@
 {
-  "backend": "tokenizers",
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
-  "is_local": true,
-  "model_max_length": 1024,
   "pad_token": "<pad>",
-  "tokenizer_class": "TokenizersBackend",
   "unk_token": "[UNK]"
 }

 {
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[INST]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "[/INST]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "[UNK]"
 }