omdeep22 commited on
Commit
44c2574
·
verified ·
1 Parent(s): 489d2f9

Upload folder using huggingface_hub

Browse files
__pycache__/configuration_konkan.cpython-312.pyc ADDED
Binary file (976 Bytes). View file
 
__pycache__/modeling_konkan.cpython-312.pyc ADDED
Binary file (9.9 kB). View file
 
config.json CHANGED
@@ -1,40 +1,15 @@
1
  {
2
- "add_cross_attention": false,
3
  "architectures": [
4
  "KonkanGPT"
5
  ],
6
- "auto_map": {
7
- "AutoConfig": "configuration_konkan.KonkanSmallConfig",
8
- "AutoModelForCausalLM": "modeling_konkan.KonkanGPT"
9
- },
10
- "bos_token_id": 0,
11
- "cross_attention_hidden_size": null,
12
  "d_ff": 3072,
13
  "d_model": 768,
14
- "decoder_start_token_id": null,
15
  "dropout": 0.1,
16
  "dtype": "float32",
17
- "eos_token_id": 2,
18
- "finetuning_task": null,
19
- "hidden_size": 768,
20
- "is_decoder": false,
21
- "max_len": 1024,
22
  "model_type": "konkangpt",
23
  "n_heads": 12,
24
  "n_layers": 12,
25
- "num_attention_heads": 12,
26
- "num_hidden_layers": 12,
27
- "pad_token_id": 1,
28
- "prefix": null,
29
- "pruned_heads": {},
30
- "sep_token_id": null,
31
- "task_specific_params": null,
32
- "tf_legacy_loss": false,
33
- "tie_encoder_decoder": false,
34
- "tie_word_embeddings": true,
35
- "tokenizer_class": null,
36
- "torchscript": false,
37
- "transformers_version": "5.2.0",
38
- "use_bfloat16": false,
39
- "vocab_size": 32002
40
  }
 
1
  {
 
2
  "architectures": [
3
  "KonkanGPT"
4
  ],
 
 
 
 
 
 
5
  "d_ff": 3072,
6
  "d_model": 768,
 
7
  "dropout": 0.1,
8
  "dtype": "float32",
9
+ "max_len": 2048,
 
 
 
 
10
  "model_type": "konkangpt",
11
  "n_heads": 12,
12
  "n_layers": 12,
13
+ "transformers_version": "4.57.1",
14
+ "vocab_size": 32000
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
configuration_konkan.py CHANGED
@@ -1,19 +1,10 @@
 
1
  from transformers import PretrainedConfig
2
 
3
  class KonkanSmallConfig(PretrainedConfig):
4
  model_type = "konkangpt"
5
-
6
- def __init__(
7
- self,
8
- vocab_size=32002, # Changed from 32000 to 32002
9
- d_model=768,
10
- n_layers=12,
11
- n_heads=12,
12
- d_ff=3072,
13
- max_len=1024,
14
- dropout=0.1,
15
- **kwargs
16
- ):
17
  super().__init__(**kwargs)
18
  self.vocab_size = vocab_size
19
  self.d_model = d_model
@@ -22,7 +13,3 @@ class KonkanSmallConfig(PretrainedConfig):
22
  self.d_ff = d_ff
23
  self.max_len = max_len
24
  self.dropout = dropout
25
-
26
- self.num_hidden_layers = n_layers
27
- self.hidden_size = d_model
28
- self.num_attention_heads = n_heads
 
1
+
2
  from transformers import PretrainedConfig
3
 
4
  class KonkanSmallConfig(PretrainedConfig):
5
  model_type = "konkangpt"
6
+ def __init__(self, vocab_size=32002, d_model=768, n_layers=12, n_heads=12,
7
+ d_ff=3072, max_len=2048, dropout=0.1, **kwargs):
 
 
 
 
 
 
 
 
 
 
8
  super().__init__(**kwargs)
9
  self.vocab_size = vocab_size
10
  self.d_model = d_model
 
13
  self.d_ff = d_ff
14
  self.max_len = max_len
15
  self.dropout = dropout
 
 
 
 
modeling_konkan.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import torch
2
  import torch.nn as nn
3
  import torch.nn.functional as F
@@ -9,11 +10,12 @@ class RotaryEmbedding(nn.Module):
9
  def __init__(self, dim, max_seq_len=2048):
10
  super().__init__()
11
  inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
12
- self.register_buffer("inv_freq", inv_freq)
13
 
14
  def forward(self, x, seq_len):
15
- t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
16
- freqs = torch.outer(t, self.inv_freq)
 
17
  emb = torch.cat((freqs, freqs), dim=-1)
18
  return emb.cos(), emb.sin()
19
 
@@ -31,7 +33,6 @@ class RMSNorm(nn.Module):
31
  super().__init__()
32
  self.eps = eps
33
  self.weight = nn.Parameter(torch.ones(dim))
34
-
35
  def forward(self, x):
36
  return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
37
 
@@ -59,25 +60,23 @@ class KonkanBlock(nn.Module):
59
  residual = x
60
  x = self.input_layernorm(x)
61
  b, t, c = x.shape
62
-
63
  q = self.q_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
64
  k = self.k_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
65
  v = self.v_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
66
-
67
  q = apply_rotary_pos_emb(q, cos, sin)
68
  k = apply_rotary_pos_emb(k, cos, sin)
69
 
 
 
 
70
  y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
71
  y = y.transpose(1, 2).contiguous().reshape(b, t, c)
72
-
73
  x = residual + self.o_proj(y)
74
  x = x + self.down_proj(self.act(self.gate_up_proj(self.post_attention_layernorm(x))))
75
  return x
76
 
77
  class KonkanGPT(PreTrainedModel):
78
  config_class = KonkanSmallConfig
79
- main_input_name = "input_ids"
80
-
81
  def __init__(self, config):
82
  super().__init__(config)
83
  self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
@@ -85,49 +84,19 @@ class KonkanGPT(PreTrainedModel):
85
  self.layers = nn.ModuleList([KonkanBlock(config) for _ in range(config.n_layers)])
86
  self.norm = RMSNorm(config.d_model)
87
  self.head = nn.Linear(config.d_model, config.vocab_size, bias=False)
88
-
89
  self.post_init()
90
- self.tie_weights()
91
-
92
- def get_input_embeddings(self):
93
- return self.token_emb
94
 
95
- def set_input_embeddings(self, value):
96
- self.token_emb = value
97
-
98
- def get_output_embeddings(self):
99
- return self.head
100
-
101
- def set_output_embeddings(self, new_embeddings):
102
- self.head = new_embeddings
103
-
104
- def tie_weights(self, **kwargs): # Added **kwargs to catch extra arguments
105
- """Standard HF method to link embeddings and head weights."""
106
- if hasattr(self, "token_emb") and hasattr(self, "head"):
107
- self.head.weight = self.token_emb.weight
108
-
109
- def forward(self, input_ids, labels=None, attention_mask=None, **kwargs):
110
  b, t = input_ids.shape
111
  cos, sin = self.rope(input_ids, t)
112
-
113
  mask = torch.tril(torch.ones(t, t, device=input_ids.device)).view(1, 1, t, t).bool()
114
- if attention_mask is not None:
115
- mask = mask & attention_mask.view(b, 1, 1, t).bool()
116
-
117
  x = self.token_emb(input_ids)
118
  for layer in self.layers:
119
  x = layer(x, cos, sin, mask)
120
-
121
  logits = self.head(self.norm(x))
122
-
123
  loss = None
124
  if labels is not None:
125
  shift_logits = logits[..., :-1, :].contiguous()
126
  shift_labels = labels[..., 1:].contiguous()
127
  loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
128
-
129
  return CausalLMOutput(loss=loss, logits=logits)
130
-
131
- def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
132
- # Important: Since we don't use KV Cache, we always send the full input_ids
133
- return {"input_ids": input_ids, "attention_mask": attention_mask}
 
1
+
2
  import torch
3
  import torch.nn as nn
4
  import torch.nn.functional as F
 
10
  def __init__(self, dim, max_seq_len=2048):
11
  super().__init__()
12
  inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
13
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
14
 
15
  def forward(self, x, seq_len):
16
+ device = x.device
17
+ t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
18
+ freqs = torch.outer(t, self.inv_freq.to(device))
19
  emb = torch.cat((freqs, freqs), dim=-1)
20
  return emb.cos(), emb.sin()
21
 
 
33
  super().__init__()
34
  self.eps = eps
35
  self.weight = nn.Parameter(torch.ones(dim))
 
36
  def forward(self, x):
37
  return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
38
 
 
60
  residual = x
61
  x = self.input_layernorm(x)
62
  b, t, c = x.shape
 
63
  q = self.q_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
64
  k = self.k_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
65
  v = self.v_proj(x).reshape(b, t, self.n_heads, self.head_dim).transpose(1, 2)
 
66
  q = apply_rotary_pos_emb(q, cos, sin)
67
  k = apply_rotary_pos_emb(k, cos, sin)
68
 
69
+ # DTYPE FIX
70
+ q, k = q.to(v.dtype), k.to(v.dtype)
71
+
72
  y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
73
  y = y.transpose(1, 2).contiguous().reshape(b, t, c)
 
74
  x = residual + self.o_proj(y)
75
  x = x + self.down_proj(self.act(self.gate_up_proj(self.post_attention_layernorm(x))))
76
  return x
77
 
78
  class KonkanGPT(PreTrainedModel):
79
  config_class = KonkanSmallConfig
 
 
80
  def __init__(self, config):
81
  super().__init__(config)
82
  self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
 
84
  self.layers = nn.ModuleList([KonkanBlock(config) for _ in range(config.n_layers)])
85
  self.norm = RMSNorm(config.d_model)
86
  self.head = nn.Linear(config.d_model, config.vocab_size, bias=False)
 
87
  self.post_init()
 
 
 
 
88
 
89
+ def forward(self, input_ids, labels=None, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  b, t = input_ids.shape
91
  cos, sin = self.rope(input_ids, t)
 
92
  mask = torch.tril(torch.ones(t, t, device=input_ids.device)).view(1, 1, t, t).bool()
 
 
 
93
  x = self.token_emb(input_ids)
94
  for layer in self.layers:
95
  x = layer(x, cos, sin, mask)
 
96
  logits = self.head(self.norm(x))
 
97
  loss = None
98
  if labels is not None:
99
  shift_logits = logits[..., :-1, :].contiguous()
100
  shift_labels = labels[..., 1:].contiguous()
101
  loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
 
102
  return CausalLMOutput(loss=loss, logits=logits)
 
 
 
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:488d504c8ba2e2bb4306639d591345b3ed5360f79a15ad5bbf75e0165357e612
3
- size 551408560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31377ee6924efa41f711a77bbd3d64a6176aca196c165db9c71e0d7260f74dd1
3
+ size 649700976
special_tokens_map.json CHANGED
@@ -26,9 +26,5 @@
26
  "normalized": false,
27
  "rstrip": false,
28
  "single_word": false
29
- },
30
- "additional_special_tokens": [
31
- "<|user|>",
32
- "<|assistant|>"
33
- ]
34
- }
 
26
  "normalized": false,
27
  "rstrip": false,
28
  "single_word": false
29
+ }
30
+ }
 
 
 
 
tokenizer.json CHANGED
@@ -56,24 +56,6 @@
56
  "rstrip": false,
57
  "normalized": false,
58
  "special": true
59
- },
60
- {
61
- "id": 32000,
62
- "content": "<|user|>",
63
- "single_word": false,
64
- "lstrip": false,
65
- "rstrip": false,
66
- "normalized": false,
67
- "special": true
68
- },
69
- {
70
- "id": 32001,
71
- "content": "<|assistant|>",
72
- "single_word": false,
73
- "lstrip": false,
74
- "rstrip": false,
75
- "normalized": false,
76
- "special": true
77
  }
78
  ],
79
  "normalizer": null,
 
56
  "rstrip": false,
57
  "normalized": false,
58
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  }
60
  ],
61
  "normalizer": null,
tokenizer_config.json CHANGED
@@ -1,11 +1,60 @@
1
  {
2
- "backend": "tokenizers",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "bos_token": "<s>",
4
  "clean_up_tokenization_spaces": false,
5
  "eos_token": "</s>",
6
- "is_local": true,
7
- "model_max_length": 1024,
8
  "pad_token": "<pad>",
9
- "tokenizer_class": "TokenizersBackend",
10
  "unk_token": "[UNK]"
11
  }
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[INST]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[/INST]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
  "bos_token": "<s>",
53
  "clean_up_tokenization_spaces": false,
54
  "eos_token": "</s>",
55
+ "extra_special_tokens": {},
56
+ "model_max_length": 1000000000000000019884624838656,
57
  "pad_token": "<pad>",
58
+ "tokenizer_class": "PreTrainedTokenizerFast",
59
  "unk_token": "[UNK]"
60
  }