omdeep22 commited on
Commit
895eea2
·
verified ·
1 Parent(s): 044a8f5

Upload Gonyai-TEO2 — Konkani language model (251M)

Browse files
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - kok
4
+ tags:
5
+ - konkani
6
+ - goa
7
+ - causal-lm
8
+ - text-generation
9
+ license: mit
10
+ ---
11
+
12
+ # Gonyai-TEO2 — Konkani Language Model
13
+
14
+ **Gonyai** (गोण्याय) is a Konkani AI assistant trained on Goan culture,
15
+ history, and the Konkani language (Goan dialect, Devanagari script).
16
+
17
+ ## Quick Start
18
+
19
+ ```python
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer
21
+
22
+ model_id = "omdeep22/Gonyai-teo2"
23
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
24
+ model = AutoModelForCausalLM.from_pretrained(
25
+ model_id, trust_remote_code=True).to("cuda")
26
+
27
+ response = model.chat(tokenizer, "गोंयच्या निसर्गाविशीं एक ओळ बरय.")
28
+ print(response)
29
+ ```
30
+
31
+ ## Multi-turn Conversation
32
+
33
+ ```python
34
+ messages = [
35
+ {"role": "user", "content": "गोंयचें फेमस जेवण कितें?"},
36
+ {"role": "assistant", "content": "शित-कडी, मासळें कालवण, बेबिंका आनी सोलकडी."},
37
+ {"role": "user", "content": "बेबिंका कशी करतात?"},
38
+ ]
39
+ response = model.chat(tokenizer, messages)
40
+ print(response)
41
+ ```
42
+
43
+ ## Reading Comprehension / RAG
44
+
45
+ ```python
46
+ passage = "गोंयांत काजूची लागवड खूब जाता. काजूपासून फेणी तयार करतात."
47
+ question = "काजूपासून कितें तयार करतात?"
48
+ prompt = f"हो उतारो वाच:\n\n{passage}\n\nप्रस्न: {question}"
49
+ response = model.chat(tokenizer, prompt)
50
+ print(response) # → "फेणी"
51
+ ```
52
+
53
+ ## Parameters
54
+
55
+ | | |
56
+ |--|--|
57
+ | Architecture | KonkanGPT (RoPE + RMSNorm + SwiGLU) |
58
+ | Parameters | ~251M |
59
+ | Layers | 24 transformer blocks |
60
+ | Context | 4096 tokens |
61
+ | Vocabulary | 32,000 (custom Konkani BPE) |
62
+ | Language | Konkani, Goan dialect, Devanagari |
chat_template.jinja ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>
2
+ ' + message['content'] + '
3
+ ' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>
4
+ ' + message['content'] + '
5
+ ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
6
+ ' }}{% endif %}
config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "KonkanGPT"
4
+ ],
5
+ "model_type": "konkangpt",
6
+ "auto_map": {
7
+ "AutoConfig": "modeling_gonyai.KonkanGPTConfig",
8
+ "AutoModelForCausalLM": "modeling_gonyai.KonkanGPT"
9
+ },
10
+ "vocab_size": 32000,
11
+ "d_model": 768,
12
+ "n_layers": 24,
13
+ "n_heads": 12,
14
+ "d_ff": 3072,
15
+ "max_len": 4096,
16
+ "hidden_size": 768,
17
+ "num_hidden_layers": 24,
18
+ "pad_token_id": 1,
19
+ "bos_token_id": 1,
20
+ "eos_token_id": 2,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.40.0"
23
+ }
modeling_gonyai.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gonyai-TEO2 — KonkanGPT model class.
3
+ Auto-loaded via trust_remote_code=True.
4
+
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+
7
+ model_id = "omdeep22/Gonyai-teo2"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
9
+ model = AutoModelForCausalLM.from_pretrained(
10
+ model_id, trust_remote_code=True).to("cuda")
11
+
12
+ # Single turn
13
+ print(model.chat(tokenizer, "गोंय कसलें?"))
14
+
15
+ # Multi-turn
16
+ messages = [
17
+ {"role": "user", "content": "गोंयचें जेवण कितें?"},
18
+ {"role": "assistant", "content": "शित-कडी, मासळें कालवण..."},
19
+ {"role": "user", "content": "बेबिंका कशी करतात?"},
20
+ ]
21
+ print(model.chat(tokenizer, messages))
22
+ """
23
+
24
+ import torch
25
+ import torch.nn as nn
26
+ import torch.nn.functional as F
27
+ from transformers import PreTrainedModel, PretrainedConfig
28
+ from transformers.modeling_outputs import CausalLMOutputWithPast
29
+
30
+ USER_TOK = "<|user|>"
31
+ ASST_TOK = "<|assistant|>"
32
+
33
+
34
+ class KonkanGPTConfig(PretrainedConfig):
35
+ model_type = "konkangpt"
36
+
37
+ def __init__(
38
+ self,
39
+ vocab_size = 32000,
40
+ d_model = 768,
41
+ n_layers = 24,
42
+ n_heads = 12,
43
+ d_ff = 3072,
44
+ max_len = 4096,
45
+ pad_token_id = 1,
46
+ bos_token_id = 1,
47
+ eos_token_id = 2,
48
+ **kwargs,
49
+ ):
50
+ super().__init__(
51
+ pad_token_id=pad_token_id,
52
+ bos_token_id=bos_token_id,
53
+ eos_token_id=eos_token_id,
54
+ **kwargs,
55
+ )
56
+ self.vocab_size = vocab_size
57
+ self.d_model = d_model
58
+ self.n_layers = n_layers
59
+ self.n_heads = n_heads
60
+ self.d_ff = d_ff
61
+ self.max_len = max_len
62
+ self.hidden_size = d_model # HF alias
63
+ self.num_hidden_layers = n_layers # HF alias
64
+
65
+
66
+ class RotaryEmbedding(nn.Module):
67
+ def __init__(self, dim, max_seq_len=4096):
68
+ super().__init__()
69
+ inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
70
+ self.register_buffer("inv_freq", inv_freq)
71
+
72
+ def forward(self, x, seq_len):
73
+ t = torch.arange(seq_len, device=x.device,
74
+ dtype=self.inv_freq.dtype)
75
+ freqs = torch.outer(t, self.inv_freq)
76
+ emb = torch.cat((freqs, freqs), dim=-1)
77
+ return emb.cos(), emb.sin()
78
+
79
+
80
+ def rotate_half(x):
81
+ x1, x2 = x.chunk(2, dim=-1)
82
+ return torch.cat((-x2, x1), dim=-1)
83
+
84
+
85
+ def apply_rope(x, cos, sin):
86
+ cos = cos[:x.shape[-2], :].unsqueeze(0).unsqueeze(0)
87
+ sin = sin[:x.shape[-2], :].unsqueeze(0).unsqueeze(0)
88
+ return (x * cos) + (rotate_half(x) * sin)
89
+
90
+
91
+ class RMSNorm(nn.Module):
92
+ def __init__(self, dim, eps=1e-6):
93
+ super().__init__()
94
+ self.eps = eps
95
+ self.weight = nn.Parameter(torch.ones(dim))
96
+
97
+ def forward(self, x):
98
+ return (x * torch.rsqrt(
99
+ x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight)
100
+
101
+
102
+ class SwiGLU(nn.Module):
103
+ def forward(self, x):
104
+ x, gate = x.chunk(2, dim=-1)
105
+ return F.silu(gate) * x
106
+
107
+
108
+ class KonkanBlock(nn.Module):
109
+ def __init__(self, d_model, n_heads, d_ff):
110
+ super().__init__()
111
+ self.n_heads = n_heads
112
+ self.head_dim = d_model // n_heads
113
+ self.q_proj = nn.Linear(d_model, d_model, bias=False)
114
+ self.k_proj = nn.Linear(d_model, d_model, bias=False)
115
+ self.v_proj = nn.Linear(d_model, d_model, bias=False)
116
+ self.o_proj = nn.Linear(d_model, d_model, bias=False)
117
+ self.gate_up_proj = nn.Linear(d_model, 2 * d_ff, bias=False)
118
+ self.down_proj = nn.Linear(d_ff, d_model, bias=False)
119
+ self.input_layernorm = RMSNorm(d_model)
120
+ self.post_attention_layernorm = RMSNorm(d_model)
121
+ self.act = SwiGLU()
122
+
123
+ def forward(self, x, cos, sin, mask):
124
+ r = x
125
+ x = self.input_layernorm(x)
126
+ b, t, c = x.shape
127
+ q = self.q_proj(x).reshape(
128
+ b, t, self.n_heads, self.head_dim).transpose(1, 2)
129
+ k = self.k_proj(x).reshape(
130
+ b, t, self.n_heads, self.head_dim).transpose(1, 2)
131
+ v = self.v_proj(x).reshape(
132
+ b, t, self.n_heads, self.head_dim).transpose(1, 2)
133
+ q, k = apply_rope(q, cos, sin), apply_rope(k, cos, sin)
134
+ y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
135
+ x = r + self.o_proj(
136
+ y.transpose(1, 2).contiguous().reshape(b, t, c))
137
+ return x + self.down_proj(
138
+ self.act(self.gate_up_proj(
139
+ self.post_attention_layernorm(x))))
140
+
141
+
142
+ class KonkanGPT(PreTrainedModel):
143
+ """
144
+ Gonyai-TEO2 — Konkani language model.
145
+ Compatible with AutoModelForCausalLM via trust_remote_code=True.
146
+ """
147
+ config_class = KonkanGPTConfig
148
+ base_model_prefix = ""
149
+ supports_gradient_checkpointing = False
150
+
151
+ # Tells HF which weight is tied — prevents "missing key" warnings
152
+ _tied_weights_keys = ["lm_head.weight"]
153
+
154
+ def __init__(self, config: KonkanGPTConfig):
155
+ super().__init__(config)
156
+ self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
157
+ self.rope = RotaryEmbedding(
158
+ config.d_model // config.n_heads, config.max_len)
159
+ self.layers = nn.ModuleList([
160
+ KonkanBlock(config.d_model, config.n_heads, config.d_ff)
161
+ for _ in range(config.n_layers)
162
+ ])
163
+ self.norm = RMSNorm(config.d_model)
164
+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
165
+ # post_init() deliberately NOT called — weights come from checkpoint
166
+
167
+ def _init_weights(self, module):
168
+ """No-op — preserves loaded weights, prevents random re-init."""
169
+ pass
170
+
171
+ def tie_weights(self, missing_keys=None, recompute_mapping=False):
172
+ """
173
+ Accept any kwargs transformers passes — signature varies by version.
174
+ Newer transformers (4.40+) calls:
175
+ tie_weights(missing_keys=[...], recompute_mapping=False)
176
+ Older transformers calls:
177
+ tie_weights()
178
+ Both work with **kwargs.
179
+ """
180
+ self.lm_head.weight = self.token_emb.weight
181
+
182
+ @property
183
+ def all_tied_weights_keys(self):
184
+ """
185
+ transformers >= 4.38 calls .keys() and .update() on this.
186
+ Must be a dict: {tied_key: canonical_key}
187
+ """
188
+ if not hasattr(self, "_all_tied_weights_keys_dict"):
189
+ self._all_tied_weights_keys_dict = {
190
+ "lm_head.weight": "token_emb.weight"
191
+ }
192
+ return self._all_tied_weights_keys_dict
193
+
194
+ @all_tied_weights_keys.setter
195
+ def all_tied_weights_keys(self, value):
196
+ """HF may set this to a set or dict depending on version."""
197
+ if isinstance(value, dict):
198
+ self._all_tied_weights_keys_dict = value
199
+ elif hasattr(value, "__iter__"):
200
+ # set, list, etc → convert to dict
201
+ self._all_tied_weights_keys_dict = {
202
+ k: "token_emb.weight" for k in value
203
+ }
204
+ else:
205
+ self._all_tied_weights_keys_dict = {
206
+ "lm_head.weight": "token_emb.weight"
207
+ }
208
+
209
+ def set_use_kernels(self, use_kernels=False, kernel_config=None):
210
+ """
211
+ Called by transformers 4.40+ after loading.
212
+ No-op for custom models.
213
+ """
214
+ pass
215
+
216
+ def prepare_inputs_for_generation(self, input_ids, **kwargs):
217
+ """
218
+ Required by GenerationMixin (added automatically by HF).
219
+ Returns minimal dict for our simple causal LM.
220
+ """
221
+ return {"input_ids": input_ids}
222
+
223
+ def get_input_embeddings(self):
224
+ return self.token_emb
225
+
226
+ def set_input_embeddings(self, value):
227
+ self.token_emb = value
228
+
229
+ def get_output_embeddings(self):
230
+ return self.lm_head
231
+
232
+ def set_output_embeddings(self, value):
233
+ self.lm_head = value
234
+
235
+ def can_generate(self):
236
+ """Tells HF this model supports .generate()"""
237
+ return True
238
+
239
+ def forward(self, input_ids=None, attention_mask=None,
240
+ labels=None, **kwargs):
241
+ b, t = input_ids.shape
242
+ cos, sin = self.rope(input_ids, t)
243
+ mask = (torch.tril(torch.ones(t, t, device=input_ids.device))
244
+ .view(1, 1, t, t).bool())
245
+ x = self.token_emb(input_ids)
246
+ for layer in self.layers:
247
+ x = layer(x, cos, sin, mask)
248
+ logits = self.lm_head(self.norm(x))
249
+ loss = None
250
+ if labels is not None:
251
+ loss = F.cross_entropy(
252
+ logits[:, :-1].reshape(-1, logits.size(-1)),
253
+ labels[:, 1:].reshape(-1),
254
+ ignore_index=-100,
255
+ )
256
+ return CausalLMOutputWithPast(loss=loss, logits=logits)
257
+
258
+ def _build_prompt(self, messages):
259
+ """
260
+ Build prompt string from:
261
+ str — plain question (wrapped as user turn)
262
+ OR pre-formatted string (used as-is)
263
+ list[dict]— multi-turn: [{"role": "user"|"assistant",
264
+ "content": "..."}]
265
+ """
266
+ if isinstance(messages, str):
267
+ # Already formatted → use as-is
268
+ if USER_TOK in messages:
269
+ return messages
270
+ # Plain string → single user turn
271
+ return f"{USER_TOK}\n{messages}\n{ASST_TOK}\n"
272
+
273
+ if isinstance(messages, list):
274
+ prompt = ""
275
+ for msg in messages:
276
+ role = msg.get("role", "user")
277
+ content = msg.get("content", "").strip()
278
+ if role == "user":
279
+ prompt += f"{USER_TOK}\n{content}\n"
280
+ elif role == "assistant":
281
+ # Include prior assistant turns as context
282
+ prompt += f"{ASST_TOK}\n{content}\n"
283
+ # End with assistant token to trigger generation
284
+ if not prompt.rstrip().endswith(ASST_TOK):
285
+ prompt += f"{ASST_TOK}\n"
286
+ return prompt
287
+
288
+ raise ValueError(
289
+ f"messages must be str or list[dict], got {type(messages)}")
290
+
291
+ @torch.no_grad()
292
+ def chat(
293
+ self,
294
+ tokenizer,
295
+ messages,
296
+ max_new_tokens = 300,
297
+ temperature = 0.7,
298
+ top_p = 0.9,
299
+ repetition_penalty = 1.3,
300
+ ):
301
+ """
302
+ Generate a Konkani response.
303
+
304
+ Args:
305
+ tokenizer : the Gonyai tokenizer
306
+ messages : str or list[dict]
307
+ str → single turn question
308
+ list → multi-turn conversation
309
+ max_new_tokens : max tokens to generate (default 300)
310
+ temperature : sampling temperature (default 0.7)
311
+ top_p : nucleus sampling (default 0.9)
312
+ repetition_penalty: reduces loops (default 1.3, 1.0=off)
313
+
314
+ Returns:
315
+ str: the assistant's response
316
+ """
317
+ self.eval()
318
+ device = next(self.parameters()).device
319
+ eos_id = tokenizer.eos_token_id
320
+ user_ids = tokenizer.encode(USER_TOK, add_special_tokens=False)
321
+
322
+ prompt = self._build_prompt(messages)
323
+ ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
324
+ out = ids.clone()
325
+ n_in = ids.shape[1]
326
+
327
+ for _ in range(max_new_tokens):
328
+ ctx = out[:, -self.config.max_len:]
329
+ logits = self(ctx).logits[:, -1, :].clone()
330
+
331
+ # Repetition penalty (response tokens only)
332
+ if repetition_penalty != 1.0 and out.shape[1] > n_in:
333
+ for uid in out[0, n_in:].unique():
334
+ if logits[0, uid] > 0:
335
+ logits[0, uid] /= repetition_penalty
336
+ else:
337
+ logits[0, uid] *= repetition_penalty
338
+
339
+ logits = logits / max(temperature, 1e-8)
340
+
341
+ # Top-p nucleus sampling
342
+ sl, si = torch.sort(logits, descending=True)
343
+ cp = torch.cumsum(F.softmax(sl, dim=-1), dim=-1)
344
+ rm = torch.zeros_like(cp, dtype=torch.bool)
345
+ rm[:, 1:]= cp[:, :-1] > top_p
346
+ sl = sl.masked_fill(rm, -float("inf"))
347
+ orig = torch.full_like(logits, -float("inf"))
348
+ orig.scatter_(1, si, sl)
349
+ probs = F.softmax(orig, dim=-1)
350
+ next_tok = (
351
+ torch.multinomial(probs, 1)
352
+ if not (probs.isnan().any() or probs.sum() < 1e-6)
353
+ else logits.argmax(-1, keepdim=True)
354
+ )
355
+ tok_id = next_tok.item()
356
+
357
+ # Stop on EOS or new user turn
358
+ if tok_id == eos_id:
359
+ break
360
+ if user_ids and tok_id == user_ids[0]:
361
+ break
362
+
363
+ out = torch.cat([out, next_tok], dim=1)
364
+
365
+ response = tokenizer.decode(
366
+ out[0][n_in:], skip_special_tokens=True).strip()
367
+
368
+ # Strip leaked special tokens
369
+ for marker in [tokenizer.eos_token, USER_TOK, ASST_TOK]:
370
+ if marker and marker in response:
371
+ response = response.split(marker)[0].strip()
372
+
373
+ return response
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8c6720ddee71d3e998d6834b7b7f8c59c973f0ed13152f90b45de1e18c02a8e
3
+ size 1102790067
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "</s>",
6
+ "is_local": true,
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "pad_token": "<pad>",
9
+ "tokenizer_class": "TokenizersBackend",
10
+ "unk_token": "[UNK]"
11
+ }