Upload Gonyai-TEO2 — Konkani language model (251M)

Browse files

Files changed (7) hide show

README.md +62 -0
chat_template.jinja +6 -0
config.json +23 -0
modeling_gonyai.py +373 -0
pytorch_model.bin +3 -0
tokenizer.json +0 -0
tokenizer_config.json +11 -0

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+language:
+- kok
+tags:
+- konkani
+- goa
+- causal-lm
+- text-generation
+license: mit
+---
+# Gonyai-TEO2 — Konkani Language Model
+**Gonyai** (गोण्याय) is a Konkani AI assistant trained on Goan culture,
+history, and the Konkani language (Goan dialect, Devanagari script).
+## Quick Start
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_id  = "omdeep22/Gonyai-teo2"
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+model     = AutoModelForCausalLM.from_pretrained(
+                model_id, trust_remote_code=True).to("cuda")
+response = model.chat(tokenizer, "गोंयच्या निसर्गाविशीं एक ओळ बरय.")
+print(response)
+```
+## Multi-turn Conversation
+```python
+messages = [
+    {"role": "user",      "content": "गोंयचें फेमस जेवण कितें?"},
+    {"role": "assistant", "content": "शित-कडी, मासळें कालवण, बेबिंका आनी सोलकडी."},
+    {"role": "user",      "content": "बेबिंका कशी करतात?"},
+]
+response = model.chat(tokenizer, messages)
+print(response)
+```
+## Reading Comprehension / RAG
+```python
+passage  = "गोंयांत काजूची लागवड खूब जाता. काजूपासून फेणी तयार करतात."
+question = "काजूपासून कितें तयार करतात?"
+prompt   = f"हो उतारो वाच:\n\n{passage}\n\nप्रस्न: {question}"
+response = model.chat(tokenizer, prompt)
+print(response)  # → "फेणी"
+```
+## Parameters
+| | |
+|--|--|
+| Architecture | KonkanGPT (RoPE + RMSNorm + SwiGLU) |
+| Parameters | ~251M |
+| Layers | 24 transformer blocks |
+| Context | 4096 tokens |
+| Vocabulary | 32,000 (custom Konkani BPE) |
+| Language | Konkani, Goan dialect, Devanagari |

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,6 @@

+{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>
+' + message['content'] + '
+' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>
+' + message['content'] + '
+' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
+' }}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "architectures": [
+    "KonkanGPT"
+  ],
+  "model_type": "konkangpt",
+  "auto_map": {
+    "AutoConfig": "modeling_gonyai.KonkanGPTConfig",
+    "AutoModelForCausalLM": "modeling_gonyai.KonkanGPT"
+  },
+  "vocab_size": 32000,
+  "d_model": 768,
+  "n_layers": 24,
+  "n_heads": 12,
+  "d_ff": 3072,
+  "max_len": 4096,
+  "hidden_size": 768,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.0"
+}

modeling_gonyai.py ADDED Viewed

	@@ -0,0 +1,373 @@

+"""
+Gonyai-TEO2 — KonkanGPT model class.
+Auto-loaded via trust_remote_code=True.
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    model_id  = "omdeep22/Gonyai-teo2"
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    model     = AutoModelForCausalLM.from_pretrained(
+                    model_id, trust_remote_code=True).to("cuda")
+    # Single turn
+    print(model.chat(tokenizer, "गोंय कसलें?"))
+    # Multi-turn
+    messages = [
+        {"role": "user",      "content": "गोंयचें जेवण कितें?"},
+        {"role": "assistant", "content": "शित-कडी, मासळें कालवण..."},
+        {"role": "user",      "content": "बेबिंका कशी करतात?"},
+    ]
+    print(model.chat(tokenizer, messages))
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+USER_TOK = "<|user|>"
+ASST_TOK = "<|assistant|>"
+class KonkanGPTConfig(PretrainedConfig):
+    model_type = "konkangpt"
+    def __init__(
+        self,
+        vocab_size   = 32000,
+        d_model      = 768,
+        n_layers     = 24,
+        n_heads      = 12,
+        d_ff         = 3072,
+        max_len      = 4096,
+        pad_token_id = 1,
+        bos_token_id = 1,
+        eos_token_id = 2,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size      = vocab_size
+        self.d_model         = d_model
+        self.n_layers        = n_layers
+        self.n_heads         = n_heads
+        self.d_ff            = d_ff
+        self.max_len         = max_len
+        self.hidden_size     = d_model       # HF alias
+        self.num_hidden_layers = n_layers    # HF alias
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len=4096):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+    def forward(self, x, seq_len):
+        t     = torch.arange(seq_len, device=x.device,
+                             dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        emb   = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos(), emb.sin()
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rope(x, cos, sin):
+    cos = cos[:x.shape[-2], :].unsqueeze(0).unsqueeze(0)
+    sin = sin[:x.shape[-2], :].unsqueeze(0).unsqueeze(0)
+    return (x * cos) + (rotate_half(x) * sin)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.eps    = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return (x * torch.rsqrt(
+            x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight)
+class SwiGLU(nn.Module):
+    def forward(self, x):
+        x, gate = x.chunk(2, dim=-1)
+        return F.silu(gate) * x
+class KonkanBlock(nn.Module):
+    def __init__(self, d_model, n_heads, d_ff):
+        super().__init__()
+        self.n_heads  = n_heads
+        self.head_dim = d_model // n_heads
+        self.q_proj       = nn.Linear(d_model, d_model,  bias=False)
+        self.k_proj       = nn.Linear(d_model, d_model,  bias=False)
+        self.v_proj       = nn.Linear(d_model, d_model,  bias=False)
+        self.o_proj       = nn.Linear(d_model, d_model,  bias=False)
+        self.gate_up_proj = nn.Linear(d_model, 2 * d_ff, bias=False)
+        self.down_proj    = nn.Linear(d_ff,    d_model,  bias=False)
+        self.input_layernorm          = RMSNorm(d_model)
+        self.post_attention_layernorm = RMSNorm(d_model)
+        self.act = SwiGLU()
+    def forward(self, x, cos, sin, mask):
+        r = x
+        x = self.input_layernorm(x)
+        b, t, c = x.shape
+        q = self.q_proj(x).reshape(
+            b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).reshape(
+            b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).reshape(
+            b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        q, k = apply_rope(q, cos, sin), apply_rope(k, cos, sin)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        x = r + self.o_proj(
+            y.transpose(1, 2).contiguous().reshape(b, t, c))
+        return x + self.down_proj(
+            self.act(self.gate_up_proj(
+                self.post_attention_layernorm(x))))
+class KonkanGPT(PreTrainedModel):
+    """
+    Gonyai-TEO2 — Konkani language model.
+    Compatible with AutoModelForCausalLM via trust_remote_code=True.
+    """
+    config_class = KonkanGPTConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = False
+    # Tells HF which weight is tied — prevents "missing key" warnings
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: KonkanGPTConfig):
+        super().__init__(config)
+        self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
+        self.rope      = RotaryEmbedding(
+            config.d_model // config.n_heads, config.max_len)
+        self.layers    = nn.ModuleList([
+            KonkanBlock(config.d_model, config.n_heads, config.d_ff)
+            for _ in range(config.n_layers)
+        ])
+        self.norm    = RMSNorm(config.d_model)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # post_init() deliberately NOT called — weights come from checkpoint
+    def _init_weights(self, module):
+        """No-op — preserves loaded weights, prevents random re-init."""
+        pass
+    def tie_weights(self, missing_keys=None, recompute_mapping=False):
+        """
+        Accept any kwargs transformers passes — signature varies by version.
+        Newer transformers (4.40+) calls:
+            tie_weights(missing_keys=[...], recompute_mapping=False)
+        Older transformers calls:
+            tie_weights()
+        Both work with **kwargs.
+        """
+        self.lm_head.weight = self.token_emb.weight
+    @property
+    def all_tied_weights_keys(self):
+        """
+        transformers >= 4.38 calls .keys() and .update() on this.
+        Must be a dict: {tied_key: canonical_key}
+        """
+        if not hasattr(self, "_all_tied_weights_keys_dict"):
+            self._all_tied_weights_keys_dict = {
+                "lm_head.weight": "token_emb.weight"
+            }
+        return self._all_tied_weights_keys_dict
+    @all_tied_weights_keys.setter
+    def all_tied_weights_keys(self, value):
+        """HF may set this to a set or dict depending on version."""
+        if isinstance(value, dict):
+            self._all_tied_weights_keys_dict = value
+        elif hasattr(value, "__iter__"):
+            # set, list, etc → convert to dict
+            self._all_tied_weights_keys_dict = {
+                k: "token_emb.weight" for k in value
+            }
+        else:
+            self._all_tied_weights_keys_dict = {
+                "lm_head.weight": "token_emb.weight"
+            }
+    def set_use_kernels(self, use_kernels=False, kernel_config=None):
+        """
+        Called by transformers 4.40+ after loading.
+        No-op for custom models.
+        """
+        pass
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        """
+        Required by GenerationMixin (added automatically by HF).
+        Returns minimal dict for our simple causal LM.
+        """
+        return {"input_ids": input_ids}
+    def get_input_embeddings(self):
+        return self.token_emb
+    def set_input_embeddings(self, value):
+        self.token_emb = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, value):
+        self.lm_head = value
+    def can_generate(self):
+        """Tells HF this model supports .generate()"""
+        return True
+    def forward(self, input_ids=None, attention_mask=None,
+                labels=None, **kwargs):
+        b, t = input_ids.shape
+        cos, sin = self.rope(input_ids, t)
+        mask = (torch.tril(torch.ones(t, t, device=input_ids.device))
+                .view(1, 1, t, t).bool())
+        x = self.token_emb(input_ids)
+        for layer in self.layers:
+            x = layer(x, cos, sin, mask)
+        logits = self.lm_head(self.norm(x))
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(
+                logits[:, :-1].reshape(-1, logits.size(-1)),
+                labels[:, 1:].reshape(-1),
+                ignore_index=-100,
+            )
+        return CausalLMOutputWithPast(loss=loss, logits=logits)
+    def _build_prompt(self, messages):
+        """
+        Build prompt string from:
+          str       — plain question (wrapped as user turn)
+                       OR pre-formatted string (used as-is)
+          list[dict]— multi-turn: [{"role": "user"|"assistant",
+                                     "content": "..."}]
+        """
+        if isinstance(messages, str):
+            # Already formatted → use as-is
+            if USER_TOK in messages:
+                return messages
+            # Plain string → single user turn
+            return f"{USER_TOK}\n{messages}\n{ASST_TOK}\n"
+        if isinstance(messages, list):
+            prompt = ""
+            for msg in messages:
+                role    = msg.get("role", "user")
+                content = msg.get("content", "").strip()
+                if role == "user":
+                    prompt += f"{USER_TOK}\n{content}\n"
+                elif role == "assistant":
+                    # Include prior assistant turns as context
+                    prompt += f"{ASST_TOK}\n{content}\n"
+            # End with assistant token to trigger generation
+            if not prompt.rstrip().endswith(ASST_TOK):
+                prompt += f"{ASST_TOK}\n"
+            return prompt
+        raise ValueError(
+            f"messages must be str or list[dict], got {type(messages)}")
+    @torch.no_grad()
+    def chat(
+        self,
+        tokenizer,
+        messages,
+        max_new_tokens     = 300,
+        temperature        = 0.7,
+        top_p              = 0.9,
+        repetition_penalty = 1.3,
+    ):
+        """
+        Generate a Konkani response.
+        Args:
+            tokenizer         : the Gonyai tokenizer
+            messages          : str or list[dict]
+                                  str  → single turn question
+                                  list → multi-turn conversation
+            max_new_tokens    : max tokens to generate (default 300)
+            temperature       : sampling temperature (default 0.7)
+            top_p             : nucleus sampling (default 0.9)
+            repetition_penalty: reduces loops (default 1.3, 1.0=off)
+        Returns:
+            str: the assistant's response
+        """
+        self.eval()
+        device   = next(self.parameters()).device
+        eos_id   = tokenizer.eos_token_id
+        user_ids = tokenizer.encode(USER_TOK, add_special_tokens=False)
+        prompt = self._build_prompt(messages)
+        ids    = tokenizer.encode(prompt, return_tensors="pt").to(device)
+        out    = ids.clone()
+        n_in   = ids.shape[1]
+        for _ in range(max_new_tokens):
+            ctx    = out[:, -self.config.max_len:]
+            logits = self(ctx).logits[:, -1, :].clone()
+            # Repetition penalty (response tokens only)
+            if repetition_penalty != 1.0 and out.shape[1] > n_in:
+                for uid in out[0, n_in:].unique():
+                    if logits[0, uid] > 0:
+                        logits[0, uid] /= repetition_penalty
+                    else:
+                        logits[0, uid] *= repetition_penalty
+            logits = logits / max(temperature, 1e-8)
+            # Top-p nucleus sampling
+            sl, si   = torch.sort(logits, descending=True)
+            cp       = torch.cumsum(F.softmax(sl, dim=-1), dim=-1)
+            rm       = torch.zeros_like(cp, dtype=torch.bool)
+            rm[:, 1:]= cp[:, :-1] > top_p
+            sl       = sl.masked_fill(rm, -float("inf"))
+            orig     = torch.full_like(logits, -float("inf"))
+            orig.scatter_(1, si, sl)
+            probs    = F.softmax(orig, dim=-1)
+            next_tok = (
+                torch.multinomial(probs, 1)
+                if not (probs.isnan().any() or probs.sum() < 1e-6)
+                else logits.argmax(-1, keepdim=True)
+            )
+            tok_id = next_tok.item()
+            # Stop on EOS or new user turn
+            if tok_id == eos_id:
+                break
+            if user_ids and tok_id == user_ids[0]:
+                break
+            out = torch.cat([out, next_tok], dim=1)
+        response = tokenizer.decode(
+            out[0][n_in:], skip_special_tokens=True).strip()
+        # Strip leaked special tokens
+        for marker in [tokenizer.eos_token, USER_TOK, ASST_TOK]:
+            if marker and marker in response:
+                response = response.split(marker)[0].strip()
+        return response

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8c6720ddee71d3e998d6834b7b7f8c59c973f0ed13152f90b45de1e18c02a8e
+size 1102790067

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "is_local": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "[UNK]"
+}