initial model 10000 epoch

Browse files

Files changed (9) hide show

README.md +55 -3
config.json +29 -0
configuration_tamil_tiny_stories.py +45 -0
generation_config.json +10 -0
model.safetensors +3 -0
modeling_tamil_tiny_stories.py +151 -0
tokenization_tamil_tiny_stories.py +72 -0
tokenizer_config.json +50 -0
vocab.json +453 -0

README.md CHANGED Viewed

@@ -1,3 +1,55 @@
----
-license: apache-2.0
----

+---
+language:
+- ta
+license: mit
+tags:
+- tamil
+- tinystories
+- character-level
+- causal-lm
+- transformers
+pipeline_tag: text-generation
+library_name: transformers
+---
+# Tamil Tiny Stories
+This repository contains a Hugging Face-compatible export of a custom Tamil Tiny Stories character-level causal language model.
+## Model details
+- Architecture: custom decoder-only transformer
+- Tokenization: character-level
+- Training data source: `neuralnets/multilingual-tinystories` Tamil split (`ta`)
+- Original checkpoint format: PyTorch `.pth`
+- Export format: Hugging Face Transformers with custom remote code
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_id = "senthil090/tamil-tiny-stories"
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
+inputs = tokenizer("ஒரு நாள்", return_tensors="pt")
+outputs = model.generate(**inputs, max_new_tokens=100)
+print(tokenizer.decode(outputs[0]))
+```
+## Notes
+- This model uses custom architecture files, so `trust_remote_code=True` is required.
+- The tokenizer is character-based and may emit `<bos>` / `<eos>` tokens in raw decoded output.
+- The exported vocabulary was reconstructed from the Tamil split of the source dataset, matching the training script.
+## Files
+- `config.json`
+- `model.safetensors`
+- `configuration_tamil_tiny_stories.py`
+- `modeling_tamil_tiny_stories.py`
+- `tokenization_tamil_tiny_stories.py`
+- `tokenizer_config.json`
+- `vocab.json`

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "TamilTinyStoriesForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_tamil_tiny_stories.TamilTinyStoriesConfig",
+    "AutoModelForCausalLM": "modeling_tamil_tiny_stories.TamilTinyStoriesForCausalLM"
+  },
+  "block_size": 128,
+  "bos_token_id": 448,
+  "dropout": 0.0,
+  "dtype": "float32",
+  "eos_token_id": 449,
+  "hidden_size": 128,
+  "is_decoder": true,
+  "max_position_embeddings": 128,
+  "model_type": "tamil_tiny_stories",
+  "n_embd": 128,
+  "n_head": 4,
+  "n_layer": 4,
+  "num_attention_heads": 4,
+  "num_hidden_layers": 4,
+  "original_vocab_size": 447,
+  "pad_token_id": 447,
+  "transformers_version": "5.3.0",
+  "unk_token_id": 450,
+  "use_cache": false,
+  "vocab_size": 451
+}

configuration_tamil_tiny_stories.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from transformers import PretrainedConfig
+class TamilTinyStoriesConfig(PretrainedConfig):
+    model_type = "tamil_tiny_stories"
+    def __init__(
+        self,
+        vocab_size=0,
+        original_vocab_size=None,
+        block_size=128,
+        n_embd=128,
+        n_head=4,
+        n_layer=4,
+        dropout=0.0,
+        bos_token_id=None,
+        eos_token_id=None,
+        pad_token_id=None,
+        unk_token_id=None,
+        use_cache=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.original_vocab_size = original_vocab_size if original_vocab_size is not None else vocab_size
+        self.block_size = block_size
+        self.n_embd = n_embd
+        self.n_head = n_head
+        self.n_layer = n_layer
+        self.dropout = dropout
+        self.hidden_size = n_embd
+        self.num_attention_heads = n_head
+        self.num_hidden_layers = n_layer
+        self.max_position_embeddings = block_size
+        self.use_cache = use_cache
+        self.is_decoder = True
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            unk_token_id=unk_token_id,
+            **kwargs,
+        )
+TamilTinyStoriesConfig.register_for_auto_class()

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 448,
+  "eos_token_id": 449,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 447,
+  "transformers_version": "5.3.0",
+  "use_cache": false
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aef97155878655c2ff5326d0a3df32f8e9da4187cfe2211e0d7d6f3efac7f304
+size 4755340

modeling_tamil_tiny_stories.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers import GenerationMixin, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutput
+from configuration_tamil_tiny_stories import TamilTinyStoriesConfig
+class TamilTinyStoriesHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        head_size = config.n_embd // config.n_head
+        self.key = nn.Linear(config.n_embd, head_size, bias=False)
+        self.query = nn.Linear(config.n_embd, head_size, bias=False)
+        self.value = nn.Linear(config.n_embd, head_size, bias=False)
+        self.register_buffer("tril", torch.tril(torch.ones(config.block_size, config.block_size)))
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        _, t, c = x.shape
+        k = self.key(x)
+        q = self.query(x)
+        wei = q @ k.transpose(-2, -1) * c**-0.5
+        wei = wei.masked_fill(self.tril[:t, :t] == 0, float("-inf"))
+        wei = F.softmax(wei, dim=-1)
+        wei = self.dropout(wei)
+        v = self.value(x)
+        return wei @ v
+class TamilTinyStoriesMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.heads = nn.ModuleList([TamilTinyStoriesHead(config) for _ in range(config.n_head)])
+        self.proj = nn.Linear(config.n_embd, config.n_embd)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        out = torch.cat([head(x) for head in self.heads], dim=-1)
+        return self.dropout(self.proj(out))
+class TamilTinyStoriesFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(config.n_embd, 4 * config.n_embd),
+            nn.ReLU(),
+            nn.Linear(4 * config.n_embd, config.n_embd),
+            nn.Dropout(config.dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class TamilTinyStoriesBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.sa = TamilTinyStoriesMultiHeadAttention(config)
+        self.ffwd = TamilTinyStoriesFeedForward(config)
+        self.ln1 = nn.LayerNorm(config.n_embd)
+        self.ln2 = nn.LayerNorm(config.n_embd)
+    def forward(self, x):
+        x = x + self.sa(self.ln1(x))
+        x = x + self.ffwd(self.ln2(x))
+        return x
+class TamilTinyStoriesPreTrainedModel(PreTrainedModel):
+    config_class = TamilTinyStoriesConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["TamilTinyStoriesBlock"]
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.ones_(module.weight)
+            nn.init.zeros_(module.bias)
+class TamilTinyStoriesForCausalLM(TamilTinyStoriesPreTrainedModel, GenerationMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embd)
+        self.position_embedding_table = nn.Embedding(config.block_size, config.n_embd)
+        self.blocks = nn.Sequential(*[TamilTinyStoriesBlock(config) for _ in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(config.n_embd)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.token_embedding_table
+    def set_input_embeddings(self, value):
+        self.token_embedding_table = value
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
+        if input_ids is None:
+            raise ValueError("input_ids must be provided")
+        if input_ids.dim() == 1:
+            input_ids = input_ids.unsqueeze(0)
+        _, t = input_ids.shape
+        if t > self.config.block_size:
+            input_ids = input_ids[:, -self.config.block_size :]
+            if labels is not None:
+                labels = labels[:, -self.config.block_size :]
+            t = input_ids.shape[1]
+        positions = torch.arange(t, device=input_ids.device)
+        tok_emb = self.token_embedding_table(input_ids)
+        pos_emb = self.position_embedding_table(positions)
+        x = tok_emb + pos_emb
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        return CausalLMOutput(loss=loss, logits=logits)
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
+        if input_ids.shape[1] > self.config.block_size:
+            input_ids = input_ids[:, -self.config.block_size :]
+        if attention_mask is not None and attention_mask.shape[1] > self.config.block_size:
+            attention_mask = attention_mask[:, -self.config.block_size :]
+        if token_type_ids is not None and token_type_ids.shape[1] > self.config.block_size:
+            token_type_ids = token_type_ids[:, -self.config.block_size :]
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+        }
+TamilTinyStoriesForCausalLM.register_for_auto_class("AutoModelForCausalLM")

tokenization_tamil_tiny_stories.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import json
+import os
+from transformers import PreTrainedTokenizer
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+class TamilTinyStoriesTokenizer(PreTrainedTokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self, vocab_file, **kwargs):
+        with open(vocab_file, "r", encoding="utf-8") as handle:
+            self.encoder = json.load(handle)
+        self.decoder = {index: token for token, index in self.encoder.items()}
+        super().__init__(**kwargs)
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        vocab = dict(self.encoder)
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text, **kwargs):
+        return list(text)
+    def _convert_token_to_id(self, token):
+        if token in self.encoder:
+            return self.encoder[token]
+        if self.unk_token_id is not None:
+            return self.unk_token_id
+        raise ValueError(f"Token {token!r} is not in the vocabulary")
+    def _convert_id_to_token(self, index):
+        return self.decoder.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if token_ids_1 is not None:
+            raise ValueError("TamilTinyStoriesTokenizer does not support sequence pairs")
+        token_ids = list(token_ids_0)
+        if self.bos_token_id is not None:
+            token_ids = [self.bos_token_id] + token_ids
+        if self.eos_token_id is not None:
+            token_ids = token_ids + [self.eos_token_id]
+        return token_ids
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        if token_ids_1 is not None:
+            raise ValueError("TamilTinyStoriesTokenizer does not support sequence pairs")
+        return [0] * len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1))
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        os.makedirs(save_directory, exist_ok=True)
+        filename = VOCAB_FILES_NAMES["vocab_file"]
+        if filename_prefix:
+            filename = f"{filename_prefix}-{filename}"
+        vocab_path = os.path.join(save_directory, filename)
+        with open(vocab_path, "w", encoding="utf-8") as handle:
+            json.dump(self.encoder, handle, ensure_ascii=False, indent=2)
+        return (vocab_path,)
+TamilTinyStoriesTokenizer.register_for_auto_class()

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "added_tokens_decoder": {
+    "447": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "448": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "449": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "450": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_tamil_tiny_stories.TamilTinyStoriesTokenizer",
+      null
+    ]
+  },
+  "backend": "custom",
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "model_max_length": 128,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "tokenizer_class": "TamilTinyStoriesTokenizer",
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,453 @@

+{
+  "\n": 0,
+  " ": 1,
+  "!": 2,
+  "\"": 3,
+  "#": 4,
+  "$": 5,
+  "%": 6,
+  "&": 7,
+  "'": 8,
+  "(": 9,
+  ")": 10,
+  "+": 11,
+  ",": 12,
+  "-": 13,
+  ".": 14,
+  "/": 15,
+  "0": 16,
+  "1": 17,
+  "2": 18,
+  "3": 19,
+  "4": 20,
+  "5": 21,
+  "6": 22,
+  "7": 23,
+  "8": 24,
+  "9": 25,
+  ":": 26,
+  ";": 27,
+  "=": 28,
+  "?": 29,
+  "[": 30,
+  "\\": 31,
+  "]": 32,
+  "_": 33,
+  "`": 34,
+  "{": 35,
+  "|": 36,
+  "}": 37,
+  "~": 38,
+  "×": 39,
+  "ú": 40,
+  "ā": 41,
+  "ē": 42,
+  "ξ": 43,
+  "σ": 44,
+  "آ": 45,
+  "ؤ": 46,
+  "ئ": 47,
+  "ا": 48,
+  "ب": 49,
+  "ت": 50,
+  "د": 51,
+  "ر": 52,
+  "ز": 53,
+  "ض": 54,
+  "ط": 55,
+  "غ": 56,
+  "ق": 57,
+  "ل": 58,
+  "م": 59,
+  "ن": 60,
+  "و": 61,
+  "پ": 62,
+  "ڈ": 63,
+  "ڑ": 64,
+  "ک": 65,
+  "گ": 66,
+  "ں": 67,
+  "ھ": 68,
+  "ہ": 69,
+  "ی": 70,
+  "۔": 71,
+  "ँ": 72,
+  "ं": 73,
+  "ः": 74,
+  "आ": 75,
+  "इ": 76,
+  "उ": 77,
+  "ए": 78,
+  "ओ": 79,
+  "क": 80,
+  "ख": 81,
+  "ग": 82,
+  "घ": 83,
+  "च": 84,
+  "छ": 85,
+  "ज": 86,
+  "ट": 87,
+  "ठ": 88,
+  "ड": 89,
+  "ढ": 90,
+  "ण": 91,
+  "त": 92,
+  "थ": 93,
+  "द": 94,
+  "ध": 95,
+  "न": 96,
+  "प": 97,
+  "फ": 98,
+  "ब": 99,
+  "म": 100,
+  "य": 101,
+  "र": 102,
+  "ल": 103,
+  "ळ": 104,
+  "व": 105,
+  "श": 106,
+  "ष": 107,
+  "स": 108,
+  "ह": 109,
+  "़": 110,
+  "ा": 111,
+  "ि": 112,
+  "ी": 113,
+  "ु": 114,
+  "ू": 115,
+  "े": 116,
+  "ै": 117,
+  "ो": 118,
+  "ौ": 119,
+  "्": 120,
+  "५": 121,
+  "ঁ": 122,
+  "ং": 123,
+  "অ": 124,
+  "আ": 125,
+  "উ": 126,
+  "ও": 127,
+  "ক": 128,
+  "খ": 129,
+  "গ": 130,
+  "ঘ": 131,
+  "ঙ": 132,
+  "চ": 133,
+  "ছ": 134,
+  "জ": 135,
+  "ঝ": 136,
+  "ঞ": 137,
+  "ট": 138,
+  "ড": 139,
+  "ণ": 140,
+  "ত": 141,
+  "দ": 142,
+  "ধ": 143,
+  "ন": 144,
+  "প": 145,
+  "ফ": 146,
+  "ব": 147,
+  "ভ": 148,
+  "ম": 149,
+  "য": 150,
+  "র": 151,
+  "ল": 152,
+  "শ": 153,
+  "ষ": 154,
+  "স": 155,
+  "হ": 156,
+  "়": 157,
+  "া": 158,
+  "ি": 159,
+  "ী": 160,
+  "ু": 161,
+  "ূ": 162,
+  "ৃ": 163,
+  "ে": 164,
+  "ো": 165,
+  "ৌ": 166,
+  "্": 167,
+  "ৎ": 168,
+  "য়": 169,
+  "৪": 170,
+  "ਂ": 171,
+  "ਆ": 172,
+  "ਇ": 173,
+  "ਈ": 174,
+  "ਕ": 175,
+  "ਖ": 176,
+  "ਚ": 177,
+  "ਡ": 178,
+  "ਤ": 179,
+  "ਦ": 180,
+  "ਧ": 181,
+  "ਨ": 182,
+  "ਪ": 183,
+  "ਫ": 184,
+  "ਮ": 185,
+  "ਰ": 186,
+  "ਲ": 187,
+  "ਵ": 188,
+  "ਸ": 189,
+  "ਹ": 190,
+  "਼": 191,
+  "ਾ": 192,
+  "ਿ": 193,
+  "ੀ": 194,
+  "ੁ": 195,
+  "ੂ": 196,
+  "ੇ": 197,
+  "ੈ": 198,
+  "ੋ": 199,
+  "੍": 200,
+  "ੰ": 201,
+  "ੱ": 202,
+  "ં": 203,
+  "અ": 204,
+  "આ": 205,
+  "ઇ": 206,
+  "ઉ": 207,
+  "ઓ": 208,
+  "ક": 209,
+  "ખ": 210,
+  "ગ": 211,
+  "ઘ": 212,
+  "ચ": 213,
+  "જ": 214,
+  "ઝ": 215,
+  "ટ": 216,
+  "ણ": 217,
+  "ત": 218,
+  "થ": 219,
+  "દ": 220,
+  "ધ": 221,
+  "ન": 222,
+  "પ": 223,
+  "ભ": 224,
+  "મ": 225,
+  "ય": 226,
+  "ર": 227,
+  "લ": 228,
+  "ળ": 229,
+  "વ": 230,
+  "શ": 231,
+  "ષ": 232,
+  "સ": 233,
+  "હ": 234,
+  "ા": 235,
+  "િ": 236,
+  "ી": 237,
+  "ુ": 238,
+  "ે": 239,
+  "ો": 240,
+  "ૌ": 241,
+  "્": 242,
+  "ஃ": 243,
+  "அ": 244,
+  "ஆ": 245,
+  "இ": 246,
+  "ஈ": 247,
+  "உ": 248,
+  "ஊ": 249,
+  "எ": 250,
+  "ஏ": 251,
+  "ஐ": 252,
+  "ஒ": 253,
+  "ஓ": 254,
+  "க": 255,
+  "஖": 256,
+  "ங": 257,
+  "ச": 258,
+  "ஜ": 259,
+  "஝": 260,
+  "ஞ": 261,
+  "ட": 262,
+  "஡": 263,
+  "ண": 264,
+  "த": 265,
+  "ந": 266,
+  "ன": 267,
+  "ப": 268,
+  "ம": 269,
+  "ய": 270,
+  "ர": 271,
+  "ற": 272,
+  "ல": 273,
+  "ள": 274,
+  "ழ": 275,
+  "வ": 276,
+  "ஷ": 277,
+  "ஸ": 278,
+  "ஹ": 279,
+  "ா": 280,
+  "ி": 281,
+  "ீ": 282,
+  "ு": 283,
+  "ூ": 284,
+  "ெ": 285,
+  "ே": 286,
+  "ை": 287,
+  "ொ": 288,
+  "ோ": 289,
+  "ௌ": 290,
+  "்": 291,
+  "௔": 292,
+  "ం": 293,
+  "అ": 294,
+  "ఆ": 295,
+  "ఉ": 296,
+  "ఎ": 297,
+  "ఒ": 298,
+  "క": 299,
+  "గ": 300,
+  "చ": 301,
+  "జ": 302,
+  "ఞ": 303,
+  "ట": 304,
+  "డ": 305,
+  "ణ": 306,
+  "త": 307,
+  "థ": 308,
+  "ద": 309,
+  "ధ": 310,
+  "న": 311,
+  "ప": 312,
+  "బ": 313,
+  "భ": 314,
+  "మ": 315,
+  "య": 316,
+  "ర": 317,
+  "ల": 318,
+  "ళ": 319,
+  "వ": 320,
+  "శ": 321,
+  "ష": 322,
+  "స": 323,
+  "హ": 324,
+  "ా": 325,
+  "ి": 326,
+  "ీ": 327,
+  "ు": 328,
+  "ూ": 329,
+  "ె": 330,
+  "ే": 331,
+  "ై": 332,
+  "ొ": 333,
+  "ో": 334,
+  "్": 335,
+  "ಂ": 336,
+  "ಅ": 337,
+  "ಆ": 338,
+  "ಇ": 339,
+  "ಉ": 340,
+  "ಎ": 341,
+  "ಒ": 342,
+  "ಕ": 343,
+  "ಗ": 344,
+  "ಚ": 345,
+  "ಜ": 346,
+  "ಟ": 347,
+  "ಡ": 348,
+  "ಣ": 349,
+  "ತ": 350,
+  "ಥ": 351,
+  "ದ": 352,
+  "ಧ": 353,
+  "ನ": 354,
+  "ಪ": 355,
+  "ಬ": 356,
+  "ಭ": 357,
+  "ಮ": 358,
+  "ಯ": 359,
+  "ರ": 360,
+  "ಲ": 361,
+  "ಳ": 362,
+  "ವ": 363,
+  "ಶ": 364,
+  "ಷ": 365,
+  "ಸ": 366,
+  "ಹ": 367,
+  "ಾ": 368,
+  "ಿ": 369,
+  "ೀ": 370,
+  "ು": 371,
+  "ೂ": 372,
+  "ೆ": 373,
+  "ೇ": 374,
+  "ೈ": 375,
+  "ೊ": 376,
+  "ೋ": 377,
+  "್": 378,
+  "ೕ": 379,
+  "ം": 380,
+  "അ": 381,
+  "ആ": 382,
+  "ഉ": 383,
+  "എ": 384,
+  "ഒ": 385,
+  "ക": 386,
+  "ഖ": 387,
+  "ഗ": 388,
+  "ങ": 389,
+  "ച": 390,
+  "ജ": 391,
+  "ഞ": 392,
+  "ട": 393,
+  "ഠ": 394,
+  "ഡ": 395,
+  "ണ": 396,
+  "ത": 397,
+  "ഥ": 398,
+  "ദ": 399,
+  "ധ": 400,
+  "ന": 401,
+  "പ": 402,
+  "ഭ": 403,
+  "മ": 404,
+  "യ": 405,
+  "ര": 406,
+  "റ": 407,
+  "ല": 408,
+  "ള": 409,
+  "ഴ": 410,
+  "വ": 411,
+  "ശ": 412,
+  "ഷ": 413,
+  "സ": 414,
+  "ഹ": 415,
+  "ാ": 416,
+  "ി": 417,
+  "ീ": 418,
+  "ു": 419,
+  "ൂ": 420,
+  "ൃ": 421,
+  "െ": 422,
+  "േ": 423,
+  "ൊ": 424,
+  "ോ": 425,
+  "്": 426,
+  "ൗ": 427,
+  "ൺ": 428,
+  "ൻ": 429,
+  "ർ": 430,
+  "ൽ": 431,
+  "ൾ": 432,
+  "ḷ": 433,
+  "ṅ": 434,
+  "–": 435,
+  "—": 436,
+  "‘": 437,
+  "’": 438,
+  "“": 439,
+  "”": 440,
+  "…": 441,
+  "→": 442,
+  "∞": 443,
+  "⌛": 444,
+  "⏰": 445,
+  "⏳": 446,
+  "<pad>": 447,
+  "<bos>": 448,
+  "<eos>": 449,
+  "<unk>": 450
+}