senthil090 commited on
Commit
b99ff82
·
verified ·
1 Parent(s): fa6627e

initial model 10000 epoch

Browse files
README.md CHANGED
@@ -1,3 +1,55 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ta
4
+ license: mit
5
+ tags:
6
+ - tamil
7
+ - tinystories
8
+ - character-level
9
+ - causal-lm
10
+ - transformers
11
+ pipeline_tag: text-generation
12
+ library_name: transformers
13
+ ---
14
+
15
+ # Tamil Tiny Stories
16
+
17
+ This repository contains a Hugging Face-compatible export of a custom Tamil Tiny Stories character-level causal language model.
18
+
19
+ ## Model details
20
+
21
+ - Architecture: custom decoder-only transformer
22
+ - Tokenization: character-level
23
+ - Training data source: `neuralnets/multilingual-tinystories` Tamil split (`ta`)
24
+ - Original checkpoint format: PyTorch `.pth`
25
+ - Export format: Hugging Face Transformers with custom remote code
26
+
27
+ ## Usage
28
+
29
+ ```python
30
+ from transformers import AutoModelForCausalLM, AutoTokenizer
31
+
32
+ model_id = "senthil090/tamil-tiny-stories"
33
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
34
+ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
35
+
36
+ inputs = tokenizer("ஒரு நாள்", return_tensors="pt")
37
+ outputs = model.generate(**inputs, max_new_tokens=100)
38
+ print(tokenizer.decode(outputs[0]))
39
+ ```
40
+
41
+ ## Notes
42
+
43
+ - This model uses custom architecture files, so `trust_remote_code=True` is required.
44
+ - The tokenizer is character-based and may emit `<bos>` / `<eos>` tokens in raw decoded output.
45
+ - The exported vocabulary was reconstructed from the Tamil split of the source dataset, matching the training script.
46
+
47
+ ## Files
48
+
49
+ - `config.json`
50
+ - `model.safetensors`
51
+ - `configuration_tamil_tiny_stories.py`
52
+ - `modeling_tamil_tiny_stories.py`
53
+ - `tokenization_tamil_tiny_stories.py`
54
+ - `tokenizer_config.json`
55
+ - `vocab.json`
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "TamilTinyStoriesForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_tamil_tiny_stories.TamilTinyStoriesConfig",
7
+ "AutoModelForCausalLM": "modeling_tamil_tiny_stories.TamilTinyStoriesForCausalLM"
8
+ },
9
+ "block_size": 128,
10
+ "bos_token_id": 448,
11
+ "dropout": 0.0,
12
+ "dtype": "float32",
13
+ "eos_token_id": 449,
14
+ "hidden_size": 128,
15
+ "is_decoder": true,
16
+ "max_position_embeddings": 128,
17
+ "model_type": "tamil_tiny_stories",
18
+ "n_embd": 128,
19
+ "n_head": 4,
20
+ "n_layer": 4,
21
+ "num_attention_heads": 4,
22
+ "num_hidden_layers": 4,
23
+ "original_vocab_size": 447,
24
+ "pad_token_id": 447,
25
+ "transformers_version": "5.3.0",
26
+ "unk_token_id": 450,
27
+ "use_cache": false,
28
+ "vocab_size": 451
29
+ }
configuration_tamil_tiny_stories.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class TamilTinyStoriesConfig(PretrainedConfig):
5
+ model_type = "tamil_tiny_stories"
6
+
7
+ def __init__(
8
+ self,
9
+ vocab_size=0,
10
+ original_vocab_size=None,
11
+ block_size=128,
12
+ n_embd=128,
13
+ n_head=4,
14
+ n_layer=4,
15
+ dropout=0.0,
16
+ bos_token_id=None,
17
+ eos_token_id=None,
18
+ pad_token_id=None,
19
+ unk_token_id=None,
20
+ use_cache=False,
21
+ **kwargs,
22
+ ):
23
+ self.vocab_size = vocab_size
24
+ self.original_vocab_size = original_vocab_size if original_vocab_size is not None else vocab_size
25
+ self.block_size = block_size
26
+ self.n_embd = n_embd
27
+ self.n_head = n_head
28
+ self.n_layer = n_layer
29
+ self.dropout = dropout
30
+ self.hidden_size = n_embd
31
+ self.num_attention_heads = n_head
32
+ self.num_hidden_layers = n_layer
33
+ self.max_position_embeddings = block_size
34
+ self.use_cache = use_cache
35
+ self.is_decoder = True
36
+ super().__init__(
37
+ bos_token_id=bos_token_id,
38
+ eos_token_id=eos_token_id,
39
+ pad_token_id=pad_token_id,
40
+ unk_token_id=unk_token_id,
41
+ **kwargs,
42
+ )
43
+
44
+
45
+ TamilTinyStoriesConfig.register_for_auto_class()
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 448,
4
+ "eos_token_id": 449,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 447,
8
+ "transformers_version": "5.3.0",
9
+ "use_cache": false
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aef97155878655c2ff5326d0a3df32f8e9da4187cfe2211e0d7d6f3efac7f304
3
+ size 4755340
modeling_tamil_tiny_stories.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn import functional as F
4
+ from transformers import GenerationMixin, PreTrainedModel
5
+ from transformers.modeling_outputs import CausalLMOutput
6
+
7
+ from configuration_tamil_tiny_stories import TamilTinyStoriesConfig
8
+
9
+
10
+ class TamilTinyStoriesHead(nn.Module):
11
+ def __init__(self, config):
12
+ super().__init__()
13
+ head_size = config.n_embd // config.n_head
14
+ self.key = nn.Linear(config.n_embd, head_size, bias=False)
15
+ self.query = nn.Linear(config.n_embd, head_size, bias=False)
16
+ self.value = nn.Linear(config.n_embd, head_size, bias=False)
17
+ self.register_buffer("tril", torch.tril(torch.ones(config.block_size, config.block_size)))
18
+ self.dropout = nn.Dropout(config.dropout)
19
+
20
+ def forward(self, x):
21
+ _, t, c = x.shape
22
+ k = self.key(x)
23
+ q = self.query(x)
24
+ wei = q @ k.transpose(-2, -1) * c**-0.5
25
+ wei = wei.masked_fill(self.tril[:t, :t] == 0, float("-inf"))
26
+ wei = F.softmax(wei, dim=-1)
27
+ wei = self.dropout(wei)
28
+ v = self.value(x)
29
+ return wei @ v
30
+
31
+
32
+ class TamilTinyStoriesMultiHeadAttention(nn.Module):
33
+ def __init__(self, config):
34
+ super().__init__()
35
+ self.heads = nn.ModuleList([TamilTinyStoriesHead(config) for _ in range(config.n_head)])
36
+ self.proj = nn.Linear(config.n_embd, config.n_embd)
37
+ self.dropout = nn.Dropout(config.dropout)
38
+
39
+ def forward(self, x):
40
+ out = torch.cat([head(x) for head in self.heads], dim=-1)
41
+ return self.dropout(self.proj(out))
42
+
43
+
44
+ class TamilTinyStoriesFeedForward(nn.Module):
45
+ def __init__(self, config):
46
+ super().__init__()
47
+ self.net = nn.Sequential(
48
+ nn.Linear(config.n_embd, 4 * config.n_embd),
49
+ nn.ReLU(),
50
+ nn.Linear(4 * config.n_embd, config.n_embd),
51
+ nn.Dropout(config.dropout),
52
+ )
53
+
54
+ def forward(self, x):
55
+ return self.net(x)
56
+
57
+
58
+ class TamilTinyStoriesBlock(nn.Module):
59
+ def __init__(self, config):
60
+ super().__init__()
61
+ self.sa = TamilTinyStoriesMultiHeadAttention(config)
62
+ self.ffwd = TamilTinyStoriesFeedForward(config)
63
+ self.ln1 = nn.LayerNorm(config.n_embd)
64
+ self.ln2 = nn.LayerNorm(config.n_embd)
65
+
66
+ def forward(self, x):
67
+ x = x + self.sa(self.ln1(x))
68
+ x = x + self.ffwd(self.ln2(x))
69
+ return x
70
+
71
+
72
+ class TamilTinyStoriesPreTrainedModel(PreTrainedModel):
73
+ config_class = TamilTinyStoriesConfig
74
+ base_model_prefix = "model"
75
+ _no_split_modules = ["TamilTinyStoriesBlock"]
76
+
77
+ def _init_weights(self, module):
78
+ if isinstance(module, (nn.Linear, nn.Embedding)):
79
+ nn.init.normal_(module.weight, mean=0.0, std=0.02)
80
+ if isinstance(module, nn.Linear) and module.bias is not None:
81
+ nn.init.zeros_(module.bias)
82
+ elif isinstance(module, nn.LayerNorm):
83
+ nn.init.ones_(module.weight)
84
+ nn.init.zeros_(module.bias)
85
+
86
+
87
+ class TamilTinyStoriesForCausalLM(TamilTinyStoriesPreTrainedModel, GenerationMixin):
88
+ def __init__(self, config):
89
+ super().__init__(config)
90
+ self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embd)
91
+ self.position_embedding_table = nn.Embedding(config.block_size, config.n_embd)
92
+ self.blocks = nn.Sequential(*[TamilTinyStoriesBlock(config) for _ in range(config.n_layer)])
93
+ self.ln_f = nn.LayerNorm(config.n_embd)
94
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
95
+ self.post_init()
96
+
97
+ def get_input_embeddings(self):
98
+ return self.token_embedding_table
99
+
100
+ def set_input_embeddings(self, value):
101
+ self.token_embedding_table = value
102
+
103
+ def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
104
+ if input_ids is None:
105
+ raise ValueError("input_ids must be provided")
106
+
107
+ if input_ids.dim() == 1:
108
+ input_ids = input_ids.unsqueeze(0)
109
+
110
+ _, t = input_ids.shape
111
+ if t > self.config.block_size:
112
+ input_ids = input_ids[:, -self.config.block_size :]
113
+ if labels is not None:
114
+ labels = labels[:, -self.config.block_size :]
115
+ t = input_ids.shape[1]
116
+
117
+ positions = torch.arange(t, device=input_ids.device)
118
+ tok_emb = self.token_embedding_table(input_ids)
119
+ pos_emb = self.position_embedding_table(positions)
120
+ x = tok_emb + pos_emb
121
+ x = self.blocks(x)
122
+ x = self.ln_f(x)
123
+ logits = self.lm_head(x)
124
+
125
+ loss = None
126
+ if labels is not None:
127
+ shift_logits = logits[:, :-1, :].contiguous()
128
+ shift_labels = labels[:, 1:].contiguous()
129
+ loss = F.cross_entropy(
130
+ shift_logits.view(-1, shift_logits.size(-1)),
131
+ shift_labels.view(-1),
132
+ ignore_index=-100,
133
+ )
134
+
135
+ return CausalLMOutput(loss=loss, logits=logits)
136
+
137
+ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
138
+ if input_ids.shape[1] > self.config.block_size:
139
+ input_ids = input_ids[:, -self.config.block_size :]
140
+ if attention_mask is not None and attention_mask.shape[1] > self.config.block_size:
141
+ attention_mask = attention_mask[:, -self.config.block_size :]
142
+ if token_type_ids is not None and token_type_ids.shape[1] > self.config.block_size:
143
+ token_type_ids = token_type_ids[:, -self.config.block_size :]
144
+ return {
145
+ "input_ids": input_ids,
146
+ "attention_mask": attention_mask,
147
+ "token_type_ids": token_type_ids,
148
+ }
149
+
150
+
151
+ TamilTinyStoriesForCausalLM.register_for_auto_class("AutoModelForCausalLM")
tokenization_tamil_tiny_stories.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ from transformers import PreTrainedTokenizer
5
+
6
+
7
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
8
+
9
+
10
+ class TamilTinyStoriesTokenizer(PreTrainedTokenizer):
11
+ vocab_files_names = VOCAB_FILES_NAMES
12
+ model_input_names = ["input_ids", "attention_mask"]
13
+
14
+ def __init__(self, vocab_file, **kwargs):
15
+ with open(vocab_file, "r", encoding="utf-8") as handle:
16
+ self.encoder = json.load(handle)
17
+ self.decoder = {index: token for token, index in self.encoder.items()}
18
+ super().__init__(**kwargs)
19
+
20
+ @property
21
+ def vocab_size(self):
22
+ return len(self.encoder)
23
+
24
+ def get_vocab(self):
25
+ vocab = dict(self.encoder)
26
+ vocab.update(self.added_tokens_encoder)
27
+ return vocab
28
+
29
+ def _tokenize(self, text, **kwargs):
30
+ return list(text)
31
+
32
+ def _convert_token_to_id(self, token):
33
+ if token in self.encoder:
34
+ return self.encoder[token]
35
+ if self.unk_token_id is not None:
36
+ return self.unk_token_id
37
+ raise ValueError(f"Token {token!r} is not in the vocabulary")
38
+
39
+ def _convert_id_to_token(self, index):
40
+ return self.decoder.get(index, self.unk_token)
41
+
42
+ def convert_tokens_to_string(self, tokens):
43
+ return "".join(tokens)
44
+
45
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
46
+ if token_ids_1 is not None:
47
+ raise ValueError("TamilTinyStoriesTokenizer does not support sequence pairs")
48
+
49
+ token_ids = list(token_ids_0)
50
+ if self.bos_token_id is not None:
51
+ token_ids = [self.bos_token_id] + token_ids
52
+ if self.eos_token_id is not None:
53
+ token_ids = token_ids + [self.eos_token_id]
54
+ return token_ids
55
+
56
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
57
+ if token_ids_1 is not None:
58
+ raise ValueError("TamilTinyStoriesTokenizer does not support sequence pairs")
59
+ return [0] * len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1))
60
+
61
+ def save_vocabulary(self, save_directory, filename_prefix=None):
62
+ os.makedirs(save_directory, exist_ok=True)
63
+ filename = VOCAB_FILES_NAMES["vocab_file"]
64
+ if filename_prefix:
65
+ filename = f"{filename_prefix}-{filename}"
66
+ vocab_path = os.path.join(save_directory, filename)
67
+ with open(vocab_path, "w", encoding="utf-8") as handle:
68
+ json.dump(self.encoder, handle, ensure_ascii=False, indent=2)
69
+ return (vocab_path,)
70
+
71
+
72
+ TamilTinyStoriesTokenizer.register_for_auto_class()
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "447": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "448": {
12
+ "content": "<bos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "449": {
20
+ "content": "<eos>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "450": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenization_tamil_tiny_stories.TamilTinyStoriesTokenizer",
39
+ null
40
+ ]
41
+ },
42
+ "backend": "custom",
43
+ "bos_token": "<bos>",
44
+ "eos_token": "<eos>",
45
+ "model_max_length": 128,
46
+ "pad_token": "<pad>",
47
+ "padding_side": "left",
48
+ "tokenizer_class": "TamilTinyStoriesTokenizer",
49
+ "unk_token": "<unk>"
50
+ }
vocab.json ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\n": 0,
3
+ " ": 1,
4
+ "!": 2,
5
+ "\"": 3,
6
+ "#": 4,
7
+ "$": 5,
8
+ "%": 6,
9
+ "&": 7,
10
+ "'": 8,
11
+ "(": 9,
12
+ ")": 10,
13
+ "+": 11,
14
+ ",": 12,
15
+ "-": 13,
16
+ ".": 14,
17
+ "/": 15,
18
+ "0": 16,
19
+ "1": 17,
20
+ "2": 18,
21
+ "3": 19,
22
+ "4": 20,
23
+ "5": 21,
24
+ "6": 22,
25
+ "7": 23,
26
+ "8": 24,
27
+ "9": 25,
28
+ ":": 26,
29
+ ";": 27,
30
+ "=": 28,
31
+ "?": 29,
32
+ "[": 30,
33
+ "\\": 31,
34
+ "]": 32,
35
+ "_": 33,
36
+ "`": 34,
37
+ "{": 35,
38
+ "|": 36,
39
+ "}": 37,
40
+ "~": 38,
41
+ "×": 39,
42
+ "ú": 40,
43
+ "ā": 41,
44
+ "ē": 42,
45
+ "ξ": 43,
46
+ "σ": 44,
47
+ "آ": 45,
48
+ "ؤ": 46,
49
+ "ئ": 47,
50
+ "ا": 48,
51
+ "ب": 49,
52
+ "ت": 50,
53
+ "د": 51,
54
+ "ر": 52,
55
+ "ز": 53,
56
+ "ض": 54,
57
+ "ط": 55,
58
+ "غ": 56,
59
+ "ق": 57,
60
+ "ل": 58,
61
+ "م": 59,
62
+ "ن": 60,
63
+ "و": 61,
64
+ "پ": 62,
65
+ "ڈ": 63,
66
+ "ڑ": 64,
67
+ "ک": 65,
68
+ "گ": 66,
69
+ "ں": 67,
70
+ "ھ": 68,
71
+ "ہ": 69,
72
+ "ی": 70,
73
+ "۔": 71,
74
+ "ँ": 72,
75
+ "ं": 73,
76
+ "ः": 74,
77
+ "आ": 75,
78
+ "इ": 76,
79
+ "उ": 77,
80
+ "ए": 78,
81
+ "ओ": 79,
82
+ "क": 80,
83
+ "ख": 81,
84
+ "ग": 82,
85
+ "घ": 83,
86
+ "च": 84,
87
+ "छ": 85,
88
+ "ज": 86,
89
+ "ट": 87,
90
+ "ठ": 88,
91
+ "ड": 89,
92
+ "ढ": 90,
93
+ "ण": 91,
94
+ "त": 92,
95
+ "थ": 93,
96
+ "द": 94,
97
+ "ध": 95,
98
+ "न": 96,
99
+ "प": 97,
100
+ "फ": 98,
101
+ "ब": 99,
102
+ "म": 100,
103
+ "य": 101,
104
+ "र": 102,
105
+ "ल": 103,
106
+ "ळ": 104,
107
+ "व": 105,
108
+ "श": 106,
109
+ "ष": 107,
110
+ "स": 108,
111
+ "ह": 109,
112
+ "़": 110,
113
+ "ा": 111,
114
+ "ि": 112,
115
+ "ी": 113,
116
+ "ु": 114,
117
+ "ू": 115,
118
+ "े": 116,
119
+ "ै": 117,
120
+ "ो": 118,
121
+ "ौ": 119,
122
+ "्": 120,
123
+ "५": 121,
124
+ "ঁ": 122,
125
+ "ং": 123,
126
+ "অ": 124,
127
+ "আ": 125,
128
+ "উ": 126,
129
+ "ও": 127,
130
+ "ক": 128,
131
+ "খ": 129,
132
+ "গ": 130,
133
+ "ঘ": 131,
134
+ "ঙ": 132,
135
+ "চ": 133,
136
+ "ছ": 134,
137
+ "জ": 135,
138
+ "ঝ": 136,
139
+ "ঞ": 137,
140
+ "ট": 138,
141
+ "ড": 139,
142
+ "ণ": 140,
143
+ "ত": 141,
144
+ "দ": 142,
145
+ "ধ": 143,
146
+ "ন": 144,
147
+ "প": 145,
148
+ "ফ": 146,
149
+ "ব": 147,
150
+ "ভ": 148,
151
+ "ম": 149,
152
+ "য": 150,
153
+ "র": 151,
154
+ "ল": 152,
155
+ "শ": 153,
156
+ "ষ": 154,
157
+ "স": 155,
158
+ "হ": 156,
159
+ "়": 157,
160
+ "া": 158,
161
+ "ি": 159,
162
+ "ী": 160,
163
+ "ু": 161,
164
+ "ূ": 162,
165
+ "ৃ": 163,
166
+ "ে": 164,
167
+ "ো": 165,
168
+ "ৌ": 166,
169
+ "্": 167,
170
+ "ৎ": 168,
171
+ "য়": 169,
172
+ "৪": 170,
173
+ "ਂ": 171,
174
+ "ਆ": 172,
175
+ "ਇ": 173,
176
+ "ਈ": 174,
177
+ "ਕ": 175,
178
+ "ਖ": 176,
179
+ "ਚ": 177,
180
+ "ਡ": 178,
181
+ "ਤ": 179,
182
+ "ਦ": 180,
183
+ "ਧ": 181,
184
+ "ਨ": 182,
185
+ "ਪ": 183,
186
+ "ਫ": 184,
187
+ "ਮ": 185,
188
+ "ਰ": 186,
189
+ "ਲ": 187,
190
+ "ਵ": 188,
191
+ "ਸ": 189,
192
+ "ਹ": 190,
193
+ "਼": 191,
194
+ "ਾ": 192,
195
+ "ਿ": 193,
196
+ "ੀ": 194,
197
+ "ੁ": 195,
198
+ "ੂ": 196,
199
+ "ੇ": 197,
200
+ "ੈ": 198,
201
+ "ੋ": 199,
202
+ "੍": 200,
203
+ "ੰ": 201,
204
+ "ੱ": 202,
205
+ "ં": 203,
206
+ "અ": 204,
207
+ "આ": 205,
208
+ "ઇ": 206,
209
+ "ઉ": 207,
210
+ "ઓ": 208,
211
+ "ક": 209,
212
+ "ખ": 210,
213
+ "ગ": 211,
214
+ "ઘ": 212,
215
+ "ચ": 213,
216
+ "જ": 214,
217
+ "ઝ": 215,
218
+ "ટ": 216,
219
+ "ણ": 217,
220
+ "ત": 218,
221
+ "થ": 219,
222
+ "દ": 220,
223
+ "ધ": 221,
224
+ "ન": 222,
225
+ "પ": 223,
226
+ "ભ": 224,
227
+ "મ": 225,
228
+ "ય": 226,
229
+ "ર": 227,
230
+ "લ": 228,
231
+ "ળ": 229,
232
+ "વ": 230,
233
+ "શ": 231,
234
+ "ષ": 232,
235
+ "સ": 233,
236
+ "હ": 234,
237
+ "ા": 235,
238
+ "િ": 236,
239
+ "ી": 237,
240
+ "ુ": 238,
241
+ "ે": 239,
242
+ "ો": 240,
243
+ "ૌ": 241,
244
+ "્": 242,
245
+ "ஃ": 243,
246
+ "அ": 244,
247
+ "ஆ": 245,
248
+ "இ": 246,
249
+ "ஈ": 247,
250
+ "உ": 248,
251
+ "ஊ": 249,
252
+ "எ": 250,
253
+ "ஏ": 251,
254
+ "ஐ": 252,
255
+ "ஒ": 253,
256
+ "ஓ": 254,
257
+ "க": 255,
258
+ "஖": 256,
259
+ "ங": 257,
260
+ "ச": 258,
261
+ "ஜ": 259,
262
+ "஝": 260,
263
+ "ஞ": 261,
264
+ "ட": 262,
265
+ "஡": 263,
266
+ "ண": 264,
267
+ "த": 265,
268
+ "ந": 266,
269
+ "ன": 267,
270
+ "ப": 268,
271
+ "ம": 269,
272
+ "ய": 270,
273
+ "ர": 271,
274
+ "ற": 272,
275
+ "ல": 273,
276
+ "ள": 274,
277
+ "ழ": 275,
278
+ "வ": 276,
279
+ "ஷ": 277,
280
+ "ஸ": 278,
281
+ "ஹ": 279,
282
+ "ா": 280,
283
+ "ி": 281,
284
+ "ீ": 282,
285
+ "ு": 283,
286
+ "ூ": 284,
287
+ "ெ": 285,
288
+ "ே": 286,
289
+ "ை": 287,
290
+ "ொ": 288,
291
+ "ோ": 289,
292
+ "ௌ": 290,
293
+ "்": 291,
294
+ "௔": 292,
295
+ "ం": 293,
296
+ "అ": 294,
297
+ "ఆ": 295,
298
+ "ఉ": 296,
299
+ "ఎ": 297,
300
+ "ఒ": 298,
301
+ "క": 299,
302
+ "గ": 300,
303
+ "చ": 301,
304
+ "జ": 302,
305
+ "ఞ": 303,
306
+ "ట": 304,
307
+ "డ": 305,
308
+ "ణ": 306,
309
+ "త": 307,
310
+ "థ": 308,
311
+ "ద": 309,
312
+ "ధ": 310,
313
+ "న": 311,
314
+ "ప": 312,
315
+ "బ": 313,
316
+ "భ": 314,
317
+ "మ": 315,
318
+ "య": 316,
319
+ "ర": 317,
320
+ "ల": 318,
321
+ "ళ": 319,
322
+ "వ": 320,
323
+ "శ": 321,
324
+ "ష": 322,
325
+ "స": 323,
326
+ "హ": 324,
327
+ "ా": 325,
328
+ "ి": 326,
329
+ "ీ": 327,
330
+ "ు": 328,
331
+ "ూ": 329,
332
+ "ె": 330,
333
+ "ే": 331,
334
+ "ై": 332,
335
+ "ొ": 333,
336
+ "ో": 334,
337
+ "్": 335,
338
+ "ಂ": 336,
339
+ "ಅ": 337,
340
+ "ಆ": 338,
341
+ "ಇ": 339,
342
+ "ಉ": 340,
343
+ "ಎ": 341,
344
+ "ಒ": 342,
345
+ "ಕ": 343,
346
+ "ಗ": 344,
347
+ "ಚ": 345,
348
+ "ಜ": 346,
349
+ "ಟ": 347,
350
+ "ಡ": 348,
351
+ "ಣ": 349,
352
+ "ತ": 350,
353
+ "ಥ": 351,
354
+ "ದ": 352,
355
+ "ಧ": 353,
356
+ "ನ": 354,
357
+ "ಪ": 355,
358
+ "ಬ": 356,
359
+ "ಭ": 357,
360
+ "ಮ": 358,
361
+ "ಯ": 359,
362
+ "ರ": 360,
363
+ "ಲ": 361,
364
+ "ಳ": 362,
365
+ "ವ": 363,
366
+ "ಶ": 364,
367
+ "ಷ": 365,
368
+ "ಸ": 366,
369
+ "ಹ": 367,
370
+ "ಾ": 368,
371
+ "ಿ": 369,
372
+ "ೀ": 370,
373
+ "ು": 371,
374
+ "ೂ": 372,
375
+ "ೆ": 373,
376
+ "ೇ": 374,
377
+ "ೈ": 375,
378
+ "ೊ": 376,
379
+ "ೋ": 377,
380
+ "್": 378,
381
+ "ೕ": 379,
382
+ "ം": 380,
383
+ "അ": 381,
384
+ "ആ": 382,
385
+ "ഉ": 383,
386
+ "എ": 384,
387
+ "ഒ": 385,
388
+ "ക": 386,
389
+ "ഖ": 387,
390
+ "ഗ": 388,
391
+ "ങ": 389,
392
+ "ച": 390,
393
+ "ജ": 391,
394
+ "ഞ": 392,
395
+ "ട": 393,
396
+ "ഠ": 394,
397
+ "ഡ": 395,
398
+ "ണ": 396,
399
+ "ത": 397,
400
+ "ഥ": 398,
401
+ "ദ": 399,
402
+ "ധ": 400,
403
+ "ന": 401,
404
+ "പ": 402,
405
+ "ഭ": 403,
406
+ "മ": 404,
407
+ "യ": 405,
408
+ "ര": 406,
409
+ "റ": 407,
410
+ "ല": 408,
411
+ "ള": 409,
412
+ "ഴ": 410,
413
+ "വ": 411,
414
+ "ശ": 412,
415
+ "ഷ": 413,
416
+ "സ": 414,
417
+ "ഹ": 415,
418
+ "ാ": 416,
419
+ "ി": 417,
420
+ "ീ": 418,
421
+ "ു": 419,
422
+ "ൂ": 420,
423
+ "ൃ": 421,
424
+ "െ": 422,
425
+ "േ": 423,
426
+ "ൊ": 424,
427
+ "ോ": 425,
428
+ "്": 426,
429
+ "ൗ": 427,
430
+ "ൺ": 428,
431
+ "ൻ": 429,
432
+ "ർ": 430,
433
+ "ൽ": 431,
434
+ "ൾ": 432,
435
+ "ḷ": 433,
436
+ "ṅ": 434,
437
+ "–": 435,
438
+ "—": 436,
439
+ "‘": 437,
440
+ "’": 438,
441
+ "“": 439,
442
+ "”": 440,
443
+ "…": 441,
444
+ "→": 442,
445
+ "∞": 443,
446
+ "⌛": 444,
447
+ "⏰": 445,
448
+ "⏳": 446,
449
+ "<pad>": 447,
450
+ "<bos>": 448,
451
+ "<eos>": 449,
452
+ "<unk>": 450
453
+ }