neshkatrapati commited on
Commit
ebb32fd
·
verified ·
1 Parent(s): e0ef6c3

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -40,6 +40,7 @@ Developed by **[Dvitva AI](https://dvitva.ai)**.
40
  | **Vocab size** | 86,075 (base + 4 chat tokens) |
41
  | **Tokenizer** | Morfessor + BPE (Telugu morpheme-aware) |
42
  | **Fine-tuning** | Full SFT on Telugu conversations |
 
43
  | **Developed by** | [Dvitva AI](https://dvitva.ai) |
44
 
45
  ## Chat Template
@@ -66,6 +67,8 @@ The model generates after `<|assistant|>` and stops at `<|end|>`.
66
 
67
  | Token | ID |
68
  |---|---|
 
 
69
  | `<|system|>` | 86071 |
70
  | `<|user|>` | 86072 |
71
  | `<|assistant|>` | 86073 |
@@ -126,7 +129,7 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=False))
126
 
127
  ### Using the CLI chat script
128
 
129
- For the best interactive experience, use the `chat.py` CLI from the [training repo](https://github.com/dvitvaai/telugu-lm):
130
 
131
  ```bash
132
  # Interactive multi-turn chat
@@ -156,12 +159,10 @@ This model uses a **Morfessor + BPE hybrid tokenizer** designed for Telugu:
156
 
157
  ```python
158
  import morfessor, re
159
- from huggingface_hub import hf_hub_download
160
 
161
- # Load Morfessor model from the repo
162
- morf_path = hf_hub_download(repo_id="dvitvaai/pothana-chat-300M", filename="morfessor_telugu.bin")
163
  io = morfessor.MorfessorIO()
164
- morf_model = io.read_binary_model_file(morf_path)
165
 
166
  TELUGU_RE = re.compile(r"[\u0C00-\u0C7F]+")
167
 
 
40
  | **Vocab size** | 86,075 (base + 4 chat tokens) |
41
  | **Tokenizer** | Morfessor + BPE (Telugu morpheme-aware) |
42
  | **Fine-tuning** | Full SFT on Telugu conversations |
43
+ | **Best val loss** | 2.4830389234855454 |
44
  | **Developed by** | [Dvitva AI](https://dvitva.ai) |
45
 
46
  ## Chat Template
 
67
 
68
  | Token | ID |
69
  |---|---|
70
+ | `<bos>` | 2 |
71
+ | `<eos>` | 3 |
72
  | `<|system|>` | 86071 |
73
  | `<|user|>` | 86072 |
74
  | `<|assistant|>` | 86073 |
 
129
 
130
  ### Using the CLI chat script
131
 
132
+ For the best experience, use the included `chat.py` CLI:
133
 
134
  ```bash
135
  # Interactive multi-turn chat
 
159
 
160
  ```python
161
  import morfessor, re
 
162
 
163
+ # Load Morfessor model
 
164
  io = morfessor.MorfessorIO()
165
+ morf_model = io.read_binary_model_file("morfessor_telugu.bin")
166
 
167
  TELUGU_RE = re.compile(r"[\u0C00-\u0C7F]+")
168
 
config.json CHANGED
@@ -21,10 +21,13 @@
21
  "tie_word_embeddings": true,
22
  "pad_token_id": 0,
23
  "bos_token_id": 2,
24
- "eos_token_id": [3, 86074],
 
 
 
25
  "attention_dropout": 0.0,
26
  "initializer_range": 0.02,
27
  "pretraining_tp": 1,
28
  "use_cache": true,
29
  "transformers_version": "4.40.0"
30
- }
 
21
  "tie_word_embeddings": true,
22
  "pad_token_id": 0,
23
  "bos_token_id": 2,
24
+ "eos_token_id": [
25
+ 3,
26
+ 86074
27
+ ],
28
  "attention_dropout": 0.0,
29
  "initializer_range": 0.02,
30
  "pretraining_tp": 1,
31
  "use_cache": true,
32
  "transformers_version": "4.40.0"
33
+ }
generation_config.json CHANGED
@@ -1,7 +1,10 @@
1
  {
2
  "_from_model_config": true,
3
  "bos_token_id": 2,
4
- "eos_token_id": [3, 86074],
 
 
 
5
  "pad_token_id": 0,
6
  "do_sample": true,
7
  "temperature": 0.7,
@@ -10,4 +13,4 @@
10
  "max_new_tokens": 256,
11
  "repetition_penalty": 1.1,
12
  "transformers_version": "4.40.0"
13
- }
 
1
  {
2
  "_from_model_config": true,
3
  "bos_token_id": 2,
4
+ "eos_token_id": [
5
+ 3,
6
+ 86074
7
+ ],
8
  "pad_token_id": 0,
9
  "do_sample": true,
10
  "temperature": 0.7,
 
13
  "max_new_tokens": 256,
14
  "repetition_penalty": 1.1,
15
  "transformers_version": "4.40.0"
16
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:236a8a7692f176c516db8a5c7448795000e1677de1c2798cb75c7d37aa6bee1f
3
  size 1380356280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2fb678f8659adecbe45390748938a5b3ec93c7bc5d5fa133fbab54723f8b168
3
  size 1380356280
special_tokens_map.json CHANGED
@@ -3,5 +3,12 @@
3
  "eos_token": "<eos>",
4
  "unk_token": "<unk>",
5
  "pad_token": "<pad>",
6
- "additional_special_tokens": ["<|system|>", "<|user|>", "<|assistant|>", "<|end|>"]
7
- }
 
 
 
 
 
 
 
 
3
  "eos_token": "<eos>",
4
  "unk_token": "<unk>",
5
  "pad_token": "<pad>",
6
+ "additional_special_tokens": [
7
+ "<|system|>",
8
+ "<|user|>",
9
+ "<|assistant|>",
10
+ "<|end|>",
11
+ "<bos>",
12
+ "<eos>"
13
+ ]
14
+ }
tokenizer_class.py CHANGED
@@ -8,8 +8,14 @@ class TeluguTokenizer(PreTrainedTokenizerFast):
8
  Tokens ending with @@ are continuation pieces that join to the next token.
9
  This class overrides decode() to strip @@ markers and join morphemes:
10
  "రెడ్డి@@ గారు" → "రెడ్డిగారు"
 
 
 
11
  """
12
 
 
 
 
13
  def decode(self, token_ids, skip_special_tokens=False, **kwargs):
14
  text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
15
  # Strip @@ continuation markers:
@@ -17,4 +23,10 @@ class TeluguTokenizer(PreTrainedTokenizerFast):
17
  text = text.replace("@@ ", "")
18
  # Handle remaining @@ (before punctuation, end of string, etc.)
19
  text = text.replace("@@", "")
 
 
 
 
 
 
20
  return text
 
8
  Tokens ending with @@ are continuation pieces that join to the next token.
9
  This class overrides decode() to strip @@ markers and join morphemes:
10
  "రెడ్డి@@ గారు" → "రెడ్డిగారు"
11
+
12
+ Also strips chat special tokens (<|system|>, <|user|>, <|assistant|>, <|end|>)
13
+ from decoded output for clean text.
14
  """
15
 
16
+ # Chat special tokens to strip from output
17
+ _CHAT_SPECIALS = ["<|system|>", "<|user|>", "<|assistant|>", "<|end|>"]
18
+
19
  def decode(self, token_ids, skip_special_tokens=False, **kwargs):
20
  text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
21
  # Strip @@ continuation markers:
 
23
  text = text.replace("@@ ", "")
24
  # Handle remaining @@ (before punctuation, end of string, etc.)
25
  text = text.replace("@@", "")
26
+ # Strip chat special tokens
27
+ for special in self._CHAT_SPECIALS:
28
+ text = text.replace(special, "")
29
+ # Clean up extra whitespace from removed tokens
30
+ import re
31
+ text = re.sub(r" +", " ", text).strip()
32
  return text
tokenizer_config.json CHANGED
@@ -15,11 +15,18 @@
15
  "add_eos_token": false,
16
  "clean_up_tokenization_spaces": false,
17
  "model_max_length": 2048,
18
- "additional_special_tokens": ["<|system|>", "<|user|>", "<|assistant|>", "<|end|>"],
19
- "chat_template": "{% for message in messages %}{% if loop.first %}<bos>{% endif %}{% if message['role'] == 'system' %}<|system|> {{ message['content'] }} <|end|>{% elif message['role'] == 'user' %}<|user|> {{ message['content'] }} <|end|>{% elif message['role'] == 'assistant' %}<|assistant|> {{ message['content'] }} <|end|>{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
20
  "extra_info": {
21
  "type": "morfessor_bpe_telugu",
22
  "separator": "@@",
23
  "note": "This tokenizer expects Morfessor-segmented text as input. For raw Telugu text, run Morfessor segmentation first using the included morfessor_telugu.bin model. Tokens ending with '@@' are continuation pieces that join to the next token. The decoder handles @@ removal automatically."
24
- }
25
- }
 
 
 
 
 
 
 
 
 
 
15
  "add_eos_token": false,
16
  "clean_up_tokenization_spaces": false,
17
  "model_max_length": 2048,
 
 
18
  "extra_info": {
19
  "type": "morfessor_bpe_telugu",
20
  "separator": "@@",
21
  "note": "This tokenizer expects Morfessor-segmented text as input. For raw Telugu text, run Morfessor segmentation first using the included morfessor_telugu.bin model. Tokens ending with '@@' are continuation pieces that join to the next token. The decoder handles @@ removal automatically."
22
+ },
23
+ "additional_special_tokens": [
24
+ "<|system|>",
25
+ "<|user|>",
26
+ "<|assistant|>",
27
+ "<|end|>",
28
+ "<bos>",
29
+ "<eos>"
30
+ ],
31
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'system' %}<|system|>{{ message['content'] }}<|end|>{% elif message['role'] == 'user' %}<|user|>{{ message['content'] }}<|end|>{% elif message['role'] == 'assistant' %}<|assistant|>{{ message['content'] }}<|end|>{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}"
32
+ }