Upload folder using huggingface_hub
Browse files- README.md +6 -5
- config.json +5 -2
- generation_config.json +5 -2
- model.safetensors +1 -1
- special_tokens_map.json +9 -2
- tokenizer_class.py +12 -0
- tokenizer_config.json +11 -4
README.md
CHANGED
|
@@ -40,6 +40,7 @@ Developed by **[Dvitva AI](https://dvitva.ai)**.
|
|
| 40 |
| **Vocab size** | 86,075 (base + 4 chat tokens) |
|
| 41 |
| **Tokenizer** | Morfessor + BPE (Telugu morpheme-aware) |
|
| 42 |
| **Fine-tuning** | Full SFT on Telugu conversations |
|
|
|
|
| 43 |
| **Developed by** | [Dvitva AI](https://dvitva.ai) |
|
| 44 |
|
| 45 |
## Chat Template
|
|
@@ -66,6 +67,8 @@ The model generates after `<|assistant|>` and stops at `<|end|>`.
|
|
| 66 |
|
| 67 |
| Token | ID |
|
| 68 |
|---|---|
|
|
|
|
|
|
|
| 69 |
| `<|system|>` | 86071 |
|
| 70 |
| `<|user|>` | 86072 |
|
| 71 |
| `<|assistant|>` | 86073 |
|
|
@@ -126,7 +129,7 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=False))
|
|
| 126 |
|
| 127 |
### Using the CLI chat script
|
| 128 |
|
| 129 |
-
For the best
|
| 130 |
|
| 131 |
```bash
|
| 132 |
# Interactive multi-turn chat
|
|
@@ -156,12 +159,10 @@ This model uses a **Morfessor + BPE hybrid tokenizer** designed for Telugu:
|
|
| 156 |
|
| 157 |
```python
|
| 158 |
import morfessor, re
|
| 159 |
-
from huggingface_hub import hf_hub_download
|
| 160 |
|
| 161 |
-
# Load Morfessor model
|
| 162 |
-
morf_path = hf_hub_download(repo_id="dvitvaai/pothana-chat-300M", filename="morfessor_telugu.bin")
|
| 163 |
io = morfessor.MorfessorIO()
|
| 164 |
-
morf_model = io.read_binary_model_file(
|
| 165 |
|
| 166 |
TELUGU_RE = re.compile(r"[\u0C00-\u0C7F]+")
|
| 167 |
|
|
|
|
| 40 |
| **Vocab size** | 86,075 (base + 4 chat tokens) |
|
| 41 |
| **Tokenizer** | Morfessor + BPE (Telugu morpheme-aware) |
|
| 42 |
| **Fine-tuning** | Full SFT on Telugu conversations |
|
| 43 |
+
| **Best val loss** | 2.4830389234855454 |
|
| 44 |
| **Developed by** | [Dvitva AI](https://dvitva.ai) |
|
| 45 |
|
| 46 |
## Chat Template
|
|
|
|
| 67 |
|
| 68 |
| Token | ID |
|
| 69 |
|---|---|
|
| 70 |
+
| `<bos>` | 2 |
|
| 71 |
+
| `<eos>` | 3 |
|
| 72 |
| `<|system|>` | 86071 |
|
| 73 |
| `<|user|>` | 86072 |
|
| 74 |
| `<|assistant|>` | 86073 |
|
|
|
|
| 129 |
|
| 130 |
### Using the CLI chat script
|
| 131 |
|
| 132 |
+
For the best experience, use the included `chat.py` CLI:
|
| 133 |
|
| 134 |
```bash
|
| 135 |
# Interactive multi-turn chat
|
|
|
|
| 159 |
|
| 160 |
```python
|
| 161 |
import morfessor, re
|
|
|
|
| 162 |
|
| 163 |
+
# Load Morfessor model
|
|
|
|
| 164 |
io = morfessor.MorfessorIO()
|
| 165 |
+
morf_model = io.read_binary_model_file("morfessor_telugu.bin")
|
| 166 |
|
| 167 |
TELUGU_RE = re.compile(r"[\u0C00-\u0C7F]+")
|
| 168 |
|
config.json
CHANGED
|
@@ -21,10 +21,13 @@
|
|
| 21 |
"tie_word_embeddings": true,
|
| 22 |
"pad_token_id": 0,
|
| 23 |
"bos_token_id": 2,
|
| 24 |
-
"eos_token_id": [
|
|
|
|
|
|
|
|
|
|
| 25 |
"attention_dropout": 0.0,
|
| 26 |
"initializer_range": 0.02,
|
| 27 |
"pretraining_tp": 1,
|
| 28 |
"use_cache": true,
|
| 29 |
"transformers_version": "4.40.0"
|
| 30 |
-
}
|
|
|
|
| 21 |
"tie_word_embeddings": true,
|
| 22 |
"pad_token_id": 0,
|
| 23 |
"bos_token_id": 2,
|
| 24 |
+
"eos_token_id": [
|
| 25 |
+
3,
|
| 26 |
+
86074
|
| 27 |
+
],
|
| 28 |
"attention_dropout": 0.0,
|
| 29 |
"initializer_range": 0.02,
|
| 30 |
"pretraining_tp": 1,
|
| 31 |
"use_cache": true,
|
| 32 |
"transformers_version": "4.40.0"
|
| 33 |
+
}
|
generation_config.json
CHANGED
|
@@ -1,7 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"_from_model_config": true,
|
| 3 |
"bos_token_id": 2,
|
| 4 |
-
"eos_token_id": [
|
|
|
|
|
|
|
|
|
|
| 5 |
"pad_token_id": 0,
|
| 6 |
"do_sample": true,
|
| 7 |
"temperature": 0.7,
|
|
@@ -10,4 +13,4 @@
|
|
| 10 |
"max_new_tokens": 256,
|
| 11 |
"repetition_penalty": 1.1,
|
| 12 |
"transformers_version": "4.40.0"
|
| 13 |
-
}
|
|
|
|
| 1 |
{
|
| 2 |
"_from_model_config": true,
|
| 3 |
"bos_token_id": 2,
|
| 4 |
+
"eos_token_id": [
|
| 5 |
+
3,
|
| 6 |
+
86074
|
| 7 |
+
],
|
| 8 |
"pad_token_id": 0,
|
| 9 |
"do_sample": true,
|
| 10 |
"temperature": 0.7,
|
|
|
|
| 13 |
"max_new_tokens": 256,
|
| 14 |
"repetition_penalty": 1.1,
|
| 15 |
"transformers_version": "4.40.0"
|
| 16 |
+
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1380356280
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2fb678f8659adecbe45390748938a5b3ec93c7bc5d5fa133fbab54723f8b168
|
| 3 |
size 1380356280
|
special_tokens_map.json
CHANGED
|
@@ -3,5 +3,12 @@
|
|
| 3 |
"eos_token": "<eos>",
|
| 4 |
"unk_token": "<unk>",
|
| 5 |
"pad_token": "<pad>",
|
| 6 |
-
"additional_special_tokens": [
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
"eos_token": "<eos>",
|
| 4 |
"unk_token": "<unk>",
|
| 5 |
"pad_token": "<pad>",
|
| 6 |
+
"additional_special_tokens": [
|
| 7 |
+
"<|system|>",
|
| 8 |
+
"<|user|>",
|
| 9 |
+
"<|assistant|>",
|
| 10 |
+
"<|end|>",
|
| 11 |
+
"<bos>",
|
| 12 |
+
"<eos>"
|
| 13 |
+
]
|
| 14 |
+
}
|
tokenizer_class.py
CHANGED
|
@@ -8,8 +8,14 @@ class TeluguTokenizer(PreTrainedTokenizerFast):
|
|
| 8 |
Tokens ending with @@ are continuation pieces that join to the next token.
|
| 9 |
This class overrides decode() to strip @@ markers and join morphemes:
|
| 10 |
"రెడ్డి@@ గారు" → "రెడ్డిగారు"
|
|
|
|
|
|
|
|
|
|
| 11 |
"""
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
def decode(self, token_ids, skip_special_tokens=False, **kwargs):
|
| 14 |
text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
|
| 15 |
# Strip @@ continuation markers:
|
|
@@ -17,4 +23,10 @@ class TeluguTokenizer(PreTrainedTokenizerFast):
|
|
| 17 |
text = text.replace("@@ ", "")
|
| 18 |
# Handle remaining @@ (before punctuation, end of string, etc.)
|
| 19 |
text = text.replace("@@", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
return text
|
|
|
|
| 8 |
Tokens ending with @@ are continuation pieces that join to the next token.
|
| 9 |
This class overrides decode() to strip @@ markers and join morphemes:
|
| 10 |
"రెడ్డి@@ గారు" → "రెడ్డిగారు"
|
| 11 |
+
|
| 12 |
+
Also strips chat special tokens (<|system|>, <|user|>, <|assistant|>, <|end|>)
|
| 13 |
+
from decoded output for clean text.
|
| 14 |
"""
|
| 15 |
|
| 16 |
+
# Chat special tokens to strip from output
|
| 17 |
+
_CHAT_SPECIALS = ["<|system|>", "<|user|>", "<|assistant|>", "<|end|>"]
|
| 18 |
+
|
| 19 |
def decode(self, token_ids, skip_special_tokens=False, **kwargs):
|
| 20 |
text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
|
| 21 |
# Strip @@ continuation markers:
|
|
|
|
| 23 |
text = text.replace("@@ ", "")
|
| 24 |
# Handle remaining @@ (before punctuation, end of string, etc.)
|
| 25 |
text = text.replace("@@", "")
|
| 26 |
+
# Strip chat special tokens
|
| 27 |
+
for special in self._CHAT_SPECIALS:
|
| 28 |
+
text = text.replace(special, "")
|
| 29 |
+
# Clean up extra whitespace from removed tokens
|
| 30 |
+
import re
|
| 31 |
+
text = re.sub(r" +", " ", text).strip()
|
| 32 |
return text
|
tokenizer_config.json
CHANGED
|
@@ -15,11 +15,18 @@
|
|
| 15 |
"add_eos_token": false,
|
| 16 |
"clean_up_tokenization_spaces": false,
|
| 17 |
"model_max_length": 2048,
|
| 18 |
-
"additional_special_tokens": ["<|system|>", "<|user|>", "<|assistant|>", "<|end|>"],
|
| 19 |
-
"chat_template": "{% for message in messages %}{% if loop.first %}<bos>{% endif %}{% if message['role'] == 'system' %}<|system|> {{ message['content'] }} <|end|>{% elif message['role'] == 'user' %}<|user|> {{ message['content'] }} <|end|>{% elif message['role'] == 'assistant' %}<|assistant|> {{ message['content'] }} <|end|>{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
|
| 20 |
"extra_info": {
|
| 21 |
"type": "morfessor_bpe_telugu",
|
| 22 |
"separator": "@@",
|
| 23 |
"note": "This tokenizer expects Morfessor-segmented text as input. For raw Telugu text, run Morfessor segmentation first using the included morfessor_telugu.bin model. Tokens ending with '@@' are continuation pieces that join to the next token. The decoder handles @@ removal automatically."
|
| 24 |
-
}
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"add_eos_token": false,
|
| 16 |
"clean_up_tokenization_spaces": false,
|
| 17 |
"model_max_length": 2048,
|
|
|
|
|
|
|
| 18 |
"extra_info": {
|
| 19 |
"type": "morfessor_bpe_telugu",
|
| 20 |
"separator": "@@",
|
| 21 |
"note": "This tokenizer expects Morfessor-segmented text as input. For raw Telugu text, run Morfessor segmentation first using the included morfessor_telugu.bin model. Tokens ending with '@@' are continuation pieces that join to the next token. The decoder handles @@ removal automatically."
|
| 22 |
+
},
|
| 23 |
+
"additional_special_tokens": [
|
| 24 |
+
"<|system|>",
|
| 25 |
+
"<|user|>",
|
| 26 |
+
"<|assistant|>",
|
| 27 |
+
"<|end|>",
|
| 28 |
+
"<bos>",
|
| 29 |
+
"<eos>"
|
| 30 |
+
],
|
| 31 |
+
"chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'system' %}<|system|>{{ message['content'] }}<|end|>{% elif message['role'] == 'user' %}<|user|>{{ message['content'] }}<|end|>{% elif message['role'] == 'assistant' %}<|assistant|>{{ message['content'] }}<|end|>{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}"
|
| 32 |
+
}
|