Add Kumru-based reasoning tokenizer with fixed system instruction template
Browse files- README.md +13 -42
- chat_template.jinja +17 -0
- model_config_tokenizer_patch.json +7 -0
- tokenizer.json +0 -0
- tokenizer_config.json +16 -16
- tokenizer_patch_report.json +32 -0
README.md
CHANGED
|
@@ -1,48 +1,19 @@
|
|
| 1 |
-
-
|
| 2 |
-
library_name: tokenizers
|
| 3 |
-
language:
|
| 4 |
-
- tr
|
| 5 |
-
tags:
|
| 6 |
-
- turkish
|
| 7 |
-
- tokenizer
|
| 8 |
-
- byte-level-bpe
|
| 9 |
-
- rslm
|
| 10 |
-
---
|
| 11 |
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
| 21 |
-
- `
|
| 22 |
-
- `OzenliDerlem`
|
| 23 |
-
- `temiz-OSCAR`
|
| 24 |
-
- `temiz-mC4`
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
Target estimated tokens: `700,000,000` total, approximately `175,000,000` per subset.
|
| 29 |
-
|
| 30 |
-
## Vocab
|
| 31 |
-
|
| 32 |
-
- Requested vocab size: `65,536`
|
| 33 |
-
- Actual vocab size: `65,536`
|
| 34 |
-
- BPE min frequency: `3`
|
| 35 |
-
|
| 36 |
-
## Special tokens
|
| 37 |
-
|
| 38 |
-
- `<|pad|>`
|
| 39 |
-
- `<|bos|>`
|
| 40 |
-
- `<|eos|>`
|
| 41 |
-
- `<|unk|>`
|
| 42 |
-
- `<|system|>`
|
| 43 |
-
- `<|user|>`
|
| 44 |
-
- `<|assistant|>`
|
| 45 |
-
- `<|answer|>`
|
| 46 |
-
- `<|end|>`
|
| 47 |
-
- `<think>`
|
| 48 |
-
- `</think>`
|
|
|
|
| 1 |
+
# RSLM Kumru-based Reasoning Tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
Base tokenizer: `vngrs-ai/Kumru-2B`
|
| 4 |
|
| 5 |
+
This tokenizer preserves core IDs:
|
| 6 |
|
| 7 |
+
- `<|pad|>` = 0
|
| 8 |
+
- `<|unknown|>` = 1
|
| 9 |
+
- `<|begin_of_text|>` = 2
|
| 10 |
+
- `<|end_of_text|>` = 3
|
| 11 |
+
- `<|system_instruction|>` = 4
|
| 12 |
+
- `<|system_instruction_end|>` = 5
|
| 13 |
|
| 14 |
+
Reasoning tokens:
|
| 15 |
|
| 16 |
+
- `<think>` = 50176
|
| 17 |
+
- `</think>` = 50177
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
System prompt is intended to appear once at the beginning of each rendered conversation.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% set sys_prompt = fixed_system_prompt|default('Sen Türkçe düşünen, güçlü muhakeme yapan ve açık cevap veren bir yardımcı modelsin.') -%}
|
| 2 |
+
<|begin_of_text|><|system_instruction|>
|
| 3 |
+
{{ sys_prompt }}<|system_instruction_end|>
|
| 4 |
+
{% for message in messages -%}
|
| 5 |
+
{% if message['role'] == 'system' -%}
|
| 6 |
+
{# Sistem mesajları burada tekrar yazılmaz; sabit sistem prompt yukarıda bir kere var. #}
|
| 7 |
+
{% elif message['role'] == 'user' -%}
|
| 8 |
+
<|user|>
|
| 9 |
+
{{ message['content'] }}<|end_of_turn|>
|
| 10 |
+
{% elif message['role'] == 'assistant' -%}
|
| 11 |
+
<|assistant|>
|
| 12 |
+
{{ message['content'] }}<|end_of_turn|>
|
| 13 |
+
{% endif -%}
|
| 14 |
+
{% endfor -%}
|
| 15 |
+
{% if add_generation_prompt -%}
|
| 16 |
+
<|assistant|>
|
| 17 |
+
{% endif -%}
|
model_config_tokenizer_patch.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"vocab_size": 50181,
|
| 3 |
+
"pad_token_id": 0,
|
| 4 |
+
"unk_token_id": 1,
|
| 5 |
+
"bos_token_id": 2,
|
| 6 |
+
"eos_token_id": 3
|
| 7 |
+
}
|
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
CHANGED
|
@@ -1,21 +1,21 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
"
|
| 4 |
"clean_up_tokenization_spaces": false,
|
| 5 |
-
"
|
| 6 |
-
"
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
"additional_special_tokens": [
|
| 12 |
-
"<|system|>",
|
| 13 |
"<|user|>",
|
| 14 |
"<|assistant|>",
|
| 15 |
-
"<|
|
| 16 |
-
"<|end|>",
|
| 17 |
-
"<think>",
|
| 18 |
-
"</think>"
|
| 19 |
],
|
| 20 |
-
"
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"bos_token": "<|begin_of_text|>",
|
| 4 |
"clean_up_tokenization_spaces": false,
|
| 5 |
+
"eos_token": "<|end_of_text|>",
|
| 6 |
+
"extra_special_tokens": [
|
| 7 |
+
"<|system_instruction|>",
|
| 8 |
+
"<|system_instruction_end|>",
|
| 9 |
+
"<think>",
|
| 10 |
+
"</think>",
|
|
|
|
|
|
|
| 11 |
"<|user|>",
|
| 12 |
"<|assistant|>",
|
| 13 |
+
"<|end_of_turn|>"
|
|
|
|
|
|
|
|
|
|
| 14 |
],
|
| 15 |
+
"model_max_length": 32768,
|
| 16 |
+
"pad_token": "<|pad|>",
|
| 17 |
+
"padding_side": "right",
|
| 18 |
+
"tokenizer_class": "TokenizersBackend",
|
| 19 |
+
"truncation_side": "right",
|
| 20 |
+
"unk_token": "<|unknown|>"
|
| 21 |
+
}
|
tokenizer_patch_report.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"created_at": "2026-05-23T20:37:51.478542Z",
|
| 3 |
+
"base_tokenizer_repo": "vngrs-ai/Kumru-2B",
|
| 4 |
+
"output_dir": "/kaggle/working/kumru-tokenizer-rslm",
|
| 5 |
+
"model_max_length": 32768,
|
| 6 |
+
"vocab_size": 50181,
|
| 7 |
+
"num_added_tokens": 5,
|
| 8 |
+
"core_token_ids": {
|
| 9 |
+
"<|pad|>": 0,
|
| 10 |
+
"<|unknown|>": 1,
|
| 11 |
+
"<|begin_of_text|>": 2,
|
| 12 |
+
"<|end_of_text|>": 3,
|
| 13 |
+
"<|system_instruction|>": 4,
|
| 14 |
+
"<|system_instruction_end|>": 5
|
| 15 |
+
},
|
| 16 |
+
"reasoning_token_ids": {
|
| 17 |
+
"<think>": 50176,
|
| 18 |
+
"</think>": 50177
|
| 19 |
+
},
|
| 20 |
+
"role_token_ids": {
|
| 21 |
+
"<|user|>": 50178,
|
| 22 |
+
"<|assistant|>": 50179,
|
| 23 |
+
"<|end_of_turn|>": 50180
|
| 24 |
+
},
|
| 25 |
+
"model_config_patch": {
|
| 26 |
+
"vocab_size": 50181,
|
| 27 |
+
"pad_token_id": 0,
|
| 28 |
+
"unk_token_id": 1,
|
| 29 |
+
"bos_token_id": 2,
|
| 30 |
+
"eos_token_id": 3
|
| 31 |
+
}
|
| 32 |
+
}
|