Efe2898 commited on
Commit
6404abb
·
verified ·
1 Parent(s): ad8b8fc

Add Kumru-based reasoning tokenizer with fixed system instruction template

Browse files
README.md CHANGED
@@ -1,48 +1,19 @@
1
- ---
2
- library_name: tokenizers
3
- language:
4
- - tr
5
- tags:
6
- - turkish
7
- - tokenizer
8
- - byte-level-bpe
9
- - rslm
10
- ---
11
 
12
- # RSLM Tokenizer 65K
13
 
14
- CPU-safe Byte-Level BPE tokenizer for RSLM.
15
 
16
- ## Training data
 
 
 
 
 
17
 
18
- Dataset: `turkish-nlp-suite/BellaTurca`
19
 
20
- Subsets:
21
- - `AkademikDerlem`
22
- - `OzenliDerlem`
23
- - `temiz-OSCAR`
24
- - `temiz-mC4`
25
 
26
- Column: `text`
27
-
28
- Target estimated tokens: `700,000,000` total, approximately `175,000,000` per subset.
29
-
30
- ## Vocab
31
-
32
- - Requested vocab size: `65,536`
33
- - Actual vocab size: `65,536`
34
- - BPE min frequency: `3`
35
-
36
- ## Special tokens
37
-
38
- - `<|pad|>`
39
- - `<|bos|>`
40
- - `<|eos|>`
41
- - `<|unk|>`
42
- - `<|system|>`
43
- - `<|user|>`
44
- - `<|assistant|>`
45
- - `<|answer|>`
46
- - `<|end|>`
47
- - `<think>`
48
- - `</think>`
 
1
+ # RSLM Kumru-based Reasoning Tokenizer
 
 
 
 
 
 
 
 
 
2
 
3
+ Base tokenizer: `vngrs-ai/Kumru-2B`
4
 
5
+ This tokenizer preserves core IDs:
6
 
7
+ - `<|pad|>` = 0
8
+ - `<|unknown|>` = 1
9
+ - `<|begin_of_text|>` = 2
10
+ - `<|end_of_text|>` = 3
11
+ - `<|system_instruction|>` = 4
12
+ - `<|system_instruction_end|>` = 5
13
 
14
+ Reasoning tokens:
15
 
16
+ - `<think>` = 50176
17
+ - `</think>` = 50177
 
 
 
18
 
19
+ System prompt is intended to appear once at the beginning of each rendered conversation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
chat_template.jinja ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% set sys_prompt = fixed_system_prompt|default('Sen Türkçe düşünen, güçlü muhakeme yapan ve açık cevap veren bir yardımcı modelsin.') -%}
2
+ <|begin_of_text|><|system_instruction|>
3
+ {{ sys_prompt }}<|system_instruction_end|>
4
+ {% for message in messages -%}
5
+ {% if message['role'] == 'system' -%}
6
+ {# Sistem mesajları burada tekrar yazılmaz; sabit sistem prompt yukarıda bir kere var. #}
7
+ {% elif message['role'] == 'user' -%}
8
+ <|user|>
9
+ {{ message['content'] }}<|end_of_turn|>
10
+ {% elif message['role'] == 'assistant' -%}
11
+ <|assistant|>
12
+ {{ message['content'] }}<|end_of_turn|>
13
+ {% endif -%}
14
+ {% endfor -%}
15
+ {% if add_generation_prompt -%}
16
+ <|assistant|>
17
+ {% endif -%}
model_config_tokenizer_patch.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 50181,
3
+ "pad_token_id": 0,
4
+ "unk_token_id": 1,
5
+ "bos_token_id": 2,
6
+ "eos_token_id": 3
7
+ }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
- "model_max_length": 262144,
3
- "tokenizer_class": "PreTrainedTokenizerFast",
4
  "clean_up_tokenization_spaces": false,
5
- "padding_side": "right",
6
- "truncation_side": "right",
7
- "bos_token": "<|bos|>",
8
- "eos_token": "<|eos|>",
9
- "unk_token": "<|unk|>",
10
- "pad_token": "<|pad|>",
11
- "additional_special_tokens": [
12
- "<|system|>",
13
  "<|user|>",
14
  "<|assistant|>",
15
- "<|answer|>",
16
- "<|end|>",
17
- "<think>",
18
- "</think>"
19
  ],
20
- "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}<|system|>\n{{ message['content'] }}<|end|>\n{% elif message['role'] == 'user' %}<|user|>\n{{ message['content'] }}<|end|>\n{% elif message['role'] == 'assistant' %}<|assistant|>\n{{ message['content'] }}<|end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}"
21
- }
 
 
 
 
 
 
1
  {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|begin_of_text|>",
4
  "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|end_of_text|>",
6
+ "extra_special_tokens": [
7
+ "<|system_instruction|>",
8
+ "<|system_instruction_end|>",
9
+ "<think>",
10
+ "</think>",
 
 
11
  "<|user|>",
12
  "<|assistant|>",
13
+ "<|end_of_turn|>"
 
 
 
14
  ],
15
+ "model_max_length": 32768,
16
+ "pad_token": "<|pad|>",
17
+ "padding_side": "right",
18
+ "tokenizer_class": "TokenizersBackend",
19
+ "truncation_side": "right",
20
+ "unk_token": "<|unknown|>"
21
+ }
tokenizer_patch_report.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "created_at": "2026-05-23T20:37:51.478542Z",
3
+ "base_tokenizer_repo": "vngrs-ai/Kumru-2B",
4
+ "output_dir": "/kaggle/working/kumru-tokenizer-rslm",
5
+ "model_max_length": 32768,
6
+ "vocab_size": 50181,
7
+ "num_added_tokens": 5,
8
+ "core_token_ids": {
9
+ "<|pad|>": 0,
10
+ "<|unknown|>": 1,
11
+ "<|begin_of_text|>": 2,
12
+ "<|end_of_text|>": 3,
13
+ "<|system_instruction|>": 4,
14
+ "<|system_instruction_end|>": 5
15
+ },
16
+ "reasoning_token_ids": {
17
+ "<think>": 50176,
18
+ "</think>": 50177
19
+ },
20
+ "role_token_ids": {
21
+ "<|user|>": 50178,
22
+ "<|assistant|>": 50179,
23
+ "<|end_of_turn|>": 50180
24
+ },
25
+ "model_config_patch": {
26
+ "vocab_size": 50181,
27
+ "pad_token_id": 0,
28
+ "unk_token_id": 1,
29
+ "bos_token_id": 2,
30
+ "eos_token_id": 3
31
+ }
32
+ }