voidful commited on
Commit
989d8ec
·
verified ·
1 Parent(s): 82e506e

Add Open Formosa special tokens

Browse files
Files changed (6) hide show
  1. README.md +12 -32
  2. added_tokens.json +171 -0
  3. evaluation_report.json +19 -53
  4. special_tokens_map.json +140 -6
  5. tokenizer.json +1209 -3
  6. tokenizer_config.json +162 -12
README.md CHANGED
@@ -11,8 +11,12 @@ tags:
11
 
12
  # PangolinTokenizer
13
 
14
- Byte-level BPE tokenizer trained for Traditional Chinese, Taiwan text, multilingual
15
- text, rich transcription, OCR-style text, and generic multimodal control formats.
 
 
 
 
16
 
17
  ## Usage
18
 
@@ -29,36 +33,12 @@ ids = tokenizer.encode(text)
29
  decoded = tokenizer.decode(ids)
30
  ```
31
 
32
- ## Files
33
-
34
- - `config.json`
35
- - `tokenizer_config.json`
36
- - `tokenizer.json`
37
- - `vocab.json`
38
- - `merges.txt`
39
- - `special_tokens_map.json`
40
- - `evaluation_report.json`
41
 
42
- ## Tokenizer Details
43
-
44
- - Type: Byte-level BPE
45
- - Vocabulary size: 114,688
46
- - Learned merges: 114,397
47
  - Model max length metadata: 131,072
48
- - Minimum merge frequency: 5
49
- - Transformers class: `GPT2TokenizerFast`
50
  - `trust_remote_code`: not required
51
-
52
- ## Safety Notes
53
-
54
- This tokenizer intentionally does not include discrete audio codec token ranges.
55
- It also intentionally does not include dense timestamp token ranges. Audio should
56
- be represented through external references or embeddings outside this tokenizer.
57
-
58
- Evaluation checks confirmed:
59
-
60
- - Transformers `AutoTokenizer` loading works with `trust_remote_code=False`
61
- - Traditional Chinese and Bopomofo smoke roundtrip works
62
- - No tokens matching `<|audio_[0-9]+|>`
63
- - No dense timestamp token ranges matching `<|ts_[0-9]+|>`,
64
- `<|timestamp_[0-9]+|>`, or `<|time_[0-9]+|>`
 
11
 
12
  # PangolinTokenizer
13
 
14
+ Byte-level BPE tokenizer for Traditional Chinese, Taiwan text, multilingual text,
15
+ rich transcription, OCR-style text, and generic control formats.
16
+
17
+ This revision adds the Open Formosa required control tokens as special tokens.
18
+ The base BPE vocabulary size remains 114,688. The effective tokenizer length,
19
+ including added special tokens, is 114,822.
20
 
21
  ## Usage
22
 
 
33
  decoded = tokenizer.decode(ids)
34
  ```
35
 
36
+ ## Open Formosa Compatibility
 
 
 
 
 
 
 
 
37
 
38
+ - Required special tokens present: 157
39
+ - Required special tokens encode as single IDs: yes
40
+ - Standard special tokens: `<unk>`, `<s>`, `</s>`, `<pad>`
 
 
41
  - Model max length metadata: 131,072
 
 
42
  - `trust_remote_code`: not required
43
+ - No discrete audio codec token ranges are included.
44
+ - No dense timestamp token ranges are included.
 
 
 
 
 
 
 
 
 
 
 
 
added_tokens.json ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|pad|>": 114653,
3
+ "<|bos|>": 114654,
4
+ "<|eos|>": 114655,
5
+ "<|unk|>": 114656,
6
+ "<|system|>": 114657,
7
+ "<|user_channel|>": 114658,
8
+ "<|assistant_channel|>": 114659,
9
+ "<|task:speech_to_text|>": 114660,
10
+ "<|task:text_to_speech|>": 114661,
11
+ "<|input_audio_start|>": 114662,
12
+ "<|input_audio_end|>": 114663,
13
+ "<|audio_ref_start|>": 114664,
14
+ "<|audio_ref_end|>": 114665,
15
+ "<|audio_start|>": 114666,
16
+ "<|audio_end|>": 114667,
17
+ "<|speech_start|>": 114668,
18
+ "<|speech_end|>": 114669,
19
+ "<|transcript_start|>": 114670,
20
+ "<|transcript_end|>": 114671,
21
+ "<|segment_start|>": 114672,
22
+ "<|segment_end|>": 114673,
23
+ "<|speaker|>": 114674,
24
+ "<|start_time|>": 114675,
25
+ "<|end_time|>": 114676,
26
+ "<|duration|>": 114677,
27
+ "<|content|>": 114678,
28
+ "<|non_speech_event|>": 114679,
29
+ "<|retrieval_result_start|>": 114680,
30
+ "<|retrieval_result_end|>": 114681,
31
+ "<|ocr_start|>": 114682,
32
+ "<|ocr_end|>": 114683,
33
+ "<|image_start|>": 114684,
34
+ "<|image_end|>": 114685,
35
+ "<|video_start|>": 114686,
36
+ "<|video_end|>": 114687,
37
+ "<unk>": 114688,
38
+ "<s>": 114689,
39
+ "</s>": 114690,
40
+ "<pad>": 114691,
41
+ "<|user|>": 114692,
42
+ "<|assistant|>": 114693,
43
+ "<|tool_call|>": 114694,
44
+ "<|tool_response|>": 114695,
45
+ "<|endoftext|>": 114696,
46
+ "<think>": 114697,
47
+ "</think>": 114698,
48
+ "<|no_think|>": 114699,
49
+ "<|think|>": 114700,
50
+ "<|think_max|>": 114701,
51
+ "<|task:text_to_text|>": 114702,
52
+ "<|task:speech_to_speech|>": 114703,
53
+ "<|task:text_speech_to_text|>": 114704,
54
+ "<|task:text_speech_to_speech|>": 114705,
55
+ "<|task:full_duplex_speech|>": 114706,
56
+ "<|task:agent|>": 114707,
57
+ "<|task:tool_use|>": 114708,
58
+ "<|task:rag|>": 114709,
59
+ "<|task:code_execution|>": 114710,
60
+ "<|task:document_qa|>": 114711,
61
+ "<|task:data_analysis|>": 114712,
62
+ "<|task:workflow|>": 114713,
63
+ "<|reasoning_mode:none|>": 114714,
64
+ "<|reasoning_mode:short|>": 114715,
65
+ "<|reasoning_mode:deep|>": 114716,
66
+ "<|reasoning_mode:verify|>": 114717,
67
+ "<|private_reasoning_start|>": 114718,
68
+ "<|private_reasoning_end|>": 114719,
69
+ "<|reasoning_summary_start|>": 114720,
70
+ "<|reasoning_summary_end|>": 114721,
71
+ "<|plan_start|>": 114722,
72
+ "<|plan_end|>": 114723,
73
+ "<|step_start|>": 114724,
74
+ "<|step_end|>": 114725,
75
+ "<|action_start|>": 114726,
76
+ "<|action_end|>": 114727,
77
+ "<|observation_start|>": 114728,
78
+ "<|observation_end|>": 114729,
79
+ "<|reflection_start|>": 114730,
80
+ "<|reflection_end|>": 114731,
81
+ "<|verification_start|>": 114732,
82
+ "<|verification_end|>": 114733,
83
+ "<|tool_schema_start|>": 114734,
84
+ "<|tool_schema_end|>": 114735,
85
+ "<|tool_call_start|>": 114736,
86
+ "<|tool_call_end|>": 114737,
87
+ "<|tool_result_start|>": 114738,
88
+ "<|tool_result_end|>": 114739,
89
+ "<|tool_error_start|>": 114740,
90
+ "<|tool_error_end|>": 114741,
91
+ "<|retrieval_query_start|>": 114742,
92
+ "<|retrieval_query_end|>": 114743,
93
+ "<|citation_start|>": 114744,
94
+ "<|citation_end|>": 114745,
95
+ "<|memory_read_start|>": 114746,
96
+ "<|memory_read_end|>": 114747,
97
+ "<|memory_write_start|>": 114748,
98
+ "<|memory_write_end|>": 114749,
99
+ "<|final_answer_start|>": 114750,
100
+ "<|final_answer_end|>": 114751,
101
+ "<|json_start|>": 114752,
102
+ "<|json_end|>": 114753,
103
+ "<|code_start|>": 114754,
104
+ "<|code_end|>": 114755,
105
+ "<|markdown_start|>": 114756,
106
+ "<|markdown_end|>": 114757,
107
+ "<|duplex_start|>": 114758,
108
+ "<|duplex_end|>": 114759,
109
+ "<|system_channel|>": 114760,
110
+ "<|listen|>": 114761,
111
+ "<|speak|>": 114762,
112
+ "<|listen_speak|>": 114763,
113
+ "<|output_audio_start|>": 114764,
114
+ "<|output_audio_end|>": 114765,
115
+ "<|text_start|>": 114766,
116
+ "<|text_end|>": 114767,
117
+ "<|overlap|>": 114768,
118
+ "<|barge_in|>": 114769,
119
+ "<|interruption|>": 114770,
120
+ "<|interruption_repair|>": 114771,
121
+ "<|backchannel|>": 114772,
122
+ "<|turn_yield|>": 114773,
123
+ "<|hold|>": 114774,
124
+ "<|silence|>": 114775,
125
+ "<|non_speech|>": 114776,
126
+ "<|voice_reference_start|>": 114777,
127
+ "<|voice_reference_end|>": 114778,
128
+ "<|voice_reference|>": 114779,
129
+ "<|voice_switch|>": 114780,
130
+ "<|speaker_style|>": 114781,
131
+ "<|prosody_control|>": 114782,
132
+ "<|zh_tw|>": 114783,
133
+ "<|zh_hant|>": 114784,
134
+ "<|taigi|>": 114785,
135
+ "<|hakka|>": 114786,
136
+ "<|bopomofo|>": 114787,
137
+ "<|mixed_en|>": 114788,
138
+ "<|en|>": 114789,
139
+ "<|ja|>": 114790,
140
+ "<|ko|>": 114791,
141
+ "<|vi|>": 114792,
142
+ "<|id|>": 114793,
143
+ "<|th|>": 114794,
144
+ "<|asr|>": 114795,
145
+ "<|tts|>": 114796,
146
+ "<|speaker_0|>": 114797,
147
+ "<|speaker_1|>": 114798,
148
+ "<|speaker_2|>": 114799,
149
+ "<|speaker_3|>": 114800,
150
+ "<|timestamp|>": 114801,
151
+ "<|noise|>": 114802,
152
+ "<|laugh|>": 114803,
153
+ "<|breath|>": 114804,
154
+ "<|pause|>": 114805,
155
+ "<|prosody|>": 114806,
156
+ "<|pron|>": 114807,
157
+ "</|pron|>": 114808,
158
+ "<|image|>": 114809,
159
+ "<|ocr|>": 114810,
160
+ "<|bbox|>": 114811,
161
+ "<|line|>": 114812,
162
+ "<|table|>": 114813,
163
+ "<|row|>": 114814,
164
+ "<|col|>": 114815,
165
+ "<|cell|>": 114816,
166
+ "<|reading_order|>": 114817,
167
+ "<|source|>": 114818,
168
+ "<|cite|>": 114819,
169
+ "<|evidence|>": 114820,
170
+ "<|quote|>": 114821
171
+ }
evaluation_report.json CHANGED
@@ -1,58 +1,24 @@
1
  {
2
  "ok": true,
3
- "failures": [],
4
- "sections": {
5
- "Timestamp / rich transcription checks": {
6
- "rich_transcription_token_ids": {
7
- "<|transcript_start|>": 114670,
8
- "<|transcript_end|>": 114671,
9
- "<|segment_start|>": 114672,
10
- "<|segment_end|>": 114673,
11
- "<|speaker|>": 114674,
12
- "<|start_time|>": 114675,
13
- "<|end_time|>": 114676,
14
- "<|duration|>": 114677,
15
- "<|content|>": 114678,
16
- "<|non_speech_event|>": 114679
17
- },
18
- "timestamp_precision_digits": 2,
19
- "json_roundtrip_ok": true,
20
- "dense_timestamp_tokens_found": [],
21
- "missing_rich_transcription_tokens": [],
22
- "timestamp_strings_present": {
23
- "0.00": true,
24
- "3.42": true,
25
- "10.25": true,
26
- "3575.50": true
27
- },
28
- "non_speech_labels_present": {
29
- "[Silence]": true,
30
- "[Noise]": true,
31
- "[Music]": true,
32
- "[Unintelligible Speech]": true
33
- },
34
- "text_roundtrip_ok": {
35
- "traditional_chinese": true,
36
- "bopomofo_mixed_romanization": true,
37
- "json_syntax": true
38
- },
39
- "required_fields_ok": true,
40
- "parse_error": null
41
- }
42
  },
43
- "rich_transcription_token_ids": {
44
- "<|transcript_start|>": 114670,
45
- "<|transcript_end|>": 114671,
46
- "<|segment_start|>": 114672,
47
- "<|segment_end|>": 114673,
48
- "<|speaker|>": 114674,
49
- "<|start_time|>": 114675,
50
- "<|end_time|>": 114676,
51
- "<|duration|>": 114677,
52
- "<|content|>": 114678,
53
- "<|non_speech_event|>": 114679
54
  },
55
- "timestamp_precision_digits": 2,
56
- "json_roundtrip_ok": true,
57
- "dense_timestamp_tokens_found": []
 
 
58
  }
 
1
  {
2
  "ok": true,
3
+ "tokenizer": "voidful/PangolinTokenizer",
4
+ "base_vocab_size": 114688,
5
+ "effective_vocab_size": 114822,
6
+ "required_special_token_count": 157,
7
+ "required_special_tokens_missing": [],
8
+ "required_special_tokens_single_id": true,
9
+ "standard_special_tokens": {
10
+ "bos_token": "<s>",
11
+ "eos_token": "</s>",
12
+ "unk_token": "<unk>",
13
+ "pad_token": "<pad>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  },
15
+ "banned_token_matches": {
16
+ "audio_numeric_tokens": [],
17
+ "dense_timestamp_tokens": []
 
 
 
 
 
 
 
 
18
  },
19
+ "smoke_texts": {
20
+ "traditional_chinese": "台灣本土語言模型需要保留繁體中文、注音ㄅㄆㄇ與台語漢字。",
21
+ "duplex": "<|task:full_duplex_speech|><|audio_ref_start|>audio://utterance_000001<|audio_ref_end|>",
22
+ "tool_call": "<|tool_call_start|>{\"name\":\"search_documents\",\"arguments\":{\"query\":\"健保年金\"}}<|tool_call_end|>"
23
+ }
24
  }
special_tokens_map.json CHANGED
@@ -1,5 +1,13 @@
1
  {
 
 
 
 
2
  "additional_special_tokens": [
 
 
 
 
3
  "<|system|>",
4
  "<|user_channel|>",
5
  "<|assistant_channel|>",
@@ -30,10 +38,136 @@
30
  "<|image_start|>",
31
  "<|image_end|>",
32
  "<|video_start|>",
33
- "<|video_end|>"
34
- ],
35
- "pad_token": "<|pad|>",
36
- "bos_token": "<|bos|>",
37
- "eos_token": "<|eos|>",
38
- "unk_token": "<|unk|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  }
 
1
  {
2
+ "unk_token": "<unk>",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "pad_token": "<pad>",
6
  "additional_special_tokens": [
7
+ "<|bos|>",
8
+ "<|eos|>",
9
+ "<|unk|>",
10
+ "<|pad|>",
11
  "<|system|>",
12
  "<|user_channel|>",
13
  "<|assistant_channel|>",
 
38
  "<|image_start|>",
39
  "<|image_end|>",
40
  "<|video_start|>",
41
+ "<|video_end|>",
42
+ "<|user|>",
43
+ "<|assistant|>",
44
+ "<|tool_call|>",
45
+ "<|tool_response|>",
46
+ "<|endoftext|>",
47
+ "<think>",
48
+ "</think>",
49
+ "<|no_think|>",
50
+ "<|think|>",
51
+ "<|think_max|>",
52
+ "<|task:text_to_text|>",
53
+ "<|task:speech_to_speech|>",
54
+ "<|task:text_speech_to_text|>",
55
+ "<|task:text_speech_to_speech|>",
56
+ "<|task:full_duplex_speech|>",
57
+ "<|task:agent|>",
58
+ "<|task:tool_use|>",
59
+ "<|task:rag|>",
60
+ "<|task:code_execution|>",
61
+ "<|task:document_qa|>",
62
+ "<|task:data_analysis|>",
63
+ "<|task:workflow|>",
64
+ "<|reasoning_mode:none|>",
65
+ "<|reasoning_mode:short|>",
66
+ "<|reasoning_mode:deep|>",
67
+ "<|reasoning_mode:verify|>",
68
+ "<|private_reasoning_start|>",
69
+ "<|private_reasoning_end|>",
70
+ "<|reasoning_summary_start|>",
71
+ "<|reasoning_summary_end|>",
72
+ "<|plan_start|>",
73
+ "<|plan_end|>",
74
+ "<|step_start|>",
75
+ "<|step_end|>",
76
+ "<|action_start|>",
77
+ "<|action_end|>",
78
+ "<|observation_start|>",
79
+ "<|observation_end|>",
80
+ "<|reflection_start|>",
81
+ "<|reflection_end|>",
82
+ "<|verification_start|>",
83
+ "<|verification_end|>",
84
+ "<|tool_schema_start|>",
85
+ "<|tool_schema_end|>",
86
+ "<|tool_call_start|>",
87
+ "<|tool_call_end|>",
88
+ "<|tool_result_start|>",
89
+ "<|tool_result_end|>",
90
+ "<|tool_error_start|>",
91
+ "<|tool_error_end|>",
92
+ "<|retrieval_query_start|>",
93
+ "<|retrieval_query_end|>",
94
+ "<|citation_start|>",
95
+ "<|citation_end|>",
96
+ "<|memory_read_start|>",
97
+ "<|memory_read_end|>",
98
+ "<|memory_write_start|>",
99
+ "<|memory_write_end|>",
100
+ "<|final_answer_start|>",
101
+ "<|final_answer_end|>",
102
+ "<|json_start|>",
103
+ "<|json_end|>",
104
+ "<|code_start|>",
105
+ "<|code_end|>",
106
+ "<|markdown_start|>",
107
+ "<|markdown_end|>",
108
+ "<|duplex_start|>",
109
+ "<|duplex_end|>",
110
+ "<|system_channel|>",
111
+ "<|listen|>",
112
+ "<|speak|>",
113
+ "<|listen_speak|>",
114
+ "<|output_audio_start|>",
115
+ "<|output_audio_end|>",
116
+ "<|text_start|>",
117
+ "<|text_end|>",
118
+ "<|overlap|>",
119
+ "<|barge_in|>",
120
+ "<|interruption|>",
121
+ "<|interruption_repair|>",
122
+ "<|backchannel|>",
123
+ "<|turn_yield|>",
124
+ "<|hold|>",
125
+ "<|silence|>",
126
+ "<|non_speech|>",
127
+ "<|voice_reference_start|>",
128
+ "<|voice_reference_end|>",
129
+ "<|voice_reference|>",
130
+ "<|voice_switch|>",
131
+ "<|speaker_style|>",
132
+ "<|prosody_control|>",
133
+ "<|zh_tw|>",
134
+ "<|zh_hant|>",
135
+ "<|taigi|>",
136
+ "<|hakka|>",
137
+ "<|bopomofo|>",
138
+ "<|mixed_en|>",
139
+ "<|en|>",
140
+ "<|ja|>",
141
+ "<|ko|>",
142
+ "<|vi|>",
143
+ "<|id|>",
144
+ "<|th|>",
145
+ "<|asr|>",
146
+ "<|tts|>",
147
+ "<|speaker_0|>",
148
+ "<|speaker_1|>",
149
+ "<|speaker_2|>",
150
+ "<|speaker_3|>",
151
+ "<|timestamp|>",
152
+ "<|noise|>",
153
+ "<|laugh|>",
154
+ "<|breath|>",
155
+ "<|pause|>",
156
+ "<|prosody|>",
157
+ "<|pron|>",
158
+ "</|pron|>",
159
+ "<|image|>",
160
+ "<|ocr|>",
161
+ "<|bbox|>",
162
+ "<|line|>",
163
+ "<|table|>",
164
+ "<|row|>",
165
+ "<|col|>",
166
+ "<|cell|>",
167
+ "<|reading_order|>",
168
+ "<|source|>",
169
+ "<|cite|>",
170
+ "<|evidence|>",
171
+ "<|quote|>"
172
+ ]
173
  }
tokenizer.json CHANGED
@@ -317,6 +317,1212 @@
317
  "rstrip": false,
318
  "normalized": false,
319
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  }
321
  ],
322
  "normalizer": null,
@@ -361,9 +1567,9 @@
361
  "model": {
362
  "type": "BPE",
363
  "dropout": null,
364
- "unk_token": "<|unk|>",
365
- "continuing_subword_prefix": null,
366
- "end_of_word_suffix": null,
367
  "fuse_unk": false,
368
  "byte_fallback": false,
369
  "ignore_merges": false,
 
317
  "rstrip": false,
318
  "normalized": false,
319
  "special": true
320
+ },
321
+ {
322
+ "id": 114688,
323
+ "content": "<unk>",
324
+ "single_word": false,
325
+ "lstrip": false,
326
+ "rstrip": false,
327
+ "normalized": false,
328
+ "special": true
329
+ },
330
+ {
331
+ "id": 114689,
332
+ "content": "<s>",
333
+ "single_word": false,
334
+ "lstrip": false,
335
+ "rstrip": false,
336
+ "normalized": false,
337
+ "special": true
338
+ },
339
+ {
340
+ "id": 114690,
341
+ "content": "</s>",
342
+ "single_word": false,
343
+ "lstrip": false,
344
+ "rstrip": false,
345
+ "normalized": false,
346
+ "special": true
347
+ },
348
+ {
349
+ "id": 114691,
350
+ "content": "<pad>",
351
+ "single_word": false,
352
+ "lstrip": false,
353
+ "rstrip": false,
354
+ "normalized": false,
355
+ "special": true
356
+ },
357
+ {
358
+ "id": 114692,
359
+ "content": "<|user|>",
360
+ "single_word": false,
361
+ "lstrip": false,
362
+ "rstrip": false,
363
+ "normalized": false,
364
+ "special": true
365
+ },
366
+ {
367
+ "id": 114693,
368
+ "content": "<|assistant|>",
369
+ "single_word": false,
370
+ "lstrip": false,
371
+ "rstrip": false,
372
+ "normalized": false,
373
+ "special": true
374
+ },
375
+ {
376
+ "id": 114694,
377
+ "content": "<|tool_call|>",
378
+ "single_word": false,
379
+ "lstrip": false,
380
+ "rstrip": false,
381
+ "normalized": false,
382
+ "special": true
383
+ },
384
+ {
385
+ "id": 114695,
386
+ "content": "<|tool_response|>",
387
+ "single_word": false,
388
+ "lstrip": false,
389
+ "rstrip": false,
390
+ "normalized": false,
391
+ "special": true
392
+ },
393
+ {
394
+ "id": 114696,
395
+ "content": "<|endoftext|>",
396
+ "single_word": false,
397
+ "lstrip": false,
398
+ "rstrip": false,
399
+ "normalized": false,
400
+ "special": true
401
+ },
402
+ {
403
+ "id": 114697,
404
+ "content": "<think>",
405
+ "single_word": false,
406
+ "lstrip": false,
407
+ "rstrip": false,
408
+ "normalized": false,
409
+ "special": true
410
+ },
411
+ {
412
+ "id": 114698,
413
+ "content": "</think>",
414
+ "single_word": false,
415
+ "lstrip": false,
416
+ "rstrip": false,
417
+ "normalized": false,
418
+ "special": true
419
+ },
420
+ {
421
+ "id": 114699,
422
+ "content": "<|no_think|>",
423
+ "single_word": false,
424
+ "lstrip": false,
425
+ "rstrip": false,
426
+ "normalized": false,
427
+ "special": true
428
+ },
429
+ {
430
+ "id": 114700,
431
+ "content": "<|think|>",
432
+ "single_word": false,
433
+ "lstrip": false,
434
+ "rstrip": false,
435
+ "normalized": false,
436
+ "special": true
437
+ },
438
+ {
439
+ "id": 114701,
440
+ "content": "<|think_max|>",
441
+ "single_word": false,
442
+ "lstrip": false,
443
+ "rstrip": false,
444
+ "normalized": false,
445
+ "special": true
446
+ },
447
+ {
448
+ "id": 114702,
449
+ "content": "<|task:text_to_text|>",
450
+ "single_word": false,
451
+ "lstrip": false,
452
+ "rstrip": false,
453
+ "normalized": false,
454
+ "special": true
455
+ },
456
+ {
457
+ "id": 114703,
458
+ "content": "<|task:speech_to_speech|>",
459
+ "single_word": false,
460
+ "lstrip": false,
461
+ "rstrip": false,
462
+ "normalized": false,
463
+ "special": true
464
+ },
465
+ {
466
+ "id": 114704,
467
+ "content": "<|task:text_speech_to_text|>",
468
+ "single_word": false,
469
+ "lstrip": false,
470
+ "rstrip": false,
471
+ "normalized": false,
472
+ "special": true
473
+ },
474
+ {
475
+ "id": 114705,
476
+ "content": "<|task:text_speech_to_speech|>",
477
+ "single_word": false,
478
+ "lstrip": false,
479
+ "rstrip": false,
480
+ "normalized": false,
481
+ "special": true
482
+ },
483
+ {
484
+ "id": 114706,
485
+ "content": "<|task:full_duplex_speech|>",
486
+ "single_word": false,
487
+ "lstrip": false,
488
+ "rstrip": false,
489
+ "normalized": false,
490
+ "special": true
491
+ },
492
+ {
493
+ "id": 114707,
494
+ "content": "<|task:agent|>",
495
+ "single_word": false,
496
+ "lstrip": false,
497
+ "rstrip": false,
498
+ "normalized": false,
499
+ "special": true
500
+ },
501
+ {
502
+ "id": 114708,
503
+ "content": "<|task:tool_use|>",
504
+ "single_word": false,
505
+ "lstrip": false,
506
+ "rstrip": false,
507
+ "normalized": false,
508
+ "special": true
509
+ },
510
+ {
511
+ "id": 114709,
512
+ "content": "<|task:rag|>",
513
+ "single_word": false,
514
+ "lstrip": false,
515
+ "rstrip": false,
516
+ "normalized": false,
517
+ "special": true
518
+ },
519
+ {
520
+ "id": 114710,
521
+ "content": "<|task:code_execution|>",
522
+ "single_word": false,
523
+ "lstrip": false,
524
+ "rstrip": false,
525
+ "normalized": false,
526
+ "special": true
527
+ },
528
+ {
529
+ "id": 114711,
530
+ "content": "<|task:document_qa|>",
531
+ "single_word": false,
532
+ "lstrip": false,
533
+ "rstrip": false,
534
+ "normalized": false,
535
+ "special": true
536
+ },
537
+ {
538
+ "id": 114712,
539
+ "content": "<|task:data_analysis|>",
540
+ "single_word": false,
541
+ "lstrip": false,
542
+ "rstrip": false,
543
+ "normalized": false,
544
+ "special": true
545
+ },
546
+ {
547
+ "id": 114713,
548
+ "content": "<|task:workflow|>",
549
+ "single_word": false,
550
+ "lstrip": false,
551
+ "rstrip": false,
552
+ "normalized": false,
553
+ "special": true
554
+ },
555
+ {
556
+ "id": 114714,
557
+ "content": "<|reasoning_mode:none|>",
558
+ "single_word": false,
559
+ "lstrip": false,
560
+ "rstrip": false,
561
+ "normalized": false,
562
+ "special": true
563
+ },
564
+ {
565
+ "id": 114715,
566
+ "content": "<|reasoning_mode:short|>",
567
+ "single_word": false,
568
+ "lstrip": false,
569
+ "rstrip": false,
570
+ "normalized": false,
571
+ "special": true
572
+ },
573
+ {
574
+ "id": 114716,
575
+ "content": "<|reasoning_mode:deep|>",
576
+ "single_word": false,
577
+ "lstrip": false,
578
+ "rstrip": false,
579
+ "normalized": false,
580
+ "special": true
581
+ },
582
+ {
583
+ "id": 114717,
584
+ "content": "<|reasoning_mode:verify|>",
585
+ "single_word": false,
586
+ "lstrip": false,
587
+ "rstrip": false,
588
+ "normalized": false,
589
+ "special": true
590
+ },
591
+ {
592
+ "id": 114718,
593
+ "content": "<|private_reasoning_start|>",
594
+ "single_word": false,
595
+ "lstrip": false,
596
+ "rstrip": false,
597
+ "normalized": false,
598
+ "special": true
599
+ },
600
+ {
601
+ "id": 114719,
602
+ "content": "<|private_reasoning_end|>",
603
+ "single_word": false,
604
+ "lstrip": false,
605
+ "rstrip": false,
606
+ "normalized": false,
607
+ "special": true
608
+ },
609
+ {
610
+ "id": 114720,
611
+ "content": "<|reasoning_summary_start|>",
612
+ "single_word": false,
613
+ "lstrip": false,
614
+ "rstrip": false,
615
+ "normalized": false,
616
+ "special": true
617
+ },
618
+ {
619
+ "id": 114721,
620
+ "content": "<|reasoning_summary_end|>",
621
+ "single_word": false,
622
+ "lstrip": false,
623
+ "rstrip": false,
624
+ "normalized": false,
625
+ "special": true
626
+ },
627
+ {
628
+ "id": 114722,
629
+ "content": "<|plan_start|>",
630
+ "single_word": false,
631
+ "lstrip": false,
632
+ "rstrip": false,
633
+ "normalized": false,
634
+ "special": true
635
+ },
636
+ {
637
+ "id": 114723,
638
+ "content": "<|plan_end|>",
639
+ "single_word": false,
640
+ "lstrip": false,
641
+ "rstrip": false,
642
+ "normalized": false,
643
+ "special": true
644
+ },
645
+ {
646
+ "id": 114724,
647
+ "content": "<|step_start|>",
648
+ "single_word": false,
649
+ "lstrip": false,
650
+ "rstrip": false,
651
+ "normalized": false,
652
+ "special": true
653
+ },
654
+ {
655
+ "id": 114725,
656
+ "content": "<|step_end|>",
657
+ "single_word": false,
658
+ "lstrip": false,
659
+ "rstrip": false,
660
+ "normalized": false,
661
+ "special": true
662
+ },
663
+ {
664
+ "id": 114726,
665
+ "content": "<|action_start|>",
666
+ "single_word": false,
667
+ "lstrip": false,
668
+ "rstrip": false,
669
+ "normalized": false,
670
+ "special": true
671
+ },
672
+ {
673
+ "id": 114727,
674
+ "content": "<|action_end|>",
675
+ "single_word": false,
676
+ "lstrip": false,
677
+ "rstrip": false,
678
+ "normalized": false,
679
+ "special": true
680
+ },
681
+ {
682
+ "id": 114728,
683
+ "content": "<|observation_start|>",
684
+ "single_word": false,
685
+ "lstrip": false,
686
+ "rstrip": false,
687
+ "normalized": false,
688
+ "special": true
689
+ },
690
+ {
691
+ "id": 114729,
692
+ "content": "<|observation_end|>",
693
+ "single_word": false,
694
+ "lstrip": false,
695
+ "rstrip": false,
696
+ "normalized": false,
697
+ "special": true
698
+ },
699
+ {
700
+ "id": 114730,
701
+ "content": "<|reflection_start|>",
702
+ "single_word": false,
703
+ "lstrip": false,
704
+ "rstrip": false,
705
+ "normalized": false,
706
+ "special": true
707
+ },
708
+ {
709
+ "id": 114731,
710
+ "content": "<|reflection_end|>",
711
+ "single_word": false,
712
+ "lstrip": false,
713
+ "rstrip": false,
714
+ "normalized": false,
715
+ "special": true
716
+ },
717
+ {
718
+ "id": 114732,
719
+ "content": "<|verification_start|>",
720
+ "single_word": false,
721
+ "lstrip": false,
722
+ "rstrip": false,
723
+ "normalized": false,
724
+ "special": true
725
+ },
726
+ {
727
+ "id": 114733,
728
+ "content": "<|verification_end|>",
729
+ "single_word": false,
730
+ "lstrip": false,
731
+ "rstrip": false,
732
+ "normalized": false,
733
+ "special": true
734
+ },
735
+ {
736
+ "id": 114734,
737
+ "content": "<|tool_schema_start|>",
738
+ "single_word": false,
739
+ "lstrip": false,
740
+ "rstrip": false,
741
+ "normalized": false,
742
+ "special": true
743
+ },
744
+ {
745
+ "id": 114735,
746
+ "content": "<|tool_schema_end|>",
747
+ "single_word": false,
748
+ "lstrip": false,
749
+ "rstrip": false,
750
+ "normalized": false,
751
+ "special": true
752
+ },
753
+ {
754
+ "id": 114736,
755
+ "content": "<|tool_call_start|>",
756
+ "single_word": false,
757
+ "lstrip": false,
758
+ "rstrip": false,
759
+ "normalized": false,
760
+ "special": true
761
+ },
762
+ {
763
+ "id": 114737,
764
+ "content": "<|tool_call_end|>",
765
+ "single_word": false,
766
+ "lstrip": false,
767
+ "rstrip": false,
768
+ "normalized": false,
769
+ "special": true
770
+ },
771
+ {
772
+ "id": 114738,
773
+ "content": "<|tool_result_start|>",
774
+ "single_word": false,
775
+ "lstrip": false,
776
+ "rstrip": false,
777
+ "normalized": false,
778
+ "special": true
779
+ },
780
+ {
781
+ "id": 114739,
782
+ "content": "<|tool_result_end|>",
783
+ "single_word": false,
784
+ "lstrip": false,
785
+ "rstrip": false,
786
+ "normalized": false,
787
+ "special": true
788
+ },
789
+ {
790
+ "id": 114740,
791
+ "content": "<|tool_error_start|>",
792
+ "single_word": false,
793
+ "lstrip": false,
794
+ "rstrip": false,
795
+ "normalized": false,
796
+ "special": true
797
+ },
798
+ {
799
+ "id": 114741,
800
+ "content": "<|tool_error_end|>",
801
+ "single_word": false,
802
+ "lstrip": false,
803
+ "rstrip": false,
804
+ "normalized": false,
805
+ "special": true
806
+ },
807
+ {
808
+ "id": 114742,
809
+ "content": "<|retrieval_query_start|>",
810
+ "single_word": false,
811
+ "lstrip": false,
812
+ "rstrip": false,
813
+ "normalized": false,
814
+ "special": true
815
+ },
816
+ {
817
+ "id": 114743,
818
+ "content": "<|retrieval_query_end|>",
819
+ "single_word": false,
820
+ "lstrip": false,
821
+ "rstrip": false,
822
+ "normalized": false,
823
+ "special": true
824
+ },
825
+ {
826
+ "id": 114744,
827
+ "content": "<|citation_start|>",
828
+ "single_word": false,
829
+ "lstrip": false,
830
+ "rstrip": false,
831
+ "normalized": false,
832
+ "special": true
833
+ },
834
+ {
835
+ "id": 114745,
836
+ "content": "<|citation_end|>",
837
+ "single_word": false,
838
+ "lstrip": false,
839
+ "rstrip": false,
840
+ "normalized": false,
841
+ "special": true
842
+ },
843
+ {
844
+ "id": 114746,
845
+ "content": "<|memory_read_start|>",
846
+ "single_word": false,
847
+ "lstrip": false,
848
+ "rstrip": false,
849
+ "normalized": false,
850
+ "special": true
851
+ },
852
+ {
853
+ "id": 114747,
854
+ "content": "<|memory_read_end|>",
855
+ "single_word": false,
856
+ "lstrip": false,
857
+ "rstrip": false,
858
+ "normalized": false,
859
+ "special": true
860
+ },
861
+ {
862
+ "id": 114748,
863
+ "content": "<|memory_write_start|>",
864
+ "single_word": false,
865
+ "lstrip": false,
866
+ "rstrip": false,
867
+ "normalized": false,
868
+ "special": true
869
+ },
870
+ {
871
+ "id": 114749,
872
+ "content": "<|memory_write_end|>",
873
+ "single_word": false,
874
+ "lstrip": false,
875
+ "rstrip": false,
876
+ "normalized": false,
877
+ "special": true
878
+ },
879
+ {
880
+ "id": 114750,
881
+ "content": "<|final_answer_start|>",
882
+ "single_word": false,
883
+ "lstrip": false,
884
+ "rstrip": false,
885
+ "normalized": false,
886
+ "special": true
887
+ },
888
+ {
889
+ "id": 114751,
890
+ "content": "<|final_answer_end|>",
891
+ "single_word": false,
892
+ "lstrip": false,
893
+ "rstrip": false,
894
+ "normalized": false,
895
+ "special": true
896
+ },
897
+ {
898
+ "id": 114752,
899
+ "content": "<|json_start|>",
900
+ "single_word": false,
901
+ "lstrip": false,
902
+ "rstrip": false,
903
+ "normalized": false,
904
+ "special": true
905
+ },
906
+ {
907
+ "id": 114753,
908
+ "content": "<|json_end|>",
909
+ "single_word": false,
910
+ "lstrip": false,
911
+ "rstrip": false,
912
+ "normalized": false,
913
+ "special": true
914
+ },
915
+ {
916
+ "id": 114754,
917
+ "content": "<|code_start|>",
918
+ "single_word": false,
919
+ "lstrip": false,
920
+ "rstrip": false,
921
+ "normalized": false,
922
+ "special": true
923
+ },
924
+ {
925
+ "id": 114755,
926
+ "content": "<|code_end|>",
927
+ "single_word": false,
928
+ "lstrip": false,
929
+ "rstrip": false,
930
+ "normalized": false,
931
+ "special": true
932
+ },
933
+ {
934
+ "id": 114756,
935
+ "content": "<|markdown_start|>",
936
+ "single_word": false,
937
+ "lstrip": false,
938
+ "rstrip": false,
939
+ "normalized": false,
940
+ "special": true
941
+ },
942
+ {
943
+ "id": 114757,
944
+ "content": "<|markdown_end|>",
945
+ "single_word": false,
946
+ "lstrip": false,
947
+ "rstrip": false,
948
+ "normalized": false,
949
+ "special": true
950
+ },
951
+ {
952
+ "id": 114758,
953
+ "content": "<|duplex_start|>",
954
+ "single_word": false,
955
+ "lstrip": false,
956
+ "rstrip": false,
957
+ "normalized": false,
958
+ "special": true
959
+ },
960
+ {
961
+ "id": 114759,
962
+ "content": "<|duplex_end|>",
963
+ "single_word": false,
964
+ "lstrip": false,
965
+ "rstrip": false,
966
+ "normalized": false,
967
+ "special": true
968
+ },
969
+ {
970
+ "id": 114760,
971
+ "content": "<|system_channel|>",
972
+ "single_word": false,
973
+ "lstrip": false,
974
+ "rstrip": false,
975
+ "normalized": false,
976
+ "special": true
977
+ },
978
+ {
979
+ "id": 114761,
980
+ "content": "<|listen|>",
981
+ "single_word": false,
982
+ "lstrip": false,
983
+ "rstrip": false,
984
+ "normalized": false,
985
+ "special": true
986
+ },
987
+ {
988
+ "id": 114762,
989
+ "content": "<|speak|>",
990
+ "single_word": false,
991
+ "lstrip": false,
992
+ "rstrip": false,
993
+ "normalized": false,
994
+ "special": true
995
+ },
996
+ {
997
+ "id": 114763,
998
+ "content": "<|listen_speak|>",
999
+ "single_word": false,
1000
+ "lstrip": false,
1001
+ "rstrip": false,
1002
+ "normalized": false,
1003
+ "special": true
1004
+ },
1005
+ {
1006
+ "id": 114764,
1007
+ "content": "<|output_audio_start|>",
1008
+ "single_word": false,
1009
+ "lstrip": false,
1010
+ "rstrip": false,
1011
+ "normalized": false,
1012
+ "special": true
1013
+ },
1014
+ {
1015
+ "id": 114765,
1016
+ "content": "<|output_audio_end|>",
1017
+ "single_word": false,
1018
+ "lstrip": false,
1019
+ "rstrip": false,
1020
+ "normalized": false,
1021
+ "special": true
1022
+ },
1023
+ {
1024
+ "id": 114766,
1025
+ "content": "<|text_start|>",
1026
+ "single_word": false,
1027
+ "lstrip": false,
1028
+ "rstrip": false,
1029
+ "normalized": false,
1030
+ "special": true
1031
+ },
1032
+ {
1033
+ "id": 114767,
1034
+ "content": "<|text_end|>",
1035
+ "single_word": false,
1036
+ "lstrip": false,
1037
+ "rstrip": false,
1038
+ "normalized": false,
1039
+ "special": true
1040
+ },
1041
+ {
1042
+ "id": 114768,
1043
+ "content": "<|overlap|>",
1044
+ "single_word": false,
1045
+ "lstrip": false,
1046
+ "rstrip": false,
1047
+ "normalized": false,
1048
+ "special": true
1049
+ },
1050
+ {
1051
+ "id": 114769,
1052
+ "content": "<|barge_in|>",
1053
+ "single_word": false,
1054
+ "lstrip": false,
1055
+ "rstrip": false,
1056
+ "normalized": false,
1057
+ "special": true
1058
+ },
1059
+ {
1060
+ "id": 114770,
1061
+ "content": "<|interruption|>",
1062
+ "single_word": false,
1063
+ "lstrip": false,
1064
+ "rstrip": false,
1065
+ "normalized": false,
1066
+ "special": true
1067
+ },
1068
+ {
1069
+ "id": 114771,
1070
+ "content": "<|interruption_repair|>",
1071
+ "single_word": false,
1072
+ "lstrip": false,
1073
+ "rstrip": false,
1074
+ "normalized": false,
1075
+ "special": true
1076
+ },
1077
+ {
1078
+ "id": 114772,
1079
+ "content": "<|backchannel|>",
1080
+ "single_word": false,
1081
+ "lstrip": false,
1082
+ "rstrip": false,
1083
+ "normalized": false,
1084
+ "special": true
1085
+ },
1086
+ {
1087
+ "id": 114773,
1088
+ "content": "<|turn_yield|>",
1089
+ "single_word": false,
1090
+ "lstrip": false,
1091
+ "rstrip": false,
1092
+ "normalized": false,
1093
+ "special": true
1094
+ },
1095
+ {
1096
+ "id": 114774,
1097
+ "content": "<|hold|>",
1098
+ "single_word": false,
1099
+ "lstrip": false,
1100
+ "rstrip": false,
1101
+ "normalized": false,
1102
+ "special": true
1103
+ },
1104
+ {
1105
+ "id": 114775,
1106
+ "content": "<|silence|>",
1107
+ "single_word": false,
1108
+ "lstrip": false,
1109
+ "rstrip": false,
1110
+ "normalized": false,
1111
+ "special": true
1112
+ },
1113
+ {
1114
+ "id": 114776,
1115
+ "content": "<|non_speech|>",
1116
+ "single_word": false,
1117
+ "lstrip": false,
1118
+ "rstrip": false,
1119
+ "normalized": false,
1120
+ "special": true
1121
+ },
1122
+ {
1123
+ "id": 114777,
1124
+ "content": "<|voice_reference_start|>",
1125
+ "single_word": false,
1126
+ "lstrip": false,
1127
+ "rstrip": false,
1128
+ "normalized": false,
1129
+ "special": true
1130
+ },
1131
+ {
1132
+ "id": 114778,
1133
+ "content": "<|voice_reference_end|>",
1134
+ "single_word": false,
1135
+ "lstrip": false,
1136
+ "rstrip": false,
1137
+ "normalized": false,
1138
+ "special": true
1139
+ },
1140
+ {
1141
+ "id": 114779,
1142
+ "content": "<|voice_reference|>",
1143
+ "single_word": false,
1144
+ "lstrip": false,
1145
+ "rstrip": false,
1146
+ "normalized": false,
1147
+ "special": true
1148
+ },
1149
+ {
1150
+ "id": 114780,
1151
+ "content": "<|voice_switch|>",
1152
+ "single_word": false,
1153
+ "lstrip": false,
1154
+ "rstrip": false,
1155
+ "normalized": false,
1156
+ "special": true
1157
+ },
1158
+ {
1159
+ "id": 114781,
1160
+ "content": "<|speaker_style|>",
1161
+ "single_word": false,
1162
+ "lstrip": false,
1163
+ "rstrip": false,
1164
+ "normalized": false,
1165
+ "special": true
1166
+ },
1167
+ {
1168
+ "id": 114782,
1169
+ "content": "<|prosody_control|>",
1170
+ "single_word": false,
1171
+ "lstrip": false,
1172
+ "rstrip": false,
1173
+ "normalized": false,
1174
+ "special": true
1175
+ },
1176
+ {
1177
+ "id": 114783,
1178
+ "content": "<|zh_tw|>",
1179
+ "single_word": false,
1180
+ "lstrip": false,
1181
+ "rstrip": false,
1182
+ "normalized": false,
1183
+ "special": true
1184
+ },
1185
+ {
1186
+ "id": 114784,
1187
+ "content": "<|zh_hant|>",
1188
+ "single_word": false,
1189
+ "lstrip": false,
1190
+ "rstrip": false,
1191
+ "normalized": false,
1192
+ "special": true
1193
+ },
1194
+ {
1195
+ "id": 114785,
1196
+ "content": "<|taigi|>",
1197
+ "single_word": false,
1198
+ "lstrip": false,
1199
+ "rstrip": false,
1200
+ "normalized": false,
1201
+ "special": true
1202
+ },
1203
+ {
1204
+ "id": 114786,
1205
+ "content": "<|hakka|>",
1206
+ "single_word": false,
1207
+ "lstrip": false,
1208
+ "rstrip": false,
1209
+ "normalized": false,
1210
+ "special": true
1211
+ },
1212
+ {
1213
+ "id": 114787,
1214
+ "content": "<|bopomofo|>",
1215
+ "single_word": false,
1216
+ "lstrip": false,
1217
+ "rstrip": false,
1218
+ "normalized": false,
1219
+ "special": true
1220
+ },
1221
+ {
1222
+ "id": 114788,
1223
+ "content": "<|mixed_en|>",
1224
+ "single_word": false,
1225
+ "lstrip": false,
1226
+ "rstrip": false,
1227
+ "normalized": false,
1228
+ "special": true
1229
+ },
1230
+ {
1231
+ "id": 114789,
1232
+ "content": "<|en|>",
1233
+ "single_word": false,
1234
+ "lstrip": false,
1235
+ "rstrip": false,
1236
+ "normalized": false,
1237
+ "special": true
1238
+ },
1239
+ {
1240
+ "id": 114790,
1241
+ "content": "<|ja|>",
1242
+ "single_word": false,
1243
+ "lstrip": false,
1244
+ "rstrip": false,
1245
+ "normalized": false,
1246
+ "special": true
1247
+ },
1248
+ {
1249
+ "id": 114791,
1250
+ "content": "<|ko|>",
1251
+ "single_word": false,
1252
+ "lstrip": false,
1253
+ "rstrip": false,
1254
+ "normalized": false,
1255
+ "special": true
1256
+ },
1257
+ {
1258
+ "id": 114792,
1259
+ "content": "<|vi|>",
1260
+ "single_word": false,
1261
+ "lstrip": false,
1262
+ "rstrip": false,
1263
+ "normalized": false,
1264
+ "special": true
1265
+ },
1266
+ {
1267
+ "id": 114793,
1268
+ "content": "<|id|>",
1269
+ "single_word": false,
1270
+ "lstrip": false,
1271
+ "rstrip": false,
1272
+ "normalized": false,
1273
+ "special": true
1274
+ },
1275
+ {
1276
+ "id": 114794,
1277
+ "content": "<|th|>",
1278
+ "single_word": false,
1279
+ "lstrip": false,
1280
+ "rstrip": false,
1281
+ "normalized": false,
1282
+ "special": true
1283
+ },
1284
+ {
1285
+ "id": 114795,
1286
+ "content": "<|asr|>",
1287
+ "single_word": false,
1288
+ "lstrip": false,
1289
+ "rstrip": false,
1290
+ "normalized": false,
1291
+ "special": true
1292
+ },
1293
+ {
1294
+ "id": 114796,
1295
+ "content": "<|tts|>",
1296
+ "single_word": false,
1297
+ "lstrip": false,
1298
+ "rstrip": false,
1299
+ "normalized": false,
1300
+ "special": true
1301
+ },
1302
+ {
1303
+ "id": 114797,
1304
+ "content": "<|speaker_0|>",
1305
+ "single_word": false,
1306
+ "lstrip": false,
1307
+ "rstrip": false,
1308
+ "normalized": false,
1309
+ "special": true
1310
+ },
1311
+ {
1312
+ "id": 114798,
1313
+ "content": "<|speaker_1|>",
1314
+ "single_word": false,
1315
+ "lstrip": false,
1316
+ "rstrip": false,
1317
+ "normalized": false,
1318
+ "special": true
1319
+ },
1320
+ {
1321
+ "id": 114799,
1322
+ "content": "<|speaker_2|>",
1323
+ "single_word": false,
1324
+ "lstrip": false,
1325
+ "rstrip": false,
1326
+ "normalized": false,
1327
+ "special": true
1328
+ },
1329
+ {
1330
+ "id": 114800,
1331
+ "content": "<|speaker_3|>",
1332
+ "single_word": false,
1333
+ "lstrip": false,
1334
+ "rstrip": false,
1335
+ "normalized": false,
1336
+ "special": true
1337
+ },
1338
+ {
1339
+ "id": 114801,
1340
+ "content": "<|timestamp|>",
1341
+ "single_word": false,
1342
+ "lstrip": false,
1343
+ "rstrip": false,
1344
+ "normalized": false,
1345
+ "special": true
1346
+ },
1347
+ {
1348
+ "id": 114802,
1349
+ "content": "<|noise|>",
1350
+ "single_word": false,
1351
+ "lstrip": false,
1352
+ "rstrip": false,
1353
+ "normalized": false,
1354
+ "special": true
1355
+ },
1356
+ {
1357
+ "id": 114803,
1358
+ "content": "<|laugh|>",
1359
+ "single_word": false,
1360
+ "lstrip": false,
1361
+ "rstrip": false,
1362
+ "normalized": false,
1363
+ "special": true
1364
+ },
1365
+ {
1366
+ "id": 114804,
1367
+ "content": "<|breath|>",
1368
+ "single_word": false,
1369
+ "lstrip": false,
1370
+ "rstrip": false,
1371
+ "normalized": false,
1372
+ "special": true
1373
+ },
1374
+ {
1375
+ "id": 114805,
1376
+ "content": "<|pause|>",
1377
+ "single_word": false,
1378
+ "lstrip": false,
1379
+ "rstrip": false,
1380
+ "normalized": false,
1381
+ "special": true
1382
+ },
1383
+ {
1384
+ "id": 114806,
1385
+ "content": "<|prosody|>",
1386
+ "single_word": false,
1387
+ "lstrip": false,
1388
+ "rstrip": false,
1389
+ "normalized": false,
1390
+ "special": true
1391
+ },
1392
+ {
1393
+ "id": 114807,
1394
+ "content": "<|pron|>",
1395
+ "single_word": false,
1396
+ "lstrip": false,
1397
+ "rstrip": false,
1398
+ "normalized": false,
1399
+ "special": true
1400
+ },
1401
+ {
1402
+ "id": 114808,
1403
+ "content": "</|pron|>",
1404
+ "single_word": false,
1405
+ "lstrip": false,
1406
+ "rstrip": false,
1407
+ "normalized": false,
1408
+ "special": true
1409
+ },
1410
+ {
1411
+ "id": 114809,
1412
+ "content": "<|image|>",
1413
+ "single_word": false,
1414
+ "lstrip": false,
1415
+ "rstrip": false,
1416
+ "normalized": false,
1417
+ "special": true
1418
+ },
1419
+ {
1420
+ "id": 114810,
1421
+ "content": "<|ocr|>",
1422
+ "single_word": false,
1423
+ "lstrip": false,
1424
+ "rstrip": false,
1425
+ "normalized": false,
1426
+ "special": true
1427
+ },
1428
+ {
1429
+ "id": 114811,
1430
+ "content": "<|bbox|>",
1431
+ "single_word": false,
1432
+ "lstrip": false,
1433
+ "rstrip": false,
1434
+ "normalized": false,
1435
+ "special": true
1436
+ },
1437
+ {
1438
+ "id": 114812,
1439
+ "content": "<|line|>",
1440
+ "single_word": false,
1441
+ "lstrip": false,
1442
+ "rstrip": false,
1443
+ "normalized": false,
1444
+ "special": true
1445
+ },
1446
+ {
1447
+ "id": 114813,
1448
+ "content": "<|table|>",
1449
+ "single_word": false,
1450
+ "lstrip": false,
1451
+ "rstrip": false,
1452
+ "normalized": false,
1453
+ "special": true
1454
+ },
1455
+ {
1456
+ "id": 114814,
1457
+ "content": "<|row|>",
1458
+ "single_word": false,
1459
+ "lstrip": false,
1460
+ "rstrip": false,
1461
+ "normalized": false,
1462
+ "special": true
1463
+ },
1464
+ {
1465
+ "id": 114815,
1466
+ "content": "<|col|>",
1467
+ "single_word": false,
1468
+ "lstrip": false,
1469
+ "rstrip": false,
1470
+ "normalized": false,
1471
+ "special": true
1472
+ },
1473
+ {
1474
+ "id": 114816,
1475
+ "content": "<|cell|>",
1476
+ "single_word": false,
1477
+ "lstrip": false,
1478
+ "rstrip": false,
1479
+ "normalized": false,
1480
+ "special": true
1481
+ },
1482
+ {
1483
+ "id": 114817,
1484
+ "content": "<|reading_order|>",
1485
+ "single_word": false,
1486
+ "lstrip": false,
1487
+ "rstrip": false,
1488
+ "normalized": false,
1489
+ "special": true
1490
+ },
1491
+ {
1492
+ "id": 114818,
1493
+ "content": "<|source|>",
1494
+ "single_word": false,
1495
+ "lstrip": false,
1496
+ "rstrip": false,
1497
+ "normalized": false,
1498
+ "special": true
1499
+ },
1500
+ {
1501
+ "id": 114819,
1502
+ "content": "<|cite|>",
1503
+ "single_word": false,
1504
+ "lstrip": false,
1505
+ "rstrip": false,
1506
+ "normalized": false,
1507
+ "special": true
1508
+ },
1509
+ {
1510
+ "id": 114820,
1511
+ "content": "<|evidence|>",
1512
+ "single_word": false,
1513
+ "lstrip": false,
1514
+ "rstrip": false,
1515
+ "normalized": false,
1516
+ "special": true
1517
+ },
1518
+ {
1519
+ "id": 114821,
1520
+ "content": "<|quote|>",
1521
+ "single_word": false,
1522
+ "lstrip": false,
1523
+ "rstrip": false,
1524
+ "normalized": false,
1525
+ "special": true
1526
  }
1527
  ],
1528
  "normalizer": null,
 
1567
  "model": {
1568
  "type": "BPE",
1569
  "dropout": null,
1570
+ "unk_token": null,
1571
+ "continuing_subword_prefix": "",
1572
+ "end_of_word_suffix": "",
1573
  "fuse_unk": false,
1574
  "byte_fallback": false,
1575
  "ignore_merges": false,
tokenizer_config.json CHANGED
@@ -1,8 +1,15 @@
1
  {
 
2
  "backend": "tokenizers",
3
- "bos_token": "<|bos|>",
4
- "eos_token": "<|eos|>",
 
 
5
  "extra_special_tokens": [
 
 
 
 
6
  "<|system|>",
7
  "<|user_channel|>",
8
  "<|assistant_channel|>",
@@ -33,11 +40,147 @@
33
  "<|image_start|>",
34
  "<|image_end|>",
35
  "<|video_start|>",
36
- "<|video_end|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ],
 
 
 
38
  "model_max_length": 131072,
39
  "model_type": "byte_level_bpe",
40
- "pad_token": "<|pad|>",
 
 
 
41
  "rich_transcription": {
42
  "allow_non_speech_events": true,
43
  "compact_json": true,
@@ -88,13 +231,20 @@
88
  "<|video_end|>"
89
  ],
90
  "strict_no_dense_timestamp_tokens": true,
91
- "tokenizer_class": "GPT2TokenizerFast",
92
- "unk_token": "<|unk|>",
93
- "vocab_size": 114688,
94
- "padding_side": "right",
95
  "truncation_side": "right",
96
- "clean_up_tokenization_spaces": false,
97
- "no_audio_codec_tokens": true,
98
- "no_dense_timestamp_tokens": true,
99
- "fix_mistral_regex": true
 
 
 
 
 
 
 
 
 
 
100
  }
 
1
  {
2
+ "add_prefix_space": false,
3
  "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "</s>",
7
+ "errors": "replace",
8
  "extra_special_tokens": [
9
+ "<|bos|>",
10
+ "<|eos|>",
11
+ "<|unk|>",
12
+ "<|pad|>",
13
  "<|system|>",
14
  "<|user_channel|>",
15
  "<|assistant_channel|>",
 
40
  "<|image_start|>",
41
  "<|image_end|>",
42
  "<|video_start|>",
43
+ "<|video_end|>",
44
+ "<|user|>",
45
+ "<|assistant|>",
46
+ "<|tool_call|>",
47
+ "<|tool_response|>",
48
+ "<|endoftext|>",
49
+ "<think>",
50
+ "</think>",
51
+ "<|no_think|>",
52
+ "<|think|>",
53
+ "<|think_max|>",
54
+ "<|task:text_to_text|>",
55
+ "<|task:speech_to_speech|>",
56
+ "<|task:text_speech_to_text|>",
57
+ "<|task:text_speech_to_speech|>",
58
+ "<|task:full_duplex_speech|>",
59
+ "<|task:agent|>",
60
+ "<|task:tool_use|>",
61
+ "<|task:rag|>",
62
+ "<|task:code_execution|>",
63
+ "<|task:document_qa|>",
64
+ "<|task:data_analysis|>",
65
+ "<|task:workflow|>",
66
+ "<|reasoning_mode:none|>",
67
+ "<|reasoning_mode:short|>",
68
+ "<|reasoning_mode:deep|>",
69
+ "<|reasoning_mode:verify|>",
70
+ "<|private_reasoning_start|>",
71
+ "<|private_reasoning_end|>",
72
+ "<|reasoning_summary_start|>",
73
+ "<|reasoning_summary_end|>",
74
+ "<|plan_start|>",
75
+ "<|plan_end|>",
76
+ "<|step_start|>",
77
+ "<|step_end|>",
78
+ "<|action_start|>",
79
+ "<|action_end|>",
80
+ "<|observation_start|>",
81
+ "<|observation_end|>",
82
+ "<|reflection_start|>",
83
+ "<|reflection_end|>",
84
+ "<|verification_start|>",
85
+ "<|verification_end|>",
86
+ "<|tool_schema_start|>",
87
+ "<|tool_schema_end|>",
88
+ "<|tool_call_start|>",
89
+ "<|tool_call_end|>",
90
+ "<|tool_result_start|>",
91
+ "<|tool_result_end|>",
92
+ "<|tool_error_start|>",
93
+ "<|tool_error_end|>",
94
+ "<|retrieval_query_start|>",
95
+ "<|retrieval_query_end|>",
96
+ "<|citation_start|>",
97
+ "<|citation_end|>",
98
+ "<|memory_read_start|>",
99
+ "<|memory_read_end|>",
100
+ "<|memory_write_start|>",
101
+ "<|memory_write_end|>",
102
+ "<|final_answer_start|>",
103
+ "<|final_answer_end|>",
104
+ "<|json_start|>",
105
+ "<|json_end|>",
106
+ "<|code_start|>",
107
+ "<|code_end|>",
108
+ "<|markdown_start|>",
109
+ "<|markdown_end|>",
110
+ "<|duplex_start|>",
111
+ "<|duplex_end|>",
112
+ "<|system_channel|>",
113
+ "<|listen|>",
114
+ "<|speak|>",
115
+ "<|listen_speak|>",
116
+ "<|output_audio_start|>",
117
+ "<|output_audio_end|>",
118
+ "<|text_start|>",
119
+ "<|text_end|>",
120
+ "<|overlap|>",
121
+ "<|barge_in|>",
122
+ "<|interruption|>",
123
+ "<|interruption_repair|>",
124
+ "<|backchannel|>",
125
+ "<|turn_yield|>",
126
+ "<|hold|>",
127
+ "<|silence|>",
128
+ "<|non_speech|>",
129
+ "<|voice_reference_start|>",
130
+ "<|voice_reference_end|>",
131
+ "<|voice_reference|>",
132
+ "<|voice_switch|>",
133
+ "<|speaker_style|>",
134
+ "<|prosody_control|>",
135
+ "<|zh_tw|>",
136
+ "<|zh_hant|>",
137
+ "<|taigi|>",
138
+ "<|hakka|>",
139
+ "<|bopomofo|>",
140
+ "<|mixed_en|>",
141
+ "<|en|>",
142
+ "<|ja|>",
143
+ "<|ko|>",
144
+ "<|vi|>",
145
+ "<|id|>",
146
+ "<|th|>",
147
+ "<|asr|>",
148
+ "<|tts|>",
149
+ "<|speaker_0|>",
150
+ "<|speaker_1|>",
151
+ "<|speaker_2|>",
152
+ "<|speaker_3|>",
153
+ "<|timestamp|>",
154
+ "<|noise|>",
155
+ "<|laugh|>",
156
+ "<|breath|>",
157
+ "<|pause|>",
158
+ "<|prosody|>",
159
+ "<|pron|>",
160
+ "</|pron|>",
161
+ "<|image|>",
162
+ "<|ocr|>",
163
+ "<|bbox|>",
164
+ "<|line|>",
165
+ "<|table|>",
166
+ "<|row|>",
167
+ "<|col|>",
168
+ "<|cell|>",
169
+ "<|reading_order|>",
170
+ "<|source|>",
171
+ "<|cite|>",
172
+ "<|evidence|>",
173
+ "<|quote|>"
174
  ],
175
+ "fix_mistral_regex": true,
176
+ "is_local": false,
177
+ "local_files_only": false,
178
  "model_max_length": 131072,
179
  "model_type": "byte_level_bpe",
180
+ "no_audio_codec_tokens": true,
181
+ "no_dense_timestamp_tokens": true,
182
+ "pad_token": "<pad>",
183
+ "padding_side": "right",
184
  "rich_transcription": {
185
  "allow_non_speech_events": true,
186
  "compact_json": true,
 
231
  "<|video_end|>"
232
  ],
233
  "strict_no_dense_timestamp_tokens": true,
234
+ "tokenizer_class": "GPT2Tokenizer",
 
 
 
235
  "truncation_side": "right",
236
+ "unk_token": "<unk>",
237
+ "vocab_size": 114688,
238
+ "effective_vocab_size": 114822,
239
+ "open_formosa": {
240
+ "required_special_token_count": 157,
241
+ "required_special_tokens_present": true,
242
+ "required_special_tokens_single_id": true,
243
+ "standard_special_tokens": {
244
+ "unk_token": "<unk>",
245
+ "bos_token": "<s>",
246
+ "eos_token": "</s>",
247
+ "pad_token": "<pad>"
248
+ }
249
+ }
250
  }