voidful commited on
Commit
8953d03
·
verified ·
1 Parent(s): 989d8ec

Make tokenizer config compatible with Transformers 4.x

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +13 -180
tokenizer_config.json CHANGED
@@ -3,175 +3,9 @@
3
  "backend": "tokenizers",
4
  "bos_token": "<s>",
5
  "clean_up_tokenization_spaces": false,
 
6
  "eos_token": "</s>",
7
  "errors": "replace",
8
- "extra_special_tokens": [
9
- "<|bos|>",
10
- "<|eos|>",
11
- "<|unk|>",
12
- "<|pad|>",
13
- "<|system|>",
14
- "<|user_channel|>",
15
- "<|assistant_channel|>",
16
- "<|task:speech_to_text|>",
17
- "<|task:text_to_speech|>",
18
- "<|input_audio_start|>",
19
- "<|input_audio_end|>",
20
- "<|audio_ref_start|>",
21
- "<|audio_ref_end|>",
22
- "<|audio_start|>",
23
- "<|audio_end|>",
24
- "<|speech_start|>",
25
- "<|speech_end|>",
26
- "<|transcript_start|>",
27
- "<|transcript_end|>",
28
- "<|segment_start|>",
29
- "<|segment_end|>",
30
- "<|speaker|>",
31
- "<|start_time|>",
32
- "<|end_time|>",
33
- "<|duration|>",
34
- "<|content|>",
35
- "<|non_speech_event|>",
36
- "<|retrieval_result_start|>",
37
- "<|retrieval_result_end|>",
38
- "<|ocr_start|>",
39
- "<|ocr_end|>",
40
- "<|image_start|>",
41
- "<|image_end|>",
42
- "<|video_start|>",
43
- "<|video_end|>",
44
- "<|user|>",
45
- "<|assistant|>",
46
- "<|tool_call|>",
47
- "<|tool_response|>",
48
- "<|endoftext|>",
49
- "<think>",
50
- "</think>",
51
- "<|no_think|>",
52
- "<|think|>",
53
- "<|think_max|>",
54
- "<|task:text_to_text|>",
55
- "<|task:speech_to_speech|>",
56
- "<|task:text_speech_to_text|>",
57
- "<|task:text_speech_to_speech|>",
58
- "<|task:full_duplex_speech|>",
59
- "<|task:agent|>",
60
- "<|task:tool_use|>",
61
- "<|task:rag|>",
62
- "<|task:code_execution|>",
63
- "<|task:document_qa|>",
64
- "<|task:data_analysis|>",
65
- "<|task:workflow|>",
66
- "<|reasoning_mode:none|>",
67
- "<|reasoning_mode:short|>",
68
- "<|reasoning_mode:deep|>",
69
- "<|reasoning_mode:verify|>",
70
- "<|private_reasoning_start|>",
71
- "<|private_reasoning_end|>",
72
- "<|reasoning_summary_start|>",
73
- "<|reasoning_summary_end|>",
74
- "<|plan_start|>",
75
- "<|plan_end|>",
76
- "<|step_start|>",
77
- "<|step_end|>",
78
- "<|action_start|>",
79
- "<|action_end|>",
80
- "<|observation_start|>",
81
- "<|observation_end|>",
82
- "<|reflection_start|>",
83
- "<|reflection_end|>",
84
- "<|verification_start|>",
85
- "<|verification_end|>",
86
- "<|tool_schema_start|>",
87
- "<|tool_schema_end|>",
88
- "<|tool_call_start|>",
89
- "<|tool_call_end|>",
90
- "<|tool_result_start|>",
91
- "<|tool_result_end|>",
92
- "<|tool_error_start|>",
93
- "<|tool_error_end|>",
94
- "<|retrieval_query_start|>",
95
- "<|retrieval_query_end|>",
96
- "<|citation_start|>",
97
- "<|citation_end|>",
98
- "<|memory_read_start|>",
99
- "<|memory_read_end|>",
100
- "<|memory_write_start|>",
101
- "<|memory_write_end|>",
102
- "<|final_answer_start|>",
103
- "<|final_answer_end|>",
104
- "<|json_start|>",
105
- "<|json_end|>",
106
- "<|code_start|>",
107
- "<|code_end|>",
108
- "<|markdown_start|>",
109
- "<|markdown_end|>",
110
- "<|duplex_start|>",
111
- "<|duplex_end|>",
112
- "<|system_channel|>",
113
- "<|listen|>",
114
- "<|speak|>",
115
- "<|listen_speak|>",
116
- "<|output_audio_start|>",
117
- "<|output_audio_end|>",
118
- "<|text_start|>",
119
- "<|text_end|>",
120
- "<|overlap|>",
121
- "<|barge_in|>",
122
- "<|interruption|>",
123
- "<|interruption_repair|>",
124
- "<|backchannel|>",
125
- "<|turn_yield|>",
126
- "<|hold|>",
127
- "<|silence|>",
128
- "<|non_speech|>",
129
- "<|voice_reference_start|>",
130
- "<|voice_reference_end|>",
131
- "<|voice_reference|>",
132
- "<|voice_switch|>",
133
- "<|speaker_style|>",
134
- "<|prosody_control|>",
135
- "<|zh_tw|>",
136
- "<|zh_hant|>",
137
- "<|taigi|>",
138
- "<|hakka|>",
139
- "<|bopomofo|>",
140
- "<|mixed_en|>",
141
- "<|en|>",
142
- "<|ja|>",
143
- "<|ko|>",
144
- "<|vi|>",
145
- "<|id|>",
146
- "<|th|>",
147
- "<|asr|>",
148
- "<|tts|>",
149
- "<|speaker_0|>",
150
- "<|speaker_1|>",
151
- "<|speaker_2|>",
152
- "<|speaker_3|>",
153
- "<|timestamp|>",
154
- "<|noise|>",
155
- "<|laugh|>",
156
- "<|breath|>",
157
- "<|pause|>",
158
- "<|prosody|>",
159
- "<|pron|>",
160
- "</|pron|>",
161
- "<|image|>",
162
- "<|ocr|>",
163
- "<|bbox|>",
164
- "<|line|>",
165
- "<|table|>",
166
- "<|row|>",
167
- "<|col|>",
168
- "<|cell|>",
169
- "<|reading_order|>",
170
- "<|source|>",
171
- "<|cite|>",
172
- "<|evidence|>",
173
- "<|quote|>"
174
- ],
175
  "fix_mistral_regex": true,
176
  "is_local": false,
177
  "local_files_only": false,
@@ -179,6 +13,17 @@
179
  "model_type": "byte_level_bpe",
180
  "no_audio_codec_tokens": true,
181
  "no_dense_timestamp_tokens": true,
 
 
 
 
 
 
 
 
 
 
 
182
  "pad_token": "<pad>",
183
  "padding_side": "right",
184
  "rich_transcription": {
@@ -234,17 +79,5 @@
234
  "tokenizer_class": "GPT2Tokenizer",
235
  "truncation_side": "right",
236
  "unk_token": "<unk>",
237
- "vocab_size": 114688,
238
- "effective_vocab_size": 114822,
239
- "open_formosa": {
240
- "required_special_token_count": 157,
241
- "required_special_tokens_present": true,
242
- "required_special_tokens_single_id": true,
243
- "standard_special_tokens": {
244
- "unk_token": "<unk>",
245
- "bos_token": "<s>",
246
- "eos_token": "</s>",
247
- "pad_token": "<pad>"
248
- }
249
- }
250
  }
 
3
  "backend": "tokenizers",
4
  "bos_token": "<s>",
5
  "clean_up_tokenization_spaces": false,
6
+ "effective_vocab_size": 114822,
7
  "eos_token": "</s>",
8
  "errors": "replace",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "fix_mistral_regex": true,
10
  "is_local": false,
11
  "local_files_only": false,
 
13
  "model_type": "byte_level_bpe",
14
  "no_audio_codec_tokens": true,
15
  "no_dense_timestamp_tokens": true,
16
+ "open_formosa": {
17
+ "required_special_token_count": 157,
18
+ "required_special_tokens_present": true,
19
+ "required_special_tokens_single_id": true,
20
+ "standard_special_tokens": {
21
+ "bos_token": "<s>",
22
+ "eos_token": "</s>",
23
+ "pad_token": "<pad>",
24
+ "unk_token": "<unk>"
25
+ }
26
+ },
27
  "pad_token": "<pad>",
28
  "padding_side": "right",
29
  "rich_transcription": {
 
79
  "tokenizer_class": "GPT2Tokenizer",
80
  "truncation_side": "right",
81
  "unk_token": "<unk>",
82
+ "vocab_size": 114688
 
 
 
 
 
 
 
 
 
 
 
 
83
  }