OpenFormosa
/

PangolinTokenizer

@@ -3,175 +3,9 @@
   "backend": "tokenizers",
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
   "errors": "replace",
-  "extra_special_tokens": [
-    "<|bos|>",
-    "<|eos|>",
-    "<|unk|>",
-    "<|pad|>",
-    "<|system|>",
-    "<|user_channel|>",
-    "<|assistant_channel|>",
-    "<|task:speech_to_text|>",
-    "<|task:text_to_speech|>",
-    "<|input_audio_start|>",
-    "<|input_audio_end|>",
-    "<|audio_ref_start|>",
-    "<|audio_ref_end|>",
-    "<|audio_start|>",
-    "<|audio_end|>",
-    "<|speech_start|>",
-    "<|speech_end|>",
-    "<|transcript_start|>",
-    "<|transcript_end|>",
-    "<|segment_start|>",
-    "<|segment_end|>",
-    "<|speaker|>",
-    "<|start_time|>",
-    "<|end_time|>",
-    "<|duration|>",
-    "<|content|>",
-    "<|non_speech_event|>",
-    "<|retrieval_result_start|>",
-    "<|retrieval_result_end|>",
-    "<|ocr_start|>",
-    "<|ocr_end|>",
-    "<|image_start|>",
-    "<|image_end|>",
-    "<|video_start|>",
-    "<|video_end|>",
-    "<|user|>",
-    "<|assistant|>",
-    "<|tool_call|>",
-    "<|tool_response|>",
-    "<|endoftext|>",
-    "<think>",
-    "</think>",
-    "<|no_think|>",
-    "<|think|>",
-    "<|think_max|>",
-    "<|task:text_to_text|>",
-    "<|task:speech_to_speech|>",
-    "<|task:text_speech_to_text|>",
-    "<|task:text_speech_to_speech|>",
-    "<|task:full_duplex_speech|>",
-    "<|task:agent|>",
-    "<|task:tool_use|>",
-    "<|task:rag|>",
-    "<|task:code_execution|>",
-    "<|task:document_qa|>",
-    "<|task:data_analysis|>",
-    "<|task:workflow|>",
-    "<|reasoning_mode:none|>",
-    "<|reasoning_mode:short|>",
-    "<|reasoning_mode:deep|>",
-    "<|reasoning_mode:verify|>",
-    "<|private_reasoning_start|>",
-    "<|private_reasoning_end|>",
-    "<|reasoning_summary_start|>",
-    "<|reasoning_summary_end|>",
-    "<|plan_start|>",
-    "<|plan_end|>",
-    "<|step_start|>",
-    "<|step_end|>",
-    "<|action_start|>",
-    "<|action_end|>",
-    "<|observation_start|>",
-    "<|observation_end|>",
-    "<|reflection_start|>",
-    "<|reflection_end|>",
-    "<|verification_start|>",
-    "<|verification_end|>",
-    "<|tool_schema_start|>",
-    "<|tool_schema_end|>",
-    "<|tool_call_start|>",
-    "<|tool_call_end|>",
-    "<|tool_result_start|>",
-    "<|tool_result_end|>",
-    "<|tool_error_start|>",
-    "<|tool_error_end|>",
-    "<|retrieval_query_start|>",
-    "<|retrieval_query_end|>",
-    "<|citation_start|>",
-    "<|citation_end|>",
-    "<|memory_read_start|>",
-    "<|memory_read_end|>",
-    "<|memory_write_start|>",
-    "<|memory_write_end|>",
-    "<|final_answer_start|>",
-    "<|final_answer_end|>",
-    "<|json_start|>",
-    "<|json_end|>",
-    "<|code_start|>",
-    "<|code_end|>",
-    "<|markdown_start|>",
-    "<|markdown_end|>",
-    "<|duplex_start|>",
-    "<|duplex_end|>",
-    "<|system_channel|>",
-    "<|listen|>",
-    "<|speak|>",
-    "<|listen_speak|>",
-    "<|output_audio_start|>",
-    "<|output_audio_end|>",
-    "<|text_start|>",
-    "<|text_end|>",
-    "<|overlap|>",
-    "<|barge_in|>",
-    "<|interruption|>",
-    "<|interruption_repair|>",
-    "<|backchannel|>",
-    "<|turn_yield|>",
-    "<|hold|>",
-    "<|silence|>",
-    "<|non_speech|>",
-    "<|voice_reference_start|>",
-    "<|voice_reference_end|>",
-    "<|voice_reference|>",
-    "<|voice_switch|>",
-    "<|speaker_style|>",
-    "<|prosody_control|>",
-    "<|zh_tw|>",
-    "<|zh_hant|>",
-    "<|taigi|>",
-    "<|hakka|>",
-    "<|bopomofo|>",
-    "<|mixed_en|>",
-    "<|en|>",
-    "<|ja|>",
-    "<|ko|>",
-    "<|vi|>",
-    "<|id|>",
-    "<|th|>",
-    "<|asr|>",
-    "<|tts|>",
-    "<|speaker_0|>",
-    "<|speaker_1|>",
-    "<|speaker_2|>",
-    "<|speaker_3|>",
-    "<|timestamp|>",
-    "<|noise|>",
-    "<|laugh|>",
-    "<|breath|>",
-    "<|pause|>",
-    "<|prosody|>",
-    "<|pron|>",
-    "</|pron|>",
-    "<|image|>",
-    "<|ocr|>",
-    "<|bbox|>",
-    "<|line|>",
-    "<|table|>",
-    "<|row|>",
-    "<|col|>",
-    "<|cell|>",
-    "<|reading_order|>",
-    "<|source|>",
-    "<|cite|>",
-    "<|evidence|>",
-    "<|quote|>"
-  ],
   "fix_mistral_regex": true,
   "is_local": false,
   "local_files_only": false,
@@ -179,6 +13,17 @@
   "model_type": "byte_level_bpe",
   "no_audio_codec_tokens": true,
   "no_dense_timestamp_tokens": true,
   "pad_token": "<pad>",
   "padding_side": "right",
   "rich_transcription": {
@@ -234,17 +79,5 @@
   "tokenizer_class": "GPT2Tokenizer",
   "truncation_side": "right",
   "unk_token": "<unk>",
-  "vocab_size": 114688,
-  "effective_vocab_size": 114822,
-  "open_formosa": {
-    "required_special_token_count": 157,
-    "required_special_tokens_present": true,
-    "required_special_tokens_single_id": true,
-    "standard_special_tokens": {
-      "unk_token": "<unk>",
-      "bos_token": "<s>",
-      "eos_token": "</s>",
-      "pad_token": "<pad>"
-    }
-  }
 }

   "backend": "tokenizers",
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
+  "effective_vocab_size": 114822,
   "eos_token": "</s>",
   "errors": "replace",
   "fix_mistral_regex": true,
   "is_local": false,
   "local_files_only": false,
   "model_type": "byte_level_bpe",
   "no_audio_codec_tokens": true,
   "no_dense_timestamp_tokens": true,
+  "open_formosa": {
+    "required_special_token_count": 157,
+    "required_special_tokens_present": true,
+    "required_special_tokens_single_id": true,
+    "standard_special_tokens": {
+      "bos_token": "<s>",
+      "eos_token": "</s>",
+      "pad_token": "<pad>",
+      "unk_token": "<unk>"
+    }
+  },
   "pad_token": "<pad>",
   "padding_side": "right",
   "rich_transcription": {
   "tokenizer_class": "GPT2Tokenizer",
   "truncation_side": "right",
   "unk_token": "<unk>",
+  "vocab_size": 114688
 }