Upload processor
Browse files- .gitattributes +1 -0
- chat_template.jinja +26 -0
- processor_config.json +18 -0
- tokenizer.json +3 -0
- tokenizer_config.json +30 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- set system_prompt = system_prompt | default("You are a helpful assistant that transcribes audio input into text output in JSON format.") -%}
|
| 2 |
+
<|im_start|>system
|
| 3 |
+
{{ system_prompt }}<|im_end|>
|
| 4 |
+
{%- set audio_token = audio_token | default("<|box_start|>") -%}
|
| 5 |
+
{%- set audio_start_token = "<|object_ref_start|>" -%}
|
| 6 |
+
{%- set audio_end_token = "<|object_ref_end|>" -%}
|
| 7 |
+
{%- for message in messages -%}
|
| 8 |
+
{%- if message['role'] == 'user' -%}
|
| 9 |
+
{{ '
|
| 10 |
+
' }}<|im_start|>user{{ '
|
| 11 |
+
' }}{%- set text_items = message['content'] | selectattr('type', 'equalto', 'text') | list -%}
|
| 12 |
+
{%- set context_text = text_items[0]['text'] if text_items else none -%}
|
| 13 |
+
{%- for item in message['content'] -%}
|
| 14 |
+
{%- if item['type'] == 'audio' -%}
|
| 15 |
+
{{ audio_start_token }}{{ audio_token }}{{ audio_end_token }}{{ "
|
| 16 |
+
" }}{%- if context_text -%}
|
| 17 |
+
This is a <|AUDIO_DURATION|> seconds audio, with extra info: {{ context_text }}
|
| 18 |
+
|
| 19 |
+
Please transcribe it with these keys: Start time, End time, Speaker ID, Content{%- else -%}
|
| 20 |
+
This is a <|AUDIO_DURATION|> seconds audio, please transcribe it with these keys: Start time, End time, Speaker ID, Content{%- endif -%}
|
| 21 |
+
{%- endif -%}
|
| 22 |
+
{%- endfor -%}
|
| 23 |
+
<|im_end|>{{ '
|
| 24 |
+
' }}
|
| 25 |
+
{%- endif -%}
|
| 26 |
+
{%- endfor -%}
|
processor_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_bos_token": "<|object_ref_start|>",
|
| 3 |
+
"audio_duration_token": "<|AUDIO_DURATION|>",
|
| 4 |
+
"audio_eos_token": "<|object_ref_end|>",
|
| 5 |
+
"audio_token": "<|box_start|>",
|
| 6 |
+
"feature_extractor": {
|
| 7 |
+
"eps": 1e-06,
|
| 8 |
+
"feature_extractor_type": "VibeVoiceAcousticTokenizerFeatureExtractor",
|
| 9 |
+
"feature_size": 1,
|
| 10 |
+
"normalize_audio": true,
|
| 11 |
+
"padding_side": "right",
|
| 12 |
+
"padding_value": 0.0,
|
| 13 |
+
"return_attention_mask": true,
|
| 14 |
+
"sampling_rate": 24000,
|
| 15 |
+
"target_dB_FS": -25
|
| 16 |
+
},
|
| 17 |
+
"processor_class": "VibeVoiceAsrProcessor"
|
| 18 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
|
| 3 |
+
size 11421892
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|endoftext|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<|im_start|>",
|
| 10 |
+
"<|im_end|>",
|
| 11 |
+
"<|object_ref_start|>",
|
| 12 |
+
"<|object_ref_end|>",
|
| 13 |
+
"<|box_start|>",
|
| 14 |
+
"<|box_end|>",
|
| 15 |
+
"<|quad_start|>",
|
| 16 |
+
"<|quad_end|>",
|
| 17 |
+
"<|vision_start|>",
|
| 18 |
+
"<|vision_end|>",
|
| 19 |
+
"<|vision_pad|>",
|
| 20 |
+
"<|image_pad|>",
|
| 21 |
+
"<|video_pad|>"
|
| 22 |
+
],
|
| 23 |
+
"is_local": false,
|
| 24 |
+
"model_max_length": 131072,
|
| 25 |
+
"pad_token": "<|endoftext|>",
|
| 26 |
+
"processor_class": "VibeVoiceAsrProcessor",
|
| 27 |
+
"split_special_tokens": false,
|
| 28 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 29 |
+
"unk_token": null
|
| 30 |
+
}
|