Dubedo commited on
Commit
942690e
·
verified ·
1 Parent(s): 171f1d7

Upload processor

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set system_prompt = system_prompt | default("You are a helpful assistant that transcribes audio input into text output in JSON format.") -%}
2
+ <|im_start|>system
3
+ {{ system_prompt }}<|im_end|>
4
+ {%- set audio_token = audio_token | default("<|box_start|>") -%}
5
+ {%- set audio_start_token = "<|object_ref_start|>" -%}
6
+ {%- set audio_end_token = "<|object_ref_end|>" -%}
7
+ {%- for message in messages -%}
8
+ {%- if message['role'] == 'user' -%}
9
+ {{ '
10
+ ' }}<|im_start|>user{{ '
11
+ ' }}{%- set text_items = message['content'] | selectattr('type', 'equalto', 'text') | list -%}
12
+ {%- set context_text = text_items[0]['text'] if text_items else none -%}
13
+ {%- for item in message['content'] -%}
14
+ {%- if item['type'] == 'audio' -%}
15
+ {{ audio_start_token }}{{ audio_token }}{{ audio_end_token }}{{ "
16
+ " }}{%- if context_text -%}
17
+ This is a <|AUDIO_DURATION|> seconds audio, with extra info: {{ context_text }}
18
+
19
+ Please transcribe it with these keys: Start time, End time, Speaker ID, Content{%- else -%}
20
+ This is a <|AUDIO_DURATION|> seconds audio, please transcribe it with these keys: Start time, End time, Speaker ID, Content{%- endif -%}
21
+ {%- endif -%}
22
+ {%- endfor -%}
23
+ <|im_end|>{{ '
24
+ ' }}
25
+ {%- endif -%}
26
+ {%- endfor -%}
processor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_bos_token": "<|object_ref_start|>",
3
+ "audio_duration_token": "<|AUDIO_DURATION|>",
4
+ "audio_eos_token": "<|object_ref_end|>",
5
+ "audio_token": "<|box_start|>",
6
+ "feature_extractor": {
7
+ "eps": 1e-06,
8
+ "feature_extractor_type": "VibeVoiceAcousticTokenizerFeatureExtractor",
9
+ "feature_size": 1,
10
+ "normalize_audio": true,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "return_attention_mask": true,
14
+ "sampling_rate": 24000,
15
+ "target_dB_FS": -25
16
+ },
17
+ "processor_class": "VibeVoiceAsrProcessor"
18
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "processor_class": "VibeVoiceAsrProcessor",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null
30
+ }