eustlb HF Staff commited on
Commit
b08eccf
·
verified ·
1 Parent(s): 270644e

Upload processor

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro to_text(content) -%}
2
+ {%- if content is string -%}
3
+ {{- content -}}
4
+ {%- elif content is iterable and content is not mapping -%}
5
+ {%- for item in content -%}
6
+ {%- if item is mapping and item.type == 'text' and item.text is defined -%}
7
+ {{- item.text -}}
8
+ {%- elif item is mapping and (item.type == 'audio' or 'audio' in item) -%}
9
+ <|begin_of_audio|><|pad|><|end_of_audio|><|user|>
10
+ {% elif item is string -%}
11
+ {{- item -}}
12
+ {%- endif -%}
13
+ {%- endfor -%}
14
+ {%- else -%}
15
+ {{- content -}}
16
+ {%- endif -%}
17
+ {%- endmacro -%}
18
+
19
+ {%- for m in messages -%}
20
+ {%- if m.role == 'system' -%}
21
+ <|system|>
22
+ {{ to_text(m.content) | trim }}
23
+
24
+ {%- elif m.role == 'user' -%}
25
+ <|user|>
26
+ {{ to_text(m.content) | trim }}
27
+
28
+ {%- elif m.role == 'assistant' -%}
29
+ <|assistant|>
30
+ {{ to_text(m.content) | trim }}
31
+
32
+ {%- endif -%}
33
+ {%- endfor -%}
34
+
35
+ {%- if add_generation_prompt -%}
36
+ <|assistant|>
37
+ {% endif -%}
processor_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_bos_token": "<|begin_of_audio|>",
3
+ "audio_eos_token": "<|end_of_audio|>",
4
+ "audio_token": "<|pad|>",
5
+ "feature_extractor": {
6
+ "chunk_length": 30,
7
+ "dither": 0.0,
8
+ "feature_extractor_type": "WhisperFeatureExtractor",
9
+ "feature_size": 128,
10
+ "hop_length": 160,
11
+ "n_fft": 400,
12
+ "n_samples": 480000,
13
+ "nb_max_frames": 3000,
14
+ "padding_side": "right",
15
+ "padding_value": 0.0,
16
+ "return_attention_mask": false,
17
+ "sampling_rate": 16000
18
+ },
19
+ "processor_class": "GlmasrProcessor"
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "clean_up_tokenization_spaces": false,
4
+ "do_lower_case": false,
5
+ "eos_token": "<|endoftext|>",
6
+ "extra_special_tokens": [
7
+ "<|endoftext|>",
8
+ "[MASK]",
9
+ "[gMASK]",
10
+ "[sMASK]",
11
+ "<sop>",
12
+ "<eop>",
13
+ "<|system|>",
14
+ "<|user|>",
15
+ "<|assistant|>",
16
+ "<|observation|>",
17
+ "<|begin_of_image|>",
18
+ "<|end_of_image|>"
19
+ ],
20
+ "is_local": false,
21
+ "model_input_names": [
22
+ "input_ids",
23
+ "attention_mask"
24
+ ],
25
+ "model_max_length": 65536,
26
+ "model_specific_special_tokens": {},
27
+ "pad_token": "<|endoftext|>",
28
+ "padding_side": "left",
29
+ "processor_class": "GlmasrProcessor",
30
+ "remove_space": false,
31
+ "tokenizer_class": "TokenizersBackend"
32
+ }