controlllmai Xenova HF Staff commited on
Commit
783fec2
·
0 Parent(s):

Duplicate from onnx-community/Voxtral-Mini-3B-2507-ONNX

Browse files

Co-authored-by: Joshua <Xenova@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +65 -0
  2. README.md +178 -0
  3. chat_template.jinja +28 -0
  4. config.json +81 -0
  5. generation_config.json +6 -0
  6. onnx/audio_encoder.onnx +3 -0
  7. onnx/audio_encoder.onnx_data +3 -0
  8. onnx/audio_encoder.onnx_data_1 +3 -0
  9. onnx/audio_encoder_bnb4.onnx +3 -0
  10. onnx/audio_encoder_bnb4.onnx_data +3 -0
  11. onnx/audio_encoder_fp16.onnx +3 -0
  12. onnx/audio_encoder_fp16.onnx_data +3 -0
  13. onnx/audio_encoder_int8.onnx +3 -0
  14. onnx/audio_encoder_int8.onnx_data +3 -0
  15. onnx/audio_encoder_q4.onnx +3 -0
  16. onnx/audio_encoder_q4.onnx_data +3 -0
  17. onnx/audio_encoder_q4f16.onnx +3 -0
  18. onnx/audio_encoder_q4f16.onnx_data +3 -0
  19. onnx/audio_encoder_quantized.onnx +3 -0
  20. onnx/audio_encoder_quantized.onnx_data +3 -0
  21. onnx/audio_encoder_uint8.onnx +3 -0
  22. onnx/audio_encoder_uint8.onnx_data +3 -0
  23. onnx/decoder_model_merged.onnx +3 -0
  24. onnx/decoder_model_merged.onnx_data +3 -0
  25. onnx/decoder_model_merged.onnx_data_1 +3 -0
  26. onnx/decoder_model_merged.onnx_data_2 +3 -0
  27. onnx/decoder_model_merged.onnx_data_3 +3 -0
  28. onnx/decoder_model_merged.onnx_data_4 +3 -0
  29. onnx/decoder_model_merged.onnx_data_5 +3 -0
  30. onnx/decoder_model_merged.onnx_data_6 +3 -0
  31. onnx/decoder_model_merged.onnx_data_7 +3 -0
  32. onnx/decoder_model_merged_fp16.onnx +3 -0
  33. onnx/decoder_model_merged_fp16.onnx_data +3 -0
  34. onnx/decoder_model_merged_fp16.onnx_data_1 +3 -0
  35. onnx/decoder_model_merged_fp16.onnx_data_2 +3 -0
  36. onnx/decoder_model_merged_fp16.onnx_data_3 +3 -0
  37. onnx/decoder_model_merged_q4.onnx +3 -0
  38. onnx/decoder_model_merged_q4.onnx_data +3 -0
  39. onnx/decoder_model_merged_q4.onnx_data_1 +3 -0
  40. onnx/decoder_model_merged_q4f16.onnx +3 -0
  41. onnx/decoder_model_merged_q4f16.onnx_data +3 -0
  42. onnx/embed_tokens.onnx +3 -0
  43. onnx/embed_tokens.onnx_data +3 -0
  44. onnx/embed_tokens_fp16.onnx +3 -0
  45. onnx/embed_tokens_fp16.onnx_data +3 -0
  46. onnx/embed_tokens_q4.onnx +3 -0
  47. onnx/embed_tokens_q4.onnx_data +3 -0
  48. onnx/embed_tokens_quantized.onnx +3 -0
  49. onnx/embed_tokens_quantized.onnx_data +3 -0
  50. preprocessor_config.json +15 -0
.gitattributes ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ onnx/decoder_model_merged.onnx_data filter=lfs diff=lfs merge=lfs -text
37
+ onnx/decoder_model_merged.onnx_data_1 filter=lfs diff=lfs merge=lfs -text
38
+ onnx/decoder_model_merged.onnx_data_2 filter=lfs diff=lfs merge=lfs -text
39
+ onnx/decoder_model_merged.onnx_data_3 filter=lfs diff=lfs merge=lfs -text
40
+ onnx/decoder_model_merged.onnx_data_4 filter=lfs diff=lfs merge=lfs -text
41
+ onnx/decoder_model_merged.onnx_data_5 filter=lfs diff=lfs merge=lfs -text
42
+ onnx/decoder_model_merged.onnx_data_6 filter=lfs diff=lfs merge=lfs -text
43
+ onnx/decoder_model_merged.onnx_data_7 filter=lfs diff=lfs merge=lfs -text
44
+ onnx/decoder_model_merged_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
45
+ onnx/decoder_model_merged_fp16.onnx_data_1 filter=lfs diff=lfs merge=lfs -text
46
+ onnx/decoder_model_merged_fp16.onnx_data_2 filter=lfs diff=lfs merge=lfs -text
47
+ onnx/decoder_model_merged_fp16.onnx_data_3 filter=lfs diff=lfs merge=lfs -text
48
+ onnx/decoder_model_merged_q4.onnx_data filter=lfs diff=lfs merge=lfs -text
49
+ onnx/decoder_model_merged_q4.onnx_data_1 filter=lfs diff=lfs merge=lfs -text
50
+ onnx/decoder_model_merged_q4f16.onnx_data filter=lfs diff=lfs merge=lfs -text
51
+ onnx/embed_tokens.onnx_data filter=lfs diff=lfs merge=lfs -text
52
+ onnx/embed_tokens_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
53
+ tekken.json filter=lfs diff=lfs merge=lfs -text
54
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
55
+ onnx/audio_encoder.onnx_data filter=lfs diff=lfs merge=lfs -text
56
+ onnx/audio_encoder.onnx_data_1 filter=lfs diff=lfs merge=lfs -text
57
+ onnx/embed_tokens_q4.onnx_data filter=lfs diff=lfs merge=lfs -text
58
+ onnx/embed_tokens_quantized.onnx_data filter=lfs diff=lfs merge=lfs -text
59
+ onnx/audio_encoder_bnb4.onnx_data filter=lfs diff=lfs merge=lfs -text
60
+ onnx/audio_encoder_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
61
+ onnx/audio_encoder_int8.onnx_data filter=lfs diff=lfs merge=lfs -text
62
+ onnx/audio_encoder_q4.onnx_data filter=lfs diff=lfs merge=lfs -text
63
+ onnx/audio_encoder_q4f16.onnx_data filter=lfs diff=lfs merge=lfs -text
64
+ onnx/audio_encoder_quantized.onnx_data filter=lfs diff=lfs merge=lfs -text
65
+ onnx/audio_encoder_uint8.onnx_data filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - fr
5
+ - de
6
+ - es
7
+ - it
8
+ - pt
9
+ - nl
10
+ - hi
11
+ license: apache-2.0
12
+ library_name: transformers.js
13
+ base_model:
14
+ - mistralai/Voxtral-Mini-3B-2507
15
+ pipeline_tag: audio-text-to-text
16
+ ---
17
+
18
+ # Voxtral Mini 1.0 (3B) - 2507
19
+
20
+ Voxtral Mini is an enhancement of [Ministral 3B](https://mistral.ai/news/ministraux), incorporating state-of-the-art audio input capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and audio understanding.
21
+
22
+ This repository contains ONNX weights for the original model, [mistralai/Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507).
23
+
24
+ Learn more about Voxtral in their blog post [here](https://mistral.ai/news/voxtral).
25
+
26
+ ## Key Features
27
+
28
+ Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
29
+ - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
30
+ - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
31
+ - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
32
+ - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
33
+ - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
34
+ - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B
35
+
36
+ ## Benchmark Results
37
+
38
+ ### Audio
39
+
40
+ Average word error rate (WER) over the FLEURS, Mozilla Common Voice and Multilingual LibriSpeech benchmarks:
41
+
42
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64161701107962562e9b1006/puASxtajF1lDeGYPrRK5y.png)
43
+
44
+ ### Text
45
+
46
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/5dfcb1aada6d0311fd3d5448/iH9V8JVtMoaGlqJd6FIri.png)
47
+
48
+ ## Usage
49
+
50
+ **Notes**:
51
+
52
+ - `temperature=0.2` and `top_p=0.95` for chat completion (*e.g. Audio Understanding*) and `temperature=0.0` for transcription
53
+ - Multiple audios per message and multiple user turns with audio are supported
54
+ - System prompts are not yet supported
55
+
56
+
57
+ ### Transformers.js
58
+
59
+ #### Online demo
60
+
61
+ Try it out with our [online demo](https://huggingface.co/spaces/webml-community/Voxtral-WebGPU):
62
+
63
+ <video controls src="https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/3z0psEz3VS4kbscvXEE4n.mp4"></video>
64
+
65
+
66
+ #### Code snippets
67
+
68
+ If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
69
+ ```bash
70
+ npm i @huggingface/transformers
71
+ ```
72
+
73
+ **Example**: Transcription
74
+
75
+ ```js
76
+ import { VoxtralForConditionalGeneration, VoxtralProcessor, TextStreamer, read_audio } from "@huggingface/transformers";
77
+
78
+ // Load the processor and model
79
+ const model_id = "onnx-community/Voxtral-Mini-3B-2507-ONNX";
80
+ const processor = await VoxtralProcessor.from_pretrained(model_id);
81
+ const model = await VoxtralForConditionalGeneration.from_pretrained(
82
+ model_id,
83
+ {
84
+ dtype: {
85
+ embed_tokens: "fp16", // "fp32", "fp16", "q8", "q4"
86
+ audio_encoder: "q4", // "fp32", "fp16", "q8", "q4", "q4f16"
87
+ decoder_model_merged: "q4", // "q4", "q4f16"
88
+ },
89
+ device: "webgpu",
90
+ },
91
+ );
92
+
93
+ // Prepare the conversation
94
+ const conversation = [
95
+ {
96
+ "role": "user",
97
+ "content": [
98
+ { "type": "audio" },
99
+ { "type": "text", "text": "lang:en [TRANSCRIBE]" },
100
+ ],
101
+ }
102
+ ];
103
+ const text = processor.apply_chat_template(conversation, { tokenize: false });
104
+ const audio = await read_audio("http://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/mlk.wav", 16000);
105
+ const inputs = await processor(text, audio);
106
+
107
+ // Generate the response
108
+ const generated_ids = await model.generate({
109
+ ...inputs,
110
+ max_new_tokens: 256,
111
+ streamer: new TextStreamer(processor.tokenizer, { skip_special_tokens: true, skip_prompt: true }),
112
+ });
113
+
114
+ // Decode the generated tokens
115
+ const new_tokens = generated_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
116
+ const generated_texts = processor.batch_decode(
117
+ new_tokens,
118
+ { skip_special_tokens: true },
119
+ );
120
+ console.log(generated_texts[0]);
121
+ // I have a dream that one day this nation will rise up and live out the true meaning of its creed.
122
+ ```
123
+
124
+
125
+ **Example**: Audio understanding
126
+
127
+ ```js
128
+ import { VoxtralForConditionalGeneration, VoxtralProcessor, TextStreamer, read_audio } from "@huggingface/transformers";
129
+
130
+ // Load the processor and model
131
+ const model_id = "onnx-community/Voxtral-Mini-3B-2507-ONNX";
132
+ const processor = await VoxtralProcessor.from_pretrained(model_id);
133
+ const model = await VoxtralForConditionalGeneration.from_pretrained(
134
+ model_id,
135
+ {
136
+ dtype: {
137
+ embed_tokens: "fp16", // "fp32", "fp16", "q8", "q4"
138
+ audio_encoder: "q4", // "fp32", "fp16", "q8", "q4", "q4f16"
139
+ decoder_model_merged: "q4", // "q4", "q4f16"
140
+ },
141
+ device: "webgpu",
142
+ },
143
+ );
144
+
145
+ // Prepare the conversation
146
+ const conversation = [
147
+ {
148
+ "role": "user",
149
+ "content": [
150
+ { "type": "audio" },
151
+ { "type": "audio" },
152
+ { "type": "text", "text": "Describe these two audio clips in detail." },
153
+ ],
154
+ }
155
+ ];
156
+ const text = processor.apply_chat_template(conversation, { tokenize: false });
157
+ const audio = await Promise.all([
158
+ read_audio("https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav", 16000),
159
+ read_audio("https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/mlk.wav", 16000),
160
+ ]);
161
+ const inputs = await processor(text, audio);
162
+
163
+ // Generate the response
164
+ const generated_ids = await model.generate({
165
+ ...inputs,
166
+ max_new_tokens: 256,
167
+ streamer: new TextStreamer(processor.tokenizer, { skip_special_tokens: true, skip_prompt: true }),
168
+ });
169
+
170
+ // Decode the generated tokens
171
+ const new_tokens = generated_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
172
+ const generated_texts = processor.batch_decode(
173
+ new_tokens,
174
+ { skip_special_tokens: true },
175
+ );
176
+ console.log(generated_texts[0]);
177
+ // The first audio clip is a speech by a leader, likely a politician or a public figure, addressing a large audience. The speaker begins by encouraging the listeners to ask not what their country can do for them, but what they can do for their country. This is a call to action and a reminder of the individual's responsibility to contribute to the nation's well-being. The second audio clip is a passionate speech by a different leader, possibly a civil rights activist or a community organizer. This speaker expresses a dream of a nation that will rise up and live out the true meaning of its creed, suggesting a vision of a more just and equitable society.
178
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- for message in messages -%}
3
+ {%- if (message["role"] == "user") == (loop.index % 2 == 0) -%}
4
+ {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") -}}
5
+ {%- endif -%}
6
+ {%- if message["role"] == "user" -%}
7
+ {{- "[INST]" -}}
8
+ {%- if message["content"] is string -%}
9
+ {{- message["content"] -}}
10
+ {%- else -%}
11
+ {%- for item in message["content"] -%}
12
+ {%- if item["type"] == "audio" -%}
13
+ {{- "[AUDIO]" -}}
14
+ {%- elif item["type"] == "text" -%}
15
+ {{- item["text"] -}}
16
+ {%- endif -%}
17
+ {%- endfor -%}
18
+ {%- endif -%}
19
+ {{- "[/INST]" -}}
20
+ {%- elif message["role"] == "assistant" -%}
21
+ {%- if message["content"] is not string -%}
22
+ {{- raise_exception("Assistant message content should be a string.") -}}
23
+ {%- endif -%}
24
+ {{- message["content"] + eos_token -}}
25
+ {%- else -%}
26
+ {{- raise_exception("Only user and assistant roles are supported!") -}}
27
+ {%- endif -%}
28
+ {%- endfor -%}
config.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VoxtralForConditionalGeneration"
4
+ ],
5
+ "audio_config": {
6
+ "activation_dropout": 0.0,
7
+ "activation_function": "gelu",
8
+ "attention_dropout": 0.0,
9
+ "dropout": 0.0,
10
+ "head_dim": 64,
11
+ "hidden_size": 1280,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5120,
14
+ "layerdrop": 0.0,
15
+ "max_source_positions": 1500,
16
+ "model_type": "voxtral_encoder",
17
+ "num_attention_heads": 20,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 20,
20
+ "num_mel_bins": 128,
21
+ "scale_embedding": false,
22
+ "vocab_size": 51866
23
+ },
24
+ "audio_token_id": 24,
25
+ "hidden_size": 3072,
26
+ "model_type": "voxtral",
27
+ "projector_hidden_act": "gelu",
28
+ "text_config": {
29
+ "attention_bias": false,
30
+ "attention_dropout": 0.0,
31
+ "head_dim": 128,
32
+ "hidden_act": "silu",
33
+ "hidden_size": 3072,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 8192,
36
+ "max_position_embeddings": 131072,
37
+ "mlp_bias": false,
38
+ "model_type": "llama",
39
+ "num_attention_heads": 32,
40
+ "num_hidden_layers": 30,
41
+ "num_key_value_heads": 8,
42
+ "pretraining_tp": 1,
43
+ "rms_norm_eps": 1e-05,
44
+ "rope_scaling": null,
45
+ "rope_theta": 100000000.0,
46
+ "sliding_window": null,
47
+ "use_cache": true,
48
+ "vocab_size": 131072
49
+ },
50
+ "torch_dtype": "bfloat16",
51
+ "transformers_version": "4.54.0.dev0",
52
+
53
+ "transformers.js_config": {
54
+ "dtype": {
55
+ "embed_tokens": "fp16"
56
+ },
57
+ "kv_cache_dtype": {
58
+ "q4f16": "float16",
59
+ "fp16": "float16"
60
+ },
61
+ "use_external_data_format": {
62
+ "audio_encoder.onnx": 2,
63
+ "audio_encoder_fp16.onnx": 1,
64
+ "audio_encoder_int8.onnx": 1,
65
+ "audio_encoder_uint8.onnx": 1,
66
+ "audio_encoder_quantized.onnx": 1,
67
+ "audio_encoder_q4.onnx": 1,
68
+ "audio_encoder_q4f16.onnx": 1,
69
+ "audio_encoder_bnb4.onnx": 1,
70
+ "decoder_model_merged.onnx": 8,
71
+ "decoder_model_merged_fp16.onnx": 4,
72
+ "decoder_model_merged_q4.onnx": 2,
73
+ "decoder_model_merged_q4f16.onnx": 1,
74
+ "embed_tokens.onnx": 1,
75
+ "embed_tokens_fp16.onnx": 1,
76
+ "embed_tokens_quantized.onnx": 1,
77
+ "embed_tokens_q4.onnx": 1
78
+ }
79
+ },
80
+ "vocab_size": 131072
81
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 11,
5
+ "transformers_version": "4.54.0.dev0"
6
+ }
onnx/audio_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47fc3e2c046b8c9ba19dc4ffe195b6606840edc74179e69bd4f7261686cff511
3
+ size 357190
onnx/audio_encoder.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb1a2baa8eb44a51cbef5cc91ead26a7226c8004d0084b873dedbc4efd1712ba
3
+ size 2095319040
onnx/audio_encoder.onnx_data_1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2125dbd94d2405064f503c9ca13c1efdddfdd9df2854aeb76e0a2ac77111dc29
3
+ size 553220096
onnx/audio_encoder_bnb4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4418e53b50540e6581cbce958a1d705c31741965ed01fd42346d47254006d6a
3
+ size 404516
onnx/audio_encoder_bnb4.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5652ecf0027b5e65f797c1f444c538e6673940fe9116c769c426890f8d54fd9e
3
+ size 399343616
onnx/audio_encoder_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8367654c6e1b64e799fd581bc607aca3aa77b6bcb0a532a6326a1932aa6728b
3
+ size 359615
onnx/audio_encoder_fp16.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1c5a7fb2d6784483943d30040b2c17d5186cbd5f7ab14e027c3abd7e566a32e
3
+ size 1324269568
onnx/audio_encoder_int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddb743e492888cf8eb3fbc77efbfd6a5154f4b21a39a8d3e22cf29832719d283
3
+ size 600567
onnx/audio_encoder_int8.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf06e83302b02f9fb59751eaf594d8677c1d3d5ae2541d8b28cceedaeeb0f819
3
+ size 669384704
onnx/audio_encoder_q4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38b4e8bc6b7dda75a3c52e358a44a8c4e144b4878ff8867bd0cb41a402d4af81
3
+ size 401545
onnx/audio_encoder_q4.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07537c1791da2d198edca192b7b15c4199c1c70c3eb095d7a432431bc52e1ae5
3
+ size 440238080
onnx/audio_encoder_q4f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:156a6291053180757636c0caf5f6be3a4e08b90afd93d77646a6bd609ace206e
3
+ size 403958
onnx/audio_encoder_q4f16.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62eba37624325b3b21c51edf6c6823aaa8ce9657979dbffc31751c7f0d8d9709
3
+ size 383696896
onnx/audio_encoder_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4db2e57b57f836f3f12deecba456785c888ddc4768cbe1d6c28e379ee479dc79
3
+ size 603112
onnx/audio_encoder_quantized.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29a112e764c22a77d20fe5b4925c709a2188a3e5264ef76116420db469fb7fdf
3
+ size 669384704
onnx/audio_encoder_uint8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:644e50318802c691623b85fd940ac60164cd904b72d1d81f658fbbab22c6506d
3
+ size 601155
onnx/audio_encoder_uint8.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29a112e764c22a77d20fe5b4925c709a2188a3e5264ef76116420db469fb7fdf
3
+ size 669384704
onnx/decoder_model_merged.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e101f5f00fad06ff4de9ba06a18e529b1b238c77cecbaf7a83f2dd21f336209
3
+ size 226717
onnx/decoder_model_merged.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f08d2cb408774380ae5f25aedb2b29f08b2a67167bfaddb83833127d0988272d
3
+ size 2005000192
onnx/decoder_model_merged.onnx_data_1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8824af4dd1a6bda4c348924701d2b78e9b9b982eee6e79c00c311fbe71436720
3
+ size 2038554624
onnx/decoder_model_merged.onnx_data_2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:906367bee2870dedf649e48c59cbba1602b0c009b80a2fe75d1e3ead80ed4eee
3
+ size 2088873984
onnx/decoder_model_merged.onnx_data_3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f0dd468461fb57d33b5a3cca33442bcf7f53c73d9a5818419d0cdb16c6340e5
3
+ size 2063720448
onnx/decoder_model_merged.onnx_data_4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e8db09e022033239103c7ab042e8297b0768d0c4566d6fa9b64611dd62f3c13
3
+ size 2038542336
onnx/decoder_model_merged.onnx_data_5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:531cd0d701944232b7487648b7154e19fab693b9ee120469543362e380923fb8
3
+ size 2038554624
onnx/decoder_model_merged.onnx_data_6 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19ce9b0777369a103d6c6f2700909b11c3ed9ac8c17ced14730829482d3d4a6a
3
+ size 629182464
onnx/decoder_model_merged.onnx_data_7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:152030f3a13ed418571c431be4275b634eb40ea554ef74f917526ee896413962
3
+ size 1610612736
onnx/decoder_model_merged_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c77e254b27448c04b6f027c8040ce4f2429f999453b496ea2af8e79aa4a9c195
3
+ size 228186
onnx/decoder_model_merged_fp16.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ae4f71eb0994d28e915b5a5e12c3b1bbfb81b7ced25be4bd6698d60e71cb276
3
+ size 2072109056
onnx/decoder_model_merged_fp16.onnx_data_1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f207b9da920da6f837d35f38611c65ef3e28e089b91d19272273c4deac797204
3
+ size 2088886272
onnx/decoder_model_merged_fp16.onnx_data_2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf5cab82a3871665c485cf00d199aa55cac4249487219928cb0632527ea7228f
3
+ size 2076297216
onnx/decoder_model_merged_fp16.onnx_data_3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28b309c4dde22701d67a210b1c90bca5ae143fc2de33b98ed43ef14488880fa6
3
+ size 1019228160
onnx/decoder_model_merged_q4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25b33b24b1737410defd6035e84664f3810d73b1591b4f350dda1911910c0175
3
+ size 306657
onnx/decoder_model_merged_q4.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:224f9159d5c1843babd6cc99f220c14ce42f7400c56a1cf5102adea8ec029b08
3
+ size 2073260032
onnx/decoder_model_merged_q4.onnx_data_1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdd87806c69de54c19576e425639109e2961196c4618843e0c0ce389af93b531
3
+ size 251658240
onnx/decoder_model_merged_q4f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00ef33f598043ce7640b2353ca13511bb40c2ce0ecc049cc4930f8fa515cd8c2
3
+ size 308330
onnx/decoder_model_merged_q4f16.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35f8d88fe9d0af7759bbcd7e34843687e8768a4088c229757601aa6a1be1dc2a
3
+ size 2065283072
onnx/embed_tokens.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1b1350a79d62150ebb3899cbf824ccf74820b94ca23b9f0ff67b69834499944
3
+ size 299
onnx/embed_tokens.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3801b085f29d922117bcc4e0c484f1652de50b015d8ff6721303584a0d3e914f
3
+ size 1610612736
onnx/embed_tokens_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed64cab8592c4731ac4c8b4fc59baad6756f040a642390ee07c7e9c9ec56879a
3
+ size 494
onnx/embed_tokens_fp16.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f8ec7534e487edaec0d707980ecf4cd4bf66db7ecc6ef8cfb5f717a565a0616
3
+ size 805306368
onnx/embed_tokens_q4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aced72ee68efcb5feaccbe5d3bb454e8a6d44cec9cfd6f5aece7272255d43a3
3
+ size 542
onnx/embed_tokens_q4.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a2dbc92241c261aa99b0fb920ccfdcf9c4f2d3b97ed51c06d670c99e633e594
3
+ size 251658240
onnx/embed_tokens_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:198b066c7f5bef2776e001934ac2eff76ea89e93d072d577ace5ba16ab08281e
3
+ size 552
onnx/embed_tokens_quantized.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:681ef4fdd1f9e7857872405b174dc68955cd75428e35591376c2630f7b5bdc01
3
+ size 402653184
preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "VoxtralProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }