shreyask commited on
Commit
c57eb15
·
verified ·
1 Parent(s): d231d47

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tekken.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: mlx-audio
3
+ tags:
4
+ - mlx
5
+ - speech-to-text
6
+ - speech
7
+ - transcription
8
+ - asr
9
+ - stt
10
+ - mlx-audio
11
+ ---
12
+ # shreyask/voxtral-mini-4b-realtime-mlx-int4
13
+
14
+ This model was converted to MLX format from [`shreyask/voxtral-mini-4b-realtime-mlx-fp16`](https://huggingface.co/shreyask/voxtral-mini-4b-realtime-mlx-fp16) using mlx-audio version **0.3.2**.
15
+
16
+ Refer to the [original model card](https://huggingface.co/shreyask/voxtral-mini-4b-realtime-mlx-fp16) for more details on the model.
17
+
18
+ ## Use with mlx-audio
19
+
20
+ ```bash
21
+ pip install -U mlx-audio
22
+ ```
23
+
24
+ ### CLI Example:
25
+ ```bash
26
+ python -m mlx_audio.stt.generate --model shreyask/voxtral-mini-4b-realtime-mlx-int4 --audio "audio.wav"
27
+ ```
28
+
29
+ ### Python Example:
30
+ ```python
31
+ from mlx_audio.stt.utils import load_model
32
+ from mlx_audio.stt.generate import generate_transcription
33
+
34
+ model = load_model("shreyask/voxtral-mini-4b-realtime-mlx-int4")
35
+ transcription = generate_transcription(
36
+ model=model,
37
+ audio_path="path_to_audio.wav",
38
+ output_path="path_to_output.txt",
39
+ format="txt",
40
+ verbose=True,
41
+ )
42
+ print(transcription.text)
43
+
44
+ ```
config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "decoder": {
3
+ "dim": 3072,
4
+ "n_layers": 26,
5
+ "head_dim": 128,
6
+ "hidden_dim": 9216,
7
+ "n_heads": 32,
8
+ "n_kv_heads": 8,
9
+ "vocab_size": 131072,
10
+ "norm_eps": 1e-05,
11
+ "rope_theta": 1000000.0,
12
+ "sliding_window": 8192,
13
+ "tied_embeddings": true,
14
+ "ada_rms_norm_t_cond": true,
15
+ "ada_rms_norm_t_cond_dim": 32
16
+ },
17
+ "encoder_args": {
18
+ "audio_encoding_args": {
19
+ "sampling_rate": 16000,
20
+ "frame_rate": 12.5,
21
+ "num_mel_bins": 128,
22
+ "hop_length": 160,
23
+ "window_size": 400,
24
+ "chunk_length_s": null,
25
+ "global_log_mel_max": 1.5,
26
+ "transcription_format": "streaming"
27
+ },
28
+ "dim": 1280,
29
+ "n_layers": 32,
30
+ "head_dim": 64,
31
+ "hidden_dim": 5120,
32
+ "n_heads": 32,
33
+ "vocab_size": 131072,
34
+ "n_kv_heads": 32,
35
+ "use_biases": true,
36
+ "use_cache": false,
37
+ "rope_theta": 1000000.0,
38
+ "causal": true,
39
+ "norm_eps": 1e-05,
40
+ "pos_embed": "rope",
41
+ "max_source_positions": null,
42
+ "ffn_type": "swiglu",
43
+ "norm_type": "rms_norm",
44
+ "sliding_window": 750,
45
+ "downsample_factor": 4
46
+ },
47
+ "model_type": "voxtral_realtime",
48
+ "quantization": {
49
+ "group_size": 64,
50
+ "bits": 4,
51
+ "mode": "affine"
52
+ },
53
+ "quantization_config": {
54
+ "group_size": 64,
55
+ "bits": 4,
56
+ "mode": "affine"
57
+ }
58
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f59b425d8a1ceb2de795454558be63937cf75b59f9c9bc77accd85aaf32af05
3
+ size 3133798126
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
tekken.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8434af1d39eba99f0ef46cf1450bf1a63fa941a26933a1ef5dbbf4adf0d00e44
3
+ size 14910348