Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitattributes +1 -0
README.md +44 -0
config.json +58 -0
model.safetensors +3 -0
model.safetensors.index.json +0 -0
tekken.json +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tekken.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,44 @@

+---
+library_name: mlx-audio
+tags:
+- mlx
+- speech-to-text
+- speech
+- transcription
+- asr
+- stt
+- mlx-audio
+---
+# shreyask/voxtral-mini-4b-realtime-mlx-int4
+This model was converted to MLX format from [`shreyask/voxtral-mini-4b-realtime-mlx-fp16`](https://huggingface.co/shreyask/voxtral-mini-4b-realtime-mlx-fp16) using mlx-audio version **0.3.2**.
+Refer to the [original model card](https://huggingface.co/shreyask/voxtral-mini-4b-realtime-mlx-fp16) for more details on the model.
+## Use with mlx-audio
+```bash
+pip install -U mlx-audio
+```
+### CLI Example:
+```bash
+python -m mlx_audio.stt.generate --model shreyask/voxtral-mini-4b-realtime-mlx-int4 --audio "audio.wav"
+```
+### Python Example:
+```python
+from mlx_audio.stt.utils import load_model
+from mlx_audio.stt.generate import generate_transcription
+model = load_model("shreyask/voxtral-mini-4b-realtime-mlx-int4")
+transcription = generate_transcription(
+    model=model,
+    audio_path="path_to_audio.wav",
+    output_path="path_to_output.txt",
+    format="txt",
+    verbose=True,
+)
+print(transcription.text)
+```

config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+    "decoder": {
+        "dim": 3072,
+        "n_layers": 26,
+        "head_dim": 128,
+        "hidden_dim": 9216,
+        "n_heads": 32,
+        "n_kv_heads": 8,
+        "vocab_size": 131072,
+        "norm_eps": 1e-05,
+        "rope_theta": 1000000.0,
+        "sliding_window": 8192,
+        "tied_embeddings": true,
+        "ada_rms_norm_t_cond": true,
+        "ada_rms_norm_t_cond_dim": 32
+    },
+    "encoder_args": {
+        "audio_encoding_args": {
+            "sampling_rate": 16000,
+            "frame_rate": 12.5,
+            "num_mel_bins": 128,
+            "hop_length": 160,
+            "window_size": 400,
+            "chunk_length_s": null,
+            "global_log_mel_max": 1.5,
+            "transcription_format": "streaming"
+        },
+        "dim": 1280,
+        "n_layers": 32,
+        "head_dim": 64,
+        "hidden_dim": 5120,
+        "n_heads": 32,
+        "vocab_size": 131072,
+        "n_kv_heads": 32,
+        "use_biases": true,
+        "use_cache": false,
+        "rope_theta": 1000000.0,
+        "causal": true,
+        "norm_eps": 1e-05,
+        "pos_embed": "rope",
+        "max_source_positions": null,
+        "ffn_type": "swiglu",
+        "norm_type": "rms_norm",
+        "sliding_window": 750,
+        "downsample_factor": 4
+    },
+    "model_type": "voxtral_realtime",
+    "quantization": {
+        "group_size": 64,
+        "bits": 4,
+        "mode": "affine"
+    },
+    "quantization_config": {
+        "group_size": 64,
+        "bits": 4,
+        "mode": "affine"
+    }
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f59b425d8a1ceb2de795454558be63937cf75b59f9c9bc77accd85aaf32af05
+size 3133798126

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tekken.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8434af1d39eba99f0ef46cf1450bf1a63fa941a26933a1ef5dbbf4adf0d00e44
+size 14910348