Duplicate from trishtan/voxtral-sentinel-4b

Browse files

Files changed (7) hide show

.gitattributes +36 -0
README.md +263 -0
config.json +66 -0
generation_config.json +12 -0
model.safetensors +3 -0
tekken.json +3 -0
training_args.bin +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tekken.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,263 @@

+---
+base_model: mistralai/Voxtral-Mini-4B-Realtime-2602
+library_name: transformers
+model_name: voxtral-sentinel-4b
+datasets:
+- trishtan/voxtral-forensic-ds
+tags:
+- audio
+- multimodal
+- emotion-recognition
+- customer-support
+- emergency-services
+- sft
+- trl
+- hf_jobs
+language:
+- en
+license: apache-2.0
+---
+# Model Card for voxtral-sentinel-4b
+This model is a fine-tuned version of [mistralai/Voxtral-Mini-4B-Realtime-2602](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+# trishtan/voxtral-sentinel-4b
+**voxtral-sentinel-4b** is a fine-tuned version of [mistralai/Voxtral-Mini-4B-Realtime-2602](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602), specialised for real-time audio understanding in high-stakes operational environments. Given a raw audio recording, the model produces a structured output containing a verbatim transcript, a contextual analysis of speaker emotion and situation, and a recommended action — enabling autonomous routing and triage without human-in-the-loop intervention.
+Built for two primary verticals:
+- **Automated customer support** — classify caller intent and emotional state to route calls, trigger escalations, or generate automated responses in real time
+- **Emergency services & safety** — identify distress, urgency, and situational context from audio to assist dispatchers or fully autonomous response systems
+---
+## Model Details
+| Property | Value |
+|---|---|
+| **Base model** | mistralai/Voxtral-Mini-4B-Realtime-2602 |
+| **Model type** | Audio-to-text (multimodal) |
+| **Parameters** | ~4B |
+| **Fine-tune method** | Full fine-tune (no LoRA) |
+| **Precision** | bfloat16 |
+| **Training hardware** | NVIDIA A100 |
+| **Framework** | Transformers + TRL SFTTrainer |
+| **Language** | English |
+| **License** | See base model license |
+---
+## Training
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/s222458666/voxtral-sentinel/runs/uouz4iq1)
+[<img src="https://img.shields.io/badge/GitHub-Forensic--Audio-181717?logo=github&style=flat" alt="View on GitHub" width="150" height="24"/>](https://github.com/SageRish/Forensic-Audio)
+### Dataset
+Fine-tuned on a curated dataset of ~9,984 audio samples with structured annotations [voxtral-forensic-ds](https://huggingface.co/datasets/trishtan/voxtral-forensic-ds). Each sample consists of a raw audio clip paired with a ground-truth output in the following canonical format:
+```
+### TRANSCRIPT:
+<verbatim transcription of the audio>
+### ANALYSIS:
+<contextual analysis of speaker emotion, tone, and situation>
+### CONCLUSION:
+<recommended action or classification>
+```
+The dataset was derived from [MELD (Multimodal EmotionLines Dataset)](https://huggingface.co/datasets/ajyy/MELD_audio), which contains emotionally rich conversational audio from multi-speaker dialogue scenarios, and [DCASE 2025 Task 1](https://dcase.community/challenge2025/task-low-complexity-acoustic-scene-classification-with-device-information) (Acoustic Scene Classification). Annotations were generated and standardised using automated pipelines with LLM-assisted formatting normalisation.
+A 90/10 train/eval split was used with a fixed seed (42) for reproducibility. The final training dataset and held-out eval split are available at [trishtan/voxtral-forensic-ds-splits](https://huggingface.co/datasets/trishtan/voxtral-forensic-ds).
+### Hyperparameters
+| Parameter | Value |
+|---|---|
+| Epochs | 5 (early stopping at eval loss < 1.15) |
+| Learning rate | 5e-6 |
+| LR scheduler | Cosine |
+| Warmup ratio | 0.05 |
+| Batch size (per device) | 2 |
+| Gradient accumulation steps | 4 |
+| Effective batch size | 8 |
+| Max grad norm | 1.0 |
+| Precision | bf16 |
+| Eval strategy | Every 100 steps |
+### Training Results
+| Metric | Value |
+|---|---|
+| Final eval loss | 1.148 |
+| Final eval mean token accuracy | 74.35% |
+| Train/eval accuracy gap | ~0% |
+| Stopped at epoch | 2.75 (early stopping) |
+The near-zero gap between train and eval accuracy across all runs indicates the model generalises well to unseen audio with no measurable overfitting.
+---
+## Usage
+```python
+import torch
+import soundfile as sf
+import numpy as np
+from transformers import AutoProcessor, VoxtralRealtimeForConditionalGeneration
+model_id = "trishtan/voxtral-sentinel-4b"
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+model = VoxtralRealtimeForConditionalGeneration.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+# Load your audio (must be 16kHz mono)
+audio, sr = sf.read("your_audio.wav")
+if audio.ndim > 1:
+    audio = audio.mean(axis=1)
+audio = audio.astype(np.float32)
+PROMPT = "[INST] Analyze this recording for forensic indicators. [/INST]"
+audio_inputs = processor.feature_extractor(
+    [audio], sampling_rate=16000, return_tensors="pt", padding=True,
+)
+text_inputs = processor.tokenizer(
+    [PROMPT], return_tensors="pt", padding=True,
+)
+inputs = {**audio_inputs, **text_inputs}
+inputs = {k: v.to(model.device) for k, v in inputs.items()}
+with torch.no_grad():
+    output_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)
+response = processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)
+print(response)
+```
+### Expected Output Format
+```
+### TRANSCRIPT:
+I need help immediately, my neighbour hasn't responded in hours and I can hear something...
+### ANALYSIS:
+The speaker exhibits elevated vocal stress indicators including increased speech rate and
+pitch variance. Tone suggests genuine distress rather than rehearsed or non-urgent
+communication. Situational context implies potential welfare concern for a third party.
+### CONCLUSION:
+Escalate to emergency services. Flag as high-priority welfare check. Do not route to
+standard support queue.
+```
+---
+## Intended Use
+### In Scope
+- Real-time audio triage in customer service pipelines
+- Emergency call classification and dispatcher assistance
+- Automated sentiment and intent detection from voice
+- Proof-of-concept and research into multimodal audio understanding
+### Out of Scope
+- Medical diagnosis or clinical decision-making
+- Surveillance or non-consensual audio analysis
+- Languages other than English
+- Audio clips under 3 seconds (insufficient signal for reliable analysis)
+---
+## Limitations
+- **Short audio clips** — clips under 3 seconds are padded with silence to the model's required 15-second input window. Analysis quality degrades significantly for very short recordings.
+- **Single-language** — trained exclusively on English-language audio. Performance on accented, non-native, or non-English speech is untested.
+- **Emotional diversity** — training data skews toward conversational emotional registers. Performance on domain-specific audio (medical, legal, industrial) may vary.
+- **Not a safety-critical system** — outputs should be reviewed by human operators in any deployment where errors have real-world consequences.
+---
+## Data Attribution
+This model was fine-tuned using audio data derived from:
+**MELD — Multimodal EmotionLines Dataset**
+Poria, S., Hazarika, D., Majumder, N., Naik, G., Cambria, E., & Mihalcea, R. (2019, July).
+Meld: A multimodal multi-party dataset for emotion recognition in conversations.
+In Proceedings of the 57th annual meeting of the association for computational linguistics (pp. 527-536).
+HuggingFace: [ajyy/MELD_audio](https://huggingface.co/datasets/ajyy/MELD_audio)
+**DCASE 2025 Challenge — Task 1: Acoustic Scene Classification**
+Annamaria Mesaros, Toni Heittola, and Tuomas Virtanen.
+A multi-device dataset for urban acoustic scene classification.
+In Proceedings of the Detection and Classification of Acoustic Scenes and Events 2018 Workshop (DCASE2018), 9–13.
+November 2018.
+URL: https://dcase.community/documents/workshop2018/proceedings/DCASE2018Workshop_Mesaros_8.pdf.
+---
+## Framework Versions
+- TRL: 0.29.0
+- Transformers: 5.2.0
+- PyTorch: 2.10.0
+- Datasets: 4.6.1
+- Tokenizers: 0.22.2
+---
+## Citation
+```bibtex
+@misc{voxtral-sentinel-4b,
+  author    = {trishtan},
+  title     = {voxtral-sentinel-4b: Fine-tuned Voxtral for Audio Triage},
+  year      = {2026},
+  publisher = {Hugging Face},
+  url       = {https://huggingface.co/trishtan/voxtral-sentinel-4b}
+}
+@inproceedings{poria2019meld,
+  title={Meld: A multimodal multi-party dataset for emotion recognition in conversations},
+  author={Poria, Soujanya and Hazarika, Devamanyu and Majumder, Navonil and Naik, Gautam and Cambria, Erik and Mihalcea, Rada},
+  booktitle={Proceedings of the 57th annual meeting of the association for computational linguistics},
+  pages={527--536},
+  year={2019}
+}
+@inproceedings{Mesaros2018_DCASE,
+    Author = "Mesaros, Annamaria and Heittola, Toni and Virtanen, Tuomas",
+    title = "A multi-device dataset for urban acoustic scene classification",
+    year = "2018",
+    booktitle = "Proceedings of the Detection and Classification of Acoustic Scenes and Events 2018 Workshop (DCASE2018)",
+    month = "November",
+    pages = "9--13",
+    keywords = "Acoustic scene classification, DCASE challenge, public datasets, multi-device data",
+    url = "https://dcase.community/documents/workshop2018/proceedings/DCASE2018Workshop\_Mesaros\_8.pdf"
+}
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```
+---
+## Acknowledgements
+Built on [Voxtral-Mini-4B-Realtime](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602) by Mistral AI.
+Fine-tuning infrastructure: HuggingFace Transformers, TRL, and Accelerate.

config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "VoxtralRealtimeForConditionalGeneration"
+  ],
+  "audio_config": {
+    "activation_function": "gelu",
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "head_dim": 64,
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "initializer_range": 0.02,
+    "intermediate_size": 5120,
+    "max_position_embeddings": 1500,
+    "model_type": "voxtral_realtime_encoder",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 32,
+    "num_mel_bins": 128,
+    "rms_norm_eps": 1e-05,
+    "rope_parameters": {
+      "rope_theta": 1000000.0,
+      "rope_type": "default"
+    },
+    "sliding_window": 750,
+    "vocab_size": 131072
+  },
+  "audio_length_per_tok": 8,
+  "bos_token_id": 1,
+  "default_num_delay_tokens": 6,
+  "downsample_factor": 4,
+  "dtype": "bfloat16",
+  "eos_token_id": 2,
+  "hidden_size": 3072,
+  "model_type": "voxtral_realtime",
+  "pad_token_id": 11,
+  "projector_hidden_act": "gelu",
+  "text_config": {
+    "attention_dropout": 0.0,
+    "bos_token_id": 1,
+    "dtype": "bfloat16",
+    "eos_token_id": 2,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 3072,
+    "initializer_range": 0.02,
+    "intermediate_size": 9216,
+    "max_position_embeddings": 131072,
+    "model_type": "voxtral_realtime_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 26,
+    "num_key_value_heads": 8,
+    "pad_token_id": null,
+    "rms_norm_eps": 1e-05,
+    "rope_parameters": {
+      "rope_theta": 1000000.0,
+      "rope_type": "default"
+    },
+    "sliding_window": 8192,
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "vocab_size": 131072
+  },
+  "transformers_version": "5.2.0",
+  "use_cache": false
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2,
+    2
+  ],
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 11,
+  "transformers_version": "5.2.0",
+  "use_cache": true
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ee63e48011841bbf002d90f2baacd1c0474c78dddd288093cf55e645e6f363a
+size 8859446848

tekken.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8434af1d39eba99f0ef46cf1450bf1a63fa941a26933a1ef5dbbf4adf0d00e44
+size 14910348

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1b4951fcbeb59169efa164df8cea10ef700d2574b2aa3da47ab8b6d0e914d01
+size 5713