Automatic Speech Recognition
NeMo
Hindi
speech
conformer
ctc
hindi
speech-tagging
entity-recognition
emotion-detection
Eval Results (legacy)
Instructions to use WhissleAI/STT-meta-HI with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- NeMo
How to use WhissleAI/STT-meta-HI with NeMo:
import nemo.collections.asr as nemo_asr asr_model = nemo_asr.models.ASRModel.from_pretrained("WhissleAI/STT-meta-HI") transcriptions = asr_model.transcribe(["file.wav"]) - Notebooks
- Google Colab
- Kaggle
| { | |
| "wer": 0.1808, | |
| "cer": 0.0675, | |
| "samples": 35993, | |
| "tags": { | |
| "AGE": { | |
| "accuracy": 0.4256, | |
| "macro_f1": 0.1202, | |
| "weighted_f1": 0.255, | |
| "per_class": { | |
| "AGE_0_18": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 15 | |
| }, | |
| "AGE_18_30": { | |
| "precision": 1.0, | |
| "recall": 0.0005, | |
| "f1": 0.001, | |
| "support": 1921 | |
| }, | |
| "AGE_30_45": { | |
| "precision": 0.1585, | |
| "recall": 0.0012, | |
| "f1": 0.0024, | |
| "support": 10930 | |
| }, | |
| "AGE_45_60": { | |
| "precision": 0.4262, | |
| "recall": 0.9995, | |
| "f1": 0.5976, | |
| "support": 15313 | |
| }, | |
| "AGE_60PLUS": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 7814 | |
| } | |
| }, | |
| "confusion_matrix": [ | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 15, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 1, | |
| 57, | |
| 1863, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 13, | |
| 10917, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 7, | |
| 15306, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 5, | |
| 7809, | |
| 0 | |
| ] | |
| ], | |
| "labels": [ | |
| "AGE_0_18", | |
| "AGE_18_30", | |
| "AGE_30_45", | |
| "AGE_45_60", | |
| "AGE_60PLUS" | |
| ] | |
| }, | |
| "GENDER": { | |
| "accuracy": 0.6329, | |
| "macro_f1": 0.2586, | |
| "weighted_f1": 0.4908, | |
| "per_class": { | |
| "GENDER_FEMALE": { | |
| "precision": 0.6329, | |
| "recall": 0.9999, | |
| "f1": 0.7751, | |
| "support": 22778 | |
| }, | |
| "GENDER_MALE": { | |
| "precision": 0.625, | |
| "recall": 0.0004, | |
| "f1": 0.0008, | |
| "support": 13184 | |
| }, | |
| "GENDER_OTHER": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 31 | |
| } | |
| }, | |
| "confusion_matrix": [ | |
| [ | |
| 22775, | |
| 3, | |
| 0 | |
| ], | |
| [ | |
| 13179, | |
| 5, | |
| 0 | |
| ], | |
| [ | |
| 31, | |
| 0, | |
| 0 | |
| ] | |
| ], | |
| "labels": [ | |
| "GENDER_FEMALE", | |
| "GENDER_MALE", | |
| "GENDER_OTHER" | |
| ] | |
| }, | |
| "EMOTION": { | |
| "accuracy": 0.4756, | |
| "macro_f1": 0.2235, | |
| "weighted_f1": 0.4216, | |
| "per_class": { | |
| "EMOTION_ANGRY": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 145 | |
| }, | |
| "EMOTION_HAPPY": { | |
| "precision": 0.5653, | |
| "recall": 0.2127, | |
| "f1": 0.3091, | |
| "support": 19184 | |
| }, | |
| "EMOTION_NEUTRAL": { | |
| "precision": 0.4531, | |
| "recall": 0.8248, | |
| "f1": 0.5849, | |
| "support": 15806 | |
| }, | |
| "EMOTION_SAD": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 858 | |
| } | |
| }, | |
| "confusion_matrix": [ | |
| [ | |
| 0, | |
| 27, | |
| 118, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 4080, | |
| 15104, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 2769, | |
| 13037, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 342, | |
| 516, | |
| 0 | |
| ] | |
| ], | |
| "labels": [ | |
| "EMOTION_ANGRY", | |
| "EMOTION_HAPPY", | |
| "EMOTION_NEUTRAL", | |
| "EMOTION_SAD" | |
| ] | |
| }, | |
| "INTENT": { | |
| "accuracy": 0.0, | |
| "macro_f1": 0.0, | |
| "weighted_f1": 0.0, | |
| "per_class": { | |
| "INTENT_ASSERTION": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 1 | |
| }, | |
| "INTENT_COMMAND": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 240 | |
| }, | |
| "INTENT_EXPLAIN": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 5 | |
| }, | |
| "INTENT_GREETING": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 2 | |
| }, | |
| "INTENT_INFORM": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 35132 | |
| }, | |
| "INTENT_QUESTION": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 55 | |
| }, | |
| "INTENT_REQUEST": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 57 | |
| }, | |
| "INTENT_THANK": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 1 | |
| }, | |
| "NONE": { | |
| "precision": 0.0, | |
| "recall": 0.0, | |
| "f1": 0.0, | |
| "support": 0 | |
| } | |
| }, | |
| "confusion_matrix": [ | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 1 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 240 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 5 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 2 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 35132 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 55 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 57 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 1 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0 | |
| ] | |
| ], | |
| "labels": [ | |
| "INTENT_ASSERTION", | |
| "INTENT_COMMAND", | |
| "INTENT_EXPLAIN", | |
| "INTENT_GREETING", | |
| "INTENT_INFORM", | |
| "INTENT_QUESTION", | |
| "INTENT_REQUEST", | |
| "INTENT_THANK", | |
| "NONE" | |
| ] | |
| } | |
| }, | |
| "entity": { | |
| "precision": 0.559, | |
| "recall": 0.075, | |
| "f1": 0.1323 | |
| } | |
| } |