Create tutor/asr_adapt.py
Browse files- tutor/asr_adapt.py +82 -0
tutor/asr_adapt.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tutor/asr_adapt.py — ASR helpers (Whisper optional, graceful fallback)."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import re
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import Optional, Tuple
|
| 6 |
+
from tutor.lang_detect import detect as lang_detect
|
| 7 |
+
|
| 8 |
+
SILENCE_RMS = 0.005
|
| 9 |
+
SILENCE_MIN_SECS = 0.3
|
| 10 |
+
|
| 11 |
+
_WORD_MAP = {
|
| 12 |
+
# English
|
| 13 |
+
"zero":0,"one":1,"two":2,"three":3,"four":4,"five":5,
|
| 14 |
+
"six":6,"seven":7,"eight":8,"nine":9,"ten":10,
|
| 15 |
+
# French
|
| 16 |
+
"zéro":0,"un":1,"une":1,"deux":2,"trois":3,"quatre":4,
|
| 17 |
+
"cinq":5,"sept":7,"huit":8,"neuf":9,"dix":10,
|
| 18 |
+
# Kinyarwanda
|
| 19 |
+
"zeru":0,"rimwe":1,"ebyiri":2,"eshatu":3,"ine":4,"eshanu":5,
|
| 20 |
+
"gatandatu":6,"indwi":7,"umunani":8,"icyenda":9,"icumi":10,
|
| 21 |
+
"mbiri":2,"gatatu":3,"kane":4,"gatanu":5,
|
| 22 |
+
# Kiswahili
|
| 23 |
+
"sifuri":0,"moja":1,"mbili":2,"tatu":3,"nne":4,"tano":5,
|
| 24 |
+
"sita":6,"saba":7,"nane":8,"tisa":9,"kumi":10,
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
_whisper_model = None
|
| 28 |
+
_whisper_ok = False
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def is_silence(audio_f32: np.ndarray, sr: int = 16000) -> bool:
|
| 32 |
+
if len(audio_f32) < int(SILENCE_MIN_SECS * sr):
|
| 33 |
+
return True
|
| 34 |
+
return float(np.sqrt(np.mean(audio_f32 ** 2))) < SILENCE_RMS
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def extract_integer(text: str) -> Optional[int]:
|
| 38 |
+
if not text:
|
| 39 |
+
return None
|
| 40 |
+
m = re.search(r"\b(\d{1,2})\b", text)
|
| 41 |
+
if m:
|
| 42 |
+
return int(m.group(1))
|
| 43 |
+
for tok in re.findall(r"[a-zA-ZÀ-öø-ÿ]+", text.lower()):
|
| 44 |
+
if tok in _WORD_MAP:
|
| 45 |
+
return _WORD_MAP[tok]
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _load_whisper() -> bool:
|
| 50 |
+
global _whisper_model, _whisper_ok
|
| 51 |
+
if _whisper_model is not None:
|
| 52 |
+
return _whisper_ok
|
| 53 |
+
try:
|
| 54 |
+
import whisper # type: ignore
|
| 55 |
+
_whisper_model = whisper.load_model("tiny")
|
| 56 |
+
_whisper_ok = True
|
| 57 |
+
except Exception:
|
| 58 |
+
_whisper_ok = False
|
| 59 |
+
return _whisper_ok
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
_LANG_TO_WHISPER = {"en":"en","fr":"fr","kin":"rw","sw":"sw"}
|
| 63 |
+
_WHISPER_TO_LANG = {"en":"en","fr":"fr","rw":"kin","sw":"sw"}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def transcribe(audio_f32: np.ndarray, lang_hint: str = "en", sample_rate: int = 16000
|
| 67 |
+
) -> Tuple[str, str, float]:
|
| 68 |
+
if is_silence(audio_f32, sample_rate):
|
| 69 |
+
return "", lang_hint, 0.0
|
| 70 |
+
if _load_whisper():
|
| 71 |
+
try:
|
| 72 |
+
import whisper # type: ignore
|
| 73 |
+
result = _whisper_model.transcribe(
|
| 74 |
+
audio_f32,
|
| 75 |
+
language=_LANG_TO_WHISPER.get(lang_hint, "en"),
|
| 76 |
+
fp16=False, task="transcribe")
|
| 77 |
+
text = result.get("text", "").strip()
|
| 78 |
+
detected = _WHISPER_TO_LANG.get(result.get("language", "en"), lang_hint)
|
| 79 |
+
return text, detected, 0.9
|
| 80 |
+
except Exception:
|
| 81 |
+
pass
|
| 82 |
+
return "", lang_hint, 0.0
|