iris / core /stt.py
Marcus Ramalho
Iris: hands-free live mode, money/bill reading, accessible UI, Qwen3-VL-2B
df6b3ac
"""Speech-to-text (Whisper via faster-whisper). Portuguese by default.
No torch needed — faster-whisper runs on CTranslate2 (GPU if available, else CPU).
"""
import os
_model = None
def _load():
global _model
if _model is None:
from faster_whisper import WhisperModel
size = os.environ.get("IRIS_STT_MODEL", "small")
# CTranslate2 needs CUDA 12 libs (cublas/cudnn). Fall back to CPU if missing.
device = os.environ.get("IRIS_STT_DEVICE", "cpu")
if device == "cuda":
try:
_model = WhisperModel(size, device="cuda", compute_type="float16")
except Exception:
device = "cpu"
if device != "cuda":
_model = WhisperModel(size, device="cpu", compute_type="int8")
return _model
def transcribe(audio_path: str, language: str = "pt") -> str:
if not audio_path or not os.path.exists(audio_path):
print(f"[stt] no audio: {audio_path!r}", flush=True)
return ""
segments, info = _load().transcribe(audio_path, language=language)
text = " ".join(s.text for s in segments).strip()
print(f"[stt] {audio_path} ({getattr(info, 'duration', '?')}s) -> {text!r}", flush=True)
return text
def transcribe_auto(audio_path: str):
"""Transcribe WITHOUT forcing a language; returns (text, detected_language).
Used for choosing the language by voice."""
if not audio_path or not os.path.exists(audio_path):
return "", "en"
segments, info = _load().transcribe(audio_path, language=None)
text = " ".join(s.text for s in segments).strip()
lang = getattr(info, "language", "en")
print(f"[stt-auto] -> {text!r} (lang={lang})", flush=True)
return text, lang