"""Speech-to-text (Whisper via faster-whisper). Portuguese by default. No torch needed — faster-whisper runs on CTranslate2 (GPU if available, else CPU). """ import os _model = None def _load(): global _model if _model is None: from faster_whisper import WhisperModel size = os.environ.get("IRIS_STT_MODEL", "small") # CTranslate2 needs CUDA 12 libs (cublas/cudnn). Fall back to CPU if missing. device = os.environ.get("IRIS_STT_DEVICE", "cpu") if device == "cuda": try: _model = WhisperModel(size, device="cuda", compute_type="float16") except Exception: device = "cpu" if device != "cuda": _model = WhisperModel(size, device="cpu", compute_type="int8") return _model def transcribe(audio_path: str, language: str = "pt") -> str: if not audio_path or not os.path.exists(audio_path): print(f"[stt] no audio: {audio_path!r}", flush=True) return "" segments, info = _load().transcribe(audio_path, language=language) text = " ".join(s.text for s in segments).strip() print(f"[stt] {audio_path} ({getattr(info, 'duration', '?')}s) -> {text!r}", flush=True) return text def transcribe_auto(audio_path: str): """Transcribe WITHOUT forcing a language; returns (text, detected_language). Used for choosing the language by voice.""" if not audio_path or not os.path.exists(audio_path): return "", "en" segments, info = _load().transcribe(audio_path, language=None) text = " ".join(s.text for s in segments).strip() lang = getattr(info, "language", "en") print(f"[stt-auto] -> {text!r} (lang={lang})", flush=True) return text, lang