File size: 1,736 Bytes
df6b3ac
26dae50
df6b3ac
26dae50
 
 
 
 
 
 
 
 
 
 
df6b3ac
26dae50
 
 
 
 
 
 
 
 
 
 
 
 
df6b3ac
26dae50
 
 
 
 
df6b3ac
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""Speech-to-text (Whisper via faster-whisper). Portuguese by default.

No torch needed — faster-whisper runs on CTranslate2 (GPU if available, else CPU).
"""
import os

_model = None


def _load():
    global _model
    if _model is None:
        from faster_whisper import WhisperModel
        size = os.environ.get("IRIS_STT_MODEL", "small")
        # CTranslate2 needs CUDA 12 libs (cublas/cudnn). Fall back to CPU if missing.
        device = os.environ.get("IRIS_STT_DEVICE", "cpu")
        if device == "cuda":
            try:
                _model = WhisperModel(size, device="cuda", compute_type="float16")
            except Exception:
                device = "cpu"
        if device != "cuda":
            _model = WhisperModel(size, device="cpu", compute_type="int8")
    return _model


def transcribe(audio_path: str, language: str = "pt") -> str:
    if not audio_path or not os.path.exists(audio_path):
        print(f"[stt] no audio: {audio_path!r}", flush=True)
        return ""
    segments, info = _load().transcribe(audio_path, language=language)
    text = " ".join(s.text for s in segments).strip()
    print(f"[stt] {audio_path} ({getattr(info, 'duration', '?')}s) -> {text!r}", flush=True)
    return text


def transcribe_auto(audio_path: str):
    """Transcribe WITHOUT forcing a language; returns (text, detected_language).
    Used for choosing the language by voice."""
    if not audio_path or not os.path.exists(audio_path):
        return "", "en"
    segments, info = _load().transcribe(audio_path, language=None)
    text = " ".join(s.text for s in segments).strip()
    lang = getattr(info, "language", "en")
    print(f"[stt-auto] -> {text!r} (lang={lang})", flush=True)
    return text, lang