"""Local speech-to-text via faster-whisper (CTranslate2). The model is loaded lazily and cached. On a CUDA box we prefer ``int8_float16``; if CUDA is unavailable or its libraries are missing we fall back to CPU ``int8`` automatically, so transcription always works. """ from __future__ import annotations from collections.abc import Iterator from dataclasses import dataclass, field from pathlib import Path from typing import Callable from . import config # Cached (model, (device, compute_type)). _MODEL = None _MODEL_INFO: tuple[str, str] | None = None # Set once a CUDA runtime failure is seen, to skip the GPU on later calls. _FORCE_CPU = False @dataclass class TranscriptSegment: start: float end: float text: str @dataclass class Transcript: segments: list[TranscriptSegment] = field(default_factory=list) language: str = "" device: str = "" @property def text(self) -> str: return " ".join(seg.text.strip() for seg in self.segments).strip() def to_timestamped_text(self) -> str: lines = [] for seg in self.segments: mm, ss = divmod(int(seg.start), 60) lines.append(f"[{mm:02d}:{ss:02d}] {seg.text.strip()}") return "\n".join(lines) def _candidate_configs() -> Iterator[tuple[str, str]]: """Yield (device, compute_type) attempts in priority order.""" override = config.WHISPER_COMPUTE_TYPE device = config.WHISPER_DEVICE if device == "cpu" or _FORCE_CPU: yield ("cpu", override or "int8") return # "cuda" or "auto": try GPU first, then always fall back to CPU int8. yield ("cuda", override or "int8_float16") yield ("cpu", "int8") def _is_cuda_error(exc: Exception) -> bool: msg = str(exc).lower() return any(tok in msg for tok in ("cublas", "cudnn", "cuda", ".dll", "gpu")) def get_model(): """Load (once) and return the faster-whisper model with its resolved config.""" global _MODEL, _MODEL_INFO if _MODEL is not None: return _MODEL, _MODEL_INFO from faster_whisper import WhisperModel errors: list[str] = [] for device, compute_type in _candidate_configs(): try: _MODEL = WhisperModel( config.WHISPER_MODEL, device=device, compute_type=compute_type ) _MODEL_INFO = (device, compute_type) return _MODEL, _MODEL_INFO except Exception as exc: # CUDA libs missing, OOM, etc. errors.append(f" {device}/{compute_type}: {exc}") raise RuntimeError( "Could not load the Whisper model. Attempts:\n" + "\n".join(errors) ) def _run_transcribe( media_path: str | Path, progress: Callable[[float, str], None] | None ) -> Transcript: model, info = get_model() device = info[0] if info else "" segment_iter, meta = model.transcribe(str(media_path), vad_filter=True, beam_size=5) total = float(getattr(meta, "duration", 0.0)) or 0.0 # The CUDA libraries are loaded lazily on first encode, so a missing-cuBLAS # error surfaces while consuming this generator — not at model construction. segments: list[TranscriptSegment] = [] for seg in segment_iter: segments.append( TranscriptSegment(start=float(seg.start), end=float(seg.end), text=seg.text) ) if progress and total: frac = min(seg.end / total, 1.0) progress(frac, f"Transcribing… {int(frac * 100)}%") return Transcript( segments=segments, language=getattr(meta, "language", "") or "", device=device, ) def transcribe( media_path: str | Path, *, progress: Callable[[float, str], None] | None = None, ) -> Transcript: """Transcribe an audio/video file into timestamped segments. Falls back from GPU to CPU automatically if a CUDA runtime error (e.g. missing cuBLAS/cuDNN) occurs during inference. """ global _MODEL, _MODEL_INFO, _FORCE_CPU try: return _run_transcribe(media_path, progress) except RuntimeError as exc: on_gpu = bool(_MODEL_INFO and _MODEL_INFO[0] == "cuda") if not (on_gpu and _is_cuda_error(exc)): raise # Drop the GPU model and retry once on CPU. _MODEL, _MODEL_INFO, _FORCE_CPU = None, None, True if progress: progress(0.0, "GPU unavailable — falling back to CPU…") return _run_transcribe(media_path, progress)