Spaces:

build-small-hackathon
/

iris

Running on Zero

iris / core /stt.py

Marcus Ramalho

Iris: hands-free live mode, money/bill reading, accessible UI, Qwen3-VL-2B

df6b3ac 1 day ago

1.74 kB

	"""Speech-to-text (Whisper via faster-whisper). Portuguese by default.

	No torch needed — faster-whisper runs on CTranslate2 (GPU if available, else CPU).
	"""
	import os

	_model = None


	def _load():
	global _model
	if _model is None:
	from faster_whisper import WhisperModel
	size = os.environ.get("IRIS_STT_MODEL", "small")
	# CTranslate2 needs CUDA 12 libs (cublas/cudnn). Fall back to CPU if missing.
	device = os.environ.get("IRIS_STT_DEVICE", "cpu")
	if device == "cuda":
	try:
	_model = WhisperModel(size, device="cuda", compute_type="float16")
	except Exception:
	device = "cpu"
	if device != "cuda":
	_model = WhisperModel(size, device="cpu", compute_type="int8")
	return _model


	def transcribe(audio_path: str, language: str = "pt") -> str:
	if not audio_path or not os.path.exists(audio_path):
	print(f"[stt] no audio: {audio_path!r}", flush=True)
	return ""
	segments, info = _load().transcribe(audio_path, language=language)
	text = " ".join(s.text for s in segments).strip()
	print(f"[stt] {audio_path} ({getattr(info, 'duration', '?')}s) -> {text!r}", flush=True)
	return text


	def transcribe_auto(audio_path: str):
	"""Transcribe WITHOUT forcing a language; returns (text, detected_language).
	Used for choosing the language by voice."""
	if not audio_path or not os.path.exists(audio_path):
	return "", "en"
	segments, info = _load().transcribe(audio_path, language=None)
	text = " ".join(s.text for s in segments).strip()
	lang = getattr(info, "language", "en")
	print(f"[stt-auto] -> {text!r} (lang={lang})", flush=True)
	return text, lang