Spaces:

build-small-hackathon
/

iris

Running on Zero

App Files Files Community

iris / app.py

nextmarte

docs: plainer voice, drop AI writing tics and negation framing

5d19f12 about 11 hours ago

raw

history blame contribute delete

7.77 kB

	"""Iris — your father's eyes, by voice (HF Space, ZeroGPU).

	gr.Server: serves a custom voice-first frontend (frontend/index.html) and exposes
	the API endpoints. Pipeline: Whisper (STT) -> Qwen3-VL (description / VQA) -> Piper (TTS).
	"""
	import difflib
	import os
	import re
	import tempfile
	import time
	import unicodedata
	from pathlib import Path

	os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

	from fastapi.responses import HTMLResponse # noqa: E402
	from fastapi.staticfiles import StaticFiles # noqa: E402
	from gradio import Server # noqa: E402
	from gradio.data_classes import FileData # noqa: E402

	from core import stt, trace, tts, vlm # noqa: E402

	HERE = Path(__file__).parent
	app = Server(title="Iris")


	def _path(f):
	"""gr.Server delivers FileData as a dict; accept dict or object."""
	if f is None:
	return None
	return f["path"] if isinstance(f, dict) else f.path


	# voice commands (live mode on/off), tolerant of transcription errors (e.g. "modo ao fio")
	_OFF_RE = re.compile(r"\b(pare\|parar\|para de\|deslig\w\|cancela\w\|stop\|turn off\|"
	r"silenci\w*\|quieto\|chega\|modo manual\|manual)\b")


	def _norm(s: str) -> str:
	s = unicodedata.normalize("NFKD", s.lower())
	s = "".join(c for c in s if not unicodedata.combining(c))
	return s.strip().strip(".!?,").strip()


	def detect_command(text: str):
	"""Map a (possibly mis-transcribed) utterance to a live-mode command."""
	if not text:
	return None
	t = _norm(text)
	if _OFF_RE.search(t):
	return "live_off"
	on = ("ao vivo" in t or "ao fio" in t or "modo ao" in t or "descreva sempre" in t
	or "descreva tudo" in t or "modo continuo" in t or "tempo real" in t
	or "live" in t or "automatico" in t
	or difflib.SequenceMatcher(None, t, "modo ao vivo").ratio() >= 0.7)
	return "live_on" if on else None


	def choose_lang(text: str, detected: str) -> str:
	"""Pick 'pt'/'en' from what the person said (keywords) or Whisper's guess."""
	low = (text or "").lower()
	if "ingl" in low or "english" in low:
	return "en"
	if "portug" in low or "brasil" in low:
	return "pt"
	return "pt" if detected == "pt" else "en"


	@app.api(name="describe")
	def describe(image: FileData, audio: FileData \| None = None, lang: str = "pt",
	qtext: str = "") -> dict:
	"""Camera frame + a question (as recorded audio OR as text from browser speech
	recognition) -> a command OR a PT/EN description + audio."""
	t0 = time.time()
	if qtext and qtext.strip():
	question = qtext.strip()
	else:
	apath = _path(audio)
	question = stt.transcribe(apath, language=lang) if apath else ""
	cmd = detect_command(question)
	if cmd:
	print(f"[describe] command {cmd} <- {question!r}", flush=True)
	return {"command": cmd, "question": question, "answer": "", "audio": None}
	answer = vlm.describe(_path(image), question, lang=lang)
	if not answer.strip():
	answer = "I couldn't describe that." if lang == "en" else "Não consegui descrever isso."
	wav = tts.synthesize(answer, lang=lang)
	print(f"[describe] q={question!r} a={answer!r}", flush=True)
	trace.log("describe", lang, question, answer, time.time() - t0, vlm.MODEL_ID)
	return {
	"question": question,
	"answer": answer,
	"audio": FileData(path=wav) if wav else None,
	}


	# Live mode: describe ONLY new/relevant things, sparingly (avoid verbosity).
	WATCH_PT = (
	"Você observa o ambiente para uma pessoa cega. "
	"Coisas já ditas (não repita): \"{prev}\". "
	"Olhe a cena AGORA. Se apareceu QUALQUER objeto ou pessoa novo — mesmo pequeno "
	"(óculos, celular, copo, papel, comida) — diga em UMA frase curta o que é. "
	"Só responda NADA se a cena estiver basicamente igual ao que já foi dito."
	)
	WATCH_EN = (
	"You watch the environment for a blind person. "
	"Already said (do not repeat): \"{prev}\". "
	"Look at the scene NOW. If ANY new object or person appeared — even small "
	"(glasses, a phone, a cup, paper, food) — say what it is in ONE short sentence. "
	"Only reply NONE if the scene is basically the same as what was already said."
	)
	_QUIET = {"nada", "none", "nenhum", "nothing", "sem novidade", "no change"}


	def is_quiet(answer: str) -> bool:
	"""True when the live-mode answer means 'nothing new to report'."""
	low = (answer or "").strip().lower().strip(".!").strip()
	return (not low) or low in _QUIET


	HINT_PT = ("Uma pessoa entrou no campo de visão. Em UMA frase de no máximo 12 palavras, "
	"avise de forma útil para um cego (ex.: 'Há alguém à sua frente.'). Descreva só o "
	"que você vê com CERTEZA; não invente; não repita o que já foi dito: \"{prev}\".")
	HINT_EN = ("A person entered the field of view. In ONE sentence of at most 12 words, alert a "
	"blind person usefully (e.g. 'Someone is in front of you.'). Describe only what you "
	"see for CERTAIN; do not invent; do not repeat what was already said: \"{prev}\".")


	@app.api(name="watch")
	def watch(image: FileData, prev: str = "", lang: str = "pt", hint: str = "") -> dict:
	"""Live mode. First call (no history) describes the scene as a baseline. When the
	in-browser detector flags a new object (`hint`), describe it (trust the gate).
	Otherwise (pixel-diff fallback) speak ONLY if something new entered, else stay quiet."""
	t0 = time.time()
	prev = (prev or "").strip()
	hint = (hint or "").strip()
	if not prev:
	# baseline: describe the current scene so the user hears the surroundings
	answer = vlm.describe(_path(image), lang=lang)
	elif hint:
	# the detector already decided something new is here -> just describe it
	tmpl = HINT_EN if lang == "en" else HINT_PT
	answer = vlm.describe(_path(image), lang=lang, system=tmpl.format(hint=hint, prev=prev))
	else:
	tmpl = WATCH_EN if lang == "en" else WATCH_PT
	sys = tmpl.format(prev=prev)
	q = "What is new?" if lang == "en" else "O que há de novo?"
	answer = vlm.describe(_path(image), question=q, lang=lang, system=sys)
	if is_quiet(answer):
	print(f"[watch] quiet ({answer!r})", flush=True)
	return {"speak": False, "answer": ""}
	wav = tts.synthesize(answer, lang=lang)
	print(f"[watch] SPEAK: {answer!r}", flush=True)
	trace.log("watch", lang, "", answer, time.time() - t0, vlm.MODEL_ID)
	return {"speak": True, "answer": answer, "audio": FileData(path=wav) if wav else None}


	@app.api(name="detect_lang")
	def detect_lang(audio: FileData) -> dict:
	"""Choose the language by voice: the person says 'português' or 'english'."""
	text, detected = stt.transcribe_auto(_path(audio))
	lang = choose_lang(text, detected)
	print(f"[detect_lang] {text!r} (det={detected}) -> {lang}", flush=True)
	return {"lang": lang, "text": text}


	app.mount("/static", StaticFiles(directory=str(HERE / "frontend")), name="static")


	@app.get("/")
	def index():
	return HTMLResponse((HERE / "frontend" / "index.html").read_text(encoding="utf-8"))


	if __name__ == "__main__":
	if os.environ.get("IRIS_WARMUP") == "1":
	print("Warmup...", flush=True)
	try:
	vlm.describe(str(HERE / "assets" / "warmup.jpg"), "test")
	stt.transcribe(tts.synthesize("test"))
	print("Warmup OK", flush=True)
	except Exception as e:
	print("Warmup failed:", e, flush=True)
	port = int(os.environ.get("GRADIO_SERVER_PORT", os.environ.get("PORT", 7860)))
	app.launch(server_name="0.0.0.0", server_port=port, show_error=True,
	allowed_paths=[tempfile.gettempdir()])