"""Iris — your father's eyes, by voice (HF Space, ZeroGPU). gr.Server: serves a custom voice-first frontend (frontend/index.html) and exposes the API endpoints. Pipeline: Whisper (STT) -> Qwen3-VL (description / VQA) -> Piper (TTS). """ import difflib import os import re import tempfile import time import unicodedata from pathlib import Path os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") from fastapi.responses import HTMLResponse # noqa: E402 from fastapi.staticfiles import StaticFiles # noqa: E402 from gradio import Server # noqa: E402 from gradio.data_classes import FileData # noqa: E402 from core import stt, trace, tts, vlm # noqa: E402 HERE = Path(__file__).parent app = Server(title="Iris") def _path(f): """gr.Server delivers FileData as a dict; accept dict or object.""" if f is None: return None return f["path"] if isinstance(f, dict) else f.path # voice commands (live mode on/off), tolerant of transcription errors (e.g. "modo ao fio") _OFF_RE = re.compile(r"\b(pare|parar|para de|deslig\w*|cancela\w*|stop|turn off|" r"silenci\w*|quieto|chega|modo manual|manual)\b") def _norm(s: str) -> str: s = unicodedata.normalize("NFKD", s.lower()) s = "".join(c for c in s if not unicodedata.combining(c)) return s.strip().strip(".!?,").strip() def detect_command(text: str): """Map a (possibly mis-transcribed) utterance to a live-mode command.""" if not text: return None t = _norm(text) if _OFF_RE.search(t): return "live_off" on = ("ao vivo" in t or "ao fio" in t or "modo ao" in t or "descreva sempre" in t or "descreva tudo" in t or "modo continuo" in t or "tempo real" in t or "live" in t or "automatico" in t or difflib.SequenceMatcher(None, t, "modo ao vivo").ratio() >= 0.7) return "live_on" if on else None def choose_lang(text: str, detected: str) -> str: """Pick 'pt'/'en' from what the person said (keywords) or Whisper's guess.""" low = (text or "").lower() if "ingl" in low or "english" in low: return "en" if "portug" in low or "brasil" in low: return "pt" return "pt" if detected == "pt" else "en" @app.api(name="describe") def describe(image: FileData, audio: FileData | None = None, lang: str = "pt", qtext: str = "") -> dict: """Camera frame + a question (as recorded audio OR as text from browser speech recognition) -> a command OR a PT/EN description + audio.""" t0 = time.time() if qtext and qtext.strip(): question = qtext.strip() else: apath = _path(audio) question = stt.transcribe(apath, language=lang) if apath else "" cmd = detect_command(question) if cmd: print(f"[describe] command {cmd} <- {question!r}", flush=True) return {"command": cmd, "question": question, "answer": "", "audio": None} answer = vlm.describe(_path(image), question, lang=lang) if not answer.strip(): answer = "I couldn't describe that." if lang == "en" else "Não consegui descrever isso." wav = tts.synthesize(answer, lang=lang) print(f"[describe] q={question!r} a={answer!r}", flush=True) trace.log("describe", lang, question, answer, time.time() - t0, vlm.MODEL_ID) return { "question": question, "answer": answer, "audio": FileData(path=wav) if wav else None, } # Live mode: describe ONLY new/relevant things, sparingly (avoid verbosity). WATCH_PT = ( "Você observa o ambiente para uma pessoa cega. " "Coisas já ditas (não repita): \"{prev}\". " "Olhe a cena AGORA. Se apareceu QUALQUER objeto ou pessoa novo — mesmo pequeno " "(óculos, celular, copo, papel, comida) — diga em UMA frase curta o que é. " "Só responda NADA se a cena estiver basicamente igual ao que já foi dito." ) WATCH_EN = ( "You watch the environment for a blind person. " "Already said (do not repeat): \"{prev}\". " "Look at the scene NOW. If ANY new object or person appeared — even small " "(glasses, a phone, a cup, paper, food) — say what it is in ONE short sentence. " "Only reply NONE if the scene is basically the same as what was already said." ) _QUIET = {"nada", "none", "nenhum", "nothing", "sem novidade", "no change"} def is_quiet(answer: str) -> bool: """True when the live-mode answer means 'nothing new to report'.""" low = (answer or "").strip().lower().strip(".!").strip() return (not low) or low in _QUIET HINT_PT = ("Uma pessoa entrou no campo de visão. Em UMA frase de no máximo 12 palavras, " "avise de forma útil para um cego (ex.: 'Há alguém à sua frente.'). Descreva só o " "que você vê com CERTEZA; não invente; não repita o que já foi dito: \"{prev}\".") HINT_EN = ("A person entered the field of view. In ONE sentence of at most 12 words, alert a " "blind person usefully (e.g. 'Someone is in front of you.'). Describe only what you " "see for CERTAIN; do not invent; do not repeat what was already said: \"{prev}\".") @app.api(name="watch") def watch(image: FileData, prev: str = "", lang: str = "pt", hint: str = "") -> dict: """Live mode. First call (no history) describes the scene as a baseline. When the in-browser detector flags a new object (`hint`), describe it (trust the gate). Otherwise (pixel-diff fallback) speak ONLY if something new entered, else stay quiet.""" t0 = time.time() prev = (prev or "").strip() hint = (hint or "").strip() if not prev: # baseline: describe the current scene so the user hears the surroundings answer = vlm.describe(_path(image), lang=lang) elif hint: # the detector already decided something new is here -> just describe it tmpl = HINT_EN if lang == "en" else HINT_PT answer = vlm.describe(_path(image), lang=lang, system=tmpl.format(hint=hint, prev=prev)) else: tmpl = WATCH_EN if lang == "en" else WATCH_PT sys = tmpl.format(prev=prev) q = "What is new?" if lang == "en" else "O que há de novo?" answer = vlm.describe(_path(image), question=q, lang=lang, system=sys) if is_quiet(answer): print(f"[watch] quiet ({answer!r})", flush=True) return {"speak": False, "answer": ""} wav = tts.synthesize(answer, lang=lang) print(f"[watch] SPEAK: {answer!r}", flush=True) trace.log("watch", lang, "", answer, time.time() - t0, vlm.MODEL_ID) return {"speak": True, "answer": answer, "audio": FileData(path=wav) if wav else None} @app.api(name="detect_lang") def detect_lang(audio: FileData) -> dict: """Choose the language by voice: the person says 'português' or 'english'.""" text, detected = stt.transcribe_auto(_path(audio)) lang = choose_lang(text, detected) print(f"[detect_lang] {text!r} (det={detected}) -> {lang}", flush=True) return {"lang": lang, "text": text} app.mount("/static", StaticFiles(directory=str(HERE / "frontend")), name="static") @app.get("/") def index(): return HTMLResponse((HERE / "frontend" / "index.html").read_text(encoding="utf-8")) if __name__ == "__main__": if os.environ.get("IRIS_WARMUP") == "1": print("Warmup...", flush=True) try: vlm.describe(str(HERE / "assets" / "warmup.jpg"), "test") stt.transcribe(tts.synthesize("test")) print("Warmup OK", flush=True) except Exception as e: print("Warmup failed:", e, flush=True) port = int(os.environ.get("GRADIO_SERVER_PORT", os.environ.get("PORT", 7860))) app.launch(server_name="0.0.0.0", server_port=port, show_error=True, allowed_paths=[tempfile.gettempdir()])