Spaces:
Running on Zero
Running on Zero
| """Iris — your father's eyes, by voice (HF Space, ZeroGPU). | |
| gr.Server: serves a custom voice-first frontend (frontend/index.html) and exposes | |
| the API endpoints. Pipeline: Whisper (STT) -> Qwen3-VL (description / VQA) -> Piper (TTS). | |
| """ | |
| import difflib | |
| import os | |
| import re | |
| import tempfile | |
| import time | |
| import unicodedata | |
| from pathlib import Path | |
| os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") | |
| from fastapi.responses import HTMLResponse # noqa: E402 | |
| from fastapi.staticfiles import StaticFiles # noqa: E402 | |
| from gradio import Server # noqa: E402 | |
| from gradio.data_classes import FileData # noqa: E402 | |
| from core import stt, trace, tts, vlm # noqa: E402 | |
| HERE = Path(__file__).parent | |
| app = Server(title="Iris") | |
| def _path(f): | |
| """gr.Server delivers FileData as a dict; accept dict or object.""" | |
| if f is None: | |
| return None | |
| return f["path"] if isinstance(f, dict) else f.path | |
| # voice commands (live mode on/off), tolerant of transcription errors (e.g. "modo ao fio") | |
| _OFF_RE = re.compile(r"\b(pare|parar|para de|deslig\w*|cancela\w*|stop|turn off|" | |
| r"silenci\w*|quieto|chega|modo manual|manual)\b") | |
| def _norm(s: str) -> str: | |
| s = unicodedata.normalize("NFKD", s.lower()) | |
| s = "".join(c for c in s if not unicodedata.combining(c)) | |
| return s.strip().strip(".!?,").strip() | |
| def detect_command(text: str): | |
| """Map a (possibly mis-transcribed) utterance to a live-mode command.""" | |
| if not text: | |
| return None | |
| t = _norm(text) | |
| if _OFF_RE.search(t): | |
| return "live_off" | |
| on = ("ao vivo" in t or "ao fio" in t or "modo ao" in t or "descreva sempre" in t | |
| or "descreva tudo" in t or "modo continuo" in t or "tempo real" in t | |
| or "live" in t or "automatico" in t | |
| or difflib.SequenceMatcher(None, t, "modo ao vivo").ratio() >= 0.7) | |
| return "live_on" if on else None | |
| def choose_lang(text: str, detected: str) -> str: | |
| """Pick 'pt'/'en' from what the person said (keywords) or Whisper's guess.""" | |
| low = (text or "").lower() | |
| if "ingl" in low or "english" in low: | |
| return "en" | |
| if "portug" in low or "brasil" in low: | |
| return "pt" | |
| return "pt" if detected == "pt" else "en" | |
| def describe(image: FileData, audio: FileData | None = None, lang: str = "pt", | |
| qtext: str = "") -> dict: | |
| """Camera frame + a question (as recorded audio OR as text from browser speech | |
| recognition) -> a command OR a PT/EN description + audio.""" | |
| t0 = time.time() | |
| if qtext and qtext.strip(): | |
| question = qtext.strip() | |
| else: | |
| apath = _path(audio) | |
| question = stt.transcribe(apath, language=lang) if apath else "" | |
| cmd = detect_command(question) | |
| if cmd: | |
| print(f"[describe] command {cmd} <- {question!r}", flush=True) | |
| return {"command": cmd, "question": question, "answer": "", "audio": None} | |
| answer = vlm.describe(_path(image), question, lang=lang) | |
| if not answer.strip(): | |
| answer = "I couldn't describe that." if lang == "en" else "Não consegui descrever isso." | |
| wav = tts.synthesize(answer, lang=lang) | |
| print(f"[describe] q={question!r} a={answer!r}", flush=True) | |
| trace.log("describe", lang, question, answer, time.time() - t0, vlm.MODEL_ID) | |
| return { | |
| "question": question, | |
| "answer": answer, | |
| "audio": FileData(path=wav) if wav else None, | |
| } | |
| # Live mode: describe ONLY new/relevant things, sparingly (avoid verbosity). | |
| WATCH_PT = ( | |
| "Você observa o ambiente para uma pessoa cega. " | |
| "Coisas já ditas (não repita): \"{prev}\". " | |
| "Olhe a cena AGORA. Se apareceu QUALQUER objeto ou pessoa novo — mesmo pequeno " | |
| "(óculos, celular, copo, papel, comida) — diga em UMA frase curta o que é. " | |
| "Só responda NADA se a cena estiver basicamente igual ao que já foi dito." | |
| ) | |
| WATCH_EN = ( | |
| "You watch the environment for a blind person. " | |
| "Already said (do not repeat): \"{prev}\". " | |
| "Look at the scene NOW. If ANY new object or person appeared — even small " | |
| "(glasses, a phone, a cup, paper, food) — say what it is in ONE short sentence. " | |
| "Only reply NONE if the scene is basically the same as what was already said." | |
| ) | |
| _QUIET = {"nada", "none", "nenhum", "nothing", "sem novidade", "no change"} | |
| def is_quiet(answer: str) -> bool: | |
| """True when the live-mode answer means 'nothing new to report'.""" | |
| low = (answer or "").strip().lower().strip(".!").strip() | |
| return (not low) or low in _QUIET | |
| HINT_PT = ("Uma pessoa entrou no campo de visão. Em UMA frase de no máximo 12 palavras, " | |
| "avise de forma útil para um cego (ex.: 'Há alguém à sua frente.'). Descreva só o " | |
| "que você vê com CERTEZA; não invente; não repita o que já foi dito: \"{prev}\".") | |
| HINT_EN = ("A person entered the field of view. In ONE sentence of at most 12 words, alert a " | |
| "blind person usefully (e.g. 'Someone is in front of you.'). Describe only what you " | |
| "see for CERTAIN; do not invent; do not repeat what was already said: \"{prev}\".") | |
| def watch(image: FileData, prev: str = "", lang: str = "pt", hint: str = "") -> dict: | |
| """Live mode. First call (no history) describes the scene as a baseline. When the | |
| in-browser detector flags a new object (`hint`), describe it (trust the gate). | |
| Otherwise (pixel-diff fallback) speak ONLY if something new entered, else stay quiet.""" | |
| t0 = time.time() | |
| prev = (prev or "").strip() | |
| hint = (hint or "").strip() | |
| if not prev: | |
| # baseline: describe the current scene so the user hears the surroundings | |
| answer = vlm.describe(_path(image), lang=lang) | |
| elif hint: | |
| # the detector already decided something new is here -> just describe it | |
| tmpl = HINT_EN if lang == "en" else HINT_PT | |
| answer = vlm.describe(_path(image), lang=lang, system=tmpl.format(hint=hint, prev=prev)) | |
| else: | |
| tmpl = WATCH_EN if lang == "en" else WATCH_PT | |
| sys = tmpl.format(prev=prev) | |
| q = "What is new?" if lang == "en" else "O que há de novo?" | |
| answer = vlm.describe(_path(image), question=q, lang=lang, system=sys) | |
| if is_quiet(answer): | |
| print(f"[watch] quiet ({answer!r})", flush=True) | |
| return {"speak": False, "answer": ""} | |
| wav = tts.synthesize(answer, lang=lang) | |
| print(f"[watch] SPEAK: {answer!r}", flush=True) | |
| trace.log("watch", lang, "", answer, time.time() - t0, vlm.MODEL_ID) | |
| return {"speak": True, "answer": answer, "audio": FileData(path=wav) if wav else None} | |
| def detect_lang(audio: FileData) -> dict: | |
| """Choose the language by voice: the person says 'português' or 'english'.""" | |
| text, detected = stt.transcribe_auto(_path(audio)) | |
| lang = choose_lang(text, detected) | |
| print(f"[detect_lang] {text!r} (det={detected}) -> {lang}", flush=True) | |
| return {"lang": lang, "text": text} | |
| app.mount("/static", StaticFiles(directory=str(HERE / "frontend")), name="static") | |
| def index(): | |
| return HTMLResponse((HERE / "frontend" / "index.html").read_text(encoding="utf-8")) | |
| if __name__ == "__main__": | |
| if os.environ.get("IRIS_WARMUP") == "1": | |
| print("Warmup...", flush=True) | |
| try: | |
| vlm.describe(str(HERE / "assets" / "warmup.jpg"), "test") | |
| stt.transcribe(tts.synthesize("test")) | |
| print("Warmup OK", flush=True) | |
| except Exception as e: | |
| print("Warmup failed:", e, flush=True) | |
| port = int(os.environ.get("GRADIO_SERVER_PORT", os.environ.get("PORT", 7860))) | |
| app.launch(server_name="0.0.0.0", server_port=port, show_error=True, | |
| allowed_paths=[tempfile.gettempdir()]) | |