iris / app.py
nextmarte's picture
docs: plainer voice, drop AI writing tics and negation framing
5d19f12
"""Iris — your father's eyes, by voice (HF Space, ZeroGPU).
gr.Server: serves a custom voice-first frontend (frontend/index.html) and exposes
the API endpoints. Pipeline: Whisper (STT) -> Qwen3-VL (description / VQA) -> Piper (TTS).
"""
import difflib
import os
import re
import tempfile
import time
import unicodedata
from pathlib import Path
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
from fastapi.responses import HTMLResponse # noqa: E402
from fastapi.staticfiles import StaticFiles # noqa: E402
from gradio import Server # noqa: E402
from gradio.data_classes import FileData # noqa: E402
from core import stt, trace, tts, vlm # noqa: E402
HERE = Path(__file__).parent
app = Server(title="Iris")
def _path(f):
"""gr.Server delivers FileData as a dict; accept dict or object."""
if f is None:
return None
return f["path"] if isinstance(f, dict) else f.path
# voice commands (live mode on/off), tolerant of transcription errors (e.g. "modo ao fio")
_OFF_RE = re.compile(r"\b(pare|parar|para de|deslig\w*|cancela\w*|stop|turn off|"
r"silenci\w*|quieto|chega|modo manual|manual)\b")
def _norm(s: str) -> str:
s = unicodedata.normalize("NFKD", s.lower())
s = "".join(c for c in s if not unicodedata.combining(c))
return s.strip().strip(".!?,").strip()
def detect_command(text: str):
"""Map a (possibly mis-transcribed) utterance to a live-mode command."""
if not text:
return None
t = _norm(text)
if _OFF_RE.search(t):
return "live_off"
on = ("ao vivo" in t or "ao fio" in t or "modo ao" in t or "descreva sempre" in t
or "descreva tudo" in t or "modo continuo" in t or "tempo real" in t
or "live" in t or "automatico" in t
or difflib.SequenceMatcher(None, t, "modo ao vivo").ratio() >= 0.7)
return "live_on" if on else None
def choose_lang(text: str, detected: str) -> str:
"""Pick 'pt'/'en' from what the person said (keywords) or Whisper's guess."""
low = (text or "").lower()
if "ingl" in low or "english" in low:
return "en"
if "portug" in low or "brasil" in low:
return "pt"
return "pt" if detected == "pt" else "en"
@app.api(name="describe")
def describe(image: FileData, audio: FileData | None = None, lang: str = "pt",
qtext: str = "") -> dict:
"""Camera frame + a question (as recorded audio OR as text from browser speech
recognition) -> a command OR a PT/EN description + audio."""
t0 = time.time()
if qtext and qtext.strip():
question = qtext.strip()
else:
apath = _path(audio)
question = stt.transcribe(apath, language=lang) if apath else ""
cmd = detect_command(question)
if cmd:
print(f"[describe] command {cmd} <- {question!r}", flush=True)
return {"command": cmd, "question": question, "answer": "", "audio": None}
answer = vlm.describe(_path(image), question, lang=lang)
if not answer.strip():
answer = "I couldn't describe that." if lang == "en" else "Não consegui descrever isso."
wav = tts.synthesize(answer, lang=lang)
print(f"[describe] q={question!r} a={answer!r}", flush=True)
trace.log("describe", lang, question, answer, time.time() - t0, vlm.MODEL_ID)
return {
"question": question,
"answer": answer,
"audio": FileData(path=wav) if wav else None,
}
# Live mode: describe ONLY new/relevant things, sparingly (avoid verbosity).
WATCH_PT = (
"Você observa o ambiente para uma pessoa cega. "
"Coisas já ditas (não repita): \"{prev}\". "
"Olhe a cena AGORA. Se apareceu QUALQUER objeto ou pessoa novo — mesmo pequeno "
"(óculos, celular, copo, papel, comida) — diga em UMA frase curta o que é. "
"Só responda NADA se a cena estiver basicamente igual ao que já foi dito."
)
WATCH_EN = (
"You watch the environment for a blind person. "
"Already said (do not repeat): \"{prev}\". "
"Look at the scene NOW. If ANY new object or person appeared — even small "
"(glasses, a phone, a cup, paper, food) — say what it is in ONE short sentence. "
"Only reply NONE if the scene is basically the same as what was already said."
)
_QUIET = {"nada", "none", "nenhum", "nothing", "sem novidade", "no change"}
def is_quiet(answer: str) -> bool:
"""True when the live-mode answer means 'nothing new to report'."""
low = (answer or "").strip().lower().strip(".!").strip()
return (not low) or low in _QUIET
HINT_PT = ("Uma pessoa entrou no campo de visão. Em UMA frase de no máximo 12 palavras, "
"avise de forma útil para um cego (ex.: 'Há alguém à sua frente.'). Descreva só o "
"que você vê com CERTEZA; não invente; não repita o que já foi dito: \"{prev}\".")
HINT_EN = ("A person entered the field of view. In ONE sentence of at most 12 words, alert a "
"blind person usefully (e.g. 'Someone is in front of you.'). Describe only what you "
"see for CERTAIN; do not invent; do not repeat what was already said: \"{prev}\".")
@app.api(name="watch")
def watch(image: FileData, prev: str = "", lang: str = "pt", hint: str = "") -> dict:
"""Live mode. First call (no history) describes the scene as a baseline. When the
in-browser detector flags a new object (`hint`), describe it (trust the gate).
Otherwise (pixel-diff fallback) speak ONLY if something new entered, else stay quiet."""
t0 = time.time()
prev = (prev or "").strip()
hint = (hint or "").strip()
if not prev:
# baseline: describe the current scene so the user hears the surroundings
answer = vlm.describe(_path(image), lang=lang)
elif hint:
# the detector already decided something new is here -> just describe it
tmpl = HINT_EN if lang == "en" else HINT_PT
answer = vlm.describe(_path(image), lang=lang, system=tmpl.format(hint=hint, prev=prev))
else:
tmpl = WATCH_EN if lang == "en" else WATCH_PT
sys = tmpl.format(prev=prev)
q = "What is new?" if lang == "en" else "O que há de novo?"
answer = vlm.describe(_path(image), question=q, lang=lang, system=sys)
if is_quiet(answer):
print(f"[watch] quiet ({answer!r})", flush=True)
return {"speak": False, "answer": ""}
wav = tts.synthesize(answer, lang=lang)
print(f"[watch] SPEAK: {answer!r}", flush=True)
trace.log("watch", lang, "", answer, time.time() - t0, vlm.MODEL_ID)
return {"speak": True, "answer": answer, "audio": FileData(path=wav) if wav else None}
@app.api(name="detect_lang")
def detect_lang(audio: FileData) -> dict:
"""Choose the language by voice: the person says 'português' or 'english'."""
text, detected = stt.transcribe_auto(_path(audio))
lang = choose_lang(text, detected)
print(f"[detect_lang] {text!r} (det={detected}) -> {lang}", flush=True)
return {"lang": lang, "text": text}
app.mount("/static", StaticFiles(directory=str(HERE / "frontend")), name="static")
@app.get("/")
def index():
return HTMLResponse((HERE / "frontend" / "index.html").read_text(encoding="utf-8"))
if __name__ == "__main__":
if os.environ.get("IRIS_WARMUP") == "1":
print("Warmup...", flush=True)
try:
vlm.describe(str(HERE / "assets" / "warmup.jpg"), "test")
stt.transcribe(tts.synthesize("test"))
print("Warmup OK", flush=True)
except Exception as e:
print("Warmup failed:", e, flush=True)
port = int(os.environ.get("GRADIO_SERVER_PORT", os.environ.get("PORT", 7860)))
app.launch(server_name="0.0.0.0", server_port=port, show_error=True,
allowed_paths=[tempfile.gettempdir()])