Spaces:

build-small-hackathon
/

iris

Running on Zero

File size: 7,765 Bytes

df6b3ac
26dae50
df6b3ac
 
26dae50
df6b3ac
26dae50
df6b3ac
26dae50
f7a11cd
df6b3ac
26dae50
 
 
 
 
 
 
 
 
f7a11cd
26dae50
 
 
 
 
 
df6b3ac
26dae50
 
 
 
 
5d19f12
df6b3ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7a11cd
 
 
 
 
 
 
 
 
 
26dae50
df6b3ac
 
 
 
f7a11cd
df6b3ac
 
 
 
 
 
 
 
 
 
26dae50
df6b3ac
 
26dae50
f7a11cd
26dae50
 
 
 
 
 
 
df6b3ac
 
19657cc
 
 
 
 
df6b3ac
 
19657cc
 
 
 
 
df6b3ac
 
 
 
f7a11cd
 
 
 
 
 
19657cc
 
 
 
 
 
 
 
df6b3ac
19657cc
 
 
 
f7a11cd
df6b3ac
19657cc
df6b3ac
 
 
19657cc
 
 
 
df6b3ac
 
 
 
 
f7a11cd
df6b3ac
 
 
 
f7a11cd
df6b3ac
 
 
 
 
 
 
f7a11cd
df6b3ac
 
 
 
26dae50
 
 
 
 
 
 
 
 
 
 
 
137241f
df6b3ac
26dae50
 
df6b3ac
26dae50

"""Iris — your father's eyes, by voice (HF Space, ZeroGPU).

gr.Server: serves a custom voice-first frontend (frontend/index.html) and exposes
the API endpoints. Pipeline: Whisper (STT) -> Qwen3-VL (description / VQA) -> Piper (TTS).
"""
import difflib
import os
import re
import tempfile
import time
import unicodedata
from pathlib import Path

os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

from fastapi.responses import HTMLResponse  # noqa: E402
from fastapi.staticfiles import StaticFiles  # noqa: E402
from gradio import Server  # noqa: E402
from gradio.data_classes import FileData  # noqa: E402

from core import stt, trace, tts, vlm  # noqa: E402

HERE = Path(__file__).parent
app = Server(title="Iris")


def _path(f):
    """gr.Server delivers FileData as a dict; accept dict or object."""
    if f is None:
        return None
    return f["path"] if isinstance(f, dict) else f.path


# voice commands (live mode on/off), tolerant of transcription errors (e.g. "modo ao fio")
_OFF_RE = re.compile(r"\b(pare|parar|para de|deslig\w*|cancela\w*|stop|turn off|"
                     r"silenci\w*|quieto|chega|modo manual|manual)\b")


def _norm(s: str) -> str:
    s = unicodedata.normalize("NFKD", s.lower())
    s = "".join(c for c in s if not unicodedata.combining(c))
    return s.strip().strip(".!?,").strip()


def detect_command(text: str):
    """Map a (possibly mis-transcribed) utterance to a live-mode command."""
    if not text:
        return None
    t = _norm(text)
    if _OFF_RE.search(t):
        return "live_off"
    on = ("ao vivo" in t or "ao fio" in t or "modo ao" in t or "descreva sempre" in t
          or "descreva tudo" in t or "modo continuo" in t or "tempo real" in t
          or "live" in t or "automatico" in t
          or difflib.SequenceMatcher(None, t, "modo ao vivo").ratio() >= 0.7)
    return "live_on" if on else None


def choose_lang(text: str, detected: str) -> str:
    """Pick 'pt'/'en' from what the person said (keywords) or Whisper's guess."""
    low = (text or "").lower()
    if "ingl" in low or "english" in low:
        return "en"
    if "portug" in low or "brasil" in low:
        return "pt"
    return "pt" if detected == "pt" else "en"


@app.api(name="describe")
def describe(image: FileData, audio: FileData | None = None, lang: str = "pt",
            qtext: str = "") -> dict:
    """Camera frame + a question (as recorded audio OR as text from browser speech
    recognition) -> a command OR a PT/EN description + audio."""
    t0 = time.time()
    if qtext and qtext.strip():
        question = qtext.strip()
    else:
        apath = _path(audio)
        question = stt.transcribe(apath, language=lang) if apath else ""
    cmd = detect_command(question)
    if cmd:
        print(f"[describe] command {cmd} <- {question!r}", flush=True)
        return {"command": cmd, "question": question, "answer": "", "audio": None}
    answer = vlm.describe(_path(image), question, lang=lang)
    if not answer.strip():
        answer = "I couldn't describe that." if lang == "en" else "Não consegui descrever isso."
    wav = tts.synthesize(answer, lang=lang)
    print(f"[describe] q={question!r} a={answer!r}", flush=True)
    trace.log("describe", lang, question, answer, time.time() - t0, vlm.MODEL_ID)
    return {
        "question": question,
        "answer": answer,
        "audio": FileData(path=wav) if wav else None,
    }


# Live mode: describe ONLY new/relevant things, sparingly (avoid verbosity).
WATCH_PT = (
    "Você observa o ambiente para uma pessoa cega. "
    "Coisas já ditas (não repita): \"{prev}\". "
    "Olhe a cena AGORA. Se apareceu QUALQUER objeto ou pessoa novo — mesmo pequeno "
    "(óculos, celular, copo, papel, comida) — diga em UMA frase curta o que é. "
    "Só responda NADA se a cena estiver basicamente igual ao que já foi dito."
)
WATCH_EN = (
    "You watch the environment for a blind person. "
    "Already said (do not repeat): \"{prev}\". "
    "Look at the scene NOW. If ANY new object or person appeared — even small "
    "(glasses, a phone, a cup, paper, food) — say what it is in ONE short sentence. "
    "Only reply NONE if the scene is basically the same as what was already said."
)
_QUIET = {"nada", "none", "nenhum", "nothing", "sem novidade", "no change"}


def is_quiet(answer: str) -> bool:
    """True when the live-mode answer means 'nothing new to report'."""
    low = (answer or "").strip().lower().strip(".!").strip()
    return (not low) or low in _QUIET


HINT_PT = ("Uma pessoa entrou no campo de visão. Em UMA frase de no máximo 12 palavras, "
           "avise de forma útil para um cego (ex.: 'Há alguém à sua frente.'). Descreva só o "
           "que você vê com CERTEZA; não invente; não repita o que já foi dito: \"{prev}\".")
HINT_EN = ("A person entered the field of view. In ONE sentence of at most 12 words, alert a "
           "blind person usefully (e.g. 'Someone is in front of you.'). Describe only what you "
           "see for CERTAIN; do not invent; do not repeat what was already said: \"{prev}\".")


@app.api(name="watch")
def watch(image: FileData, prev: str = "", lang: str = "pt", hint: str = "") -> dict:
    """Live mode. First call (no history) describes the scene as a baseline. When the
    in-browser detector flags a new object (`hint`), describe it (trust the gate).
    Otherwise (pixel-diff fallback) speak ONLY if something new entered, else stay quiet."""
    t0 = time.time()
    prev = (prev or "").strip()
    hint = (hint or "").strip()
    if not prev:
        # baseline: describe the current scene so the user hears the surroundings
        answer = vlm.describe(_path(image), lang=lang)
    elif hint:
        # the detector already decided something new is here -> just describe it
        tmpl = HINT_EN if lang == "en" else HINT_PT
        answer = vlm.describe(_path(image), lang=lang, system=tmpl.format(hint=hint, prev=prev))
    else:
        tmpl = WATCH_EN if lang == "en" else WATCH_PT
        sys = tmpl.format(prev=prev)
        q = "What is new?" if lang == "en" else "O que há de novo?"
        answer = vlm.describe(_path(image), question=q, lang=lang, system=sys)
    if is_quiet(answer):
        print(f"[watch] quiet ({answer!r})", flush=True)
        return {"speak": False, "answer": ""}
    wav = tts.synthesize(answer, lang=lang)
    print(f"[watch] SPEAK: {answer!r}", flush=True)
    trace.log("watch", lang, "", answer, time.time() - t0, vlm.MODEL_ID)
    return {"speak": True, "answer": answer, "audio": FileData(path=wav) if wav else None}


@app.api(name="detect_lang")
def detect_lang(audio: FileData) -> dict:
    """Choose the language by voice: the person says 'português' or 'english'."""
    text, detected = stt.transcribe_auto(_path(audio))
    lang = choose_lang(text, detected)
    print(f"[detect_lang] {text!r} (det={detected}) -> {lang}", flush=True)
    return {"lang": lang, "text": text}


app.mount("/static", StaticFiles(directory=str(HERE / "frontend")), name="static")


@app.get("/")
def index():
    return HTMLResponse((HERE / "frontend" / "index.html").read_text(encoding="utf-8"))


if __name__ == "__main__":
    if os.environ.get("IRIS_WARMUP") == "1":
        print("Warmup...", flush=True)
        try:
            vlm.describe(str(HERE / "assets" / "warmup.jpg"), "test")
            stt.transcribe(tts.synthesize("test"))
            print("Warmup OK", flush=True)
        except Exception as e:
            print("Warmup failed:", e, flush=True)
    port = int(os.environ.get("GRADIO_SERVER_PORT", os.environ.get("PORT", 7860)))
    app.launch(server_name="0.0.0.0", server_port=port, show_error=True,
               allowed_paths=[tempfile.gettempdir()])