Spaces:
Running on Zero
Running on Zero
| import { Client, handle_file } from "https://esm.sh/@gradio/client"; | |
| const $ = (id) => document.getElementById(id); | |
| const cam = $("cam"), canvas = $("canvas"), statusEl = $("status"), | |
| answerEl = $("answer"), hintEl = $("hint"), langBtn = $("lang"), | |
| player = $("player"), stage = $("stage"), | |
| btnAsk = $("btn-ask"), btnDescribe = $("btn-describe"), | |
| btnLive = $("btn-live"), a11yBtn = $("a11y"); | |
| function vibrate(ms) { try { navigator.vibrate && navigator.vibrate(ms); } catch (e) {} } | |
| // ---- language (auto-detected from the browser; switchable by voice or button) ---- | |
| let lang = (navigator.language || "en").toLowerCase().startsWith("pt") ? "pt" : "en"; | |
| const T = { | |
| en: { idle: "Iris", listening: "Listening…", thinking: "Thinking…", | |
| hint: "Tap: describe · Hold: ask · Double-tap: live", | |
| camErr: "Camera blocked — allow access", err: "Something went wrong", | |
| langListen: "Listening… say your language", | |
| welcome: "Welcome to Iris. Tap the screen to describe what is in front of you. Hold to ask a question. Double-tap to turn live mode on or off, which announces new things around you.", | |
| liveOn: "Live mode on.", liveOff: "Live mode off.", confLang: "English selected.", | |
| startTap: "Iris. Tap the screen to begin.", | |
| welcomeAsk: "Welcome to Iris, in English. Tap the screen to describe what is in front of you, hold to ask a question, double-tap for live mode. To use Portuguese instead, say 'português' now.", | |
| bAsk: "Ask", bDescribe: "Describe", bLive: "Live" }, | |
| pt: { idle: "Iris", listening: "Ouvindo…", thinking: "Pensando…", | |
| hint: "Toque: descrever · Segurar: perguntar · Toque duplo: ao vivo", | |
| camErr: "Câmera bloqueada — permita o acesso", err: "Algo deu errado", | |
| langListen: "Ouvindo… diga seu idioma", | |
| welcome: "Bem-vindo ao Iris. Toque na tela para descrever o que está à sua frente. Segure para fazer uma pergunta. Toque duas vezes para ligar ou desligar o modo ao vivo, que avisa o que aparece de novo à sua volta.", | |
| liveOn: "Modo ao vivo ligado.", liveOff: "Modo ao vivo desligado.", confLang: "Português selecionado.", | |
| startTap: "Iris. Toque na tela para começar.", | |
| welcomeAsk: "Bem-vindo ao Iris, em português. Toque na tela para descrever o que está à sua frente, segure para perguntar, e toque duas vezes para o modo ao vivo. Para usar em inglês, diga 'inglês' agora.", | |
| bAsk: "Perguntar", bDescribe: "Descrever", bLive: "Ao vivo" }, | |
| }; | |
| const t = (k) => T[lang][k]; | |
| const LANG_PROMPT = "Segure e diga seu idioma · Hold and say your language"; | |
| function setState(s, msg) { | |
| document.body.dataset.state = s || ""; | |
| if (msg !== undefined) statusEl.textContent = (liveOn ? "● " : "") + msg; | |
| } | |
| // UI speech via the browser voice (instructions/confirmations) | |
| function speak(text) { | |
| try { | |
| speechSynthesis.cancel(); | |
| const u = new SpeechSynthesisUtterance(text); | |
| u.lang = lang === "pt" ? "pt-BR" : "en-US"; | |
| speechSynthesis.speak(u); | |
| } catch (e) { console.error("speak:", e); } | |
| } | |
| // ---- state ---- | |
| let onboarded = false; | |
| try { onboarded = localStorage.getItem("iris_onboarded") === "1"; } catch (e) {} | |
| let mode = onboarded ? "normal" : "onboarding"; | |
| let busy = false; | |
| let liveOn = false, liveTimer = null; | |
| let holding = false, recording = false, holdTimer = null; | |
| const HOLD_MS = 350; | |
| langBtn.onclick = (e) => { | |
| e.stopPropagation(); | |
| lang = lang === "en" ? "pt" : "en"; | |
| document.documentElement.lang = lang; | |
| langBtn.textContent = lang.toUpperCase(); | |
| hintEl.textContent = onboarded ? t("hint") : ""; | |
| updateLabels(); | |
| if (!busy) setState("", onboarded ? t("idle") : t("startTap")); | |
| }; | |
| // ---- unlock audio (autoplay) on the first tap ---- | |
| const SILENT = "data:audio/wav;base64,UklGRiQAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQAAAAA="; | |
| let audioUnlocked = false; | |
| function unlockAudio() { | |
| if (audioUnlocked) return; | |
| audioUnlocked = true; | |
| player.src = SILENT; | |
| player.play().catch(() => {}); | |
| } | |
| // ---- live camera ---- | |
| async function startCamera() { | |
| for (const c of [{ video: { facingMode: { ideal: "environment" } }, audio: false }, { video: true, audio: false }]) { | |
| try { cam.srcObject = await navigator.mediaDevices.getUserMedia(c); return; } | |
| catch (e) {} | |
| } | |
| setState("", t("camErr")); | |
| } | |
| function grabFrame() { | |
| const w = cam.videoWidth, h = cam.videoHeight; | |
| if (!w || !h) return Promise.resolve(null); | |
| canvas.width = w; canvas.height = h; | |
| canvas.getContext("2d").drawImage(cam, 0, 0, w, h); | |
| return new Promise((res) => canvas.toBlob(res, "image/jpeg", 0.85)); | |
| } | |
| // ---- microphone ---- | |
| let micStream = null, chunks = [], recorder = null; | |
| async function startRec() { | |
| try { if (!micStream) micStream = await navigator.mediaDevices.getUserMedia({ audio: true }); } | |
| catch (e) { return false; } | |
| chunks = []; | |
| recorder = new MediaRecorder(micStream); | |
| recorder.ondataavailable = (e) => { if (e.data.size) chunks.push(e.data); }; | |
| recorder.start(); | |
| return true; | |
| } | |
| function stopRec() { | |
| return new Promise((res) => { | |
| if (!recorder || recorder.state === "inactive") return res(null); | |
| recorder.onstop = () => res(chunks.length ? new Blob(chunks, { type: recorder.mimeType || "audio/webm" }) : null); | |
| recorder.stop(); | |
| }); | |
| } | |
| // ---- backend ---- | |
| let client = null; | |
| async function send(frame, audio, qtext = "") { | |
| if (busy) return; | |
| busy = true; | |
| answerEl.textContent = ""; | |
| setState("thinking", t("thinking")); | |
| try { | |
| const payload = { image: handle_file(frame), lang }; | |
| if (audio) payload.audio = handle_file(audio); | |
| if (qtext) payload.qtext = qtext; | |
| const result = await client.predict("/describe", payload); | |
| const out = Array.isArray(result.data) ? result.data[0] : result.data; | |
| console.log("Iris result:", out); | |
| if (out && out.command) { busy = false; handleCommand(out.command); return; } | |
| answerEl.textContent = (out && out.answer) || ""; | |
| setState("speaking", ""); | |
| const a = out && out.audio; | |
| let url = a && a.url; | |
| if (!url && a && a.path) url = window.location.origin + "/gradio_api/file=" + a.path; | |
| if (url) { | |
| pauseSR(); | |
| player.src = url; | |
| try { await player.play(); } catch (err) { console.error("play:", err); } | |
| } else { resetSoon(); } | |
| } catch (e) { | |
| console.error("describe:", e); | |
| setState("", t("err")); | |
| resetSoon(); | |
| } finally { | |
| busy = false; | |
| } | |
| } | |
| function resetSoon() { setTimeout(() => { resumeSR(); if (!busy) setState("", t("idle")); }, 600); } | |
| player.addEventListener("ended", () => { setTimeout(resumeSR, 700); if (!busy) setState("", t("idle")); }); | |
| // ---- first-run onboarding: default to the browser language, speak it, offer to switch ---- | |
| function onboard() { | |
| if (onboarded) return; | |
| const u = new SpeechSynthesisUtterance(t("welcomeAsk")); | |
| u.lang = lang === "pt" ? "pt-BR" : "en-US"; | |
| // mark onboarded only when speech actually STARTS — so if the browser blocks | |
| // autoplay on load, the mode stays "onboarding" and the first tap triggers it. | |
| u.onstart = () => { | |
| onboarded = true; mode = "normal"; | |
| try { localStorage.setItem("iris_onboarded", "1"); } catch (e) {} | |
| hintEl.textContent = t("hint"); | |
| updateLabels(); | |
| setState("speaking", t("idle")); | |
| }; | |
| u.onend = listenForLangChoice; // after the welcome, listen for a language switch | |
| try { speechSynthesis.cancel(); speechSynthesis.speak(u); } | |
| catch (e) { /* will retry on the first tap */ } | |
| } | |
| function listenForLangChoice() { | |
| const SR = window.SpeechRecognition || window.webkitSpeechRecognition; | |
| if (!SR) { setState("", t("idle")); return; } | |
| const r = new SR(); | |
| r.lang = lang === "pt" ? "pt-BR" : "en-US"; | |
| r.continuous = false; r.interimResults = false; | |
| let handled = false; | |
| const finish = () => { if (!handled) { handled = true; setState("", t("idle")); } }; | |
| const to = setTimeout(() => { try { r.stop(); } catch (e) {} finish(); }, 5000); | |
| r.onresult = (e) => { | |
| handled = true; clearTimeout(to); | |
| const txt = (e.results[0][0].transcript || "").toLowerCase(); | |
| if (txt.includes("ingl") || txt.includes("english")) switchLang("en"); | |
| else if (txt.includes("portug") || txt.includes("brasil")) switchLang("pt"); | |
| else setState("", t("idle")); | |
| try { r.stop(); } catch (e) {} | |
| }; | |
| r.onerror = () => { clearTimeout(to); finish(); }; | |
| setState("listening", t("listening")); | |
| try { r.start(); } catch (e) { finish(); } | |
| } | |
| function switchLang(newLang) { | |
| if (newLang !== lang) { | |
| lang = newLang; | |
| document.documentElement.lang = lang; | |
| langBtn.textContent = lang.toUpperCase(); | |
| hintEl.textContent = t("hint"); | |
| updateLabels(); | |
| } | |
| setState("", t("idle")); | |
| speak(t("confLang") + " " + t("welcome")); | |
| } | |
| // ---- live mode (toggled by voice command or double-tap) ---- | |
| function handleCommand(cmd) { | |
| if (cmd === "live_on") setLive(true); | |
| else if (cmd === "live_off") setLive(false); | |
| } | |
| const LIVE_INTERVAL = 3500; // check the scene every 3.5s | |
| const LIVE_THRESHOLD = 15; // mean per-pixel change (0-255) to trigger the VLM | |
| const LIVE_COOLDOWN = 5000; // quiet window after an auto-alert -> gap for the user to ask | |
| let lastSig = null, history = []; // recent descriptions (anti-repetition) | |
| let suppressLiveUntil = 0; // pause auto-alerts after an alert OR a user question (their question wins) | |
| // in-browser object detection gates the live mode (semantic change, not pixels). | |
| // Falls back to the cheap pixel-diff if the detector can't load. | |
| let detector = null, detectorTried = false; | |
| let seenClasses = new Map(); // class name -> last tick index it was seen | |
| let tickN = 0; | |
| const CLASS_TTL = 4; // ticks a class is remembered before it can re-trigger | |
| const PERSON_GONE = 4; // ticks a person must be absent before a re-entry counts as new (~14s) | |
| async function loadDetector() { | |
| if (detector || detectorTried || !window.cocoSsd) return; | |
| detectorTried = true; | |
| try { | |
| detector = await window.cocoSsd.load({ base: "lite_mobilenet_v2" }); | |
| console.log("Iris: object detector ready"); | |
| } catch (e) { console.warn("detector load failed, using pixel-diff:", e); } | |
| } | |
| function setLive(on) { | |
| liveOn = on; | |
| btnLive.setAttribute("aria-pressed", on ? "true" : "false"); | |
| btnLive.classList.toggle("on", on); | |
| vibrate(on ? [20, 40, 20] : 20); | |
| speak(on ? t("liveOn") : t("liveOff")); | |
| setState("", t("idle")); | |
| clearInterval(liveTimer); liveTimer = null; | |
| lastSig = null; history = []; seenClasses = new Map(); | |
| if (on) { | |
| loadDetector(); | |
| liveTimer = setInterval(liveTick, LIVE_INTERVAL); | |
| startListening(); | |
| // baseline: describe the scene once shortly after turning on | |
| setTimeout(async () => { | |
| if (liveOn && !busy && player.paused) { const f = await grabFrame(); if (f) sendWatch(f); } | |
| }, 900); | |
| } else stopListening(); | |
| } | |
| // ---- hands-free continuous listening (live mode) via Web Speech API ---- | |
| let recog = null, listening = false, srPaused = false; | |
| function setupRecognition() { | |
| const SR = window.SpeechRecognition || window.webkitSpeechRecognition; | |
| if (!SR) { console.warn("SpeechRecognition not supported in this browser"); return null; } | |
| const r = new SR(); | |
| r.continuous = true; | |
| r.interimResults = false; | |
| r.onresult = (e) => { | |
| if (srPaused || busy) return; | |
| const txt = e.results[e.results.length - 1][0].transcript.trim(); | |
| if (txt) onVoice(txt); | |
| }; | |
| r.onend = () => { if (listening && !srPaused) { try { r.start(); } catch (e) {} } }; // keep alive | |
| r.onerror = (e) => { console.log("SR:", e.error); }; | |
| return r; | |
| } | |
| function startListening() { | |
| if (!recog) recog = setupRecognition(); | |
| if (!recog || listening) return; | |
| listening = true; | |
| recog.lang = lang === "pt" ? "pt-BR" : "en-US"; | |
| try { recog.start(); } catch (e) {} | |
| } | |
| function stopListening() { | |
| listening = false; | |
| if (recog) { try { recog.stop(); } catch (e) {} } | |
| } | |
| // stop the recognizer WHILE Iris speaks (so it never transcribes its own voice) | |
| function pauseSR() { srPaused = true; if (recog) { try { recog.stop(); } catch (e) {} } } | |
| function resumeSR() { srPaused = false; if (listening && recog) { try { recog.start(); } catch (e) {} } } | |
| async function onVoice(txt, tries = 0) { | |
| suppressLiveUntil = performance.now() + 12000; // the user's question wins: mute auto-alerts for a while | |
| if (busy) { if (tries < 3) setTimeout(() => onVoice(txt, tries + 1), 700); return; } // retry if a watch is in flight | |
| const frame = await grabFrame(); | |
| if (frame) send(frame, null, txt); // text question (from browser speech recognition) | |
| } | |
| // cheap visual signature (32x32 gray) to detect change without calling the model | |
| function frameSignature() { | |
| if (!cam.videoWidth) return null; | |
| const c = document.createElement("canvas"); c.width = 32; c.height = 32; | |
| const ctx = c.getContext("2d"); | |
| ctx.drawImage(cam, 0, 0, 32, 32); | |
| const d = ctx.getImageData(0, 0, 32, 32).data; | |
| const g = new Uint8Array(1024); | |
| for (let i = 0; i < 1024; i++) g[i] = (d[i * 4] + d[i * 4 + 1] + d[i * 4 + 2]) / 3; | |
| return g; | |
| } | |
| function changeAmount(a, b) { | |
| if (!a || !b) return 999; | |
| let s = 0; | |
| for (let i = 0; i < a.length; i++) s += Math.abs(a[i] - b[i]); | |
| return s / a.length; | |
| } | |
| async function liveTick() { | |
| if (busy || mode !== "normal" || !liveOn || !player.paused) return; | |
| if (performance.now() < suppressLiveUntil) return; // user just asked / we just spoke -> stay quiet, let them talk | |
| if (!detector || !cam.videoWidth) return; // alerts are person-arrival only -> need the detector | |
| tickN++; | |
| // Live alerts fire ONLY when a NEW person enters (per the spec: rare ambient alerts, not narration). | |
| // Re-describing objects/movement makes the small VLM hallucinate and repeat; objects/text/colors are | |
| // answered on demand (the user asks), which is accurate. So we gate strictly on a fresh "person". | |
| let personNew = false; | |
| try { | |
| const preds = await detector.detect(cam, 5); | |
| let hasPerson = false; | |
| for (const p of preds) if (p.class === "person" && p.score >= 0.6) hasPerson = true; | |
| if (hasPerson) { | |
| const last = seenClasses.get("person"); | |
| if (last === undefined || tickN - last > PERSON_GONE) personNew = true; // absent a while -> a real arrival | |
| seenClasses.set("person", tickN); | |
| } | |
| } catch (e) { return; } | |
| if (personNew) { | |
| suppressLiveUntil = performance.now() + LIVE_COOLDOWN; // one alert, then a quiet gap to ask | |
| const frame = await grabFrame(); | |
| if (frame) sendWatch(frame, "person"); | |
| } | |
| } | |
| // word-set similarity (accent-insensitive) — the model often re-emits a near-identical | |
| // line, so we drop it client-side instead of trusting the "do not repeat" prompt. | |
| function words(s) { | |
| return (s || "").toLowerCase().normalize("NFD").replace(/[̀-ͯ]/g, "") | |
| .replace(/[^a-z0-9 ]/g, " ").split(/\s+/).filter((w) => w.length > 2); // NFD + strip diacritics | |
| } | |
| function tooSimilar(a, b) { | |
| const A = new Set(words(a)), B = new Set(words(b)); | |
| if (!A.size || !B.size) return false; | |
| let inter = 0; for (const w of A) if (B.has(w)) inter++; | |
| return inter / (A.size + B.size - inter) >= 0.5; // Jaccard >= 0.5 -> basically the same alert | |
| } | |
| async function sendWatch(frame, hint = "") { | |
| if (busy) return; | |
| busy = true; | |
| try { | |
| const result = await client.predict("/watch", { image: handle_file(frame), prev: history.join(" · "), lang, hint }); | |
| const out = Array.isArray(result.data) ? result.data[0] : result.data; | |
| if (out && out.speak && out.answer && !history.some((h) => tooSimilar(h, out.answer))) { | |
| history.push(out.answer); if (history.length > 5) history.shift(); | |
| answerEl.textContent = out.answer; | |
| setState("speaking", ""); | |
| const a = out.audio; | |
| let url = a && a.url; | |
| if (!url && a && a.path) url = window.location.origin + "/gradio_api/file=" + a.path; | |
| if (url) { pauseSR(); player.src = url; try { await player.play(); } catch (e) { console.error("play:", e); } } | |
| } | |
| } catch (e) { console.error("watch:", e); } | |
| finally { busy = false; } | |
| } | |
| // ---- interaction ---- | |
| stage.addEventListener("pointerdown", async () => { | |
| if (mode === "onboarding") { | |
| unlockAudio(); | |
| onboard(); // speak welcome in the browser language + offer to switch | |
| return; | |
| } | |
| if (liveOn) { speechSynthesis.cancel(); player.pause(); } | |
| if (busy) return; | |
| unlockAudio(); | |
| holding = true; | |
| stage.classList.add("armed"); | |
| holdTimer = setTimeout(async () => { | |
| recording = await startRec(); | |
| if (recording) setState("listening", t("listening")); | |
| }, HOLD_MS); | |
| }); | |
| let lastTapTime = 0, tapTimer = null; | |
| const DOUBLE_MS = 320; // double-tap window | |
| async function endPress() { | |
| if (mode === "onboarding") return; // onboarding is handled on pointerdown | |
| if (!holding) return; | |
| holding = false; | |
| stage.classList.remove("armed"); | |
| clearTimeout(holdTimer); | |
| if (recording) { // held and spoke = a question | |
| recording = false; | |
| const frame = await grabFrame(); | |
| const audio = await stopRec(); | |
| if (frame) send(frame, audio); else setState("", t("camErr")); | |
| return; | |
| } | |
| // quick tap: double-tap toggles live mode; single tap describes | |
| const now = performance.now(); | |
| if (now - lastTapTime < DOUBLE_MS) { | |
| lastTapTime = 0; | |
| if (tapTimer) { clearTimeout(tapTimer); tapTimer = null; } | |
| setLive(!liveOn); | |
| } else { | |
| lastTapTime = now; | |
| if (tapTimer) clearTimeout(tapTimer); | |
| tapTimer = setTimeout(async () => { | |
| tapTimer = null; | |
| const frame = await grabFrame(); | |
| if (frame) send(frame, null); else setState("", t("camErr")); | |
| }, DOUBLE_MS); | |
| } | |
| } | |
| stage.addEventListener("pointerup", endPress); | |
| stage.addEventListener("pointercancel", endPress); | |
| // ---- explicit buttons (low vision / keyboard / screen reader) ---- | |
| btnDescribe.addEventListener("click", async (e) => { | |
| e.stopPropagation(); | |
| if (mode !== "normal" || busy) return; | |
| unlockAudio(); vibrate(10); | |
| const frame = await grabFrame(); | |
| if (frame) send(frame, null); else setState("", t("camErr")); | |
| }); | |
| let askRecBtn = false; | |
| btnAsk.addEventListener("pointerdown", async (e) => { | |
| e.stopPropagation(); | |
| if (mode !== "normal" || busy) return; | |
| unlockAudio(); vibrate(10); | |
| askRecBtn = await startRec(); | |
| if (askRecBtn) setState("listening", t("listening")); | |
| }); | |
| async function btnAskEnd(e) { | |
| if (e) e.stopPropagation(); | |
| if (!askRecBtn) return; askRecBtn = false; | |
| const frame = await grabFrame(); | |
| const audio = await stopRec(); | |
| if (frame) send(frame, audio); else setState("", t("camErr")); | |
| } | |
| btnAsk.addEventListener("pointerup", btnAskEnd); | |
| btnAsk.addEventListener("pointerleave", btnAskEnd); | |
| btnAsk.addEventListener("pointercancel", btnAskEnd); | |
| btnLive.addEventListener("click", (e) => { | |
| e.stopPropagation(); | |
| if (mode !== "normal") return; | |
| unlockAudio(); | |
| setLive(!liveOn); | |
| }); | |
| // ---- accessibility: max contrast + larger text (persisted) ---- | |
| let boost = false; | |
| try { boost = localStorage.getItem("iris_a11y") === "1"; } catch (e) {} | |
| document.body.classList.toggle("a11y-boost", boost); | |
| a11yBtn.setAttribute("aria-pressed", boost ? "true" : "false"); | |
| a11yBtn.addEventListener("click", (e) => { | |
| e.stopPropagation(); | |
| boost = !boost; | |
| document.body.classList.toggle("a11y-boost", boost); | |
| a11yBtn.setAttribute("aria-pressed", boost ? "true" : "false"); | |
| try { localStorage.setItem("iris_a11y", boost ? "1" : "0"); } catch (e) {} | |
| vibrate(10); | |
| }); | |
| // button labels per language | |
| function updateLabels() { | |
| btnAsk.querySelector(".ctl-lbl").textContent = t("bAsk"); | |
| btnDescribe.querySelector(".ctl-lbl").textContent = t("bDescribe"); | |
| btnLive.querySelector(".ctl-lbl").textContent = t("bLive"); | |
| } | |
| // ---- boot ---- | |
| (async () => { | |
| document.documentElement.lang = lang; | |
| langBtn.textContent = lang.toUpperCase(); | |
| updateLabels(); | |
| hintEl.textContent = onboarded ? t("hint") : ""; | |
| setState("", onboarded ? t("idle") : t("startTap")); | |
| if (!onboarded) onboard(); // try to speak the welcome on load (works where the browser allows it) | |
| await startCamera(); | |
| loadDetector(); // preload the object detector in the background (non-blocking) | |
| try { client = await Client.connect(window.location.origin); console.log("Iris connected"); } | |
| catch (e) { console.error("connect:", e); setState("", t("err")); } | |
| })(); | |