import { Client, handle_file } from "https://esm.sh/@gradio/client"; const $ = (id) => document.getElementById(id); const cam = $("cam"), canvas = $("canvas"), statusEl = $("status"), answerEl = $("answer"), hintEl = $("hint"), langBtn = $("lang"), player = $("player"), stage = $("stage"), btnAsk = $("btn-ask"), btnDescribe = $("btn-describe"), btnLive = $("btn-live"), a11yBtn = $("a11y"); function vibrate(ms) { try { navigator.vibrate && navigator.vibrate(ms); } catch (e) {} } // ---- language (auto-detected from the browser; switchable by voice or button) ---- let lang = (navigator.language || "en").toLowerCase().startsWith("pt") ? "pt" : "en"; const T = { en: { idle: "Iris", listening: "Listening…", thinking: "Thinking…", hint: "Tap: describe · Hold: ask · Double-tap: live", camErr: "Camera blocked — allow access", err: "Something went wrong", langListen: "Listening… say your language", welcome: "Welcome to Iris. Tap the screen to describe what is in front of you. Hold to ask a question. Double-tap to turn live mode on or off, which announces new things around you.", liveOn: "Live mode on.", liveOff: "Live mode off.", confLang: "English selected.", startTap: "Iris. Tap the screen to begin.", welcomeAsk: "Welcome to Iris, in English. Tap the screen to describe what is in front of you, hold to ask a question, double-tap for live mode. To use Portuguese instead, say 'português' now.", bAsk: "Ask", bDescribe: "Describe", bLive: "Live" }, pt: { idle: "Iris", listening: "Ouvindo…", thinking: "Pensando…", hint: "Toque: descrever · Segurar: perguntar · Toque duplo: ao vivo", camErr: "Câmera bloqueada — permita o acesso", err: "Algo deu errado", langListen: "Ouvindo… diga seu idioma", welcome: "Bem-vindo ao Iris. Toque na tela para descrever o que está à sua frente. Segure para fazer uma pergunta. Toque duas vezes para ligar ou desligar o modo ao vivo, que avisa o que aparece de novo à sua volta.", liveOn: "Modo ao vivo ligado.", liveOff: "Modo ao vivo desligado.", confLang: "Português selecionado.", startTap: "Iris. Toque na tela para começar.", welcomeAsk: "Bem-vindo ao Iris, em português. Toque na tela para descrever o que está à sua frente, segure para perguntar, e toque duas vezes para o modo ao vivo. Para usar em inglês, diga 'inglês' agora.", bAsk: "Perguntar", bDescribe: "Descrever", bLive: "Ao vivo" }, }; const t = (k) => T[lang][k]; const LANG_PROMPT = "Segure e diga seu idioma · Hold and say your language"; function setState(s, msg) { document.body.dataset.state = s || ""; if (msg !== undefined) statusEl.textContent = (liveOn ? "● " : "") + msg; } // UI speech via the browser voice (instructions/confirmations) function speak(text) { try { speechSynthesis.cancel(); const u = new SpeechSynthesisUtterance(text); u.lang = lang === "pt" ? "pt-BR" : "en-US"; speechSynthesis.speak(u); } catch (e) { console.error("speak:", e); } } // ---- state ---- let onboarded = false; try { onboarded = localStorage.getItem("iris_onboarded") === "1"; } catch (e) {} let mode = onboarded ? "normal" : "onboarding"; let busy = false; let liveOn = false, liveTimer = null; let holding = false, recording = false, holdTimer = null; const HOLD_MS = 350; langBtn.onclick = (e) => { e.stopPropagation(); lang = lang === "en" ? "pt" : "en"; document.documentElement.lang = lang; langBtn.textContent = lang.toUpperCase(); hintEl.textContent = onboarded ? t("hint") : ""; updateLabels(); if (!busy) setState("", onboarded ? t("idle") : t("startTap")); }; // ---- unlock audio (autoplay) on the first tap ---- const SILENT = "data:audio/wav;base64,UklGRiQAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQAAAAA="; let audioUnlocked = false; function unlockAudio() { if (audioUnlocked) return; audioUnlocked = true; player.src = SILENT; player.play().catch(() => {}); } // ---- live camera ---- async function startCamera() { for (const c of [{ video: { facingMode: { ideal: "environment" } }, audio: false }, { video: true, audio: false }]) { try { cam.srcObject = await navigator.mediaDevices.getUserMedia(c); return; } catch (e) {} } setState("", t("camErr")); } function grabFrame() { const w = cam.videoWidth, h = cam.videoHeight; if (!w || !h) return Promise.resolve(null); canvas.width = w; canvas.height = h; canvas.getContext("2d").drawImage(cam, 0, 0, w, h); return new Promise((res) => canvas.toBlob(res, "image/jpeg", 0.85)); } // ---- microphone ---- let micStream = null, chunks = [], recorder = null; async function startRec() { try { if (!micStream) micStream = await navigator.mediaDevices.getUserMedia({ audio: true }); } catch (e) { return false; } chunks = []; recorder = new MediaRecorder(micStream); recorder.ondataavailable = (e) => { if (e.data.size) chunks.push(e.data); }; recorder.start(); return true; } function stopRec() { return new Promise((res) => { if (!recorder || recorder.state === "inactive") return res(null); recorder.onstop = () => res(chunks.length ? new Blob(chunks, { type: recorder.mimeType || "audio/webm" }) : null); recorder.stop(); }); } // ---- backend ---- let client = null; async function send(frame, audio, qtext = "") { if (busy) return; busy = true; answerEl.textContent = ""; setState("thinking", t("thinking")); try { const payload = { image: handle_file(frame), lang }; if (audio) payload.audio = handle_file(audio); if (qtext) payload.qtext = qtext; const result = await client.predict("/describe", payload); const out = Array.isArray(result.data) ? result.data[0] : result.data; console.log("Iris result:", out); if (out && out.command) { busy = false; handleCommand(out.command); return; } answerEl.textContent = (out && out.answer) || ""; setState("speaking", ""); const a = out && out.audio; let url = a && a.url; if (!url && a && a.path) url = window.location.origin + "/gradio_api/file=" + a.path; if (url) { pauseSR(); player.src = url; try { await player.play(); } catch (err) { console.error("play:", err); } } else { resetSoon(); } } catch (e) { console.error("describe:", e); setState("", t("err")); resetSoon(); } finally { busy = false; } } function resetSoon() { setTimeout(() => { resumeSR(); if (!busy) setState("", t("idle")); }, 600); } player.addEventListener("ended", () => { setTimeout(resumeSR, 700); if (!busy) setState("", t("idle")); }); // ---- first-run onboarding: default to the browser language, speak it, offer to switch ---- function onboard() { if (onboarded) return; const u = new SpeechSynthesisUtterance(t("welcomeAsk")); u.lang = lang === "pt" ? "pt-BR" : "en-US"; // mark onboarded only when speech actually STARTS — so if the browser blocks // autoplay on load, the mode stays "onboarding" and the first tap triggers it. u.onstart = () => { onboarded = true; mode = "normal"; try { localStorage.setItem("iris_onboarded", "1"); } catch (e) {} hintEl.textContent = t("hint"); updateLabels(); setState("speaking", t("idle")); }; u.onend = listenForLangChoice; // after the welcome, listen for a language switch try { speechSynthesis.cancel(); speechSynthesis.speak(u); } catch (e) { /* will retry on the first tap */ } } function listenForLangChoice() { const SR = window.SpeechRecognition || window.webkitSpeechRecognition; if (!SR) { setState("", t("idle")); return; } const r = new SR(); r.lang = lang === "pt" ? "pt-BR" : "en-US"; r.continuous = false; r.interimResults = false; let handled = false; const finish = () => { if (!handled) { handled = true; setState("", t("idle")); } }; const to = setTimeout(() => { try { r.stop(); } catch (e) {} finish(); }, 5000); r.onresult = (e) => { handled = true; clearTimeout(to); const txt = (e.results[0][0].transcript || "").toLowerCase(); if (txt.includes("ingl") || txt.includes("english")) switchLang("en"); else if (txt.includes("portug") || txt.includes("brasil")) switchLang("pt"); else setState("", t("idle")); try { r.stop(); } catch (e) {} }; r.onerror = () => { clearTimeout(to); finish(); }; setState("listening", t("listening")); try { r.start(); } catch (e) { finish(); } } function switchLang(newLang) { if (newLang !== lang) { lang = newLang; document.documentElement.lang = lang; langBtn.textContent = lang.toUpperCase(); hintEl.textContent = t("hint"); updateLabels(); } setState("", t("idle")); speak(t("confLang") + " " + t("welcome")); } // ---- live mode (toggled by voice command or double-tap) ---- function handleCommand(cmd) { if (cmd === "live_on") setLive(true); else if (cmd === "live_off") setLive(false); } const LIVE_INTERVAL = 3500; // check the scene every 3.5s const LIVE_THRESHOLD = 15; // mean per-pixel change (0-255) to trigger the VLM const LIVE_COOLDOWN = 5000; // quiet window after an auto-alert -> gap for the user to ask let lastSig = null, history = []; // recent descriptions (anti-repetition) let suppressLiveUntil = 0; // pause auto-alerts after an alert OR a user question (their question wins) // in-browser object detection gates the live mode (semantic change, not pixels). // Falls back to the cheap pixel-diff if the detector can't load. let detector = null, detectorTried = false; let seenClasses = new Map(); // class name -> last tick index it was seen let tickN = 0; const CLASS_TTL = 4; // ticks a class is remembered before it can re-trigger const PERSON_GONE = 4; // ticks a person must be absent before a re-entry counts as new (~14s) async function loadDetector() { if (detector || detectorTried || !window.cocoSsd) return; detectorTried = true; try { detector = await window.cocoSsd.load({ base: "lite_mobilenet_v2" }); console.log("Iris: object detector ready"); } catch (e) { console.warn("detector load failed, using pixel-diff:", e); } } function setLive(on) { liveOn = on; btnLive.setAttribute("aria-pressed", on ? "true" : "false"); btnLive.classList.toggle("on", on); vibrate(on ? [20, 40, 20] : 20); speak(on ? t("liveOn") : t("liveOff")); setState("", t("idle")); clearInterval(liveTimer); liveTimer = null; lastSig = null; history = []; seenClasses = new Map(); if (on) { loadDetector(); liveTimer = setInterval(liveTick, LIVE_INTERVAL); startListening(); // baseline: describe the scene once shortly after turning on setTimeout(async () => { if (liveOn && !busy && player.paused) { const f = await grabFrame(); if (f) sendWatch(f); } }, 900); } else stopListening(); } // ---- hands-free continuous listening (live mode) via Web Speech API ---- let recog = null, listening = false, srPaused = false; function setupRecognition() { const SR = window.SpeechRecognition || window.webkitSpeechRecognition; if (!SR) { console.warn("SpeechRecognition not supported in this browser"); return null; } const r = new SR(); r.continuous = true; r.interimResults = false; r.onresult = (e) => { if (srPaused || busy) return; const txt = e.results[e.results.length - 1][0].transcript.trim(); if (txt) onVoice(txt); }; r.onend = () => { if (listening && !srPaused) { try { r.start(); } catch (e) {} } }; // keep alive r.onerror = (e) => { console.log("SR:", e.error); }; return r; } function startListening() { if (!recog) recog = setupRecognition(); if (!recog || listening) return; listening = true; recog.lang = lang === "pt" ? "pt-BR" : "en-US"; try { recog.start(); } catch (e) {} } function stopListening() { listening = false; if (recog) { try { recog.stop(); } catch (e) {} } } // stop the recognizer WHILE Iris speaks (so it never transcribes its own voice) function pauseSR() { srPaused = true; if (recog) { try { recog.stop(); } catch (e) {} } } function resumeSR() { srPaused = false; if (listening && recog) { try { recog.start(); } catch (e) {} } } async function onVoice(txt, tries = 0) { suppressLiveUntil = performance.now() + 12000; // the user's question wins: mute auto-alerts for a while if (busy) { if (tries < 3) setTimeout(() => onVoice(txt, tries + 1), 700); return; } // retry if a watch is in flight const frame = await grabFrame(); if (frame) send(frame, null, txt); // text question (from browser speech recognition) } // cheap visual signature (32x32 gray) to detect change without calling the model function frameSignature() { if (!cam.videoWidth) return null; const c = document.createElement("canvas"); c.width = 32; c.height = 32; const ctx = c.getContext("2d"); ctx.drawImage(cam, 0, 0, 32, 32); const d = ctx.getImageData(0, 0, 32, 32).data; const g = new Uint8Array(1024); for (let i = 0; i < 1024; i++) g[i] = (d[i * 4] + d[i * 4 + 1] + d[i * 4 + 2]) / 3; return g; } function changeAmount(a, b) { if (!a || !b) return 999; let s = 0; for (let i = 0; i < a.length; i++) s += Math.abs(a[i] - b[i]); return s / a.length; } async function liveTick() { if (busy || mode !== "normal" || !liveOn || !player.paused) return; if (performance.now() < suppressLiveUntil) return; // user just asked / we just spoke -> stay quiet, let them talk if (!detector || !cam.videoWidth) return; // alerts are person-arrival only -> need the detector tickN++; // Live alerts fire ONLY when a NEW person enters (per the spec: rare ambient alerts, not narration). // Re-describing objects/movement makes the small VLM hallucinate and repeat; objects/text/colors are // answered on demand (the user asks), which is accurate. So we gate strictly on a fresh "person". let personNew = false; try { const preds = await detector.detect(cam, 5); let hasPerson = false; for (const p of preds) if (p.class === "person" && p.score >= 0.6) hasPerson = true; if (hasPerson) { const last = seenClasses.get("person"); if (last === undefined || tickN - last > PERSON_GONE) personNew = true; // absent a while -> a real arrival seenClasses.set("person", tickN); } } catch (e) { return; } if (personNew) { suppressLiveUntil = performance.now() + LIVE_COOLDOWN; // one alert, then a quiet gap to ask const frame = await grabFrame(); if (frame) sendWatch(frame, "person"); } } // word-set similarity (accent-insensitive) — the model often re-emits a near-identical // line, so we drop it client-side instead of trusting the "do not repeat" prompt. function words(s) { return (s || "").toLowerCase().normalize("NFD").replace(/[̀-ͯ]/g, "") .replace(/[^a-z0-9 ]/g, " ").split(/\s+/).filter((w) => w.length > 2); // NFD + strip diacritics } function tooSimilar(a, b) { const A = new Set(words(a)), B = new Set(words(b)); if (!A.size || !B.size) return false; let inter = 0; for (const w of A) if (B.has(w)) inter++; return inter / (A.size + B.size - inter) >= 0.5; // Jaccard >= 0.5 -> basically the same alert } async function sendWatch(frame, hint = "") { if (busy) return; busy = true; try { const result = await client.predict("/watch", { image: handle_file(frame), prev: history.join(" · "), lang, hint }); const out = Array.isArray(result.data) ? result.data[0] : result.data; if (out && out.speak && out.answer && !history.some((h) => tooSimilar(h, out.answer))) { history.push(out.answer); if (history.length > 5) history.shift(); answerEl.textContent = out.answer; setState("speaking", ""); const a = out.audio; let url = a && a.url; if (!url && a && a.path) url = window.location.origin + "/gradio_api/file=" + a.path; if (url) { pauseSR(); player.src = url; try { await player.play(); } catch (e) { console.error("play:", e); } } } } catch (e) { console.error("watch:", e); } finally { busy = false; } } // ---- interaction ---- stage.addEventListener("pointerdown", async () => { if (mode === "onboarding") { unlockAudio(); onboard(); // speak welcome in the browser language + offer to switch return; } if (liveOn) { speechSynthesis.cancel(); player.pause(); } if (busy) return; unlockAudio(); holding = true; stage.classList.add("armed"); holdTimer = setTimeout(async () => { recording = await startRec(); if (recording) setState("listening", t("listening")); }, HOLD_MS); }); let lastTapTime = 0, tapTimer = null; const DOUBLE_MS = 320; // double-tap window async function endPress() { if (mode === "onboarding") return; // onboarding is handled on pointerdown if (!holding) return; holding = false; stage.classList.remove("armed"); clearTimeout(holdTimer); if (recording) { // held and spoke = a question recording = false; const frame = await grabFrame(); const audio = await stopRec(); if (frame) send(frame, audio); else setState("", t("camErr")); return; } // quick tap: double-tap toggles live mode; single tap describes const now = performance.now(); if (now - lastTapTime < DOUBLE_MS) { lastTapTime = 0; if (tapTimer) { clearTimeout(tapTimer); tapTimer = null; } setLive(!liveOn); } else { lastTapTime = now; if (tapTimer) clearTimeout(tapTimer); tapTimer = setTimeout(async () => { tapTimer = null; const frame = await grabFrame(); if (frame) send(frame, null); else setState("", t("camErr")); }, DOUBLE_MS); } } stage.addEventListener("pointerup", endPress); stage.addEventListener("pointercancel", endPress); // ---- explicit buttons (low vision / keyboard / screen reader) ---- btnDescribe.addEventListener("click", async (e) => { e.stopPropagation(); if (mode !== "normal" || busy) return; unlockAudio(); vibrate(10); const frame = await grabFrame(); if (frame) send(frame, null); else setState("", t("camErr")); }); let askRecBtn = false; btnAsk.addEventListener("pointerdown", async (e) => { e.stopPropagation(); if (mode !== "normal" || busy) return; unlockAudio(); vibrate(10); askRecBtn = await startRec(); if (askRecBtn) setState("listening", t("listening")); }); async function btnAskEnd(e) { if (e) e.stopPropagation(); if (!askRecBtn) return; askRecBtn = false; const frame = await grabFrame(); const audio = await stopRec(); if (frame) send(frame, audio); else setState("", t("camErr")); } btnAsk.addEventListener("pointerup", btnAskEnd); btnAsk.addEventListener("pointerleave", btnAskEnd); btnAsk.addEventListener("pointercancel", btnAskEnd); btnLive.addEventListener("click", (e) => { e.stopPropagation(); if (mode !== "normal") return; unlockAudio(); setLive(!liveOn); }); // ---- accessibility: max contrast + larger text (persisted) ---- let boost = false; try { boost = localStorage.getItem("iris_a11y") === "1"; } catch (e) {} document.body.classList.toggle("a11y-boost", boost); a11yBtn.setAttribute("aria-pressed", boost ? "true" : "false"); a11yBtn.addEventListener("click", (e) => { e.stopPropagation(); boost = !boost; document.body.classList.toggle("a11y-boost", boost); a11yBtn.setAttribute("aria-pressed", boost ? "true" : "false"); try { localStorage.setItem("iris_a11y", boost ? "1" : "0"); } catch (e) {} vibrate(10); }); // button labels per language function updateLabels() { btnAsk.querySelector(".ctl-lbl").textContent = t("bAsk"); btnDescribe.querySelector(".ctl-lbl").textContent = t("bDescribe"); btnLive.querySelector(".ctl-lbl").textContent = t("bLive"); } // ---- boot ---- (async () => { document.documentElement.lang = lang; langBtn.textContent = lang.toUpperCase(); updateLabels(); hintEl.textContent = onboarded ? t("hint") : ""; setState("", onboarded ? t("idle") : t("startTap")); if (!onboarded) onboard(); // try to speak the welcome on load (works where the browser allows it) await startCamera(); loadDetector(); // preload the object detector in the background (non-blocking) try { client = await Client.connect(window.location.origin); console.log("Iris connected"); } catch (e) { console.error("connect:", e); setState("", t("err")); } })();