iris / frontend /app.js
nextmarte's picture
live mode: stop self-hearing, prioritize user questions, kill repetition/hallucination
19657cc
import { Client, handle_file } from "https://esm.sh/@gradio/client";
const $ = (id) => document.getElementById(id);
const cam = $("cam"), canvas = $("canvas"), statusEl = $("status"),
answerEl = $("answer"), hintEl = $("hint"), langBtn = $("lang"),
player = $("player"), stage = $("stage"),
btnAsk = $("btn-ask"), btnDescribe = $("btn-describe"),
btnLive = $("btn-live"), a11yBtn = $("a11y");
function vibrate(ms) { try { navigator.vibrate && navigator.vibrate(ms); } catch (e) {} }
// ---- language (auto-detected from the browser; switchable by voice or button) ----
let lang = (navigator.language || "en").toLowerCase().startsWith("pt") ? "pt" : "en";
const T = {
en: { idle: "Iris", listening: "Listening…", thinking: "Thinking…",
hint: "Tap: describe · Hold: ask · Double-tap: live",
camErr: "Camera blocked — allow access", err: "Something went wrong",
langListen: "Listening… say your language",
welcome: "Welcome to Iris. Tap the screen to describe what is in front of you. Hold to ask a question. Double-tap to turn live mode on or off, which announces new things around you.",
liveOn: "Live mode on.", liveOff: "Live mode off.", confLang: "English selected.",
startTap: "Iris. Tap the screen to begin.",
welcomeAsk: "Welcome to Iris, in English. Tap the screen to describe what is in front of you, hold to ask a question, double-tap for live mode. To use Portuguese instead, say 'português' now.",
bAsk: "Ask", bDescribe: "Describe", bLive: "Live" },
pt: { idle: "Iris", listening: "Ouvindo…", thinking: "Pensando…",
hint: "Toque: descrever · Segurar: perguntar · Toque duplo: ao vivo",
camErr: "Câmera bloqueada — permita o acesso", err: "Algo deu errado",
langListen: "Ouvindo… diga seu idioma",
welcome: "Bem-vindo ao Iris. Toque na tela para descrever o que está à sua frente. Segure para fazer uma pergunta. Toque duas vezes para ligar ou desligar o modo ao vivo, que avisa o que aparece de novo à sua volta.",
liveOn: "Modo ao vivo ligado.", liveOff: "Modo ao vivo desligado.", confLang: "Português selecionado.",
startTap: "Iris. Toque na tela para começar.",
welcomeAsk: "Bem-vindo ao Iris, em português. Toque na tela para descrever o que está à sua frente, segure para perguntar, e toque duas vezes para o modo ao vivo. Para usar em inglês, diga 'inglês' agora.",
bAsk: "Perguntar", bDescribe: "Descrever", bLive: "Ao vivo" },
};
const t = (k) => T[lang][k];
const LANG_PROMPT = "Segure e diga seu idioma · Hold and say your language";
function setState(s, msg) {
document.body.dataset.state = s || "";
if (msg !== undefined) statusEl.textContent = (liveOn ? "● " : "") + msg;
}
// UI speech via the browser voice (instructions/confirmations)
function speak(text) {
try {
speechSynthesis.cancel();
const u = new SpeechSynthesisUtterance(text);
u.lang = lang === "pt" ? "pt-BR" : "en-US";
speechSynthesis.speak(u);
} catch (e) { console.error("speak:", e); }
}
// ---- state ----
let onboarded = false;
try { onboarded = localStorage.getItem("iris_onboarded") === "1"; } catch (e) {}
let mode = onboarded ? "normal" : "onboarding";
let busy = false;
let liveOn = false, liveTimer = null;
let holding = false, recording = false, holdTimer = null;
const HOLD_MS = 350;
langBtn.onclick = (e) => {
e.stopPropagation();
lang = lang === "en" ? "pt" : "en";
document.documentElement.lang = lang;
langBtn.textContent = lang.toUpperCase();
hintEl.textContent = onboarded ? t("hint") : "";
updateLabels();
if (!busy) setState("", onboarded ? t("idle") : t("startTap"));
};
// ---- unlock audio (autoplay) on the first tap ----
const SILENT = "data:audio/wav;base64,UklGRiQAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQAAAAA=";
let audioUnlocked = false;
function unlockAudio() {
if (audioUnlocked) return;
audioUnlocked = true;
player.src = SILENT;
player.play().catch(() => {});
}
// ---- live camera ----
async function startCamera() {
for (const c of [{ video: { facingMode: { ideal: "environment" } }, audio: false }, { video: true, audio: false }]) {
try { cam.srcObject = await navigator.mediaDevices.getUserMedia(c); return; }
catch (e) {}
}
setState("", t("camErr"));
}
function grabFrame() {
const w = cam.videoWidth, h = cam.videoHeight;
if (!w || !h) return Promise.resolve(null);
canvas.width = w; canvas.height = h;
canvas.getContext("2d").drawImage(cam, 0, 0, w, h);
return new Promise((res) => canvas.toBlob(res, "image/jpeg", 0.85));
}
// ---- microphone ----
let micStream = null, chunks = [], recorder = null;
async function startRec() {
try { if (!micStream) micStream = await navigator.mediaDevices.getUserMedia({ audio: true }); }
catch (e) { return false; }
chunks = [];
recorder = new MediaRecorder(micStream);
recorder.ondataavailable = (e) => { if (e.data.size) chunks.push(e.data); };
recorder.start();
return true;
}
function stopRec() {
return new Promise((res) => {
if (!recorder || recorder.state === "inactive") return res(null);
recorder.onstop = () => res(chunks.length ? new Blob(chunks, { type: recorder.mimeType || "audio/webm" }) : null);
recorder.stop();
});
}
// ---- backend ----
let client = null;
async function send(frame, audio, qtext = "") {
if (busy) return;
busy = true;
answerEl.textContent = "";
setState("thinking", t("thinking"));
try {
const payload = { image: handle_file(frame), lang };
if (audio) payload.audio = handle_file(audio);
if (qtext) payload.qtext = qtext;
const result = await client.predict("/describe", payload);
const out = Array.isArray(result.data) ? result.data[0] : result.data;
console.log("Iris result:", out);
if (out && out.command) { busy = false; handleCommand(out.command); return; }
answerEl.textContent = (out && out.answer) || "";
setState("speaking", "");
const a = out && out.audio;
let url = a && a.url;
if (!url && a && a.path) url = window.location.origin + "/gradio_api/file=" + a.path;
if (url) {
pauseSR();
player.src = url;
try { await player.play(); } catch (err) { console.error("play:", err); }
} else { resetSoon(); }
} catch (e) {
console.error("describe:", e);
setState("", t("err"));
resetSoon();
} finally {
busy = false;
}
}
function resetSoon() { setTimeout(() => { resumeSR(); if (!busy) setState("", t("idle")); }, 600); }
player.addEventListener("ended", () => { setTimeout(resumeSR, 700); if (!busy) setState("", t("idle")); });
// ---- first-run onboarding: default to the browser language, speak it, offer to switch ----
function onboard() {
if (onboarded) return;
const u = new SpeechSynthesisUtterance(t("welcomeAsk"));
u.lang = lang === "pt" ? "pt-BR" : "en-US";
// mark onboarded only when speech actually STARTS — so if the browser blocks
// autoplay on load, the mode stays "onboarding" and the first tap triggers it.
u.onstart = () => {
onboarded = true; mode = "normal";
try { localStorage.setItem("iris_onboarded", "1"); } catch (e) {}
hintEl.textContent = t("hint");
updateLabels();
setState("speaking", t("idle"));
};
u.onend = listenForLangChoice; // after the welcome, listen for a language switch
try { speechSynthesis.cancel(); speechSynthesis.speak(u); }
catch (e) { /* will retry on the first tap */ }
}
function listenForLangChoice() {
const SR = window.SpeechRecognition || window.webkitSpeechRecognition;
if (!SR) { setState("", t("idle")); return; }
const r = new SR();
r.lang = lang === "pt" ? "pt-BR" : "en-US";
r.continuous = false; r.interimResults = false;
let handled = false;
const finish = () => { if (!handled) { handled = true; setState("", t("idle")); } };
const to = setTimeout(() => { try { r.stop(); } catch (e) {} finish(); }, 5000);
r.onresult = (e) => {
handled = true; clearTimeout(to);
const txt = (e.results[0][0].transcript || "").toLowerCase();
if (txt.includes("ingl") || txt.includes("english")) switchLang("en");
else if (txt.includes("portug") || txt.includes("brasil")) switchLang("pt");
else setState("", t("idle"));
try { r.stop(); } catch (e) {}
};
r.onerror = () => { clearTimeout(to); finish(); };
setState("listening", t("listening"));
try { r.start(); } catch (e) { finish(); }
}
function switchLang(newLang) {
if (newLang !== lang) {
lang = newLang;
document.documentElement.lang = lang;
langBtn.textContent = lang.toUpperCase();
hintEl.textContent = t("hint");
updateLabels();
}
setState("", t("idle"));
speak(t("confLang") + " " + t("welcome"));
}
// ---- live mode (toggled by voice command or double-tap) ----
function handleCommand(cmd) {
if (cmd === "live_on") setLive(true);
else if (cmd === "live_off") setLive(false);
}
const LIVE_INTERVAL = 3500; // check the scene every 3.5s
const LIVE_THRESHOLD = 15; // mean per-pixel change (0-255) to trigger the VLM
const LIVE_COOLDOWN = 5000; // quiet window after an auto-alert -> gap for the user to ask
let lastSig = null, history = []; // recent descriptions (anti-repetition)
let suppressLiveUntil = 0; // pause auto-alerts after an alert OR a user question (their question wins)
// in-browser object detection gates the live mode (semantic change, not pixels).
// Falls back to the cheap pixel-diff if the detector can't load.
let detector = null, detectorTried = false;
let seenClasses = new Map(); // class name -> last tick index it was seen
let tickN = 0;
const CLASS_TTL = 4; // ticks a class is remembered before it can re-trigger
const PERSON_GONE = 4; // ticks a person must be absent before a re-entry counts as new (~14s)
async function loadDetector() {
if (detector || detectorTried || !window.cocoSsd) return;
detectorTried = true;
try {
detector = await window.cocoSsd.load({ base: "lite_mobilenet_v2" });
console.log("Iris: object detector ready");
} catch (e) { console.warn("detector load failed, using pixel-diff:", e); }
}
function setLive(on) {
liveOn = on;
btnLive.setAttribute("aria-pressed", on ? "true" : "false");
btnLive.classList.toggle("on", on);
vibrate(on ? [20, 40, 20] : 20);
speak(on ? t("liveOn") : t("liveOff"));
setState("", t("idle"));
clearInterval(liveTimer); liveTimer = null;
lastSig = null; history = []; seenClasses = new Map();
if (on) {
loadDetector();
liveTimer = setInterval(liveTick, LIVE_INTERVAL);
startListening();
// baseline: describe the scene once shortly after turning on
setTimeout(async () => {
if (liveOn && !busy && player.paused) { const f = await grabFrame(); if (f) sendWatch(f); }
}, 900);
} else stopListening();
}
// ---- hands-free continuous listening (live mode) via Web Speech API ----
let recog = null, listening = false, srPaused = false;
function setupRecognition() {
const SR = window.SpeechRecognition || window.webkitSpeechRecognition;
if (!SR) { console.warn("SpeechRecognition not supported in this browser"); return null; }
const r = new SR();
r.continuous = true;
r.interimResults = false;
r.onresult = (e) => {
if (srPaused || busy) return;
const txt = e.results[e.results.length - 1][0].transcript.trim();
if (txt) onVoice(txt);
};
r.onend = () => { if (listening && !srPaused) { try { r.start(); } catch (e) {} } }; // keep alive
r.onerror = (e) => { console.log("SR:", e.error); };
return r;
}
function startListening() {
if (!recog) recog = setupRecognition();
if (!recog || listening) return;
listening = true;
recog.lang = lang === "pt" ? "pt-BR" : "en-US";
try { recog.start(); } catch (e) {}
}
function stopListening() {
listening = false;
if (recog) { try { recog.stop(); } catch (e) {} }
}
// stop the recognizer WHILE Iris speaks (so it never transcribes its own voice)
function pauseSR() { srPaused = true; if (recog) { try { recog.stop(); } catch (e) {} } }
function resumeSR() { srPaused = false; if (listening && recog) { try { recog.start(); } catch (e) {} } }
async function onVoice(txt, tries = 0) {
suppressLiveUntil = performance.now() + 12000; // the user's question wins: mute auto-alerts for a while
if (busy) { if (tries < 3) setTimeout(() => onVoice(txt, tries + 1), 700); return; } // retry if a watch is in flight
const frame = await grabFrame();
if (frame) send(frame, null, txt); // text question (from browser speech recognition)
}
// cheap visual signature (32x32 gray) to detect change without calling the model
function frameSignature() {
if (!cam.videoWidth) return null;
const c = document.createElement("canvas"); c.width = 32; c.height = 32;
const ctx = c.getContext("2d");
ctx.drawImage(cam, 0, 0, 32, 32);
const d = ctx.getImageData(0, 0, 32, 32).data;
const g = new Uint8Array(1024);
for (let i = 0; i < 1024; i++) g[i] = (d[i * 4] + d[i * 4 + 1] + d[i * 4 + 2]) / 3;
return g;
}
function changeAmount(a, b) {
if (!a || !b) return 999;
let s = 0;
for (let i = 0; i < a.length; i++) s += Math.abs(a[i] - b[i]);
return s / a.length;
}
async function liveTick() {
if (busy || mode !== "normal" || !liveOn || !player.paused) return;
if (performance.now() < suppressLiveUntil) return; // user just asked / we just spoke -> stay quiet, let them talk
if (!detector || !cam.videoWidth) return; // alerts are person-arrival only -> need the detector
tickN++;
// Live alerts fire ONLY when a NEW person enters (per the spec: rare ambient alerts, not narration).
// Re-describing objects/movement makes the small VLM hallucinate and repeat; objects/text/colors are
// answered on demand (the user asks), which is accurate. So we gate strictly on a fresh "person".
let personNew = false;
try {
const preds = await detector.detect(cam, 5);
let hasPerson = false;
for (const p of preds) if (p.class === "person" && p.score >= 0.6) hasPerson = true;
if (hasPerson) {
const last = seenClasses.get("person");
if (last === undefined || tickN - last > PERSON_GONE) personNew = true; // absent a while -> a real arrival
seenClasses.set("person", tickN);
}
} catch (e) { return; }
if (personNew) {
suppressLiveUntil = performance.now() + LIVE_COOLDOWN; // one alert, then a quiet gap to ask
const frame = await grabFrame();
if (frame) sendWatch(frame, "person");
}
}
// word-set similarity (accent-insensitive) — the model often re-emits a near-identical
// line, so we drop it client-side instead of trusting the "do not repeat" prompt.
function words(s) {
return (s || "").toLowerCase().normalize("NFD").replace(/[̀-ͯ]/g, "")
.replace(/[^a-z0-9 ]/g, " ").split(/\s+/).filter((w) => w.length > 2); // NFD + strip diacritics
}
function tooSimilar(a, b) {
const A = new Set(words(a)), B = new Set(words(b));
if (!A.size || !B.size) return false;
let inter = 0; for (const w of A) if (B.has(w)) inter++;
return inter / (A.size + B.size - inter) >= 0.5; // Jaccard >= 0.5 -> basically the same alert
}
async function sendWatch(frame, hint = "") {
if (busy) return;
busy = true;
try {
const result = await client.predict("/watch", { image: handle_file(frame), prev: history.join(" · "), lang, hint });
const out = Array.isArray(result.data) ? result.data[0] : result.data;
if (out && out.speak && out.answer && !history.some((h) => tooSimilar(h, out.answer))) {
history.push(out.answer); if (history.length > 5) history.shift();
answerEl.textContent = out.answer;
setState("speaking", "");
const a = out.audio;
let url = a && a.url;
if (!url && a && a.path) url = window.location.origin + "/gradio_api/file=" + a.path;
if (url) { pauseSR(); player.src = url; try { await player.play(); } catch (e) { console.error("play:", e); } }
}
} catch (e) { console.error("watch:", e); }
finally { busy = false; }
}
// ---- interaction ----
stage.addEventListener("pointerdown", async () => {
if (mode === "onboarding") {
unlockAudio();
onboard(); // speak welcome in the browser language + offer to switch
return;
}
if (liveOn) { speechSynthesis.cancel(); player.pause(); }
if (busy) return;
unlockAudio();
holding = true;
stage.classList.add("armed");
holdTimer = setTimeout(async () => {
recording = await startRec();
if (recording) setState("listening", t("listening"));
}, HOLD_MS);
});
let lastTapTime = 0, tapTimer = null;
const DOUBLE_MS = 320; // double-tap window
async function endPress() {
if (mode === "onboarding") return; // onboarding is handled on pointerdown
if (!holding) return;
holding = false;
stage.classList.remove("armed");
clearTimeout(holdTimer);
if (recording) { // held and spoke = a question
recording = false;
const frame = await grabFrame();
const audio = await stopRec();
if (frame) send(frame, audio); else setState("", t("camErr"));
return;
}
// quick tap: double-tap toggles live mode; single tap describes
const now = performance.now();
if (now - lastTapTime < DOUBLE_MS) {
lastTapTime = 0;
if (tapTimer) { clearTimeout(tapTimer); tapTimer = null; }
setLive(!liveOn);
} else {
lastTapTime = now;
if (tapTimer) clearTimeout(tapTimer);
tapTimer = setTimeout(async () => {
tapTimer = null;
const frame = await grabFrame();
if (frame) send(frame, null); else setState("", t("camErr"));
}, DOUBLE_MS);
}
}
stage.addEventListener("pointerup", endPress);
stage.addEventListener("pointercancel", endPress);
// ---- explicit buttons (low vision / keyboard / screen reader) ----
btnDescribe.addEventListener("click", async (e) => {
e.stopPropagation();
if (mode !== "normal" || busy) return;
unlockAudio(); vibrate(10);
const frame = await grabFrame();
if (frame) send(frame, null); else setState("", t("camErr"));
});
let askRecBtn = false;
btnAsk.addEventListener("pointerdown", async (e) => {
e.stopPropagation();
if (mode !== "normal" || busy) return;
unlockAudio(); vibrate(10);
askRecBtn = await startRec();
if (askRecBtn) setState("listening", t("listening"));
});
async function btnAskEnd(e) {
if (e) e.stopPropagation();
if (!askRecBtn) return; askRecBtn = false;
const frame = await grabFrame();
const audio = await stopRec();
if (frame) send(frame, audio); else setState("", t("camErr"));
}
btnAsk.addEventListener("pointerup", btnAskEnd);
btnAsk.addEventListener("pointerleave", btnAskEnd);
btnAsk.addEventListener("pointercancel", btnAskEnd);
btnLive.addEventListener("click", (e) => {
e.stopPropagation();
if (mode !== "normal") return;
unlockAudio();
setLive(!liveOn);
});
// ---- accessibility: max contrast + larger text (persisted) ----
let boost = false;
try { boost = localStorage.getItem("iris_a11y") === "1"; } catch (e) {}
document.body.classList.toggle("a11y-boost", boost);
a11yBtn.setAttribute("aria-pressed", boost ? "true" : "false");
a11yBtn.addEventListener("click", (e) => {
e.stopPropagation();
boost = !boost;
document.body.classList.toggle("a11y-boost", boost);
a11yBtn.setAttribute("aria-pressed", boost ? "true" : "false");
try { localStorage.setItem("iris_a11y", boost ? "1" : "0"); } catch (e) {}
vibrate(10);
});
// button labels per language
function updateLabels() {
btnAsk.querySelector(".ctl-lbl").textContent = t("bAsk");
btnDescribe.querySelector(".ctl-lbl").textContent = t("bDescribe");
btnLive.querySelector(".ctl-lbl").textContent = t("bLive");
}
// ---- boot ----
(async () => {
document.documentElement.lang = lang;
langBtn.textContent = lang.toUpperCase();
updateLabels();
hintEl.textContent = onboarded ? t("hint") : "";
setState("", onboarded ? t("idle") : t("startTap"));
if (!onboarded) onboard(); // try to speak the welcome on load (works where the browser allows it)
await startCamera();
loadDetector(); // preload the object detector in the background (non-blocking)
try { client = await Client.connect(window.location.origin); console.log("Iris connected"); }
catch (e) { console.error("connect:", e); setState("", t("err")); }
})();