Marcus Ramalho Claude Opus 4.8 commited on
Commit
2bdcfe3
·
1 Parent(s): 137241f

live mode: in-browser object-detection gate + browser-language onboarding

Browse files

- COCO-SSD (TF.js) gates the live mode: the VLM only wakes when a NEW object/person
class appears (semantic change, not pixels); per-class TTL avoids re-announcing;
graceful fallback to the pixel-diff if the detector can't load; scene baseline on start
- onboarding: default to the browser language, speak the welcome on first tap and offer
to switch by voice (listens 5s); 'tap to begin' prompt (screen-reader announced)
- detector model runs in the browser (~5MB) — does not count toward <=32B / Tiny Titan

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (2) hide show
  1. frontend/app.js +103 -31
  2. frontend/index.html +4 -0
frontend/app.js CHANGED
@@ -19,6 +19,8 @@ const T = {
19
  langListen: "Listening… say your language",
20
  welcome: "Welcome to Iris. Tap the screen to describe what is in front of you. Hold to ask a question. Double-tap to turn live mode on or off, which announces new things around you.",
21
  liveOn: "Live mode on.", liveOff: "Live mode off.", confLang: "English selected.",
 
 
22
  bAsk: "Ask", bDescribe: "Describe", bLive: "Live" },
23
  pt: { idle: "Iris", listening: "Ouvindo…", thinking: "Pensando…",
24
  hint: "Toque: descrever · Segurar: perguntar · Toque duplo: ao vivo",
@@ -26,6 +28,8 @@ const T = {
26
  langListen: "Ouvindo… diga seu idioma",
27
  welcome: "Bem-vindo ao Iris. Toque na tela para descrever o que está à sua frente. Segure para fazer uma pergunta. Toque duas vezes para ligar ou desligar o modo ao vivo, que avisa o que aparece de novo à sua volta.",
28
  liveOn: "Modo ao vivo ligado.", liveOff: "Modo ao vivo desligado.", confLang: "Português selecionado.",
 
 
29
  bAsk: "Perguntar", bDescribe: "Descrever", bLive: "Ao vivo" },
30
  };
31
  const t = (k) => T[lang][k];
@@ -61,7 +65,7 @@ langBtn.onclick = (e) => {
61
  langBtn.textContent = lang.toUpperCase();
62
  hintEl.textContent = onboarded ? t("hint") : "";
63
  updateLabels();
64
- if (!busy) setState("", onboarded ? t("idle") : LANG_PROMPT);
65
  };
66
 
67
  // ---- unlock audio (autoplay) on the first tap ----
@@ -146,25 +150,52 @@ async function send(frame, audio, qtext = "") {
146
  function resetSoon() { setTimeout(() => { srMuted = false; if (!busy) setState("", t("idle")); }, 600); }
147
  player.addEventListener("ended", () => { srMuted = false; if (!busy) setState("", t("idle")); });
148
 
149
- // ---- choose language by voice (onboarding) ----
150
- async function chooseLang(audio) {
151
- if (!audio || !client) { setState("", LANG_PROMPT); return; }
152
- busy = true; setState("thinking", t("thinking"));
153
- try {
154
- const result = await client.predict("/detect_lang", { audio: handle_file(audio) });
155
- const out = Array.isArray(result.data) ? result.data[0] : result.data;
156
- lang = (out && out.lang === "en") ? "en" : "pt";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  document.documentElement.lang = lang;
158
  langBtn.textContent = lang.toUpperCase();
159
- onboarded = true; mode = "normal";
160
- try { localStorage.setItem("iris_onboarded", "1"); } catch (e) {}
161
  hintEl.textContent = t("hint");
162
- setState("", t("idle"));
163
- speak(t("confLang") + " " + t("welcome"));
164
- } catch (e) {
165
- console.error("detect_lang:", e);
166
- setState("", LANG_PROMPT);
167
- } finally { busy = false; }
168
  }
169
 
170
  // ---- live mode (toggled by voice command or double-tap) ----
@@ -176,6 +207,22 @@ const LIVE_INTERVAL = 2500; // check the scene every 2.5s
176
  const LIVE_THRESHOLD = 14; // mean per-pixel change (0-255) to trigger the VLM
177
  let lastSig = null, history = []; // recent descriptions (anti-repetition)
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  function setLive(on) {
180
  liveOn = on;
181
  btnLive.setAttribute("aria-pressed", on ? "true" : "false");
@@ -184,9 +231,16 @@ function setLive(on) {
184
  speak(on ? t("liveOn") : t("liveOff"));
185
  setState("", t("idle"));
186
  clearInterval(liveTimer); liveTimer = null;
187
- lastSig = null; history = [];
188
- if (on) { liveTimer = setInterval(liveTick, LIVE_INTERVAL); startListening(); }
189
- else stopListening();
 
 
 
 
 
 
 
190
  }
191
 
192
  // ---- hands-free continuous listening (live mode) via Web Speech API ----
@@ -243,6 +297,30 @@ function changeAmount(a, b) {
243
 
244
  async function liveTick() {
245
  if (busy || mode !== "normal" || !liveOn || !player.paused) return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  const sig = frameSignature();
247
  if (!sig) return;
248
  if (lastSig && changeAmount(lastSig, sig) < LIVE_THRESHOLD) return; // static scene -> stay quiet
@@ -273,10 +351,8 @@ async function sendWatch(frame) {
273
  // ---- interaction ----
274
  stage.addEventListener("pointerdown", async () => {
275
  if (mode === "onboarding") {
276
- if (busy) return;
277
  unlockAudio();
278
- recording = await startRec();
279
- if (recording) setState("listening", t("langListen"));
280
  return;
281
  }
282
  if (liveOn) { speechSynthesis.cancel(); player.pause(); }
@@ -294,12 +370,7 @@ let lastTapTime = 0, tapTimer = null;
294
  const DOUBLE_MS = 320; // double-tap window
295
 
296
  async function endPress() {
297
- if (mode === "onboarding") {
298
- if (!recording) return;
299
- recording = false;
300
- const audio = await stopRec();
301
- return chooseLang(audio);
302
- }
303
  if (!holding) return;
304
  holding = false;
305
  stage.classList.remove("armed");
@@ -394,8 +465,9 @@ function updateLabels() {
394
  langBtn.textContent = lang.toUpperCase();
395
  updateLabels();
396
  hintEl.textContent = onboarded ? t("hint") : "";
397
- setState("", onboarded ? t("idle") : LANG_PROMPT);
398
  await startCamera();
399
- try { client = await Client.connect(window.location.origin); console.log("Iris conectado"); }
 
400
  catch (e) { console.error("connect:", e); setState("", t("err")); }
401
  })();
 
19
  langListen: "Listening… say your language",
20
  welcome: "Welcome to Iris. Tap the screen to describe what is in front of you. Hold to ask a question. Double-tap to turn live mode on or off, which announces new things around you.",
21
  liveOn: "Live mode on.", liveOff: "Live mode off.", confLang: "English selected.",
22
+ startTap: "Iris. Tap the screen to begin.",
23
+ welcomeAsk: "Welcome to Iris, in English. Tap the screen to describe what is in front of you, hold to ask a question, double-tap for live mode. To use Portuguese instead, say 'português' now.",
24
  bAsk: "Ask", bDescribe: "Describe", bLive: "Live" },
25
  pt: { idle: "Iris", listening: "Ouvindo…", thinking: "Pensando…",
26
  hint: "Toque: descrever · Segurar: perguntar · Toque duplo: ao vivo",
 
28
  langListen: "Ouvindo… diga seu idioma",
29
  welcome: "Bem-vindo ao Iris. Toque na tela para descrever o que está à sua frente. Segure para fazer uma pergunta. Toque duas vezes para ligar ou desligar o modo ao vivo, que avisa o que aparece de novo à sua volta.",
30
  liveOn: "Modo ao vivo ligado.", liveOff: "Modo ao vivo desligado.", confLang: "Português selecionado.",
31
+ startTap: "Iris. Toque na tela para começar.",
32
+ welcomeAsk: "Bem-vindo ao Iris, em português. Toque na tela para descrever o que está à sua frente, segure para perguntar, e toque duas vezes para o modo ao vivo. Para usar em inglês, diga 'inglês' agora.",
33
  bAsk: "Perguntar", bDescribe: "Descrever", bLive: "Ao vivo" },
34
  };
35
  const t = (k) => T[lang][k];
 
65
  langBtn.textContent = lang.toUpperCase();
66
  hintEl.textContent = onboarded ? t("hint") : "";
67
  updateLabels();
68
+ if (!busy) setState("", onboarded ? t("idle") : t("startTap"));
69
  };
70
 
71
  // ---- unlock audio (autoplay) on the first tap ----
 
150
  function resetSoon() { setTimeout(() => { srMuted = false; if (!busy) setState("", t("idle")); }, 600); }
151
  player.addEventListener("ended", () => { srMuted = false; if (!busy) setState("", t("idle")); });
152
 
153
+ // ---- first-run onboarding: default to the browser language, speak it, offer to switch ----
154
+ function onboard() {
155
+ onboarded = true; mode = "normal";
156
+ try { localStorage.setItem("iris_onboarded", "1"); } catch (e) {}
157
+ hintEl.textContent = t("hint");
158
+ updateLabels();
159
+ const u = new SpeechSynthesisUtterance(t("welcomeAsk"));
160
+ u.lang = lang === "pt" ? "pt-BR" : "en-US";
161
+ u.onend = listenForLangChoice; // after the welcome, listen for a language switch
162
+ setState("speaking", t("idle"));
163
+ try { speechSynthesis.cancel(); speechSynthesis.speak(u); }
164
+ catch (e) { listenForLangChoice(); }
165
+ }
166
+
167
+ function listenForLangChoice() {
168
+ const SR = window.SpeechRecognition || window.webkitSpeechRecognition;
169
+ if (!SR) { setState("", t("idle")); return; }
170
+ const r = new SR();
171
+ r.lang = lang === "pt" ? "pt-BR" : "en-US";
172
+ r.continuous = false; r.interimResults = false;
173
+ let handled = false;
174
+ const finish = () => { if (!handled) { handled = true; setState("", t("idle")); } };
175
+ const to = setTimeout(() => { try { r.stop(); } catch (e) {} finish(); }, 5000);
176
+ r.onresult = (e) => {
177
+ handled = true; clearTimeout(to);
178
+ const txt = (e.results[0][0].transcript || "").toLowerCase();
179
+ if (txt.includes("ingl") || txt.includes("english")) switchLang("en");
180
+ else if (txt.includes("portug") || txt.includes("brasil")) switchLang("pt");
181
+ else setState("", t("idle"));
182
+ try { r.stop(); } catch (e) {}
183
+ };
184
+ r.onerror = () => { clearTimeout(to); finish(); };
185
+ setState("listening", t("listening"));
186
+ try { r.start(); } catch (e) { finish(); }
187
+ }
188
+
189
+ function switchLang(newLang) {
190
+ if (newLang !== lang) {
191
+ lang = newLang;
192
  document.documentElement.lang = lang;
193
  langBtn.textContent = lang.toUpperCase();
 
 
194
  hintEl.textContent = t("hint");
195
+ updateLabels();
196
+ }
197
+ setState("", t("idle"));
198
+ speak(t("confLang") + " " + t("welcome"));
 
 
199
  }
200
 
201
  // ---- live mode (toggled by voice command or double-tap) ----
 
207
  const LIVE_THRESHOLD = 14; // mean per-pixel change (0-255) to trigger the VLM
208
  let lastSig = null, history = []; // recent descriptions (anti-repetition)
209
 
210
+ // in-browser object detection gates the live mode (semantic change, not pixels).
211
+ // Falls back to the cheap pixel-diff if the detector can't load.
212
+ let detector = null, detectorTried = false;
213
+ let seenClasses = new Map(); // class name -> last tick index it was seen
214
+ let tickN = 0;
215
+ const CLASS_TTL = 4; // ticks a class is remembered before it can re-trigger
216
+
217
+ async function loadDetector() {
218
+ if (detector || detectorTried || !window.cocoSsd) return;
219
+ detectorTried = true;
220
+ try {
221
+ detector = await window.cocoSsd.load({ base: "lite_mobilenet_v2" });
222
+ console.log("Iris: object detector ready");
223
+ } catch (e) { console.warn("detector load failed, using pixel-diff:", e); }
224
+ }
225
+
226
  function setLive(on) {
227
  liveOn = on;
228
  btnLive.setAttribute("aria-pressed", on ? "true" : "false");
 
231
  speak(on ? t("liveOn") : t("liveOff"));
232
  setState("", t("idle"));
233
  clearInterval(liveTimer); liveTimer = null;
234
+ lastSig = null; history = []; seenClasses = new Map();
235
+ if (on) {
236
+ loadDetector();
237
+ liveTimer = setInterval(liveTick, LIVE_INTERVAL);
238
+ startListening();
239
+ // baseline: describe the scene once shortly after turning on
240
+ setTimeout(async () => {
241
+ if (liveOn && !busy && player.paused) { const f = await grabFrame(); if (f) sendWatch(f); }
242
+ }, 900);
243
+ } else stopListening();
244
  }
245
 
246
  // ---- hands-free continuous listening (live mode) via Web Speech API ----
 
297
 
298
  async function liveTick() {
299
  if (busy || mode !== "normal" || !liveOn || !player.paused) return;
300
+ tickN++;
301
+
302
+ // semantic gate: only wake the VLM when a NEW object/person class appears
303
+ if (detector && cam.videoWidth) {
304
+ let preds = null;
305
+ try { preds = await detector.detect(cam, 10); } catch (e) {}
306
+ if (preds) {
307
+ let novel = false;
308
+ for (const p of preds) {
309
+ if (p.score < 0.55) continue;
310
+ const last = seenClasses.get(p.class);
311
+ if (last === undefined || tickN - last > CLASS_TTL) novel = true;
312
+ seenClasses.set(p.class, tickN);
313
+ }
314
+ for (const [c, n] of seenClasses) if (tickN - n > CLASS_TTL) seenClasses.delete(c);
315
+ if (novel) {
316
+ const frame = await grabFrame();
317
+ if (frame) sendWatch(frame);
318
+ }
319
+ return;
320
+ }
321
+ }
322
+
323
+ // fallback: cheap pixel-diff gate
324
  const sig = frameSignature();
325
  if (!sig) return;
326
  if (lastSig && changeAmount(lastSig, sig) < LIVE_THRESHOLD) return; // static scene -> stay quiet
 
351
  // ---- interaction ----
352
  stage.addEventListener("pointerdown", async () => {
353
  if (mode === "onboarding") {
 
354
  unlockAudio();
355
+ onboard(); // speak welcome in the browser language + offer to switch
 
356
  return;
357
  }
358
  if (liveOn) { speechSynthesis.cancel(); player.pause(); }
 
370
  const DOUBLE_MS = 320; // double-tap window
371
 
372
  async function endPress() {
373
+ if (mode === "onboarding") return; // onboarding is handled on pointerdown
 
 
 
 
 
374
  if (!holding) return;
375
  holding = false;
376
  stage.classList.remove("armed");
 
465
  langBtn.textContent = lang.toUpperCase();
466
  updateLabels();
467
  hintEl.textContent = onboarded ? t("hint") : "";
468
+ setState("", onboarded ? t("idle") : t("startTap"));
469
  await startCamera();
470
+ loadDetector(); // preload the object detector in the background (non-blocking)
471
+ try { client = await Client.connect(window.location.origin); console.log("Iris connected"); }
472
  catch (e) { console.error("connect:", e); setState("", t("err")); }
473
  })();
frontend/index.html CHANGED
@@ -54,6 +54,10 @@
54
  </nav>
55
 
56
  <audio id="player" playsinline></audio>
 
 
 
 
57
  <script type="module" src="/static/app.js"></script>
58
  </body>
59
  </html>
 
54
  </nav>
55
 
56
  <audio id="player" playsinline></audio>
57
+
58
+ <!-- in-browser object detection (gates live mode); falls back to pixel-diff if unavailable -->
59
+ <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@4"></script>
60
+ <script src="https://cdn.jsdelivr.net/npm/@tensorflow-models/coco-ssd@2"></script>
61
  <script type="module" src="/static/app.js"></script>
62
  </body>
63
  </html>