import { Client, handle_file } from "https://esm.sh/@gradio/client";

const $ = (id) => document.getElementById(id);
const cam = $("cam"), canvas = $("canvas"), statusEl = $("status"),
      answerEl = $("answer"), hintEl = $("hint"), langBtn = $("lang"),
      player = $("player"), stage = $("stage"),
      btnAsk = $("btn-ask"), btnDescribe = $("btn-describe"),
      btnLive = $("btn-live"), a11yBtn = $("a11y");

function vibrate(ms) { try { navigator.vibrate && navigator.vibrate(ms); } catch (e) {} }

// ---- language (auto-detected from the browser; switchable by voice or button) ----
let lang = (navigator.language || "en").toLowerCase().startsWith("pt") ? "pt" : "en";

const T = {
  en: { idle: "Iris", listening: "Listening…", thinking: "Thinking…",
        hint: "Tap: describe · Hold: ask · Double-tap: live",
        camErr: "Camera blocked — allow access", err: "Something went wrong",
        langListen: "Listening… say your language",
        welcome: "Welcome to Iris. Tap the screen to describe what is in front of you. Hold to ask a question. Double-tap to turn live mode on or off, which announces new things around you.",
        liveOn: "Live mode on.", liveOff: "Live mode off.", confLang: "English selected.",
        startTap: "Iris. Tap the screen to begin.",
        welcomeAsk: "Welcome to Iris, in English. Tap the screen to describe what is in front of you, hold to ask a question, double-tap for live mode. To use Portuguese instead, say 'português' now.",
        bAsk: "Ask", bDescribe: "Describe", bLive: "Live" },
  pt: { idle: "Iris", listening: "Ouvindo…", thinking: "Pensando…",
        hint: "Toque: descrever · Segurar: perguntar · Toque duplo: ao vivo",
        camErr: "Câmera bloqueada — permita o acesso", err: "Algo deu errado",
        langListen: "Ouvindo… diga seu idioma",
        welcome: "Bem-vindo ao Iris. Toque na tela para descrever o que está à sua frente. Segure para fazer uma pergunta. Toque duas vezes para ligar ou desligar o modo ao vivo, que avisa o que aparece de novo à sua volta.",
        liveOn: "Modo ao vivo ligado.", liveOff: "Modo ao vivo desligado.", confLang: "Português selecionado.",
        startTap: "Iris. Toque na tela para começar.",
        welcomeAsk: "Bem-vindo ao Iris, em português. Toque na tela para descrever o que está à sua frente, segure para perguntar, e toque duas vezes para o modo ao vivo. Para usar em inglês, diga 'inglês' agora.",
        bAsk: "Perguntar", bDescribe: "Descrever", bLive: "Ao vivo" },
};
const t = (k) => T[lang][k];
const LANG_PROMPT = "Segure e diga seu idioma · Hold and say your language";

function setState(s, msg) {
  document.body.dataset.state = s || "";
  if (msg !== undefined) statusEl.textContent = (liveOn ? "● " : "") + msg;
}
// UI speech via the browser voice (instructions/confirmations)
function speak(text) {
  try {
    speechSynthesis.cancel();
    const u = new SpeechSynthesisUtterance(text);
    u.lang = lang === "pt" ? "pt-BR" : "en-US";
    speechSynthesis.speak(u);
  } catch (e) { console.error("speak:", e); }
}

// ---- state ----
let onboarded = false;
try { onboarded = localStorage.getItem("iris_onboarded") === "1"; } catch (e) {}
let mode = onboarded ? "normal" : "onboarding";
let busy = false;
let liveOn = false, liveTimer = null;
let holding = false, recording = false, holdTimer = null;
const HOLD_MS = 350;

langBtn.onclick = (e) => {
  e.stopPropagation();
  lang = lang === "en" ? "pt" : "en";
  document.documentElement.lang = lang;
  langBtn.textContent = lang.toUpperCase();
  hintEl.textContent = onboarded ? t("hint") : "";
  updateLabels();
  if (!busy) setState("", onboarded ? t("idle") : t("startTap"));
};

// ---- unlock audio (autoplay) on the first tap ----
const SILENT = "data:audio/wav;base64,UklGRiQAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQAAAAA=";
let audioUnlocked = false;
function unlockAudio() {
  if (audioUnlocked) return;
  audioUnlocked = true;
  player.src = SILENT;
  player.play().catch(() => {});
}

// ---- live camera ----
async function startCamera() {
  for (const c of [{ video: { facingMode: { ideal: "environment" } }, audio: false }, { video: true, audio: false }]) {
    try { cam.srcObject = await navigator.mediaDevices.getUserMedia(c); return; }
    catch (e) {}
  }
  setState("", t("camErr"));
}
function grabFrame() {
  const w = cam.videoWidth, h = cam.videoHeight;
  if (!w || !h) return Promise.resolve(null);
  canvas.width = w; canvas.height = h;
  canvas.getContext("2d").drawImage(cam, 0, 0, w, h);
  return new Promise((res) => canvas.toBlob(res, "image/jpeg", 0.85));
}

// ---- microphone ----
let micStream = null, chunks = [], recorder = null;
async function startRec() {
  try { if (!micStream) micStream = await navigator.mediaDevices.getUserMedia({ audio: true }); }
  catch (e) { return false; }
  chunks = [];
  recorder = new MediaRecorder(micStream);
  recorder.ondataavailable = (e) => { if (e.data.size) chunks.push(e.data); };
  recorder.start();
  return true;
}
function stopRec() {
  return new Promise((res) => {
    if (!recorder || recorder.state === "inactive") return res(null);
    recorder.onstop = () => res(chunks.length ? new Blob(chunks, { type: recorder.mimeType || "audio/webm" }) : null);
    recorder.stop();
  });
}

// ---- backend ----
let client = null;

async function send(frame, audio, qtext = "") {
  if (busy) return;
  busy = true;
  answerEl.textContent = "";
  setState("thinking", t("thinking"));
  try {
    const payload = { image: handle_file(frame), lang };
    if (audio) payload.audio = handle_file(audio);
    if (qtext) payload.qtext = qtext;
    const result = await client.predict("/describe", payload);
    const out = Array.isArray(result.data) ? result.data[0] : result.data;
    console.log("Iris result:", out);
    if (out && out.command) { busy = false; handleCommand(out.command); return; }
    answerEl.textContent = (out && out.answer) || "";
    setState("speaking", "");
    const a = out && out.audio;
    let url = a && a.url;
    if (!url && a && a.path) url = window.location.origin + "/gradio_api/file=" + a.path;
    if (url) {
      pauseSR();
      player.src = url;
      try { await player.play(); } catch (err) { console.error("play:", err); }
    } else { resetSoon(); }
  } catch (e) {
    console.error("describe:", e);
    setState("", t("err"));
    resetSoon();
  } finally {
    busy = false;
  }
}
function resetSoon() { setTimeout(() => { resumeSR(); if (!busy) setState("", t("idle")); }, 600); }
player.addEventListener("ended", () => { setTimeout(resumeSR, 700); if (!busy) setState("", t("idle")); });

// ---- first-run onboarding: default to the browser language, speak it, offer to switch ----
function onboard() {
  if (onboarded) return;
  const u = new SpeechSynthesisUtterance(t("welcomeAsk"));
  u.lang = lang === "pt" ? "pt-BR" : "en-US";
  // mark onboarded only when speech actually STARTS — so if the browser blocks
  // autoplay on load, the mode stays "onboarding" and the first tap triggers it.
  u.onstart = () => {
    onboarded = true; mode = "normal";
    try { localStorage.setItem("iris_onboarded", "1"); } catch (e) {}
    hintEl.textContent = t("hint");
    updateLabels();
    setState("speaking", t("idle"));
  };
  u.onend = listenForLangChoice;          // after the welcome, listen for a language switch
  try { speechSynthesis.cancel(); speechSynthesis.speak(u); }
  catch (e) { /* will retry on the first tap */ }
}

function listenForLangChoice() {
  const SR = window.SpeechRecognition || window.webkitSpeechRecognition;
  if (!SR) { setState("", t("idle")); return; }
  const r = new SR();
  r.lang = lang === "pt" ? "pt-BR" : "en-US";
  r.continuous = false; r.interimResults = false;
  let handled = false;
  const finish = () => { if (!handled) { handled = true; setState("", t("idle")); } };
  const to = setTimeout(() => { try { r.stop(); } catch (e) {} finish(); }, 5000);
  r.onresult = (e) => {
    handled = true; clearTimeout(to);
    const txt = (e.results[0][0].transcript || "").toLowerCase();
    if (txt.includes("ingl") || txt.includes("english")) switchLang("en");
    else if (txt.includes("portug") || txt.includes("brasil")) switchLang("pt");
    else setState("", t("idle"));
    try { r.stop(); } catch (e) {}
  };
  r.onerror = () => { clearTimeout(to); finish(); };
  setState("listening", t("listening"));
  try { r.start(); } catch (e) { finish(); }
}

function switchLang(newLang) {
  if (newLang !== lang) {
    lang = newLang;
    document.documentElement.lang = lang;
    langBtn.textContent = lang.toUpperCase();
    hintEl.textContent = t("hint");
    updateLabels();
  }
  setState("", t("idle"));
  speak(t("confLang") + " " + t("welcome"));
}

// ---- live mode (toggled by voice command or double-tap) ----
function handleCommand(cmd) {
  if (cmd === "live_on") setLive(true);
  else if (cmd === "live_off") setLive(false);
}
const LIVE_INTERVAL = 3500;   // check the scene every 3.5s
const LIVE_THRESHOLD = 15;    // mean per-pixel change (0-255) to trigger the VLM
const LIVE_COOLDOWN = 5000;   // quiet window after an auto-alert -> gap for the user to ask
let lastSig = null, history = [];   // recent descriptions (anti-repetition)
let suppressLiveUntil = 0;    // pause auto-alerts after an alert OR a user question (their question wins)

// in-browser object detection gates the live mode (semantic change, not pixels).
// Falls back to the cheap pixel-diff if the detector can't load.
let detector = null, detectorTried = false;
let seenClasses = new Map();   // class name -> last tick index it was seen
let tickN = 0;
const CLASS_TTL = 4;           // ticks a class is remembered before it can re-trigger
const PERSON_GONE = 4;         // ticks a person must be absent before a re-entry counts as new (~14s)

async function loadDetector() {
  if (detector || detectorTried || !window.cocoSsd) return;
  detectorTried = true;
  try {
    detector = await window.cocoSsd.load({ base: "lite_mobilenet_v2" });
    console.log("Iris: object detector ready");
  } catch (e) { console.warn("detector load failed, using pixel-diff:", e); }
}

function setLive(on) {
  liveOn = on;
  btnLive.setAttribute("aria-pressed", on ? "true" : "false");
  btnLive.classList.toggle("on", on);
  vibrate(on ? [20, 40, 20] : 20);
  speak(on ? t("liveOn") : t("liveOff"));
  setState("", t("idle"));
  clearInterval(liveTimer); liveTimer = null;
  lastSig = null; history = []; seenClasses = new Map();
  if (on) {
    loadDetector();
    liveTimer = setInterval(liveTick, LIVE_INTERVAL);
    startListening();
    // baseline: describe the scene once shortly after turning on
    setTimeout(async () => {
      if (liveOn && !busy && player.paused) { const f = await grabFrame(); if (f) sendWatch(f); }
    }, 900);
  } else stopListening();
}

// ---- hands-free continuous listening (live mode) via Web Speech API ----
let recog = null, listening = false, srPaused = false;
function setupRecognition() {
  const SR = window.SpeechRecognition || window.webkitSpeechRecognition;
  if (!SR) { console.warn("SpeechRecognition not supported in this browser"); return null; }
  const r = new SR();
  r.continuous = true;
  r.interimResults = false;
  r.onresult = (e) => {
    if (srPaused || busy) return;
    const txt = e.results[e.results.length - 1][0].transcript.trim();
    if (txt) onVoice(txt);
  };
  r.onend = () => { if (listening && !srPaused) { try { r.start(); } catch (e) {} } };  // keep alive
  r.onerror = (e) => { console.log("SR:", e.error); };
  return r;
}
function startListening() {
  if (!recog) recog = setupRecognition();
  if (!recog || listening) return;
  listening = true;
  recog.lang = lang === "pt" ? "pt-BR" : "en-US";
  try { recog.start(); } catch (e) {}
}
function stopListening() {
  listening = false;
  if (recog) { try { recog.stop(); } catch (e) {} }
}
// stop the recognizer WHILE Iris speaks (so it never transcribes its own voice)
function pauseSR() { srPaused = true; if (recog) { try { recog.stop(); } catch (e) {} } }
function resumeSR() { srPaused = false; if (listening && recog) { try { recog.start(); } catch (e) {} } }
async function onVoice(txt, tries = 0) {
  suppressLiveUntil = performance.now() + 12000;   // the user's question wins: mute auto-alerts for a while
  if (busy) { if (tries < 3) setTimeout(() => onVoice(txt, tries + 1), 700); return; }  // retry if a watch is in flight
  const frame = await grabFrame();
  if (frame) send(frame, null, txt);   // text question (from browser speech recognition)
}

// cheap visual signature (32x32 gray) to detect change without calling the model
function frameSignature() {
  if (!cam.videoWidth) return null;
  const c = document.createElement("canvas"); c.width = 32; c.height = 32;
  const ctx = c.getContext("2d");
  ctx.drawImage(cam, 0, 0, 32, 32);
  const d = ctx.getImageData(0, 0, 32, 32).data;
  const g = new Uint8Array(1024);
  for (let i = 0; i < 1024; i++) g[i] = (d[i * 4] + d[i * 4 + 1] + d[i * 4 + 2]) / 3;
  return g;
}
function changeAmount(a, b) {
  if (!a || !b) return 999;
  let s = 0;
  for (let i = 0; i < a.length; i++) s += Math.abs(a[i] - b[i]);
  return s / a.length;
}

async function liveTick() {
  if (busy || mode !== "normal" || !liveOn || !player.paused) return;
  if (performance.now() < suppressLiveUntil) return;   // user just asked / we just spoke -> stay quiet, let them talk
  if (!detector || !cam.videoWidth) return;            // alerts are person-arrival only -> need the detector
  tickN++;

  // Live alerts fire ONLY when a NEW person enters (per the spec: rare ambient alerts, not narration).
  // Re-describing objects/movement makes the small VLM hallucinate and repeat; objects/text/colors are
  // answered on demand (the user asks), which is accurate. So we gate strictly on a fresh "person".
  let personNew = false;
  try {
    const preds = await detector.detect(cam, 5);
    let hasPerson = false;
    for (const p of preds) if (p.class === "person" && p.score >= 0.6) hasPerson = true;
    if (hasPerson) {
      const last = seenClasses.get("person");
      if (last === undefined || tickN - last > PERSON_GONE) personNew = true;   // absent a while -> a real arrival
      seenClasses.set("person", tickN);
    }
  } catch (e) { return; }

  if (personNew) {
    suppressLiveUntil = performance.now() + LIVE_COOLDOWN;   // one alert, then a quiet gap to ask
    const frame = await grabFrame();
    if (frame) sendWatch(frame, "person");
  }
}

// word-set similarity (accent-insensitive) — the model often re-emits a near-identical
// line, so we drop it client-side instead of trusting the "do not repeat" prompt.
function words(s) {
  return (s || "").toLowerCase().normalize("NFD").replace(/[̀-ͯ]/g, "")
    .replace(/[^a-z0-9 ]/g, " ").split(/\s+/).filter((w) => w.length > 2);   // NFD + strip diacritics
}
function tooSimilar(a, b) {
  const A = new Set(words(a)), B = new Set(words(b));
  if (!A.size || !B.size) return false;
  let inter = 0; for (const w of A) if (B.has(w)) inter++;
  return inter / (A.size + B.size - inter) >= 0.5;   // Jaccard >= 0.5 -> basically the same alert
}

async function sendWatch(frame, hint = "") {
  if (busy) return;
  busy = true;
  try {
    const result = await client.predict("/watch", { image: handle_file(frame), prev: history.join(" · "), lang, hint });
    const out = Array.isArray(result.data) ? result.data[0] : result.data;
    if (out && out.speak && out.answer && !history.some((h) => tooSimilar(h, out.answer))) {
      history.push(out.answer); if (history.length > 5) history.shift();
      answerEl.textContent = out.answer;
      setState("speaking", "");
      const a = out.audio;
      let url = a && a.url;
      if (!url && a && a.path) url = window.location.origin + "/gradio_api/file=" + a.path;
      if (url) { pauseSR(); player.src = url; try { await player.play(); } catch (e) { console.error("play:", e); } }
    }
  } catch (e) { console.error("watch:", e); }
  finally { busy = false; }
}

// ---- interaction ----
stage.addEventListener("pointerdown", async () => {
  if (mode === "onboarding") {
    unlockAudio();
    onboard();             // speak welcome in the browser language + offer to switch
    return;
  }
  if (liveOn) { speechSynthesis.cancel(); player.pause(); }
  if (busy) return;
  unlockAudio();
  holding = true;
  stage.classList.add("armed");
  holdTimer = setTimeout(async () => {
    recording = await startRec();
    if (recording) setState("listening", t("listening"));
  }, HOLD_MS);
});

let lastTapTime = 0, tapTimer = null;
const DOUBLE_MS = 320;   // double-tap window

async function endPress() {
  if (mode === "onboarding") return;   // onboarding is handled on pointerdown
  if (!holding) return;
  holding = false;
  stage.classList.remove("armed");
  clearTimeout(holdTimer);

  if (recording) {                       // held and spoke = a question
    recording = false;
    const frame = await grabFrame();
    const audio = await stopRec();
    if (frame) send(frame, audio); else setState("", t("camErr"));
    return;
  }

  // quick tap: double-tap toggles live mode; single tap describes
  const now = performance.now();
  if (now - lastTapTime < DOUBLE_MS) {
    lastTapTime = 0;
    if (tapTimer) { clearTimeout(tapTimer); tapTimer = null; }
    setLive(!liveOn);
  } else {
    lastTapTime = now;
    if (tapTimer) clearTimeout(tapTimer);
    tapTimer = setTimeout(async () => {
      tapTimer = null;
      const frame = await grabFrame();
      if (frame) send(frame, null); else setState("", t("camErr"));
    }, DOUBLE_MS);
  }
}
stage.addEventListener("pointerup", endPress);
stage.addEventListener("pointercancel", endPress);

// ---- explicit buttons (low vision / keyboard / screen reader) ----
btnDescribe.addEventListener("click", async (e) => {
  e.stopPropagation();
  if (mode !== "normal" || busy) return;
  unlockAudio(); vibrate(10);
  const frame = await grabFrame();
  if (frame) send(frame, null); else setState("", t("camErr"));
});

let askRecBtn = false;
btnAsk.addEventListener("pointerdown", async (e) => {
  e.stopPropagation();
  if (mode !== "normal" || busy) return;
  unlockAudio(); vibrate(10);
  askRecBtn = await startRec();
  if (askRecBtn) setState("listening", t("listening"));
});
async function btnAskEnd(e) {
  if (e) e.stopPropagation();
  if (!askRecBtn) return; askRecBtn = false;
  const frame = await grabFrame();
  const audio = await stopRec();
  if (frame) send(frame, audio); else setState("", t("camErr"));
}
btnAsk.addEventListener("pointerup", btnAskEnd);
btnAsk.addEventListener("pointerleave", btnAskEnd);
btnAsk.addEventListener("pointercancel", btnAskEnd);

btnLive.addEventListener("click", (e) => {
  e.stopPropagation();
  if (mode !== "normal") return;
  unlockAudio();
  setLive(!liveOn);
});

// ---- accessibility: max contrast + larger text (persisted) ----
let boost = false;
try { boost = localStorage.getItem("iris_a11y") === "1"; } catch (e) {}
document.body.classList.toggle("a11y-boost", boost);
a11yBtn.setAttribute("aria-pressed", boost ? "true" : "false");
a11yBtn.addEventListener("click", (e) => {
  e.stopPropagation();
  boost = !boost;
  document.body.classList.toggle("a11y-boost", boost);
  a11yBtn.setAttribute("aria-pressed", boost ? "true" : "false");
  try { localStorage.setItem("iris_a11y", boost ? "1" : "0"); } catch (e) {}
  vibrate(10);
});

// button labels per language
function updateLabels() {
  btnAsk.querySelector(".ctl-lbl").textContent = t("bAsk");
  btnDescribe.querySelector(".ctl-lbl").textContent = t("bDescribe");
  btnLive.querySelector(".ctl-lbl").textContent = t("bLive");
}

// ---- boot ----
(async () => {
  document.documentElement.lang = lang;
  langBtn.textContent = lang.toUpperCase();
  updateLabels();
  hintEl.textContent = onboarded ? t("hint") : "";
  setState("", onboarded ? t("idle") : t("startTap"));
  if (!onboarded) onboard();   // try to speak the welcome on load (works where the browser allows it)
  await startCamera();
  loadDetector();   // preload the object detector in the background (non-blocking)
  try { client = await Client.connect(window.location.origin); console.log("Iris connected"); }
  catch (e) { console.error("connect:", e); setState("", t("err")); }
})();