Spaces:
Running
Running
| /** | |
| * Minimal Conversation - embedded app entry point. | |
| * | |
| * Mounted by the dispatcher when the URL has `?embedded=1`, i.e. we're | |
| * inside the host's iframe. The host has already: | |
| * - signed the user in (HF OAuth), | |
| * - let them pick a robot, | |
| * - established the WebRTC session, | |
| * - run the wake-up trajectory. | |
| * | |
| * `connectToHost()` resolves with a live SDK handle past all that. | |
| * From there this module owns: | |
| * - kicking off the OpenAI Realtime session (ephemeral key minted | |
| * from the visitor's HF token, see `ephemeral-key.ts`), | |
| * - routing audio (robot mic β OpenAI β robot speakers), | |
| * - the conversation FSM (listening / user-speaking / processing / | |
| * ai-speaking) and its visual orb, | |
| * - tool calls (head poses + body-language move catalog), | |
| * - the in-app side controls (mute, end conversation), | |
| * - the settings modal (only the model `instructions` is user- | |
| * editable; model + voice are locked to the server defaults). | |
| * | |
| * Sign-in, picking, top-bar avatar, end-session button are NOT this | |
| * app's concern - they live in `@pollen-robotics/reachy-mini-sdk/host`. | |
| */ | |
| import "./style.css"; | |
| import { connectToHost, type ConnectedHandle } from "@pollen-robotics/reachy-mini-sdk/host/embed"; | |
| import type { ReachyMiniInstance } from "@pollen-robotics/reachy-mini-sdk"; | |
| import { OpenaiRealtimeClient, type RealtimeTool } from "./openai-realtime"; | |
| import { | |
| EphemeralKeyError, | |
| invalidateEphemeralKey, | |
| mintEphemeralKey, | |
| } from "./ephemeral-key"; | |
| import { HeadWobbler } from "./head-wobbler"; | |
| import { AntennasOscillator } from "./antennas"; | |
| import { applyAudioStartupConfig } from "./audio-startup-config"; | |
| import { | |
| MovePlayer, | |
| MOVE_CATALOG, | |
| MOVE_IDS, | |
| type MoveId, | |
| } from "./move-player"; | |
| // βββ Settings & defaults ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // Locked server-side defaults. Model + voice used to be user-editable | |
| // but the ephemeral-key flow (see `ephemeral-key.ts`) provisions a | |
| // session tied to a specific model: the Pollen mint endpoint | |
| // (`/api/openai/ephemeral` on `pollen-robotics-reachy-mini`) currently | |
| // mints for `gpt-realtime-2`. Sending a different model name in the | |
| // SDP handshake's session config makes OpenAI reject the call with | |
| // `400 invalid_model "Model X does not match the realtime token | |
| // model."`, so we MUST pass back the exact same model string the mint | |
| // endpoint defaulted to. If the upstream default ever changes, bump | |
| // this constant in sync; until then, keep these two values aligned | |
| // with `reachy_mini_mobile_app/.../settings.ts:DEFAULT_MODEL`. | |
| const DEFAULT_MODEL = "gpt-realtime-2"; | |
| const DEFAULT_VOICE = "cedar"; | |
| const DEFAULT_INSTRUCTIONS = | |
| "You are Reachy Mini, a small friendly robot companion. " + | |
| "Keep replies short, warm, and spoken. Avoid long monologues. " + | |
| "You control a small robot body. Two tools are available:\n" + | |
| " - `move_head`: point the head in a named direction (up, down, left, " + | |
| "right, tilt_left, tilt_right, center). Instant, use for subtle gestures " + | |
| "that accompany a sentence.\n" + | |
| " - `play_move`: trigger a short pre-recorded choreography (1-4s). The " + | |
| "catalog mixes `dance` entries (rhythmic, playful) and `emotion` entries " + | |
| "(reactive body language). Pick a dance when the moment calls for " + | |
| "theatricality (hi, joke, groove) and an emotion when reacting to " + | |
| "something the user just said (surprise, curiosity, praise, bad news).\n" + | |
| "Use tools sparingly, never more than once per reply."; | |
| // `instructions` is the only persisted setting now that the API key | |
| // comes from the HF-token ephemeral mint and model/voice are locked | |
| // server-side. The legacy `reachyMini.openai.{apiKey,model,voice}` | |
| // localStorage entries are intentionally left in place for any | |
| // returning visitor - they're simply ignored on read. | |
| const STORAGE_KEYS = { | |
| instructions: "reachyMini.openai.instructions", | |
| } as const; | |
| interface Settings { | |
| instructions: string; | |
| } | |
| function loadSettings(): Settings { | |
| return { | |
| instructions: | |
| localStorage.getItem(STORAGE_KEYS.instructions) ?? DEFAULT_INSTRUCTIONS, | |
| }; | |
| } | |
| function saveSettings(s: Settings): void { | |
| localStorage.setItem(STORAGE_KEYS.instructions, s.instructions); | |
| } | |
| // βββ Robot tools exposed to the OpenAI model ββββββββββββββββββββββββββββ | |
| const HEAD_POSES = { | |
| center: { roll: 0, pitch: 0, yaw: 0 }, | |
| up: { roll: 0, pitch: -18, yaw: 0 }, | |
| down: { roll: 0, pitch: 18, yaw: 0 }, | |
| left: { roll: 0, pitch: 0, yaw: 25 }, | |
| right: { roll: 0, pitch: 0, yaw: -25 }, | |
| tilt_left: { roll: -15, pitch: 0, yaw: 0 }, | |
| tilt_right: { roll: 15, pitch: 0, yaw: 0 }, | |
| } as const; | |
| type HeadPoseName = keyof typeof HEAD_POSES; | |
| const ROBOT_TOOLS: RealtimeTool[] = [ | |
| { | |
| name: "move_head", | |
| description: | |
| "Point the robot's head in a named direction. Use this to accompany " + | |
| "your speech with a tiny, legible gesture.", | |
| parameters: { | |
| type: "object", | |
| properties: { | |
| direction: { | |
| type: "string", | |
| enum: Object.keys(HEAD_POSES), | |
| description: "Named head pose to assume.", | |
| }, | |
| }, | |
| required: ["direction"], | |
| }, | |
| }, | |
| { | |
| name: "play_move", | |
| description: | |
| "Trigger a short pre-recorded body-language move from the Reachy " + | |
| "dances + emotions library. Catalog (id | kind | when to pick it):\n" + | |
| MOVE_CATALOG.map( | |
| (m) => ` - ${m.id} | ${m.kind} | ${m.description}`, | |
| ).join("\n"), | |
| parameters: { | |
| type: "object", | |
| properties: { | |
| name: { | |
| type: "string", | |
| enum: [...MOVE_IDS], | |
| description: "Catalog id to play.", | |
| }, | |
| }, | |
| required: ["name"], | |
| }, | |
| }, | |
| ]; | |
| // βββ App state machine ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // | |
| // The host owns all pre-session state (sign-in / picker / connecting). | |
| // This app's FSM starts at `idle` once the host handed us a live SDK, | |
| // and only transitions between conversation-level states. | |
| type AppState = | |
| | "idle" | |
| | "starting" | |
| | "listening" | |
| | "user-speaking" | |
| | "processing" | |
| | "ai-speaking" | |
| | "error"; | |
| interface StateView { | |
| caption: string; | |
| disabled: boolean; | |
| } | |
| const STATE_VIEWS: Record<AppState, StateView> = { | |
| idle: { caption: "Tap to start", disabled: false }, | |
| starting: { caption: "Starting", disabled: true }, | |
| listening: { caption: "", disabled: false }, | |
| "user-speaking": { caption: "", disabled: false }, | |
| processing: { caption: "", disabled: false }, | |
| "ai-speaking": { caption: "", disabled: false }, | |
| error: { caption: "Tap to retry", disabled: false }, | |
| }; | |
| const STATE_CLASS: Record<AppState, string> = { | |
| idle: "state-authenticated", | |
| starting: "state-starting", | |
| listening: "state-listening", | |
| "user-speaking": "state-user-speaking", | |
| processing: "state-processing", | |
| "ai-speaking": "state-ai-speaking", | |
| error: "state-error", | |
| }; | |
| const LIVE_STATES: ReadonlySet<AppState> = new Set([ | |
| "listening", | |
| "user-speaking", | |
| "processing", | |
| "ai-speaking", | |
| "starting", | |
| ]); | |
| /** Map our FSM to the embed protocol's coarse `AppPhase`. | |
| * | |
| * We deliberately collapse every intra-session state to `"live"` - | |
| * including `"starting"`. The host treats `"connecting"` as "the | |
| * embedded app is not interactive, reveal my ConnectingView overlay | |
| * on top", which made sense during the initial wake-up handshake | |
| * (owned by `connectToHost()` and surfaced from outside this FSM) | |
| * but is wrong for the post-boot transitions. Every time we re-enter | |
| * `"starting"` (user tapped the orb to begin a conversation, or the | |
| * silent reconnect kicked in) the app is already mounted and has its | |
| * own visual feedback - the `.state-starting .ind-spinner` CSS | |
| * showing a spinner inside the central circle. Reporting | |
| * `"connecting"` would re-paint the host's full-bleed overlay over | |
| * our spinner, which both flashes the user back to an | |
| * "establishing-session" screen they already cleared, and breaks | |
| * the silent-reconnect contract for ICE blips. | |
| */ | |
| function mapAppStateToHostPhase( | |
| state: AppState, | |
| ): "boot" | "connecting" | "live" | "leaving" | "error" { | |
| if (state === "error") return "error"; | |
| return "live"; | |
| } | |
| // βββ DOM refs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const $ = <T extends HTMLElement>(selector: string): T => { | |
| const el = document.querySelector<T>(selector); | |
| if (!el) throw new Error(`Missing element: ${selector}`); | |
| return el; | |
| }; | |
| const appRoot = $<HTMLElement>("#app"); | |
| const circleBtn = $<HTMLButtonElement>("#main-circle"); | |
| const circleCaption = $<HTMLParagraphElement>("#circle-caption"); | |
| const toolToast = $<HTMLElement>("#tool-toast"); | |
| const toolToastText = toolToast.querySelector<HTMLSpanElement>( | |
| ".tool-toast-text", | |
| )!; | |
| const micBtn = $<HTMLButtonElement>("#mic-btn"); | |
| const stopBtn = $<HTMLButtonElement>("#stop-btn"); | |
| const settingsBtn = $<HTMLButtonElement>("#settings-btn"); | |
| const settingsModal = $<HTMLDialogElement>("#settings-modal"); | |
| const inputInstructions = $<HTMLTextAreaElement>("#openai-instructions"); | |
| const restartBtn = $<HTMLButtonElement>("#restart-conversation"); | |
| const restartHint = $<HTMLElement>("#restart-hint"); | |
| const settingsForm = settingsModal.querySelector<HTMLFormElement>("form")!; | |
| // βββ Runtime state ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| let currentState: AppState = "idle"; | |
| let settings: Settings = loadSettings(); | |
| let robot: ReachyMiniInstance | null = null; | |
| let hostHandle: ConnectedHandle | null = null; | |
| let openai: OpenaiRealtimeClient | null = null; | |
| let openaiSink: HTMLAudioElement | null = null; | |
| let wobbler: HeadWobbler | null = null; | |
| let antennas: AntennasOscillator | null = null; | |
| let micLevel: MicLevelMonitor | null = null; | |
| let aiLevel: AiLevelMonitor | null = null; | |
| let movePlayer: MovePlayer | null = null; | |
| let movePlaying = false; | |
| let toolPoseRestoreTimer: number | null = null; | |
| let openaiReconnecting = false; | |
| let openaiReconnectAttempts = 0; | |
| let wakeLock: { release(): Promise<void> } | null = null; | |
| let wakeLockUnavailable = false; | |
| let micMuted = false; | |
| // βββ UI rendering βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function setState(next: AppState): void { | |
| currentState = next; | |
| hostHandle?.setAppState({ phase: mapAppStateToHostPhase(next) }); | |
| const view = STATE_VIEWS[next]; | |
| circleBtn.disabled = view.disabled; | |
| circleBtn.className = `circle ${STATE_CLASS[next]}`; | |
| if (next !== "error") setCaption(view.caption); | |
| const live = LIVE_STATES.has(next); | |
| document | |
| .querySelector(".orb-wrap") | |
| ?.classList.toggle("live", live); | |
| micBtn.setAttribute("aria-hidden", live ? "false" : "true"); | |
| stopBtn.setAttribute("aria-hidden", live ? "false" : "true"); | |
| micBtn.tabIndex = live ? 0 : -1; | |
| stopBtn.tabIndex = live ? 0 : -1; | |
| updateRestartAvailability(); | |
| } | |
| function updateRestartAvailability(): void { | |
| const live = LIVE_STATES.has(currentState); | |
| restartBtn.disabled = !live; | |
| restartHint.hidden = live; | |
| } | |
| function setCaption(text: string, kind: "" | "error" | "muted" = ""): void { | |
| const trimmed = text.trim(); | |
| circleCaption.textContent = trimmed; | |
| circleCaption.className = `circle-caption${kind ? ` ${kind}` : ""}${trimmed ? "" : " empty"}`; | |
| } | |
| // βββ Settings modal βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function openSettings(): void { | |
| inputInstructions.value = settings.instructions; | |
| updateRestartAvailability(); | |
| settingsModal.showModal(); | |
| } | |
| settingsBtn.addEventListener("click", () => openSettings()); | |
| settingsForm.addEventListener("submit", (event) => { | |
| const submitter = (event as SubmitEvent).submitter as HTMLButtonElement | null; | |
| if (submitter?.value !== "save") return; | |
| settings = { | |
| instructions: inputInstructions.value.trim() || DEFAULT_INSTRUCTIONS, | |
| }; | |
| saveSettings(settings); | |
| }); | |
| restartBtn.addEventListener("click", async () => { | |
| if (!LIVE_STATES.has(currentState)) return; | |
| settings = { | |
| instructions: inputInstructions.value.trim() || DEFAULT_INSTRUCTIONS, | |
| }; | |
| saveSettings(settings); | |
| settingsModal.close(); | |
| try { | |
| await teardownConversation(); | |
| await startConversation(); | |
| } catch (err) { | |
| onFatalError(err); | |
| } | |
| }); | |
| // βββ Click handler for the central circle ββββββββββββββββββββββββββββββ | |
| circleBtn.addEventListener("click", async () => { | |
| try { | |
| if (currentState === "idle") { | |
| await startConversation(); | |
| return; | |
| } | |
| if (currentState === "error") { | |
| circleCaption.removeAttribute("title"); | |
| setState("idle"); | |
| return; | |
| } | |
| } catch (err) { | |
| onFatalError(err); | |
| } | |
| }); | |
| // βββ Side controls (mic mute + stop) ββββββββββββββββββββββββββββββββββββ | |
| micBtn.addEventListener("click", () => { | |
| if (!robot) return; | |
| micMuted = !micMuted; | |
| robot.setMicMuted(micMuted); | |
| micBtn.classList.toggle("muted", micMuted); | |
| micBtn.setAttribute("aria-label", micMuted ? "Unmute" : "Mute"); | |
| micBtn.title = micMuted ? "Unmute" : "Mute"; | |
| }); | |
| stopBtn.addEventListener("click", async () => { | |
| await teardownConversation(); | |
| micMuted = false; | |
| micBtn.classList.remove("muted"); | |
| setState("idle"); | |
| }); | |
| // βββ High-level flow steps ββββββββββββββββββββββββββββββββββββββββββββββ | |
| async function startConversation(): Promise<void> { | |
| if (!robot) return; | |
| setState("starting"); | |
| const robotMicTrack = getRobotMicTrack(robot); | |
| if (!robotMicTrack) { | |
| onFatalError(new Error("Could not find the robot's microphone track")); | |
| return; | |
| } | |
| startMicLevelMonitor(robotMicTrack); | |
| startAntennas(); | |
| void acquireWakeLock(); | |
| openaiReconnectAttempts = 0; | |
| try { | |
| await connectOpenai(robotMicTrack); | |
| } catch (err) { | |
| onFatalError(err); | |
| return; | |
| } | |
| // Make sure the robot's outbound audio path is open so OpenAI's voice | |
| // reaches the speakers. | |
| robot.setMicMuted(false); | |
| } | |
| async function connectOpenai(robotMicTrack: MediaStreamTrack): Promise<void> { | |
| // Mint an OpenAI Realtime ephemeral key from the visitor's HF token | |
| // (seeded into `sessionStorage.hf_token` by the host shell). The | |
| // returned `ek_β¦` value is used as the Bearer for the SDP handshake. | |
| // Failures bubble back to `startConversation()` as a fatal error | |
| // with a user-friendly caption. | |
| let ephemeralKey: string; | |
| try { | |
| ephemeralKey = await mintEphemeralKey(); | |
| } catch (err) { | |
| if (err instanceof EphemeralKeyError) { | |
| throw new Error(captionForEphemeralError(err)); | |
| } | |
| throw err; | |
| } | |
| const client = new OpenaiRealtimeClient({ | |
| apiKey: ephemeralKey, | |
| model: DEFAULT_MODEL, | |
| voice: DEFAULT_VOICE, | |
| instructions: settings.instructions, | |
| inputTrack: robotMicTrack, | |
| tools: ROBOT_TOOLS, | |
| }); | |
| client.on("outputTrack", ({ track }) => { | |
| routeOpenaiToRobot(track); | |
| startWobbler(track); | |
| startAiLevelMonitor(track); | |
| }); | |
| client.on("status", ({ status }) => { | |
| switch (status) { | |
| case "connected": | |
| if (currentState === "ai-speaking" && aiLevel) { | |
| aiLevel.waitForSilence(900, () => { | |
| if (currentState === "ai-speaking") { | |
| setState("listening"); | |
| antennas?.resume(); | |
| } | |
| }); | |
| } else { | |
| setState("listening"); | |
| antennas?.resume(); | |
| } | |
| openaiReconnectAttempts = 0; | |
| break; | |
| case "user-speaking": | |
| aiLevel?.cancelSilenceWait(); | |
| setState("user-speaking"); | |
| wobbler?.reset(); | |
| antennas?.freeze(); | |
| break; | |
| case "processing": | |
| setState("processing"); | |
| antennas?.resume(); | |
| break; | |
| case "ai-speaking": | |
| aiLevel?.cancelSilenceWait(); | |
| setState("ai-speaking"); | |
| antennas?.resume(); | |
| break; | |
| case "error": | |
| if (openaiReconnecting) return; | |
| void tryReconnectOpenai( | |
| robotMicTrack, | |
| new Error("OpenAI connection lost"), | |
| ); | |
| break; | |
| default: | |
| break; | |
| } | |
| }); | |
| client.on("toolCall", (call) => handleToolCall(call)); | |
| client.on("error", ({ error }) => console.error("[openai]", error)); | |
| openai = client; | |
| await client.connect(); | |
| } | |
| async function tryReconnectOpenai( | |
| robotMicTrack: MediaStreamTrack, | |
| cause: Error, | |
| ): Promise<void> { | |
| if (openaiReconnecting) return; | |
| if (openaiReconnectAttempts >= 1) { | |
| onFatalError(cause); | |
| return; | |
| } | |
| openaiReconnecting = true; | |
| openaiReconnectAttempts += 1; | |
| console.warn("[openai] connection lost, attempting silent reconnectβ¦", cause); | |
| setState("starting"); | |
| setCaption("Reconnecting", "muted"); | |
| stopWobbler(); | |
| antennas?.freeze(); | |
| try { | |
| await openai?.close(); | |
| } catch (err) { | |
| console.warn("[openai] close during reconnect failed:", err); | |
| } | |
| openai = null; | |
| // Drop the cached ephemeral key before the reconnect attempt. The | |
| // mint-cache is sized for back-to-back handshakes in normal | |
| // conditions, but a real disconnect implies the key may have been | |
| // server-revoked (clock skew, master key rotation, rate-limit). The | |
| // extra ~200 ms round-trip to re-mint is cheap compared to the | |
| // alternative: an OpenAI 401 that fails the silent reconnect and | |
| // forces the user back to the central circle. | |
| invalidateEphemeralKey(); | |
| await new Promise((resolve) => setTimeout(resolve, 500)); | |
| try { | |
| await connectOpenai(robotMicTrack); | |
| } catch (err) { | |
| openaiReconnecting = false; | |
| onFatalError(err instanceof Error ? err : new Error(String(err))); | |
| return; | |
| } | |
| openaiReconnecting = false; | |
| } | |
| /** | |
| * User-facing caption for an `EphemeralKeyError`. Kept terse because | |
| * the central circle's caption only has room for a single short line; | |
| * the full error message goes to the `circleCaption.title` tooltip | |
| * via `onFatalError()`. | |
| */ | |
| function captionForEphemeralError(err: EphemeralKeyError): string { | |
| if (err.reason === "hf_token_missing") { | |
| return "Sign in to Hugging Face to start a conversation"; | |
| } | |
| return `Could not reach the OpenAI key service (HTTP ${err.status ?? "?"})`; | |
| } | |
| function getRobotMicTrack( | |
| robotInstance: ReachyMiniInstance, | |
| ): MediaStreamTrack | null { | |
| const pc = robotInstance._pc; | |
| if (!pc) return null; | |
| for (const receiver of pc.getReceivers()) { | |
| if (receiver.track && receiver.track.kind === "audio") { | |
| return receiver.track; | |
| } | |
| } | |
| return null; | |
| } | |
| function routeOpenaiToRobot(track: MediaStreamTrack): void { | |
| if (!robot) return; | |
| const pc = robot._pc; | |
| if (!pc) return; | |
| const transceivers = pc.getTransceivers(); | |
| const audioTransceiver = transceivers.find( | |
| (t) => | |
| t.receiver.track?.kind === "audio" || t.sender.track?.kind === "audio", | |
| ); | |
| const audioSender = audioTransceiver?.sender ?? null; | |
| if (audioSender) { | |
| if ( | |
| audioTransceiver && | |
| audioTransceiver.direction !== "sendrecv" && | |
| audioTransceiver.direction !== "sendonly" | |
| ) { | |
| try { | |
| audioTransceiver.direction = "sendrecv"; | |
| } catch (err) { | |
| console.warn("[main] could not bump transceiver direction:", err); | |
| } | |
| } | |
| audioSender.replaceTrack(track).catch((err) => { | |
| console.error("[main] replaceTrack failed", err); | |
| }); | |
| } else { | |
| console.warn( | |
| "[main] no audio transceiver on the robot peer β bidirectional audio unavailable", | |
| ); | |
| } | |
| if (!openaiSink) { | |
| openaiSink = document.createElement("audio"); | |
| openaiSink.autoplay = true; | |
| openaiSink.muted = true; | |
| document.body.appendChild(openaiSink); | |
| } | |
| openaiSink.srcObject = new MediaStream([track]); | |
| } | |
| // βββ Head motion agent ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function startWobbler(assistantTrack: MediaStreamTrack): void { | |
| if (!robot) return; | |
| wobbler?.stop(); | |
| wobbler = new HeadWobbler({ | |
| track: assistantTrack, | |
| onOffsets: ({ roll, pitch, yaw }) => { | |
| if (toolPoseRestoreTimer !== null) return; | |
| if (movePlaying) return; | |
| const ok = robot?.setHeadRpyDeg(roll, pitch, yaw) ?? false; | |
| recordSend(ok, "wobbler"); | |
| }, | |
| }); | |
| wobbler.start(); | |
| } | |
| function stopWobbler(): void { | |
| wobbler?.stop(); | |
| wobbler = null; | |
| robot?.setHeadRpyDeg(0, 0, 0); | |
| } | |
| // βββ Antennas oscillator ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function startAntennas(): void { | |
| if (!robot) return; | |
| antennas?.stop(); | |
| antennas = new AntennasOscillator({ | |
| onAntennas: (right, left) => { | |
| if (movePlaying) return; | |
| const ok = robot?.setAntennasDeg(right, left) ?? false; | |
| recordSend(ok, "antennas"); | |
| }, | |
| }); | |
| antennas.start(); | |
| } | |
| function stopAntennas(): void { | |
| antennas?.stop(); | |
| antennas = null; | |
| robot?.setAntennasDeg(0, 0); | |
| } | |
| // βββ Tool-call handler βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| let toolToastTimer: number | null = null; | |
| function showToolToast(text: string, durationMs = 2800): void { | |
| if (toolToastTimer !== null) { | |
| clearTimeout(toolToastTimer); | |
| toolToastTimer = null; | |
| } | |
| toolToastText.textContent = text; | |
| toolToast.classList.add("visible"); | |
| toolToast.setAttribute("aria-hidden", "false"); | |
| toolToastTimer = window.setTimeout(() => { | |
| toolToast.classList.remove("visible"); | |
| toolToast.setAttribute("aria-hidden", "true"); | |
| toolToastTimer = null; | |
| }, durationMs); | |
| } | |
| function describeToolCall(name: string, args: Record<string, unknown>): string { | |
| switch (name) { | |
| case "move_head": { | |
| const direction = String(args.direction ?? "").toLowerCase(); | |
| const labels: Record<string, string> = { | |
| up: "Looking up", | |
| down: "Looking down", | |
| left: "Looking left", | |
| right: "Looking right", | |
| center: "Looking forward", | |
| neutral: "Looking forward", | |
| }; | |
| return labels[direction] ?? `Moving head: ${direction || "?"}`; | |
| } | |
| case "play_move": { | |
| const move = String(args.name ?? ""); | |
| return move ? `Playing ${move}` : "Playing move"; | |
| } | |
| default: | |
| return `Tool: ${name}`; | |
| } | |
| } | |
| async function handleToolCall({ | |
| callId, | |
| name, | |
| arguments: args, | |
| }: { | |
| callId: string; | |
| name: string; | |
| arguments: Record<string, unknown>; | |
| }): Promise<void> { | |
| if (!robot || !openai) return; | |
| showToolToast(describeToolCall(name, args)); | |
| let result: { ok: boolean; message: string }; | |
| switch (name) { | |
| case "move_head": { | |
| const direction = String(args.direction ?? ""); | |
| if (direction in HEAD_POSES) { | |
| const pose = HEAD_POSES[direction as HeadPoseName]; | |
| applyToolHeadPose(pose); | |
| result = { ok: true, message: `head moved to ${direction}` }; | |
| } else { | |
| result = { | |
| ok: false, | |
| message: `unknown direction '${direction}'. Valid: ${Object.keys(HEAD_POSES).join(", ")}`, | |
| }; | |
| } | |
| break; | |
| } | |
| case "play_move": { | |
| const moveName = String(args.name ?? ""); | |
| if ((MOVE_IDS as readonly string[]).includes(moveName)) { | |
| try { | |
| await playMove(moveName as MoveId); | |
| result = { ok: true, message: `played move '${moveName}'` }; | |
| } catch (err) { | |
| result = { | |
| ok: false, | |
| message: `failed to play '${moveName}': ${err instanceof Error ? err.message : String(err)}`, | |
| }; | |
| } | |
| } else { | |
| result = { | |
| ok: false, | |
| message: `unknown move '${moveName}'. Valid: ${MOVE_IDS.join(", ")}`, | |
| }; | |
| } | |
| break; | |
| } | |
| default: | |
| result = { ok: false, message: `unknown tool '${name}'` }; | |
| } | |
| openai.sendToolResponse(callId, result); | |
| } | |
| async function playMove(name: MoveId): Promise<void> { | |
| if (!robot) return; | |
| movePlayer ??= new MovePlayer(robot); | |
| movePlaying = true; | |
| try { | |
| await movePlayer.play(name); | |
| } finally { | |
| movePlaying = false; | |
| robot.setAntennasDeg(0, 0); | |
| } | |
| } | |
| function applyToolHeadPose(pose: { | |
| roll: number; | |
| pitch: number; | |
| yaw: number; | |
| }): void { | |
| if (!robot) return; | |
| robot.setHeadRpyDeg(pose.roll, pose.pitch, pose.yaw); | |
| if (toolPoseRestoreTimer !== null) clearTimeout(toolPoseRestoreTimer); | |
| toolPoseRestoreTimer = window.setTimeout(() => { | |
| toolPoseRestoreTimer = null; | |
| }, 1200); | |
| } | |
| // βββ Mic-level monitor (circle audio-reactivity) ββββββββββββββββββββββββ | |
| class MicLevelMonitor { | |
| private ctx: AudioContext | null = null; | |
| private analyser: AnalyserNode | null = null; | |
| private source: MediaStreamAudioSourceNode | null = null; | |
| private raf = 0; | |
| private timeBuf: Float32Array<ArrayBuffer> | null = null; | |
| private freqBuf: Uint8Array<ArrayBuffer> | null = null; | |
| private level = 0; | |
| private bands = [0, 0, 0, 0, 0]; | |
| private static readonly BAND_EDGES = [4, 8, 16, 32, 64, 128]; | |
| private static readonly LOG1P_10 = Math.log1p(10); | |
| private static compress(v: number): number { | |
| return Math.log1p(v * 10) / MicLevelMonitor.LOG1P_10; | |
| } | |
| start(track: MediaStreamTrack): void { | |
| this.stop(); | |
| const ctx = new AudioContext(); | |
| const src = ctx.createMediaStreamSource(new MediaStream([track])); | |
| const analyser = ctx.createAnalyser(); | |
| analyser.fftSize = 1024; | |
| analyser.smoothingTimeConstant = 0.75; | |
| src.connect(analyser); | |
| this.ctx = ctx; | |
| this.source = src; | |
| this.analyser = analyser; | |
| this.timeBuf = new Float32Array(new ArrayBuffer(analyser.fftSize * 4)); | |
| this.freqBuf = new Uint8Array( | |
| new ArrayBuffer(analyser.frequencyBinCount), | |
| ); | |
| const rootStyle = document.documentElement.style; | |
| const tick = (): void => { | |
| const an = this.analyser; | |
| const tbuf = this.timeBuf; | |
| const fbuf = this.freqBuf; | |
| if (!an || !tbuf || !fbuf) return; | |
| an.getFloatTimeDomainData(tbuf); | |
| let sum = 0; | |
| for (let i = 0; i < tbuf.length; i++) sum += tbuf[i]! * tbuf[i]!; | |
| const rms = Math.sqrt(sum / tbuf.length); | |
| const boosted = Math.min(1, Math.pow(rms * 6, 0.7)); | |
| const levelAttack = boosted > this.level ? 0.55 : 0.12; | |
| this.level += (boosted - this.level) * levelAttack; | |
| rootStyle.setProperty("--audio-level", this.level.toFixed(3)); | |
| an.getByteFrequencyData(fbuf); | |
| const edges = MicLevelMonitor.BAND_EDGES; | |
| for (let b = 0; b < 5; b++) { | |
| const lo = edges[b]!; | |
| const hi = edges[b + 1]!; | |
| let bandSum = 0; | |
| for (let j = lo; j < hi; j++) bandSum += fbuf[j]!; | |
| const raw = MicLevelMonitor.compress(bandSum / (hi - lo) / 255); | |
| const bandAttack = raw > this.bands[b]! ? 0.35 : 0.12; | |
| this.bands[b]! += (raw - this.bands[b]!) * bandAttack; | |
| rootStyle.setProperty( | |
| `--bar${b}`, | |
| Math.min(1, this.bands[b]!).toFixed(3), | |
| ); | |
| } | |
| this.raf = requestAnimationFrame(tick); | |
| }; | |
| this.raf = requestAnimationFrame(tick); | |
| } | |
| stop(): void { | |
| cancelAnimationFrame(this.raf); | |
| this.raf = 0; | |
| try { | |
| this.source?.disconnect(); | |
| this.analyser?.disconnect(); | |
| this.ctx?.close(); | |
| } catch { | |
| /* swallow */ | |
| } | |
| this.ctx = null; | |
| this.source = null; | |
| this.analyser = null; | |
| this.timeBuf = null; | |
| this.freqBuf = null; | |
| this.level = 0; | |
| this.bands = [0, 0, 0, 0, 0]; | |
| const rootStyle = document.documentElement.style; | |
| rootStyle.setProperty("--audio-level", "0"); | |
| for (let b = 0; b < 5; b++) rootStyle.setProperty(`--bar${b}`, "0"); | |
| } | |
| resumeAudio(): void { | |
| const ctx = this.ctx; | |
| if (!ctx || ctx.state !== "suspended") return; | |
| ctx.resume().catch((err) => { | |
| console.warn("[mic-level] audioCtx resume failed:", err); | |
| }); | |
| } | |
| } | |
| function startMicLevelMonitor(track: MediaStreamTrack): void { | |
| micLevel ??= new MicLevelMonitor(); | |
| micLevel.start(track); | |
| } | |
| function stopMicLevelMonitor(): void { | |
| micLevel?.stop(); | |
| } | |
| class AiLevelMonitor { | |
| private ctx: AudioContext | null = null; | |
| private analyser: AnalyserNode | null = null; | |
| private source: MediaStreamAudioSourceNode | null = null; | |
| private raf = 0; | |
| private timeBuf: Float32Array<ArrayBuffer> | null = null; | |
| private level = 0; | |
| private lastActiveTs = 0; | |
| private silenceWait: { | |
| quietMs: number; | |
| cb: () => void; | |
| maxWaitTimer: number | null; | |
| } | null = null; | |
| private static readonly SILENCE_THRESHOLD = 0.006; | |
| start(track: MediaStreamTrack): void { | |
| this.stop(); | |
| const ctx = new AudioContext(); | |
| const src = ctx.createMediaStreamSource(new MediaStream([track])); | |
| const analyser = ctx.createAnalyser(); | |
| analyser.fftSize = 1024; | |
| analyser.smoothingTimeConstant = 0.75; | |
| src.connect(analyser); | |
| this.ctx = ctx; | |
| this.source = src; | |
| this.analyser = analyser; | |
| this.timeBuf = new Float32Array(new ArrayBuffer(analyser.fftSize * 4)); | |
| this.lastActiveTs = performance.now(); | |
| const rootStyle = document.documentElement.style; | |
| const tick = (): void => { | |
| const an = this.analyser; | |
| const buf = this.timeBuf; | |
| if (!an || !buf) return; | |
| an.getFloatTimeDomainData(buf); | |
| let sum = 0; | |
| for (let i = 0; i < buf.length; i++) sum += buf[i]! * buf[i]!; | |
| const rms = Math.sqrt(sum / buf.length); | |
| const boosted = Math.min(1, Math.pow(rms * 6, 0.7)); | |
| const levelAttack = boosted > this.level ? 0.55 : 0.12; | |
| this.level += (boosted - this.level) * levelAttack; | |
| rootStyle.setProperty("--ai-audio-level", this.level.toFixed(3)); | |
| const now = performance.now(); | |
| if (rms > AiLevelMonitor.SILENCE_THRESHOLD) { | |
| this.lastActiveTs = now; | |
| } else if (this.silenceWait) { | |
| const quietFor = now - this.lastActiveTs; | |
| if (quietFor >= this.silenceWait.quietMs) { | |
| const { cb, maxWaitTimer } = this.silenceWait; | |
| this.silenceWait = null; | |
| if (maxWaitTimer !== null) clearTimeout(maxWaitTimer); | |
| try { | |
| cb(); | |
| } catch (err) { | |
| console.warn("[ai-level] silence callback threw:", err); | |
| } | |
| } | |
| } | |
| this.raf = requestAnimationFrame(tick); | |
| }; | |
| this.raf = requestAnimationFrame(tick); | |
| } | |
| stop(): void { | |
| cancelAnimationFrame(this.raf); | |
| this.raf = 0; | |
| this.cancelSilenceWait(); | |
| try { | |
| this.source?.disconnect(); | |
| this.analyser?.disconnect(); | |
| this.ctx?.close(); | |
| } catch { | |
| /* swallow */ | |
| } | |
| this.ctx = null; | |
| this.source = null; | |
| this.analyser = null; | |
| this.timeBuf = null; | |
| this.level = 0; | |
| document.documentElement.style.setProperty("--ai-audio-level", "0"); | |
| } | |
| resumeAudio(): void { | |
| const ctx = this.ctx; | |
| if (!ctx || ctx.state !== "suspended") return; | |
| ctx.resume().catch((err) => { | |
| console.warn("[ai-level] audioCtx resume failed:", err); | |
| }); | |
| } | |
| waitForSilence(quietMs: number, cb: () => void, maxWaitMs = 8000): void { | |
| this.cancelSilenceWait(); | |
| const maxWaitTimer = window.setTimeout(() => { | |
| if (this.silenceWait?.cb === cb) { | |
| this.silenceWait = null; | |
| try { | |
| cb(); | |
| } catch (err) { | |
| console.warn("[ai-level] max-wait callback threw:", err); | |
| } | |
| } | |
| }, maxWaitMs); | |
| this.silenceWait = { quietMs, cb, maxWaitTimer }; | |
| } | |
| cancelSilenceWait(): void { | |
| if (!this.silenceWait) return; | |
| if (this.silenceWait.maxWaitTimer !== null) { | |
| clearTimeout(this.silenceWait.maxWaitTimer); | |
| } | |
| this.silenceWait = null; | |
| } | |
| } | |
| function startAiLevelMonitor(track: MediaStreamTrack): void { | |
| aiLevel ??= new AiLevelMonitor(); | |
| aiLevel.start(track); | |
| } | |
| function stopAiLevelMonitor(): void { | |
| aiLevel?.stop(); | |
| } | |
| // βββ Background-tab resilience ββββββββββββββββββββββββββββββββββββββββββ | |
| async function acquireWakeLock(): Promise<void> { | |
| if (wakeLockUnavailable) return; | |
| const anyNav = navigator as Navigator & { | |
| wakeLock?: { | |
| request(type: "screen"): Promise<{ release(): Promise<void> }>; | |
| }; | |
| }; | |
| if (!anyNav.wakeLock) { | |
| wakeLockUnavailable = true; | |
| return; | |
| } | |
| if (wakeLock) return; | |
| try { | |
| wakeLock = await anyNav.wakeLock.request("screen"); | |
| } catch (err) { | |
| const name = (err as { name?: string } | null)?.name; | |
| if (name === "NotAllowedError" || name === "SecurityError") { | |
| wakeLockUnavailable = true; | |
| } else { | |
| console.warn("[main] wakeLock.request failed:", err); | |
| } | |
| wakeLock = null; | |
| } | |
| } | |
| async function releaseWakeLock(): Promise<void> { | |
| try { | |
| await wakeLock?.release(); | |
| } catch { | |
| /* swallow */ | |
| } | |
| wakeLock = null; | |
| } | |
| function resumeAudioContexts(): void { | |
| wobbler?.resumeAudio(); | |
| micLevel?.resumeAudio(); | |
| aiLevel?.resumeAudio(); | |
| } | |
| document.addEventListener("visibilitychange", () => { | |
| if (document.hidden) return; | |
| if (LIVE_STATES.has(currentState)) { | |
| void acquireWakeLock(); | |
| resumeAudioContexts(); | |
| void probeRobotLink(); | |
| } | |
| }); | |
| // βββ Robot data-channel health βββββββββββββββββββββββββββββββββββββββββ | |
| let consecutiveSendFailures = 0; | |
| function recordSend(ok: boolean, where: string): void { | |
| if (ok) { | |
| consecutiveSendFailures = 0; | |
| return; | |
| } | |
| consecutiveSendFailures += 1; | |
| if (consecutiveSendFailures === 1 || consecutiveSendFailures % 20 === 0) { | |
| console.warn( | |
| `[main] robot send failed (${where}), ${consecutiveSendFailures} consecutive failures`, | |
| ); | |
| } | |
| if (consecutiveSendFailures >= 40) { | |
| onFatalError( | |
| new Error( | |
| "Lost the robot data channel (no commands acknowledged). Tap the circle to reconnect.", | |
| ), | |
| ); | |
| } | |
| } | |
| async function probeRobotLink(): Promise<void> { | |
| if (!robot) return; | |
| const ok = robot.setAntennasDeg(0, 0); | |
| if (!ok) { | |
| onFatalError( | |
| new Error( | |
| "Lost the robot data channel while the tab was hidden. Tap the circle to reconnect.", | |
| ), | |
| ); | |
| } | |
| } | |
| async function teardownConversation(): Promise<void> { | |
| if (toolPoseRestoreTimer !== null) { | |
| clearTimeout(toolPoseRestoreTimer); | |
| toolPoseRestoreTimer = null; | |
| } | |
| movePlayer?.stop(); | |
| movePlaying = false; | |
| openaiReconnecting = false; | |
| openaiReconnectAttempts = 0; | |
| try { | |
| await openai?.close(); | |
| } catch { | |
| /* swallow */ | |
| } | |
| openai = null; | |
| stopWobbler(); | |
| stopAntennas(); | |
| stopMicLevelMonitor(); | |
| stopAiLevelMonitor(); | |
| void releaseWakeLock(); | |
| if (openaiSink) { | |
| openaiSink.srcObject = null; | |
| openaiSink.remove(); | |
| openaiSink = null; | |
| } | |
| } | |
| async function onFatalError(err: unknown): Promise<void> { | |
| const message = err instanceof Error ? err.message : String(err); | |
| console.error("[main] error:", err); | |
| setState("error"); | |
| circleCaption.title = message; | |
| await teardownConversation(); | |
| } | |
| // βββ Boot βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async function boot(): Promise<void> { | |
| hostHandle = await connectToHost(); | |
| robot = hostHandle.reachy; | |
| // Tune the XVF3800 audio board for conversation. Mirrors the Python | |
| // `apply_audio_startup_config()` call right after `start_recording()` | |
| // / `start_playing()` in `reachy_mini_conversation_app/console.py`: | |
| // by the time `connectToHost()` resolves the host's bridge has | |
| // already negotiated WebRTC audio in both directions and the | |
| // daemon's audio pipeline is hot, so the batched parameter write | |
| // lands on a settled board. | |
| // | |
| // Fire-and-forget: a failure (older SDK without `applyAudioConfig`, | |
| // missing audio board on a Lite running off-robot, DataChannel | |
| // burp) is non-fatal - the helper logs and we keep going with the | |
| // daemon's default tuning. The `void` makes the lack of await | |
| // explicit so we don't block the UI reveal on a multi-parameter | |
| // verify roundtrip (~100 ms Γ N). | |
| void applyAudioStartupConfig(robot); | |
| // Reveal the UI now that the SDK is ready. | |
| appRoot.classList.remove("hidden"); | |
| document.body.classList.remove("booting"); | |
| hostHandle.onLeave(async () => { | |
| await teardownConversation(); | |
| micMuted = false; | |
| micBtn.classList.remove("muted"); | |
| }); | |
| setState("idle"); | |
| } | |
| void boot().catch((err) => { | |
| console.error("[minimal-conversation/embed] bootstrap failed", err); | |
| try { | |
| window.parent.postMessage( | |
| { | |
| source: "reachy-mini", | |
| type: "embed:error", | |
| version: 1, | |
| message: err instanceof Error ? err.message : String(err), | |
| fatal: true, | |
| }, | |
| window.location.origin, | |
| ); | |
| } catch { | |
| /* swallow */ | |
| } | |
| }); | |