const connectButton = document.getElementById("connectButton"); const socketState = document.getElementById("socketState"); const micLevel = document.getElementById("micLevel"); const speechState = document.getElementById("speechState"); const transcriptLog = document.getElementById("transcriptLog"); const assistantLog = document.getElementById("assistantLog"); const activityLog = document.getElementById("activityLog"); const voicePromptPathInput = document.getElementById("voicePromptPath"); let audioContext; let mediaStream; let sourceNode; let workletNode; let playbackNode; let socket; const activePlaybackNodes = new Set(); const assistantLines = []; let currentAssistantLineIndex = null; let currentTurn = null; let lastTranscriptText = ""; const transcriptLines = []; let currentPartialTranscript = ""; let assistantDonePending = false; let lastMicLevel = 0; function appendLog(element, line) { element.textContent = `${line}\n${element.textContent}`.trim(); } function renderTranscriptLog() { const lines = []; if (currentPartialTranscript.trim()) { lines.push(`You: ${currentPartialTranscript.trim()} ...`); } lines.push(...transcriptLines); transcriptLog.textContent = lines.join("\n").trim(); } function prependTranscriptLine(line) { transcriptLines.unshift(line); renderTranscriptLog(); } function renderAssistantLog() { assistantLog.textContent = assistantLines.join("\n").trim(); } function prependAssistantLine(line, { trackTurn = false } = {}) { if (currentAssistantLineIndex !== null) { currentAssistantLineIndex += 1; } assistantLines.unshift(line); currentAssistantLineIndex = trackTurn ? 0 : currentAssistantLineIndex; renderAssistantLog(); } function updateAssistantTurn(text) { const line = text.trim(); if (!line.trim()) { return; } if (currentAssistantLineIndex === null || currentAssistantLineIndex >= assistantLines.length) { prependAssistantLine(line, { trackTurn: true }); return; } assistantLines[currentAssistantLineIndex] = line; renderAssistantLog(); } function finishAssistantTurn() { currentAssistantLineIndex = null; currentTurn = null; assistantDonePending = false; } function beginAssistantTurn(text) { finishAssistantTurn(); prependAssistantLine(`› ${text.trim()}`); currentTurn = { promptIndex: 0, statusIndex: null, memoryIndex: null, }; } function addAssistantActivityLine(line, { replaceStatus = false, replaceMemory = false } = {}) { if (!line.trim()) { return; } if (!currentTurn) { currentTurn = { promptIndex: null, statusIndex: null, memoryIndex: null, }; } if (replaceStatus && currentTurn.statusIndex !== null && currentTurn.statusIndex < assistantLines.length) { assistantLines[currentTurn.statusIndex] = line; renderAssistantLog(); return; } if (replaceMemory && currentTurn.memoryIndex !== null && currentTurn.memoryIndex < assistantLines.length) { assistantLines[currentTurn.memoryIndex] = line; renderAssistantLog(); return; } prependAssistantLine(line); if (replaceStatus) { currentTurn.statusIndex = 0; } if (replaceMemory) { currentTurn.memoryIndex = 0; } } function setSocketState(value) { socketState.textContent = value; } function appendActivityLine(line) { appendLog(activityLog, line); const lines = activityLog.textContent.split("\n").filter(Boolean); if (lines.length > 40) { activityLog.textContent = lines.slice(0, 40).join("\n"); } } function activityText(message) { if (message.kind === "tool_use") { const name = (message.toolName || "tool").trim(); const detail = formatToolInput(message.toolInput); return detail ? `${name} ${detail}` : name; } return message.text || ""; } function formatActivityLine(message) { if (message.kind === "tool_use") { return `[tool] ${activityText(message)}`; } if (message.kind === "tool_result") { return message.isError ? `[tool error] ${activityText(message)}` : `[result] ${activityText(message)}`; } if (message.kind === "error") { return `[error] ${activityText(message)}`; } return `[status] ${activityText(message)}`; } function formatToolInput(toolInput) { if (!toolInput || typeof toolInput !== "object" || Array.isArray(toolInput)) { return ""; } for (const key of ["path", "command", "url", "pattern", "query", "name", "tool_name"]) { const value = toolInput[key]; if (typeof value === "string" && value.trim()) { return truncate(value.trim(), 140); } } const preview = Object.entries(toolInput) .slice(0, 2) .map(([key, value]) => `${key}=${stringifyValue(value)}`) .join(" "); return truncate(preview, 140); } function stringifyValue(value) { if (typeof value === "string") { return JSON.stringify(value); } if (typeof value === "number" || typeof value === "boolean") { return String(value); } if (Array.isArray(value)) { return `[${value.length} items]`; } if (value && typeof value === "object") { return "{...}"; } return "null"; } function truncate(value, limit) { return value.length <= limit ? value : `${value.slice(0, limit - 3)}...`; } function normalizeForCompare(text) { return (text || "") .trim() .replace(/^(you|user|agent|assistant)\s*:\s*/i, "") .replace(/[^\p{L}\p{N}\s]/gu, " ") .replace(/\s+/g, " ") .toLowerCase(); } function looksLikeTranscriptEcho(text) { const assistantText = normalizeForCompare(text); const transcriptText = normalizeForCompare(lastTranscriptText); if (!assistantText || !transcriptText) { return false; } const transcriptWords = transcriptText.split(" ").filter(Boolean); if (assistantText === transcriptText || assistantText.startsWith(transcriptText)) { return true; } if (assistantText.startsWith(`you said ${transcriptText}`)) { return true; } if (assistantText.startsWith(`i heard ${transcriptText}`)) { return true; } if (transcriptWords.length >= 4 && assistantText.includes(transcriptText)) { return true; } return false; } function isGenericStatus(text) { const normalized = (text || "").trim().toLowerCase().replace(/[.]+$/, ""); return [ "working on it", "thinking", "building a plan", "starting multi-step work", "complex task detected; switching to orchestrate mode", "simple task detected; using tools", ].includes(normalized); } function isInternalStatus(text) { const normalized = (text || "").trim().toLowerCase(); return ( normalized.includes("injected relevant context from memory") || normalized.includes("relevant context from memory") || normalized.includes("context from memory") ); } function formatAssistantActivity(message) { const text = (message.text || "").trim(); if (!text) { return ""; } if (isInternalStatus(text)) { return ""; } if (message.kind === "status" && isGenericStatus(text)) { return ` ${text}`; } if (message.kind === "tool_use" || message.kind === "tool_result") { return ` ${text}`; } if (message.kind === "error") { return ` error ${text}`; } return ""; } async function startSession() { if (socket?.readyState === WebSocket.OPEN) { socket.close(); return; } setSocketState("connecting"); try { await startAudioCapture(); } catch (error) { stopAudioCapture(); setSocketState("error"); appendActivityLine(`[error] mic startup failed: ${error?.message || error}`); return; } socket = new WebSocket(`${location.protocol === "https:" ? "wss" : "ws"}://${location.host}/ws`); socket.binaryType = "arraybuffer"; socket.addEventListener("open", () => { setSocketState("open"); connectButton.textContent = "Stop Session"; socket.send(JSON.stringify({ type: "session.configure", voicePromptPath: voicePromptPathInput.value.trim() || null, clientSampleRate: audioContext?.sampleRate || null, })); }); socket.addEventListener("close", () => { setSocketState("closed"); connectButton.textContent = "Start Session"; stopAudioCapture(); }); socket.addEventListener("error", () => { appendActivityLine("[error] websocket failed"); }); socket.addEventListener("message", async (event) => { if (typeof event.data !== "string") { queuePlayback(new Int16Array(event.data)); return; } const message = JSON.parse(event.data); handleServerMessage(message); }); } function handleServerMessage(message) { switch (message.type) { case "session.ready": appendActivityLine( `[session] ${message.sessionId} · ${message.assistantBackend} · ${message.assistantModel} · ${message.sampleRate} Hz`, ); break; case "session.configured": appendActivityLine( `[config] voice=${message.voicePromptPath || "none"} · mic ${message.clientSampleRate} Hz → ${message.serverSampleRate} Hz`, ); break; case "input.level": lastMicLevel = Number(message.value) || 0; micLevel.textContent = message.value.toFixed(5); break; case "input.speech_start": speechState.textContent = "yes"; if (activePlaybackNodes.size > 0) { stopPlayback(); } break; case "input.speech_end": speechState.textContent = "no"; break; case "transcript.partial": currentPartialTranscript = message.text || ""; renderTranscriptLog(); break; case "transcript.final": currentPartialTranscript = ""; if (message.text?.trim()) { lastTranscriptText = message.text.trim(); prependTranscriptLine(`You: ${lastTranscriptText}`); beginAssistantTurn(lastTranscriptText); } else { renderTranscriptLog(); } break; case "assistant.text": if (message.text?.trim()) { const assistantText = message.text.trim(); if (!looksLikeTranscriptEcho(assistantText)) { updateAssistantTurn(assistantText); } } break; case "assistant.status": if (message.text?.trim() && !isGenericStatus(message.text) && !isInternalStatus(message.text)) { appendActivityLine(`[status] ${message.text}`); } break; case "assistant.activity": if ( message.kind !== "message" && ( (message.text?.trim() && !(message.kind === "status" && isGenericStatus(message.text)) && !isInternalStatus(message.text)) || message.kind === "tool_use" ) ) { appendActivityLine(formatActivityLine(message)); } break; case "assistant.notification": if (message.text?.trim()) { appendActivityLine(`[notice] ${message.text}`); } break; case "assistant.backchannel": if (message.text?.trim()) { appendActivityLine(`[backchannel] ${message.text}`); } break; case "assistant.done": if (activePlaybackNodes.size > 0) { assistantDonePending = true; } else { finishAssistantTurn(); } break; case "assistant.interrupted": currentPartialTranscript = ""; renderTranscriptLog(); finishAssistantTurn(); stopPlayback(); appendActivityLine("[interrupted]"); break; default: break; } } async function startAudioCapture() { audioContext = new AudioContext(); if (audioContext.state === "suspended") { await audioContext.resume(); } playbackNode = audioContext.createGain(); playbackNode.connect(audioContext.destination); await audioContext.audioWorklet.addModule("/static/pcm-worklet.js?v=20260503c"); mediaStream = await navigator.mediaDevices.getUserMedia({ audio: { channelCount: 1, echoCancellation: true, noiseSuppression: true, autoGainControl: true, sampleRate: { ideal: 48000 }, }, }); sourceNode = audioContext.createMediaStreamSource(mediaStream); workletNode = new AudioWorkletNode(audioContext, "pcm-recorder"); workletNode.port.onmessage = (event) => { if (socket?.readyState === WebSocket.OPEN) { socket.send(event.data); } }; sourceNode.connect(workletNode); } function stopAudioCapture() { workletNode?.disconnect(); sourceNode?.disconnect(); mediaStream?.getTracks().forEach((track) => track.stop()); stopPlayback(); audioContext?.close(); workletNode = null; sourceNode = null; mediaStream = null; audioContext = null; } let playbackClock = 0; function queuePlayback(pcm16) { if (!audioContext) { return; } const float32 = new Float32Array(pcm16.length); for (let i = 0; i < pcm16.length; i += 1) { float32[i] = pcm16[i] / 32768; } const buffer = audioContext.createBuffer(1, float32.length, 24000); buffer.copyToChannel(float32, 0); const node = audioContext.createBufferSource(); node.buffer = buffer; node.connect(playbackNode); activePlaybackNodes.add(node); node.onended = () => { activePlaybackNodes.delete(node); node.disconnect(); if (activePlaybackNodes.size === 0 && assistantDonePending) { finishAssistantTurn(); } }; const now = audioContext.currentTime; playbackClock = Math.max(playbackClock, now + 0.005); node.start(playbackClock); playbackClock += buffer.duration; } function stopPlayback() { activePlaybackNodes.forEach((node) => { try { node.stop(); } catch (error) { // Ignore nodes that have already ended. } node.disconnect(); }); activePlaybackNodes.clear(); playbackClock = 0; if (assistantDonePending) { finishAssistantTurn(); } } connectButton.addEventListener("click", startSession);