Spaces:
Sleeping
Sleeping
| const connectButton = document.getElementById("connectButton"); | |
| const socketState = document.getElementById("socketState"); | |
| const micLevel = document.getElementById("micLevel"); | |
| const speechState = document.getElementById("speechState"); | |
| const transcriptLog = document.getElementById("transcriptLog"); | |
| const assistantLog = document.getElementById("assistantLog"); | |
| const activityLog = document.getElementById("activityLog"); | |
| const voicePromptPathInput = document.getElementById("voicePromptPath"); | |
| let audioContext; | |
| let mediaStream; | |
| let sourceNode; | |
| let workletNode; | |
| let playbackNode; | |
| let socket; | |
| const activePlaybackNodes = new Set(); | |
| const assistantLines = []; | |
| let currentAssistantLineIndex = null; | |
| let currentTurn = null; | |
| let lastTranscriptText = ""; | |
| const transcriptLines = []; | |
| let currentPartialTranscript = ""; | |
| let assistantDonePending = false; | |
| let lastMicLevel = 0; | |
| function appendLog(element, line) { | |
| element.textContent = `${line}\n${element.textContent}`.trim(); | |
| } | |
| function renderTranscriptLog() { | |
| const lines = []; | |
| if (currentPartialTranscript.trim()) { | |
| lines.push(`You: ${currentPartialTranscript.trim()} ...`); | |
| } | |
| lines.push(...transcriptLines); | |
| transcriptLog.textContent = lines.join("\n").trim(); | |
| } | |
| function prependTranscriptLine(line) { | |
| transcriptLines.unshift(line); | |
| renderTranscriptLog(); | |
| } | |
| function renderAssistantLog() { | |
| assistantLog.textContent = assistantLines.join("\n").trim(); | |
| } | |
| function prependAssistantLine(line, { trackTurn = false } = {}) { | |
| if (currentAssistantLineIndex !== null) { | |
| currentAssistantLineIndex += 1; | |
| } | |
| assistantLines.unshift(line); | |
| currentAssistantLineIndex = trackTurn ? 0 : currentAssistantLineIndex; | |
| renderAssistantLog(); | |
| } | |
| function updateAssistantTurn(text) { | |
| const line = text.trim(); | |
| if (!line.trim()) { | |
| return; | |
| } | |
| if (currentAssistantLineIndex === null || currentAssistantLineIndex >= assistantLines.length) { | |
| prependAssistantLine(line, { trackTurn: true }); | |
| return; | |
| } | |
| assistantLines[currentAssistantLineIndex] = line; | |
| renderAssistantLog(); | |
| } | |
| function finishAssistantTurn() { | |
| currentAssistantLineIndex = null; | |
| currentTurn = null; | |
| assistantDonePending = false; | |
| } | |
| function beginAssistantTurn(text) { | |
| finishAssistantTurn(); | |
| prependAssistantLine(`› ${text.trim()}`); | |
| currentTurn = { | |
| promptIndex: 0, | |
| statusIndex: null, | |
| memoryIndex: null, | |
| }; | |
| } | |
| function addAssistantActivityLine(line, { replaceStatus = false, replaceMemory = false } = {}) { | |
| if (!line.trim()) { | |
| return; | |
| } | |
| if (!currentTurn) { | |
| currentTurn = { | |
| promptIndex: null, | |
| statusIndex: null, | |
| memoryIndex: null, | |
| }; | |
| } | |
| if (replaceStatus && currentTurn.statusIndex !== null && currentTurn.statusIndex < assistantLines.length) { | |
| assistantLines[currentTurn.statusIndex] = line; | |
| renderAssistantLog(); | |
| return; | |
| } | |
| if (replaceMemory && currentTurn.memoryIndex !== null && currentTurn.memoryIndex < assistantLines.length) { | |
| assistantLines[currentTurn.memoryIndex] = line; | |
| renderAssistantLog(); | |
| return; | |
| } | |
| prependAssistantLine(line); | |
| if (replaceStatus) { | |
| currentTurn.statusIndex = 0; | |
| } | |
| if (replaceMemory) { | |
| currentTurn.memoryIndex = 0; | |
| } | |
| } | |
| function setSocketState(value) { | |
| socketState.textContent = value; | |
| } | |
| function appendActivityLine(line) { | |
| appendLog(activityLog, line); | |
| const lines = activityLog.textContent.split("\n").filter(Boolean); | |
| if (lines.length > 40) { | |
| activityLog.textContent = lines.slice(0, 40).join("\n"); | |
| } | |
| } | |
| function activityText(message) { | |
| if (message.kind === "tool_use") { | |
| const name = (message.toolName || "tool").trim(); | |
| const detail = formatToolInput(message.toolInput); | |
| return detail ? `${name} ${detail}` : name; | |
| } | |
| return message.text || ""; | |
| } | |
| function formatActivityLine(message) { | |
| if (message.kind === "tool_use") { | |
| return `[tool] ${activityText(message)}`; | |
| } | |
| if (message.kind === "tool_result") { | |
| return message.isError ? `[tool error] ${activityText(message)}` : `[result] ${activityText(message)}`; | |
| } | |
| if (message.kind === "error") { | |
| return `[error] ${activityText(message)}`; | |
| } | |
| return `[status] ${activityText(message)}`; | |
| } | |
| function formatToolInput(toolInput) { | |
| if (!toolInput || typeof toolInput !== "object" || Array.isArray(toolInput)) { | |
| return ""; | |
| } | |
| for (const key of ["path", "command", "url", "pattern", "query", "name", "tool_name"]) { | |
| const value = toolInput[key]; | |
| if (typeof value === "string" && value.trim()) { | |
| return truncate(value.trim(), 140); | |
| } | |
| } | |
| const preview = Object.entries(toolInput) | |
| .slice(0, 2) | |
| .map(([key, value]) => `${key}=${stringifyValue(value)}`) | |
| .join(" "); | |
| return truncate(preview, 140); | |
| } | |
| function stringifyValue(value) { | |
| if (typeof value === "string") { | |
| return JSON.stringify(value); | |
| } | |
| if (typeof value === "number" || typeof value === "boolean") { | |
| return String(value); | |
| } | |
| if (Array.isArray(value)) { | |
| return `[${value.length} items]`; | |
| } | |
| if (value && typeof value === "object") { | |
| return "{...}"; | |
| } | |
| return "null"; | |
| } | |
| function truncate(value, limit) { | |
| return value.length <= limit ? value : `${value.slice(0, limit - 3)}...`; | |
| } | |
| function normalizeForCompare(text) { | |
| return (text || "") | |
| .trim() | |
| .replace(/^(you|user|agent|assistant)\s*:\s*/i, "") | |
| .replace(/[^\p{L}\p{N}\s]/gu, " ") | |
| .replace(/\s+/g, " ") | |
| .toLowerCase(); | |
| } | |
| function looksLikeTranscriptEcho(text) { | |
| const assistantText = normalizeForCompare(text); | |
| const transcriptText = normalizeForCompare(lastTranscriptText); | |
| if (!assistantText || !transcriptText) { | |
| return false; | |
| } | |
| const transcriptWords = transcriptText.split(" ").filter(Boolean); | |
| if (assistantText === transcriptText || assistantText.startsWith(transcriptText)) { | |
| return true; | |
| } | |
| if (assistantText.startsWith(`you said ${transcriptText}`)) { | |
| return true; | |
| } | |
| if (assistantText.startsWith(`i heard ${transcriptText}`)) { | |
| return true; | |
| } | |
| if (transcriptWords.length >= 4 && assistantText.includes(transcriptText)) { | |
| return true; | |
| } | |
| return false; | |
| } | |
| function isGenericStatus(text) { | |
| const normalized = (text || "").trim().toLowerCase().replace(/[.]+$/, ""); | |
| return [ | |
| "working on it", | |
| "thinking", | |
| "building a plan", | |
| "starting multi-step work", | |
| "complex task detected; switching to orchestrate mode", | |
| "simple task detected; using tools", | |
| ].includes(normalized); | |
| } | |
| function isInternalStatus(text) { | |
| const normalized = (text || "").trim().toLowerCase(); | |
| return ( | |
| normalized.includes("injected relevant context from memory") || | |
| normalized.includes("relevant context from memory") || | |
| normalized.includes("context from memory") | |
| ); | |
| } | |
| function formatAssistantActivity(message) { | |
| const text = (message.text || "").trim(); | |
| if (!text) { | |
| return ""; | |
| } | |
| if (isInternalStatus(text)) { | |
| return ""; | |
| } | |
| if (message.kind === "status" && isGenericStatus(text)) { | |
| return ` ${text}`; | |
| } | |
| if (message.kind === "tool_use" || message.kind === "tool_result") { | |
| return ` ${text}`; | |
| } | |
| if (message.kind === "error") { | |
| return ` error ${text}`; | |
| } | |
| return ""; | |
| } | |
| async function startSession() { | |
| if (socket?.readyState === WebSocket.OPEN) { | |
| socket.close(); | |
| return; | |
| } | |
| setSocketState("connecting"); | |
| try { | |
| await startAudioCapture(); | |
| } catch (error) { | |
| stopAudioCapture(); | |
| setSocketState("error"); | |
| appendActivityLine(`[error] mic startup failed: ${error?.message || error}`); | |
| return; | |
| } | |
| socket = new WebSocket(`${location.protocol === "https:" ? "wss" : "ws"}://${location.host}/ws`); | |
| socket.binaryType = "arraybuffer"; | |
| socket.addEventListener("open", () => { | |
| setSocketState("open"); | |
| connectButton.textContent = "Stop Session"; | |
| socket.send(JSON.stringify({ | |
| type: "session.configure", | |
| voicePromptPath: voicePromptPathInput.value.trim() || null, | |
| clientSampleRate: audioContext?.sampleRate || null, | |
| })); | |
| }); | |
| socket.addEventListener("close", () => { | |
| setSocketState("closed"); | |
| connectButton.textContent = "Start Session"; | |
| stopAudioCapture(); | |
| }); | |
| socket.addEventListener("error", () => { | |
| appendActivityLine("[error] websocket failed"); | |
| }); | |
| socket.addEventListener("message", async (event) => { | |
| if (typeof event.data !== "string") { | |
| queuePlayback(new Int16Array(event.data)); | |
| return; | |
| } | |
| const message = JSON.parse(event.data); | |
| handleServerMessage(message); | |
| }); | |
| } | |
| function handleServerMessage(message) { | |
| switch (message.type) { | |
| case "session.ready": | |
| appendActivityLine( | |
| `[session] ${message.sessionId} · ${message.assistantBackend} · ${message.assistantModel} · ${message.sampleRate} Hz`, | |
| ); | |
| break; | |
| case "session.configured": | |
| appendActivityLine( | |
| `[config] voice=${message.voicePromptPath || "none"} · mic ${message.clientSampleRate} Hz → ${message.serverSampleRate} Hz`, | |
| ); | |
| break; | |
| case "input.level": | |
| lastMicLevel = Number(message.value) || 0; | |
| micLevel.textContent = message.value.toFixed(5); | |
| break; | |
| case "input.speech_start": | |
| speechState.textContent = "yes"; | |
| if (activePlaybackNodes.size > 0) { | |
| stopPlayback(); | |
| } | |
| break; | |
| case "input.speech_end": | |
| speechState.textContent = "no"; | |
| break; | |
| case "transcript.partial": | |
| currentPartialTranscript = message.text || ""; | |
| renderTranscriptLog(); | |
| break; | |
| case "transcript.final": | |
| currentPartialTranscript = ""; | |
| if (message.text?.trim()) { | |
| lastTranscriptText = message.text.trim(); | |
| prependTranscriptLine(`You: ${lastTranscriptText}`); | |
| beginAssistantTurn(lastTranscriptText); | |
| } else { | |
| renderTranscriptLog(); | |
| } | |
| break; | |
| case "assistant.text": | |
| if (message.text?.trim()) { | |
| const assistantText = message.text.trim(); | |
| if (!looksLikeTranscriptEcho(assistantText)) { | |
| updateAssistantTurn(assistantText); | |
| } | |
| } | |
| break; | |
| case "assistant.status": | |
| if (message.text?.trim() && !isGenericStatus(message.text) && !isInternalStatus(message.text)) { | |
| appendActivityLine(`[status] ${message.text}`); | |
| } | |
| break; | |
| case "assistant.activity": | |
| if ( | |
| message.kind !== "message" && | |
| ( | |
| (message.text?.trim() && | |
| !(message.kind === "status" && isGenericStatus(message.text)) && | |
| !isInternalStatus(message.text)) || | |
| message.kind === "tool_use" | |
| ) | |
| ) { | |
| appendActivityLine(formatActivityLine(message)); | |
| } | |
| break; | |
| case "assistant.notification": | |
| if (message.text?.trim()) { | |
| appendActivityLine(`[notice] ${message.text}`); | |
| } | |
| break; | |
| case "assistant.backchannel": | |
| if (message.text?.trim()) { | |
| appendActivityLine(`[backchannel] ${message.text}`); | |
| } | |
| break; | |
| case "assistant.done": | |
| if (activePlaybackNodes.size > 0) { | |
| assistantDonePending = true; | |
| } else { | |
| finishAssistantTurn(); | |
| } | |
| break; | |
| case "assistant.interrupted": | |
| currentPartialTranscript = ""; | |
| renderTranscriptLog(); | |
| finishAssistantTurn(); | |
| stopPlayback(); | |
| appendActivityLine("[interrupted]"); | |
| break; | |
| default: | |
| break; | |
| } | |
| } | |
| async function startAudioCapture() { | |
| audioContext = new AudioContext(); | |
| if (audioContext.state === "suspended") { | |
| await audioContext.resume(); | |
| } | |
| playbackNode = audioContext.createGain(); | |
| playbackNode.connect(audioContext.destination); | |
| await audioContext.audioWorklet.addModule("/static/pcm-worklet.js?v=20260503c"); | |
| mediaStream = await navigator.mediaDevices.getUserMedia({ | |
| audio: { | |
| channelCount: 1, | |
| echoCancellation: true, | |
| noiseSuppression: true, | |
| autoGainControl: true, | |
| sampleRate: { ideal: 48000 }, | |
| }, | |
| }); | |
| sourceNode = audioContext.createMediaStreamSource(mediaStream); | |
| workletNode = new AudioWorkletNode(audioContext, "pcm-recorder"); | |
| workletNode.port.onmessage = (event) => { | |
| if (socket?.readyState === WebSocket.OPEN) { | |
| socket.send(event.data); | |
| } | |
| }; | |
| sourceNode.connect(workletNode); | |
| } | |
| function stopAudioCapture() { | |
| workletNode?.disconnect(); | |
| sourceNode?.disconnect(); | |
| mediaStream?.getTracks().forEach((track) => track.stop()); | |
| stopPlayback(); | |
| audioContext?.close(); | |
| workletNode = null; | |
| sourceNode = null; | |
| mediaStream = null; | |
| audioContext = null; | |
| } | |
| let playbackClock = 0; | |
| function queuePlayback(pcm16) { | |
| if (!audioContext) { | |
| return; | |
| } | |
| const float32 = new Float32Array(pcm16.length); | |
| for (let i = 0; i < pcm16.length; i += 1) { | |
| float32[i] = pcm16[i] / 32768; | |
| } | |
| const buffer = audioContext.createBuffer(1, float32.length, 24000); | |
| buffer.copyToChannel(float32, 0); | |
| const node = audioContext.createBufferSource(); | |
| node.buffer = buffer; | |
| node.connect(playbackNode); | |
| activePlaybackNodes.add(node); | |
| node.onended = () => { | |
| activePlaybackNodes.delete(node); | |
| node.disconnect(); | |
| if (activePlaybackNodes.size === 0 && assistantDonePending) { | |
| finishAssistantTurn(); | |
| } | |
| }; | |
| const now = audioContext.currentTime; | |
| playbackClock = Math.max(playbackClock, now + 0.005); | |
| node.start(playbackClock); | |
| playbackClock += buffer.duration; | |
| } | |
| function stopPlayback() { | |
| activePlaybackNodes.forEach((node) => { | |
| try { | |
| node.stop(); | |
| } catch (error) { | |
| // Ignore nodes that have already ended. | |
| } | |
| node.disconnect(); | |
| }); | |
| activePlaybackNodes.clear(); | |
| playbackClock = 0; | |
| if (assistantDonePending) { | |
| finishAssistantTurn(); | |
| } | |
| } | |
| connectButton.addEventListener("click", startSession); | |