voice-agent / static /app.js
RalphThings's picture
Deploy Hugging Face Space
a8bcb70
const connectButton = document.getElementById("connectButton");
const socketState = document.getElementById("socketState");
const micLevel = document.getElementById("micLevel");
const speechState = document.getElementById("speechState");
const transcriptLog = document.getElementById("transcriptLog");
const assistantLog = document.getElementById("assistantLog");
const activityLog = document.getElementById("activityLog");
const voicePromptPathInput = document.getElementById("voicePromptPath");
let audioContext;
let mediaStream;
let sourceNode;
let workletNode;
let playbackNode;
let socket;
const activePlaybackNodes = new Set();
const assistantLines = [];
let currentAssistantLineIndex = null;
let currentTurn = null;
let lastTranscriptText = "";
const transcriptLines = [];
let currentPartialTranscript = "";
let assistantDonePending = false;
let lastMicLevel = 0;
function appendLog(element, line) {
element.textContent = `${line}\n${element.textContent}`.trim();
}
function renderTranscriptLog() {
const lines = [];
if (currentPartialTranscript.trim()) {
lines.push(`You: ${currentPartialTranscript.trim()} ...`);
}
lines.push(...transcriptLines);
transcriptLog.textContent = lines.join("\n").trim();
}
function prependTranscriptLine(line) {
transcriptLines.unshift(line);
renderTranscriptLog();
}
function renderAssistantLog() {
assistantLog.textContent = assistantLines.join("\n").trim();
}
function prependAssistantLine(line, { trackTurn = false } = {}) {
if (currentAssistantLineIndex !== null) {
currentAssistantLineIndex += 1;
}
assistantLines.unshift(line);
currentAssistantLineIndex = trackTurn ? 0 : currentAssistantLineIndex;
renderAssistantLog();
}
function updateAssistantTurn(text) {
const line = text.trim();
if (!line.trim()) {
return;
}
if (currentAssistantLineIndex === null || currentAssistantLineIndex >= assistantLines.length) {
prependAssistantLine(line, { trackTurn: true });
return;
}
assistantLines[currentAssistantLineIndex] = line;
renderAssistantLog();
}
function finishAssistantTurn() {
currentAssistantLineIndex = null;
currentTurn = null;
assistantDonePending = false;
}
function beginAssistantTurn(text) {
finishAssistantTurn();
prependAssistantLine(`› ${text.trim()}`);
currentTurn = {
promptIndex: 0,
statusIndex: null,
memoryIndex: null,
};
}
function addAssistantActivityLine(line, { replaceStatus = false, replaceMemory = false } = {}) {
if (!line.trim()) {
return;
}
if (!currentTurn) {
currentTurn = {
promptIndex: null,
statusIndex: null,
memoryIndex: null,
};
}
if (replaceStatus && currentTurn.statusIndex !== null && currentTurn.statusIndex < assistantLines.length) {
assistantLines[currentTurn.statusIndex] = line;
renderAssistantLog();
return;
}
if (replaceMemory && currentTurn.memoryIndex !== null && currentTurn.memoryIndex < assistantLines.length) {
assistantLines[currentTurn.memoryIndex] = line;
renderAssistantLog();
return;
}
prependAssistantLine(line);
if (replaceStatus) {
currentTurn.statusIndex = 0;
}
if (replaceMemory) {
currentTurn.memoryIndex = 0;
}
}
function setSocketState(value) {
socketState.textContent = value;
}
function appendActivityLine(line) {
appendLog(activityLog, line);
const lines = activityLog.textContent.split("\n").filter(Boolean);
if (lines.length > 40) {
activityLog.textContent = lines.slice(0, 40).join("\n");
}
}
function activityText(message) {
if (message.kind === "tool_use") {
const name = (message.toolName || "tool").trim();
const detail = formatToolInput(message.toolInput);
return detail ? `${name} ${detail}` : name;
}
return message.text || "";
}
function formatActivityLine(message) {
if (message.kind === "tool_use") {
return `[tool] ${activityText(message)}`;
}
if (message.kind === "tool_result") {
return message.isError ? `[tool error] ${activityText(message)}` : `[result] ${activityText(message)}`;
}
if (message.kind === "error") {
return `[error] ${activityText(message)}`;
}
return `[status] ${activityText(message)}`;
}
function formatToolInput(toolInput) {
if (!toolInput || typeof toolInput !== "object" || Array.isArray(toolInput)) {
return "";
}
for (const key of ["path", "command", "url", "pattern", "query", "name", "tool_name"]) {
const value = toolInput[key];
if (typeof value === "string" && value.trim()) {
return truncate(value.trim(), 140);
}
}
const preview = Object.entries(toolInput)
.slice(0, 2)
.map(([key, value]) => `${key}=${stringifyValue(value)}`)
.join(" ");
return truncate(preview, 140);
}
function stringifyValue(value) {
if (typeof value === "string") {
return JSON.stringify(value);
}
if (typeof value === "number" || typeof value === "boolean") {
return String(value);
}
if (Array.isArray(value)) {
return `[${value.length} items]`;
}
if (value && typeof value === "object") {
return "{...}";
}
return "null";
}
function truncate(value, limit) {
return value.length <= limit ? value : `${value.slice(0, limit - 3)}...`;
}
function normalizeForCompare(text) {
return (text || "")
.trim()
.replace(/^(you|user|agent|assistant)\s*:\s*/i, "")
.replace(/[^\p{L}\p{N}\s]/gu, " ")
.replace(/\s+/g, " ")
.toLowerCase();
}
function looksLikeTranscriptEcho(text) {
const assistantText = normalizeForCompare(text);
const transcriptText = normalizeForCompare(lastTranscriptText);
if (!assistantText || !transcriptText) {
return false;
}
const transcriptWords = transcriptText.split(" ").filter(Boolean);
if (assistantText === transcriptText || assistantText.startsWith(transcriptText)) {
return true;
}
if (assistantText.startsWith(`you said ${transcriptText}`)) {
return true;
}
if (assistantText.startsWith(`i heard ${transcriptText}`)) {
return true;
}
if (transcriptWords.length >= 4 && assistantText.includes(transcriptText)) {
return true;
}
return false;
}
function isGenericStatus(text) {
const normalized = (text || "").trim().toLowerCase().replace(/[.]+$/, "");
return [
"working on it",
"thinking",
"building a plan",
"starting multi-step work",
"complex task detected; switching to orchestrate mode",
"simple task detected; using tools",
].includes(normalized);
}
function isInternalStatus(text) {
const normalized = (text || "").trim().toLowerCase();
return (
normalized.includes("injected relevant context from memory") ||
normalized.includes("relevant context from memory") ||
normalized.includes("context from memory")
);
}
function formatAssistantActivity(message) {
const text = (message.text || "").trim();
if (!text) {
return "";
}
if (isInternalStatus(text)) {
return "";
}
if (message.kind === "status" && isGenericStatus(text)) {
return ` ${text}`;
}
if (message.kind === "tool_use" || message.kind === "tool_result") {
return ` ${text}`;
}
if (message.kind === "error") {
return ` error ${text}`;
}
return "";
}
async function startSession() {
if (socket?.readyState === WebSocket.OPEN) {
socket.close();
return;
}
setSocketState("connecting");
try {
await startAudioCapture();
} catch (error) {
stopAudioCapture();
setSocketState("error");
appendActivityLine(`[error] mic startup failed: ${error?.message || error}`);
return;
}
socket = new WebSocket(`${location.protocol === "https:" ? "wss" : "ws"}://${location.host}/ws`);
socket.binaryType = "arraybuffer";
socket.addEventListener("open", () => {
setSocketState("open");
connectButton.textContent = "Stop Session";
socket.send(JSON.stringify({
type: "session.configure",
voicePromptPath: voicePromptPathInput.value.trim() || null,
clientSampleRate: audioContext?.sampleRate || null,
}));
});
socket.addEventListener("close", () => {
setSocketState("closed");
connectButton.textContent = "Start Session";
stopAudioCapture();
});
socket.addEventListener("error", () => {
appendActivityLine("[error] websocket failed");
});
socket.addEventListener("message", async (event) => {
if (typeof event.data !== "string") {
queuePlayback(new Int16Array(event.data));
return;
}
const message = JSON.parse(event.data);
handleServerMessage(message);
});
}
function handleServerMessage(message) {
switch (message.type) {
case "session.ready":
appendActivityLine(
`[session] ${message.sessionId} · ${message.assistantBackend} · ${message.assistantModel} · ${message.sampleRate} Hz`,
);
break;
case "session.configured":
appendActivityLine(
`[config] voice=${message.voicePromptPath || "none"} · mic ${message.clientSampleRate} Hz → ${message.serverSampleRate} Hz`,
);
break;
case "input.level":
lastMicLevel = Number(message.value) || 0;
micLevel.textContent = message.value.toFixed(5);
break;
case "input.speech_start":
speechState.textContent = "yes";
if (activePlaybackNodes.size > 0) {
stopPlayback();
}
break;
case "input.speech_end":
speechState.textContent = "no";
break;
case "transcript.partial":
currentPartialTranscript = message.text || "";
renderTranscriptLog();
break;
case "transcript.final":
currentPartialTranscript = "";
if (message.text?.trim()) {
lastTranscriptText = message.text.trim();
prependTranscriptLine(`You: ${lastTranscriptText}`);
beginAssistantTurn(lastTranscriptText);
} else {
renderTranscriptLog();
}
break;
case "assistant.text":
if (message.text?.trim()) {
const assistantText = message.text.trim();
if (!looksLikeTranscriptEcho(assistantText)) {
updateAssistantTurn(assistantText);
}
}
break;
case "assistant.status":
if (message.text?.trim() && !isGenericStatus(message.text) && !isInternalStatus(message.text)) {
appendActivityLine(`[status] ${message.text}`);
}
break;
case "assistant.activity":
if (
message.kind !== "message" &&
(
(message.text?.trim() &&
!(message.kind === "status" && isGenericStatus(message.text)) &&
!isInternalStatus(message.text)) ||
message.kind === "tool_use"
)
) {
appendActivityLine(formatActivityLine(message));
}
break;
case "assistant.notification":
if (message.text?.trim()) {
appendActivityLine(`[notice] ${message.text}`);
}
break;
case "assistant.backchannel":
if (message.text?.trim()) {
appendActivityLine(`[backchannel] ${message.text}`);
}
break;
case "assistant.done":
if (activePlaybackNodes.size > 0) {
assistantDonePending = true;
} else {
finishAssistantTurn();
}
break;
case "assistant.interrupted":
currentPartialTranscript = "";
renderTranscriptLog();
finishAssistantTurn();
stopPlayback();
appendActivityLine("[interrupted]");
break;
default:
break;
}
}
async function startAudioCapture() {
audioContext = new AudioContext();
if (audioContext.state === "suspended") {
await audioContext.resume();
}
playbackNode = audioContext.createGain();
playbackNode.connect(audioContext.destination);
await audioContext.audioWorklet.addModule("/static/pcm-worklet.js?v=20260503c");
mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
sampleRate: { ideal: 48000 },
},
});
sourceNode = audioContext.createMediaStreamSource(mediaStream);
workletNode = new AudioWorkletNode(audioContext, "pcm-recorder");
workletNode.port.onmessage = (event) => {
if (socket?.readyState === WebSocket.OPEN) {
socket.send(event.data);
}
};
sourceNode.connect(workletNode);
}
function stopAudioCapture() {
workletNode?.disconnect();
sourceNode?.disconnect();
mediaStream?.getTracks().forEach((track) => track.stop());
stopPlayback();
audioContext?.close();
workletNode = null;
sourceNode = null;
mediaStream = null;
audioContext = null;
}
let playbackClock = 0;
function queuePlayback(pcm16) {
if (!audioContext) {
return;
}
const float32 = new Float32Array(pcm16.length);
for (let i = 0; i < pcm16.length; i += 1) {
float32[i] = pcm16[i] / 32768;
}
const buffer = audioContext.createBuffer(1, float32.length, 24000);
buffer.copyToChannel(float32, 0);
const node = audioContext.createBufferSource();
node.buffer = buffer;
node.connect(playbackNode);
activePlaybackNodes.add(node);
node.onended = () => {
activePlaybackNodes.delete(node);
node.disconnect();
if (activePlaybackNodes.size === 0 && assistantDonePending) {
finishAssistantTurn();
}
};
const now = audioContext.currentTime;
playbackClock = Math.max(playbackClock, now + 0.005);
node.start(playbackClock);
playbackClock += buffer.duration;
}
function stopPlayback() {
activePlaybackNodes.forEach((node) => {
try {
node.stop();
} catch (error) {
// Ignore nodes that have already ended.
}
node.disconnect();
});
activePlaybackNodes.clear();
playbackClock = 0;
if (assistantDonePending) {
finishAssistantTurn();
}
}
connectButton.addEventListener("click", startSession);