Spaces:

DataEyond
/

Demo-Frontend-Voice-Agent

Sleeping

App Files Files Community

ishaq101 commited on 30 days ago

Commit

b919616

1 Parent(s): 8172156

Feat: tap to speak, speaker button in bubble chat

Browse files

Files changed (10) hide show

.gitignore +1 -0
server.js +1 -1
src/app/components/Main.tsx +65 -62
src/app/components/chat/FeedbackWidget.tsx +3 -1
src/app/components/chat/MessageBubble.tsx +1 -0
src/app/components/chat/VoiceMicButton.tsx +2 -9
src/app/components/chat/VoiceStatusBar.tsx +0 -2
src/app/components/chat/types.ts +1 -0
src/hooks/useVoiceSession.ts +58 -231
src/services/voiceApi.ts +56 -0

.gitignore CHANGED Viewed

@@ -40,6 +40,7 @@ API_CONTRACT_CHATBOT.md
 API_CONTRACT_VOICE.md
 STYLE.md
 HIGHLIGHT_VOICE.md
 # Database logos (served via CDN)
 public/databases/

 API_CONTRACT_VOICE.md
 STYLE.md
 HIGHLIGHT_VOICE.md
+HIGHLIGHT_STT_TTS.md
 # Database logos (served via CDN)
 public/databases/

server.js CHANGED Viewed

@@ -26,7 +26,7 @@ const MIME = {
 };
 console.log(`Starting server on port ${PORT}`);
-console.log(`Backend URL: ${BACKEND_URL || "(not set)"}`);
 const server = http.createServer((req, res) => {
   const parsed = url.parse(req.url);

 };
 console.log(`Starting server on port ${PORT}`);
+// console.log(`Backend URL: ${BACKEND_URL || "(not set)"}`);
 const server = http.createServer((req, res) => {
   const parsed = url.parse(req.url);

src/app/components/Main.tsx CHANGED Viewed

@@ -11,7 +11,7 @@ import {
   streamChat,
   type ChatSource,
 } from "../../services/api";
-import { textToSpeech } from "../../services/voiceApi";
 import { AudioPlayer } from "../../audio/AudioPlayer";
 import ChatLayout from "./chat/ChatLayout";
 import Sidebar from "./chat/Sidebar";
@@ -19,6 +19,7 @@ import ChatWindow from "./chat/ChatWindow";
 import ChatInput from "./chat/ChatInput";
 import VoiceStatusBar from "./chat/VoiceStatusBar";
 import { useVoiceSession } from "../../hooks/useVoiceSession";
 import type { Message, ChatSession, StoredUser } from "./chat/types";
 interface ChatRoom {
@@ -41,8 +42,8 @@ export default function Main() {
   const [knowledgeOpen, setKnowledgeOpen] = useState(false);
   const [mobileSidebarOpen, setMobileSidebarOpen] = useState(false);
   const abortControllerRef = useRef<AbortController | null>(null);
-  const ttsPlayerRef = useRef<AudioPlayer | null>(null);
   const isVoiceActiveRef = useRef(false);
   // Stable refs so voice callbacks always see the latest values
   const currentChatIdRef = useRef<string | null>(null);
@@ -167,10 +168,18 @@ export default function Main() {
   };
   const handleSend = useCallback(async (text: string) => {
-    if (!user) return;
     let roomId = await ensureRoom(text.slice(0, 50));
-    if (!roomId) return;
     const userMessage: Message = {
       id: crypto.randomUUID(),
@@ -217,10 +226,7 @@ export default function Main() {
     abortControllerRef.current = new AbortController();
-    // TTS state — only used when voice is active
-    const audioChunks: ArrayBuffer[] = [];
-    let audioSampleRate = 24000;
-    let ttsChain = Promise.resolve();
     try {
       const response = await streamChat(user.user_id, roomId, text);
@@ -287,23 +293,8 @@ export default function Main() {
                     : chat
                 )
               );
-            } else if (currentEvent === "audio" && data && isVoiceActiveRef.current) {
-              // TTS: only during active voice session
-              const sentence = data;
-              ttsChain = ttsChain.then(async () => {
-                try {
-                  const { pcm, sampleRate } = await textToSpeech(sentence);
-                  audioChunks.push(pcm);
-                  audioSampleRate = sampleRate;
-                  if (!ttsPlayerRef.current) {
-                    ttsPlayerRef.current = new AudioPlayer();
-                  }
-                  ttsPlayerRef.current.init(sampleRate);
-                  ttsPlayerRef.current.enqueue(pcm);
-                } catch {
-                  // TTS failure is non-fatal; chat continues
-                }
-              });
             } else if (currentEvent === "done") {
               break;
             }
@@ -311,28 +302,7 @@ export default function Main() {
         }
       }
-      // Wait for all queued TTS calls to finish
-      await ttsChain;
-      // Reload messages from server (may reassign IDs)
       await loadRoomMessages(roomId);
-      // Re-attach audio chunks to the last assistant message after server sync
-      if (audioChunks.length > 0) {
-        setChats((prev) =>
-          prev.map((chat) => {
-            if (chat.id !== roomId) return chat;
-            const msgs = [...chat.messages];
-            for (let i = msgs.length - 1; i >= 0; i--) {
-              if (msgs[i].role === "assistant") {
-                msgs[i] = { ...msgs[i], audioChunks, audioSampleRate };
-                break;
-              }
-            }
-            return { ...chat, messages: msgs };
-          })
-        );
-      }
     } catch (err: unknown) {
       if ((err as Error).name !== "AbortError") {
         setChats((prev) =>
@@ -352,35 +322,68 @@ export default function Main() {
               : chat
           )
         );
       }
     } finally {
       setIsStreaming(false);
       setStreamingMsgId(null);
       abortControllerRef.current = null;
-      ttsPlayerRef.current = null;
     }
   }, [user, ensureRoom, loadRoomMessages]);
-  // Voice callbacks
-  const handleVoiceTranscript = useCallback(async (text: string) => {
-    // Route the STT transcript through the chatbot (same as typing)
-    await handleSend(text);
-  }, [handleSend]);
-  const { voiceState, start, stop, isActive: isVoiceActive } = useVoiceSession({
-    onTranscript: handleVoiceTranscript,
-    onReply: () => { /* WebSocket reply ignored — chatbot SSE handles response */ },
-    bypassWsTts: true,
-    sessionParams: user ? { userId: user.user_id, fullname: user.name } : undefined,
   });
-  // Keep isVoiceActiveRef in sync so handleSend can read it synchronously
   useEffect(() => { isVoiceActiveRef.current = isVoiceActive; }, [isVoiceActive]);
   const handleVoiceToggle = useCallback(() => {
-    if (isVoiceActive) stop();
-    else start();
-  }, [isVoiceActive, start, stop]);
   const sessions: ChatSession[] = chats.map((c) => ({
     id: c.id,

   streamChat,
   type ChatSource,
 } from "../../services/api";
+import { textToSpeechStreaming } from "../../services/voiceApi";
 import { AudioPlayer } from "../../audio/AudioPlayer";
 import ChatLayout from "./chat/ChatLayout";
 import Sidebar from "./chat/Sidebar";
 import ChatInput from "./chat/ChatInput";
 import VoiceStatusBar from "./chat/VoiceStatusBar";
 import { useVoiceSession } from "../../hooks/useVoiceSession";
+import type { VoiceState } from "../../hooks/useVoiceSession";
 import type { Message, ChatSession, StoredUser } from "./chat/types";
 interface ChatRoom {
   const [knowledgeOpen, setKnowledgeOpen] = useState(false);
   const [mobileSidebarOpen, setMobileSidebarOpen] = useState(false);
   const abortControllerRef = useRef<AbortController | null>(null);
   const isVoiceActiveRef = useRef(false);
+  const setVoiceStateRef = useRef<((s: VoiceState) => void) | null>(null);
   // Stable refs so voice callbacks always see the latest values
   const currentChatIdRef = useRef<string | null>(null);
   };
   const handleSend = useCallback(async (text: string) => {
+    console.log("[handleSend] called, user:", user?.user_id ?? "null", "text:", text.slice(0, 40));
+    if (!user) {
+      console.warn("[handleSend] early return: no user");
+      return;
+    }
     let roomId = await ensureRoom(text.slice(0, 50));
+    console.log("[handleSend] roomId:", roomId);
+    if (!roomId) {
+      console.warn("[handleSend] early return: no roomId");
+      return;
+    }
     const userMessage: Message = {
       id: crypto.randomUUID(),
     abortControllerRef.current = new AbortController();
+    let audioText = "";
     try {
       const response = await streamChat(user.user_id, roomId, text);
                     : chat
                 )
               );
+            } else if (currentEvent === "audio_text" && data) {
+              audioText = data;
             } else if (currentEvent === "done") {
               break;
             }
         }
       }
       await loadRoomMessages(roomId);
     } catch (err: unknown) {
       if ((err as Error).name !== "AbortError") {
         setChats((prev) =>
               : chat
           )
         );
+        audioText = "";
       }
     } finally {
       setIsStreaming(false);
       setStreamingMsgId(null);
       abortControllerRef.current = null;
     }
+    return audioText;
   }, [user, ensureRoom, loadRoomMessages]);
+  const playTtsAudio = useCallback(async (ttsText: string) => {
+    try {
+      const { sampleRate, stream } = await textToSpeechStreaming(ttsText);
+      const player = new AudioPlayer();
+      player.init(sampleRate);
+      const reader = stream.getReader();
+      let totalBytes = 0;
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+        if (value && value.byteLength > 0) {
+          const pcm = value.buffer.slice(value.byteOffset, value.byteOffset + value.byteLength) as ArrayBuffer;
+          player.enqueue(pcm);
+          totalBytes += value.byteLength;
+        }
+      }
+      const durationMs = (totalBytes / 2 / sampleRate) * 1000;
+      await new Promise<void>((resolve) => setTimeout(resolve, durationMs + 300));
+      player.stopImmediately();
+    } catch {
+      // TTS failure is non-fatal
+    }
+  }, []);
+  const { voiceState, start, stop, stopRecording, setStateExternal, isActive: isVoiceActive } = useVoiceSession({
+    onTranscript: async (text: string) => {
+      console.log("[onTranscript] received:", text);
+      const audioText = await handleSend(text);
+      console.log("[onTranscript] handleSend done, audioText:", audioText ? audioText.slice(0, 40) : "(empty)");
+      if (audioText && isVoiceActiveRef.current) {
+        setVoiceStateRef.current?.("SPEAKING");
+        await playTtsAudio(audioText);
+      }
+      if (isVoiceActiveRef.current) setVoiceStateRef.current?.("IDLE");
+    },
+    sessionParams: {},
   });
+  // Keep refs in sync with latest values
   useEffect(() => { isVoiceActiveRef.current = isVoiceActive; }, [isVoiceActive]);
+  useEffect(() => { setVoiceStateRef.current = setStateExternal; }, [setStateExternal]);
   const handleVoiceToggle = useCallback(() => {
+    if (!isVoiceActive) {
+      start();
+    } else if (voiceState === "LISTENING") {
+      stopRecording();
+    } else {
+      stop();
+    }
+  }, [isVoiceActive, voiceState, start, stop, stopRecording]);
   const sessions: ChatSession[] = chats.map((c) => ({
     id: c.id,

src/app/components/chat/FeedbackWidget.tsx CHANGED Viewed

@@ -6,12 +6,14 @@ import { textToSpeech } from "../../../services/voiceApi";
 interface FeedbackWidgetProps {
   messageId: string;
   content: string;
   audioChunks?: ArrayBuffer[];
   audioSampleRate?: number;
 }
 export default function FeedbackWidget({
   content,
   audioChunks,
   audioSampleRate,
 }: FeedbackWidgetProps) {
@@ -69,7 +71,7 @@ export default function FeedbackWidget({
       // Text mode: request TTS now
       setSpeakerState("loading");
       try {
-        const { pcm, sampleRate } = await textToSpeech(content);
         setSpeakerState("playing");
         const cancel = replayAudio([pcm], sampleRate);
         cancelPlayRef.current = cancel;

 interface FeedbackWidgetProps {
   messageId: string;
   content: string;
+  audioText: string;
   audioChunks?: ArrayBuffer[];
   audioSampleRate?: number;
 }
 export default function FeedbackWidget({
   content,
+  audioText,
   audioChunks,
   audioSampleRate,
 }: FeedbackWidgetProps) {
       // Text mode: request TTS now
       setSpeakerState("loading");
       try {
+        const { pcm, sampleRate } = await textToSpeech(audioText);
         setSpeakerState("playing");
         const cancel = replayAudio([pcm], sampleRate);
         cancelPlayRef.current = cancel;

src/app/components/chat/MessageBubble.tsx CHANGED Viewed

@@ -69,6 +69,7 @@ export default function MessageBubble({ message, isStreamingPlaceholder }: Messa
         <FeedbackWidget
           messageId={message.id}
           content={message.content}
           audioChunks={message.audioChunks}
           audioSampleRate={message.audioSampleRate}
         />

         <FeedbackWidget
           messageId={message.id}
           content={message.content}
+          audioText={message.audioText}
           audioChunks={message.audioChunks}
           audioSampleRate={message.audioSampleRate}
         />

src/app/components/chat/VoiceMicButton.tsx CHANGED Viewed

@@ -1,5 +1,5 @@
 import { motion } from "motion/react";
-import { Loader2, Mic, MicOff, Volume2, WifiOff } from "lucide-react";
 import type { VoiceState } from "../../../hooks/useVoiceSession";
 interface VoiceMicButtonProps {
@@ -9,7 +9,7 @@ interface VoiceMicButtonProps {
 }
 export default function VoiceMicButton({ voiceState, onToggle, disabled }: VoiceMicButtonProps) {
-  const isDisabled = disabled || voiceState === "RECONNECTING";
   const stateConfig: Record<
     VoiceState,
@@ -50,13 +50,6 @@ export default function VoiceMicButton({ voiceState, onToggle, disabled }: Voice
       pulse: false,
       scalePulse: true,
     },
-    RECONNECTING: {
-      icon: <WifiOff className="h-4 w-4" />,
-      className: "bg-neutral-200 text-neutral-400 cursor-not-allowed",
-      title: "Reconnecting...",
-      pulse: false,
-      scalePulse: false,
-    },
     ERROR: {
       icon: <MicOff className="h-4 w-4" />,
       className: "bg-red-100 text-red-400 hover:bg-red-200",

 import { motion } from "motion/react";
+import { Loader2, Mic, MicOff, Volume2 } from "lucide-react";
 import type { VoiceState } from "../../../hooks/useVoiceSession";
 interface VoiceMicButtonProps {
 }
 export default function VoiceMicButton({ voiceState, onToggle, disabled }: VoiceMicButtonProps) {
+  const isDisabled = disabled ?? false;
   const stateConfig: Record<
     VoiceState,
       pulse: false,
       scalePulse: true,
     },
     ERROR: {
       icon: <MicOff className="h-4 w-4" />,
       className: "bg-red-100 text-red-400 hover:bg-red-200",

src/app/components/chat/VoiceStatusBar.tsx CHANGED Viewed

@@ -13,7 +13,6 @@ const STATE_LABELS: Record<VoiceState, string> = {
   LISTENING: "Listening...",
   PROCESSING: "Processing...",
   SPEAKING: "Agent is speaking",
-  RECONNECTING: "Reconnecting...",
   ERROR: "Connection error",
 };
@@ -23,7 +22,6 @@ const STATE_COLORS: Record<VoiceState, string> = {
   LISTENING: "bg-brand-green/10 text-brand-green border-brand-green/20",
   PROCESSING: "bg-brand-amber/10 text-brand-amber border-brand-amber/20",
   SPEAKING: "bg-brand-cyan/10 text-brand-cyan border-brand-cyan/20",
-  RECONNECTING: "bg-neutral-100 text-neutral-500 border-neutral-200",
   ERROR: "bg-red-50 text-red-500 border-red-200",
 };

   LISTENING: "Listening...",
   PROCESSING: "Processing...",
   SPEAKING: "Agent is speaking",
   ERROR: "Connection error",
 };
   LISTENING: "bg-brand-green/10 text-brand-green border-brand-green/20",
   PROCESSING: "bg-brand-amber/10 text-brand-amber border-brand-amber/20",
   SPEAKING: "bg-brand-cyan/10 text-brand-cyan border-brand-cyan/20",
   ERROR: "bg-red-50 text-red-500 border-red-200",
 };

src/app/components/chat/types.ts CHANGED Viewed

@@ -4,6 +4,7 @@ export interface Message {
   id: string;
   role: "user" | "assistant";
   content: string;
   timestamp: number;
   sources?: ChatSource[];
   /** PCM audio chunks from TTS — only populated when sent via voice mode */

   id: string;
   role: "user" | "assistant";
   content: string;
+  audioText: string;
   timestamp: number;
   sources?: ChatSource[];
   /** PCM audio chunks from TTS — only populated when sent via voice mode */

src/hooks/useVoiceSession.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import { useState, useRef, useEffect, useCallback } from "react";
 import { AudioRecorder } from "../audio/AudioRecorder";
 import { AudioPlayer } from "../audio/AudioPlayer";
 export type VoiceState =
   | "IDLE"
@@ -8,28 +9,16 @@ export type VoiceState =
   | "LISTENING"
   | "PROCESSING"
   | "SPEAKING"
-  | "RECONNECTING"
   | "ERROR";
 export interface VoiceSessionParams {
-  userId?: string;
-  fullname?: string;
-  company?: string;
-  function?: string;
-  site?: string;
-  role?: string;
-  agent?: string;
   sttProvider?: string;
   ttsProvider?: string;
-  wakeWordEnabled?: boolean;
 }
 interface UseVoiceSessionOptions {
   onTranscript: (text: string) => void;
-  onReply: (text: string) => void;
   onError?: (code: string, message: string) => void;
-  /** When true, binary TTS audio frames from the WebSocket are ignored. */
-  bypassWsTts?: boolean;
   sessionParams?: VoiceSessionParams;
 }
@@ -37,68 +26,39 @@ export interface UseVoiceSessionReturn {
   voiceState: VoiceState;
   start: () => Promise<void>;
   stop: () => void;
   isActive: boolean;
 }
-const BARGE_IN_THRESHOLD = 500;
-const MAX_RECONNECT_ATTEMPTS = 10;
-const HEARTBEAT_INTERVAL_MS = 20_000;
-const PONG_TIMEOUT_MS = 5_000;
 const BUFFER_SOUNDS = [
   "/sounds/01_Pertanyaan_bagus_mohon_ditunggu_sebentar.wav",
   "/sounds/02_Oke_menararik_banget_Sebentar_ya_saya_se.wav",
   "/sounds/03_Sip_aku_sudah_dengar_pertanyaanmu_Tunggu.wav",
 ];
 function getVoiceHttpBaseUrl(): string {
   return (import.meta as unknown as { env: Record<string, string> }).env
     .VITE_API_BASE_VOICE_URL ?? "http://localhost:7861";
 }
-function buildWsUrl(params: VoiceSessionParams): string {
-  const base = (import.meta as unknown as { env: Record<string, string> }).env
-    .VITE_API_BASE_VOICE_WS_URL ?? "ws://localhost:7861";
-  const p = new URLSearchParams({
-    user_id:           params.userId          ?? "anonymous",
-    fullname:          params.fullname         ?? "",
-    company:           params.company          ?? "",
-    function:          params.function         ?? "",
-    site:              params.site             ?? "HO",
-    role:              params.role             ?? "engineer",
-    agent:             params.agent            ?? "analysis",
-    stt_provider:      params.sttProvider      ?? "gemini",
-    tts_provider:      params.ttsProvider      ?? "gemini",
-    wake_word_enabled: String(params.wakeWordEnabled ?? false),
-  });
-  return `${base}/ws/voice?${p}`;
-}
 export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionReturn {
   const [voiceState, setVoiceState] = useState<VoiceState>("IDLE");
   const stateRef = useRef<VoiceState>("IDLE");
-  const wsRef = useRef<WebSocket | null>(null);
   const recorderRef = useRef<AudioRecorder | null>(null);
   const playerRef = useRef<AudioPlayer | null>(null);
-  const heartbeatTimerRef = useRef<ReturnType<typeof setInterval> | null>(null);
-  const pongTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
-  const rafRef = useRef<number | null>(null);
-  const reconnectAttemptRef = useRef(0);
   const bufferAudioRef = useRef<HTMLAudioElement | null>(null);
   const lastBufferIndexRef = useRef<number>(-1);
-  // Keep opts in a ref so callbacks never go stale
   const optsRef = useRef(opts);
   useEffect(() => { optsRef.current = opts; });
-  const setState = useCallback((s: VoiceState) => {
-    stateRef.current = s;
-    setVoiceState(s);
-  }, []);
   const stopBufferSound = useCallback(() => {
     if (bufferAudioRef.current) {
       bufferAudioRef.current.pause();
@@ -107,6 +67,15 @@ export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionRe
     }
   }, []);
   const playBufferSound = useCallback(() => {
     stopBufferSound();
     let idx: number;
@@ -119,198 +88,57 @@ export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionRe
     audio.play().catch(() => {});
   }, [stopBufferSound]);
-  const clearHeartbeat = useCallback(() => {
-    if (heartbeatTimerRef.current) clearInterval(heartbeatTimerRef.current);
-    if (pongTimeoutRef.current) clearTimeout(pongTimeoutRef.current);
-    heartbeatTimerRef.current = null;
-    pongTimeoutRef.current = null;
-  }, []);
-  const stopBargeInLoop = useCallback(() => {
-    if (rafRef.current !== null) {
-      cancelAnimationFrame(rafRef.current);
-      rafRef.current = null;
-    }
-  }, []);
-  const closeWs = useCallback(() => {
-    if (wsRef.current) {
-      wsRef.current.onopen = null;
-      wsRef.current.onmessage = null;
-      wsRef.current.onerror = null;
-      wsRef.current.onclose = null;
-      wsRef.current.close();
-      wsRef.current = null;
-    }
-  }, []);
   const stopSession = useCallback(() => {
-    stopBargeInLoop();
-    clearHeartbeat();
-    stopBufferSound();
-    if (wsRef.current?.readyState === WebSocket.OPEN) {
-      wsRef.current.send(JSON.stringify({ action: "stop" }));
-    }
-    closeWs();
     recorderRef.current?.stop();
     playerRef.current?.stopImmediately();
-    reconnectAttemptRef.current = 0;
-    setState("IDLE");
-  }, [clearHeartbeat, closeWs, setState, stopBargeInLoop, stopBufferSound]);
-  const startBargeInLoop = useCallback(() => {
-    const check = () => {
-      if (stateRef.current !== "SPEAKING") {
-        rafRef.current = null;
-        return;
-      }
-      const level = recorderRef.current?.micLevel ?? 0;
-      if (level > BARGE_IN_THRESHOLD && wsRef.current?.readyState === WebSocket.OPEN) {
-        wsRef.current.send(JSON.stringify({ action: "interrupt" }));
-        playerRef.current?.stopImmediately();
-        playerRef.current?.init();
-      }
-      rafRef.current = requestAnimationFrame(check);
-    };
-    rafRef.current = requestAnimationFrame(check);
-  }, []);
-  const startHeartbeat = useCallback(() => {
-    clearHeartbeat();
-    heartbeatTimerRef.current = setInterval(() => {
-      if (wsRef.current?.readyState === WebSocket.OPEN) {
-        wsRef.current.send(JSON.stringify({ action: "ping" }));
-        pongTimeoutRef.current = setTimeout(() => {
-          // No pong received — reconnect
-          wsRef.current?.close();
-        }, PONG_TIMEOUT_MS);
-      }
-    }, HEARTBEAT_INTERVAL_MS);
-  }, [clearHeartbeat]);
-  const openWebSocket = useCallback(() => {
-    closeWs();
-    const ws = new WebSocket(buildWsUrl(optsRef.current.sessionParams ?? {}));
-    ws.binaryType = "arraybuffer";
-    wsRef.current = ws;
-    ws.onopen = () => {
-      reconnectAttemptRef.current = 0;
-      // Stay in CONNECTING until tts_config is received
-      startHeartbeat();
-    };
-    ws.onmessage = (event) => {
-      if (event.data instanceof ArrayBuffer) {
-        // Binary frame = TTS audio chunk from voice backend
-        if (!optsRef.current.bypassWsTts) {
-          if (stateRef.current === "SPEAKING" || stateRef.current === "PROCESSING") {
-            if (stateRef.current === "PROCESSING") {
-              stopBufferSound();
-              setState("SPEAKING");
-              startBargeInLoop();
-            }
-            playerRef.current?.enqueue(event.data);
-          }
-        }
-        return;
-      }
       try {
-        const msg = JSON.parse(event.data as string);
-        switch (msg.event) {
-          case "tts_config": {
-            const sampleRate = (msg.sample_rate as number) ?? 16000;
-            playerRef.current?.stopImmediately();
-            playerRef.current?.init(sampleRate);
-            setState("LISTENING");
-            break;
-          }
-          case "transcript":
-            if (!msg.is_partial) {
-              setState("PROCESSING");
-              stopBargeInLoop();
-              playBufferSound();
-              optsRef.current.onTranscript(msg.text as string);
-            }
-            break;
-          case "reply":
-            optsRef.current.onReply(msg.text as string);
-            break;
-          case "tts_end":
-            playerRef.current?.drain();
-            setState("LISTENING");
-            stopBargeInLoop();
-            break;
-          case "interrupted":
-            stopBufferSound();
-            playerRef.current?.stopImmediately();
-            setState("LISTENING");
-            stopBargeInLoop();
-            break;
-          case "pong":
-            if (pongTimeoutRef.current) clearTimeout(pongTimeoutRef.current);
-            break;
-          case "error":
-            optsRef.current.onError?.(msg.code as string, msg.message as string);
-            break;
         }
-      } catch {
-        // non-JSON frame — ignore
-      }
-    };
-    ws.onerror = () => {
-      // onclose will fire next and handle reconnect
-    };
-    ws.onclose = () => {
-      clearHeartbeat();
-      stopBargeInLoop();
-      recorderRef.current?.stop();
-      if (stateRef.current === "IDLE") return; // intentional stop
-      // Reconnect with exponential backoff
-      if (reconnectAttemptRef.current >= MAX_RECONNECT_ATTEMPTS) {
-        setState("ERROR");
-        return;
-      }
-      setState("RECONNECTING");
-      const delay = Math.min(Math.pow(2, reconnectAttemptRef.current), 8) * 1000;
-      reconnectAttemptRef.current++;
-      setTimeout(async () => {
-        if (stateRef.current !== "RECONNECTING") return;
-        try {
-          if (!recorderRef.current) recorderRef.current = new AudioRecorder();
-          await recorderRef.current.start((chunk) => {
-            if (wsRef.current?.readyState === WebSocket.OPEN) {
-              wsRef.current.send(chunk);
-            }
-          });
-          setState("CONNECTING");
-          openWebSocket();
-        } catch {
-          setState("ERROR");
         }
-      }, delay);
-    };
-  }, [clearHeartbeat, closeWs, playBufferSound, setState, startBargeInLoop, startHeartbeat, stopBargeInLoop, stopBufferSound]);
   const start = useCallback(async () => {
     if (stateRef.current !== "IDLE" && stateRef.current !== "ERROR") return;
     setState("CONNECTING");
-    // Health check — best-effort: don't block connect if endpoint unreachable
     try {
       const res = await fetch(`${getVoiceHttpBaseUrl()}/health`);
       if (res.ok) {
@@ -326,26 +154,23 @@ export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionRe
     }
     try {
       if (!recorderRef.current) recorderRef.current = new AudioRecorder();
       if (!playerRef.current) playerRef.current = new AudioPlayer();
-      await recorderRef.current.start((chunk) => {
-        if (wsRef.current?.readyState === WebSocket.OPEN) {
-          wsRef.current.send(chunk);
         }
       });
-      // Init player inside user-gesture chain to satisfy autoplay policy
-      playerRef.current.init();
-      openWebSocket();
     } catch {
       recorderRef.current?.stop();
       setState("ERROR");
     }
-  }, [openWebSocket, setState]);
-  // Cleanup on unmount
   useEffect(() => {
     return () => {
       stopSession();
@@ -357,6 +182,8 @@ export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionRe
     voiceState,
     start,
     stop: stopSession,
     isActive: voiceState !== "IDLE" && voiceState !== "ERROR",
   };
 }

 import { useState, useRef, useEffect, useCallback } from "react";
 import { AudioRecorder } from "../audio/AudioRecorder";
 import { AudioPlayer } from "../audio/AudioPlayer";
+import { createWavBlob, speechToText } from "../services/voiceApi";
 export type VoiceState =
   | "IDLE"
   | "LISTENING"
   | "PROCESSING"
   | "SPEAKING"
   | "ERROR";
 export interface VoiceSessionParams {
   sttProvider?: string;
   ttsProvider?: string;
 }
 interface UseVoiceSessionOptions {
   onTranscript: (text: string) => void;
   onError?: (code: string, message: string) => void;
   sessionParams?: VoiceSessionParams;
 }
   voiceState: VoiceState;
   start: () => Promise<void>;
   stop: () => void;
+  stopRecording: () => void;
+  setStateExternal: (s: VoiceState) => void;
   isActive: boolean;
 }
 const BUFFER_SOUNDS = [
   "/sounds/01_Pertanyaan_bagus_mohon_ditunggu_sebentar.wav",
   "/sounds/02_Oke_menararik_banget_Sebentar_ya_saya_se.wav",
   "/sounds/03_Sip_aku_sudah_dengar_pertanyaanmu_Tunggu.wav",
 ];
+const RECORDER_SAMPLE_RATE = 16000;
 function getVoiceHttpBaseUrl(): string {
   return (import.meta as unknown as { env: Record<string, string> }).env
     .VITE_API_BASE_VOICE_URL ?? "http://localhost:7861";
 }
 export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionReturn {
   const [voiceState, setVoiceState] = useState<VoiceState>("IDLE");
   const stateRef = useRef<VoiceState>("IDLE");
   const recorderRef = useRef<AudioRecorder | null>(null);
   const playerRef = useRef<AudioPlayer | null>(null);
+  const chunksRef = useRef<ArrayBuffer[]>([]);
   const bufferAudioRef = useRef<HTMLAudioElement | null>(null);
   const lastBufferIndexRef = useRef<number>(-1);
   const optsRef = useRef(opts);
   useEffect(() => { optsRef.current = opts; });
+  // Defined before setState so setState can call it without circular deps.
   const stopBufferSound = useCallback(() => {
     if (bufferAudioRef.current) {
       bufferAudioRef.current.pause();
     }
   }, []);
+  // Auto-stops the buffer audio when the waiting phase ends (TTS about to start, or session ends).
+  const setState = useCallback((s: VoiceState) => {
+    if (s === "SPEAKING" || s === "IDLE" || s === "ERROR") {
+      stopBufferSound();
+    }
+    stateRef.current = s;
+    setVoiceState(s);
+  }, [stopBufferSound]);
   const playBufferSound = useCallback(() => {
     stopBufferSound();
     let idx: number;
     audio.play().catch(() => {});
   }, [stopBufferSound]);
   const stopSession = useCallback(() => {
     recorderRef.current?.stop();
     playerRef.current?.stopImmediately();
+    chunksRef.current = [];
+    setState("IDLE"); // setState("IDLE") calls stopBufferSound internally
+  }, [setState]);
+  const stopRecording = useCallback(() => {
+    if (stateRef.current !== "LISTENING") return;
+    setState("PROCESSING");
+    recorderRef.current?.stop();
+    // Play buffer audio — it keeps playing through STT and chatbot processing.
+    // It stops automatically when setState("SPEAKING"), setState("IDLE"), or setState("ERROR") is called.
+    playBufferSound();
+    const chunks = chunksRef.current;
+    chunksRef.current = [];
+    void (async () => {
       try {
+        if (chunks.length === 0) {
+          setState("IDLE");
+          return;
         }
+        const wav = createWavBlob(chunks, RECORDER_SAMPLE_RATE);
+        const { text } = await speechToText(wav, optsRef.current.sessionParams?.sttProvider ?? "chirp3");
+        // Guard: session may have been cancelled while STT was in flight.
+        if (stateRef.current !== "PROCESSING") return;
+        if (text.trim()) {
+          console.log("[Voice] STT transcript →", text);
+          // Buffer audio continues to play while Main.tsx calls the chatbot API.
+          // It will stop when setStateExternal("SPEAKING") or setStateExternal("IDLE") is called.
+          optsRef.current.onTranscript(text);
+        } else {
+          setState("IDLE");
         }
+      } catch (err) {
+        console.error("[STT] Request failed:", err);
+        optsRef.current.onError?.("STT_ERROR", (err as Error).message);
+        setState("ERROR"); // setState("ERROR") calls stopBufferSound internally
+      }
+    })();
+  }, [playBufferSound, setState]);
   const start = useCallback(async () => {
     if (stateRef.current !== "IDLE" && stateRef.current !== "ERROR") return;
     setState("CONNECTING");
     try {
       const res = await fetch(`${getVoiceHttpBaseUrl()}/health`);
       if (res.ok) {
     }
     try {
+      chunksRef.current = [];
       if (!recorderRef.current) recorderRef.current = new AudioRecorder();
       if (!playerRef.current) playerRef.current = new AudioPlayer();
+      await recorderRef.current.start((chunk: ArrayBuffer) => {
+        if (stateRef.current === "LISTENING") {
+          chunksRef.current.push(chunk);
         }
       });
+      setState("LISTENING");
     } catch {
       recorderRef.current?.stop();
       setState("ERROR");
     }
+  }, [setState]);
   useEffect(() => {
     return () => {
       stopSession();
     voiceState,
     start,
     stop: stopSession,
+    stopRecording,
+    setStateExternal: setState,
     isActive: voiceState !== "IDLE" && voiceState !== "ERROR",
   };
 }

src/services/voiceApi.ts CHANGED Viewed

@@ -2,6 +2,62 @@ const VOICE_BASE_URL =
   (import.meta as unknown as { env: Record<string, string> }).env
     .VITE_API_BASE_VOICE_URL ?? "http://localhost:7861";
 export async function textToSpeech(
   text: string,
   provider = "gemini"

   (import.meta as unknown as { env: Record<string, string> }).env
     .VITE_API_BASE_VOICE_URL ?? "http://localhost:7861";
+function writeString(view: DataView, offset: number, str: string): void {
+  for (let i = 0; i < str.length; i++) view.setUint8(offset + i, str.charCodeAt(i));
+}
+export function createWavBlob(chunks: ArrayBuffer[], sampleRate: number): Blob {
+  const pcmByteLength = chunks.reduce((sum, c) => sum + c.byteLength, 0);
+  const buffer = new ArrayBuffer(44 + pcmByteLength);
+  const view = new DataView(buffer);
+  writeString(view, 0, "RIFF");
+  view.setUint32(4, 36 + pcmByteLength, true);
+  writeString(view, 8, "WAVE");
+  writeString(view, 12, "fmt ");
+  view.setUint32(16, 16, true);
+  view.setUint16(20, 1, true);             // PCM
+  view.setUint16(22, 1, true);             // mono
+  view.setUint32(24, sampleRate, true);
+  view.setUint32(28, sampleRate * 2, true);
+  view.setUint16(32, 2, true);
+  view.setUint16(34, 16, true);
+  writeString(view, 36, "data");
+  view.setUint32(40, pcmByteLength, true);
+  let offset = 44;
+  for (const chunk of chunks) {
+    new Uint8Array(buffer, offset, chunk.byteLength).set(new Uint8Array(chunk));
+    offset += chunk.byteLength;
+  }
+  return new Blob([buffer], { type: "audio/wav" });
+}
+export async function speechToText(
+  wavBlob: Blob,
+  provider = "chirp3"
+): Promise<{ text: string; language: string; duration: number | null }> {
+  const form = new FormData();
+  form.append("audio", wavBlob, "recording.wav");
+  form.append("provider", provider);
+  const res = await fetch(`${VOICE_BASE_URL}/stt`, { method: "POST", body: form });
+  if (!res.ok) throw new Error(`STT error: ${res.status}`);
+  return res.json();
+}
+export async function textToSpeechStreaming(
+  text: string,
+  provider = "gemini"
+): Promise<{ sampleRate: number; stream: ReadableStream<Uint8Array> }> {
+  const res = await fetch(`${VOICE_BASE_URL}/tts`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ text, provider }),
+  });
+  if (!res.ok) throw new Error(`TTS error: ${res.status}`);
+  if (!res.body) throw new Error("TTS response has no body");
+  const sampleRate = parseInt(res.headers.get("X-Sample-Rate") ?? "24000", 10);
+  return { sampleRate, stream: res.body };
+}
 export async function textToSpeech(
   text: string,
   provider = "gemini"