ishaq101 commited on
Commit
b919616
·
1 Parent(s): 8172156

Feat: tap to speak, speaker button in bubble chat

Browse files
.gitignore CHANGED
@@ -40,6 +40,7 @@ API_CONTRACT_CHATBOT.md
40
  API_CONTRACT_VOICE.md
41
  STYLE.md
42
  HIGHLIGHT_VOICE.md
 
43
 
44
  # Database logos (served via CDN)
45
  public/databases/
 
40
  API_CONTRACT_VOICE.md
41
  STYLE.md
42
  HIGHLIGHT_VOICE.md
43
+ HIGHLIGHT_STT_TTS.md
44
 
45
  # Database logos (served via CDN)
46
  public/databases/
server.js CHANGED
@@ -26,7 +26,7 @@ const MIME = {
26
  };
27
 
28
  console.log(`Starting server on port ${PORT}`);
29
- console.log(`Backend URL: ${BACKEND_URL || "(not set)"}`);
30
 
31
  const server = http.createServer((req, res) => {
32
  const parsed = url.parse(req.url);
 
26
  };
27
 
28
  console.log(`Starting server on port ${PORT}`);
29
+ // console.log(`Backend URL: ${BACKEND_URL || "(not set)"}`);
30
 
31
  const server = http.createServer((req, res) => {
32
  const parsed = url.parse(req.url);
src/app/components/Main.tsx CHANGED
@@ -11,7 +11,7 @@ import {
11
  streamChat,
12
  type ChatSource,
13
  } from "../../services/api";
14
- import { textToSpeech } from "../../services/voiceApi";
15
  import { AudioPlayer } from "../../audio/AudioPlayer";
16
  import ChatLayout from "./chat/ChatLayout";
17
  import Sidebar from "./chat/Sidebar";
@@ -19,6 +19,7 @@ import ChatWindow from "./chat/ChatWindow";
19
  import ChatInput from "./chat/ChatInput";
20
  import VoiceStatusBar from "./chat/VoiceStatusBar";
21
  import { useVoiceSession } from "../../hooks/useVoiceSession";
 
22
  import type { Message, ChatSession, StoredUser } from "./chat/types";
23
 
24
  interface ChatRoom {
@@ -41,8 +42,8 @@ export default function Main() {
41
  const [knowledgeOpen, setKnowledgeOpen] = useState(false);
42
  const [mobileSidebarOpen, setMobileSidebarOpen] = useState(false);
43
  const abortControllerRef = useRef<AbortController | null>(null);
44
- const ttsPlayerRef = useRef<AudioPlayer | null>(null);
45
  const isVoiceActiveRef = useRef(false);
 
46
 
47
  // Stable refs so voice callbacks always see the latest values
48
  const currentChatIdRef = useRef<string | null>(null);
@@ -167,10 +168,18 @@ export default function Main() {
167
  };
168
 
169
  const handleSend = useCallback(async (text: string) => {
170
- if (!user) return;
 
 
 
 
171
 
172
  let roomId = await ensureRoom(text.slice(0, 50));
173
- if (!roomId) return;
 
 
 
 
174
 
175
  const userMessage: Message = {
176
  id: crypto.randomUUID(),
@@ -217,10 +226,7 @@ export default function Main() {
217
 
218
  abortControllerRef.current = new AbortController();
219
 
220
- // TTS state — only used when voice is active
221
- const audioChunks: ArrayBuffer[] = [];
222
- let audioSampleRate = 24000;
223
- let ttsChain = Promise.resolve();
224
 
225
  try {
226
  const response = await streamChat(user.user_id, roomId, text);
@@ -287,23 +293,8 @@ export default function Main() {
287
  : chat
288
  )
289
  );
290
- } else if (currentEvent === "audio" && data && isVoiceActiveRef.current) {
291
- // TTS: only during active voice session
292
- const sentence = data;
293
- ttsChain = ttsChain.then(async () => {
294
- try {
295
- const { pcm, sampleRate } = await textToSpeech(sentence);
296
- audioChunks.push(pcm);
297
- audioSampleRate = sampleRate;
298
- if (!ttsPlayerRef.current) {
299
- ttsPlayerRef.current = new AudioPlayer();
300
- }
301
- ttsPlayerRef.current.init(sampleRate);
302
- ttsPlayerRef.current.enqueue(pcm);
303
- } catch {
304
- // TTS failure is non-fatal; chat continues
305
- }
306
- });
307
  } else if (currentEvent === "done") {
308
  break;
309
  }
@@ -311,28 +302,7 @@ export default function Main() {
311
  }
312
  }
313
 
314
- // Wait for all queued TTS calls to finish
315
- await ttsChain;
316
-
317
- // Reload messages from server (may reassign IDs)
318
  await loadRoomMessages(roomId);
319
-
320
- // Re-attach audio chunks to the last assistant message after server sync
321
- if (audioChunks.length > 0) {
322
- setChats((prev) =>
323
- prev.map((chat) => {
324
- if (chat.id !== roomId) return chat;
325
- const msgs = [...chat.messages];
326
- for (let i = msgs.length - 1; i >= 0; i--) {
327
- if (msgs[i].role === "assistant") {
328
- msgs[i] = { ...msgs[i], audioChunks, audioSampleRate };
329
- break;
330
- }
331
- }
332
- return { ...chat, messages: msgs };
333
- })
334
- );
335
- }
336
  } catch (err: unknown) {
337
  if ((err as Error).name !== "AbortError") {
338
  setChats((prev) =>
@@ -352,35 +322,68 @@ export default function Main() {
352
  : chat
353
  )
354
  );
 
355
  }
356
  } finally {
357
  setIsStreaming(false);
358
  setStreamingMsgId(null);
359
  abortControllerRef.current = null;
360
- ttsPlayerRef.current = null;
361
  }
 
 
362
  }, [user, ensureRoom, loadRoomMessages]);
363
 
364
- // Voice callbacks
365
- const handleVoiceTranscript = useCallback(async (text: string) => {
366
- // Route the STT transcript through the chatbot (same as typing)
367
- await handleSend(text);
368
- }, [handleSend]);
369
-
370
- const { voiceState, start, stop, isActive: isVoiceActive } = useVoiceSession({
371
- onTranscript: handleVoiceTranscript,
372
- onReply: () => { /* WebSocket reply ignored chatbot SSE handles response */ },
373
- bypassWsTts: true,
374
- sessionParams: user ? { userId: user.user_id, fullname: user.name } : undefined,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  });
376
 
377
- // Keep isVoiceActiveRef in sync so handleSend can read it synchronously
378
  useEffect(() => { isVoiceActiveRef.current = isVoiceActive; }, [isVoiceActive]);
 
379
 
380
  const handleVoiceToggle = useCallback(() => {
381
- if (isVoiceActive) stop();
382
- else start();
383
- }, [isVoiceActive, start, stop]);
 
 
 
 
 
384
 
385
  const sessions: ChatSession[] = chats.map((c) => ({
386
  id: c.id,
 
11
  streamChat,
12
  type ChatSource,
13
  } from "../../services/api";
14
+ import { textToSpeechStreaming } from "../../services/voiceApi";
15
  import { AudioPlayer } from "../../audio/AudioPlayer";
16
  import ChatLayout from "./chat/ChatLayout";
17
  import Sidebar from "./chat/Sidebar";
 
19
  import ChatInput from "./chat/ChatInput";
20
  import VoiceStatusBar from "./chat/VoiceStatusBar";
21
  import { useVoiceSession } from "../../hooks/useVoiceSession";
22
+ import type { VoiceState } from "../../hooks/useVoiceSession";
23
  import type { Message, ChatSession, StoredUser } from "./chat/types";
24
 
25
  interface ChatRoom {
 
42
  const [knowledgeOpen, setKnowledgeOpen] = useState(false);
43
  const [mobileSidebarOpen, setMobileSidebarOpen] = useState(false);
44
  const abortControllerRef = useRef<AbortController | null>(null);
 
45
  const isVoiceActiveRef = useRef(false);
46
+ const setVoiceStateRef = useRef<((s: VoiceState) => void) | null>(null);
47
 
48
  // Stable refs so voice callbacks always see the latest values
49
  const currentChatIdRef = useRef<string | null>(null);
 
168
  };
169
 
170
  const handleSend = useCallback(async (text: string) => {
171
+ console.log("[handleSend] called, user:", user?.user_id ?? "null", "text:", text.slice(0, 40));
172
+ if (!user) {
173
+ console.warn("[handleSend] early return: no user");
174
+ return;
175
+ }
176
 
177
  let roomId = await ensureRoom(text.slice(0, 50));
178
+ console.log("[handleSend] roomId:", roomId);
179
+ if (!roomId) {
180
+ console.warn("[handleSend] early return: no roomId");
181
+ return;
182
+ }
183
 
184
  const userMessage: Message = {
185
  id: crypto.randomUUID(),
 
226
 
227
  abortControllerRef.current = new AbortController();
228
 
229
+ let audioText = "";
 
 
 
230
 
231
  try {
232
  const response = await streamChat(user.user_id, roomId, text);
 
293
  : chat
294
  )
295
  );
296
+ } else if (currentEvent === "audio_text" && data) {
297
+ audioText = data;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  } else if (currentEvent === "done") {
299
  break;
300
  }
 
302
  }
303
  }
304
 
 
 
 
 
305
  await loadRoomMessages(roomId);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  } catch (err: unknown) {
307
  if ((err as Error).name !== "AbortError") {
308
  setChats((prev) =>
 
322
  : chat
323
  )
324
  );
325
+ audioText = "";
326
  }
327
  } finally {
328
  setIsStreaming(false);
329
  setStreamingMsgId(null);
330
  abortControllerRef.current = null;
 
331
  }
332
+
333
+ return audioText;
334
  }, [user, ensureRoom, loadRoomMessages]);
335
 
336
+ const playTtsAudio = useCallback(async (ttsText: string) => {
337
+ try {
338
+ const { sampleRate, stream } = await textToSpeechStreaming(ttsText);
339
+ const player = new AudioPlayer();
340
+ player.init(sampleRate);
341
+ const reader = stream.getReader();
342
+ let totalBytes = 0;
343
+ while (true) {
344
+ const { done, value } = await reader.read();
345
+ if (done) break;
346
+ if (value && value.byteLength > 0) {
347
+ const pcm = value.buffer.slice(value.byteOffset, value.byteOffset + value.byteLength) as ArrayBuffer;
348
+ player.enqueue(pcm);
349
+ totalBytes += value.byteLength;
350
+ }
351
+ }
352
+ const durationMs = (totalBytes / 2 / sampleRate) * 1000;
353
+ await new Promise<void>((resolve) => setTimeout(resolve, durationMs + 300));
354
+ player.stopImmediately();
355
+ } catch {
356
+ // TTS failure is non-fatal
357
+ }
358
+ }, []);
359
+
360
+ const { voiceState, start, stop, stopRecording, setStateExternal, isActive: isVoiceActive } = useVoiceSession({
361
+ onTranscript: async (text: string) => {
362
+ console.log("[onTranscript] received:", text);
363
+ const audioText = await handleSend(text);
364
+ console.log("[onTranscript] handleSend done, audioText:", audioText ? audioText.slice(0, 40) : "(empty)");
365
+ if (audioText && isVoiceActiveRef.current) {
366
+ setVoiceStateRef.current?.("SPEAKING");
367
+ await playTtsAudio(audioText);
368
+ }
369
+ if (isVoiceActiveRef.current) setVoiceStateRef.current?.("IDLE");
370
+ },
371
+ sessionParams: {},
372
  });
373
 
374
+ // Keep refs in sync with latest values
375
  useEffect(() => { isVoiceActiveRef.current = isVoiceActive; }, [isVoiceActive]);
376
+ useEffect(() => { setVoiceStateRef.current = setStateExternal; }, [setStateExternal]);
377
 
378
  const handleVoiceToggle = useCallback(() => {
379
+ if (!isVoiceActive) {
380
+ start();
381
+ } else if (voiceState === "LISTENING") {
382
+ stopRecording();
383
+ } else {
384
+ stop();
385
+ }
386
+ }, [isVoiceActive, voiceState, start, stop, stopRecording]);
387
 
388
  const sessions: ChatSession[] = chats.map((c) => ({
389
  id: c.id,
src/app/components/chat/FeedbackWidget.tsx CHANGED
@@ -6,12 +6,14 @@ import { textToSpeech } from "../../../services/voiceApi";
6
  interface FeedbackWidgetProps {
7
  messageId: string;
8
  content: string;
 
9
  audioChunks?: ArrayBuffer[];
10
  audioSampleRate?: number;
11
  }
12
 
13
  export default function FeedbackWidget({
14
  content,
 
15
  audioChunks,
16
  audioSampleRate,
17
  }: FeedbackWidgetProps) {
@@ -69,7 +71,7 @@ export default function FeedbackWidget({
69
  // Text mode: request TTS now
70
  setSpeakerState("loading");
71
  try {
72
- const { pcm, sampleRate } = await textToSpeech(content);
73
  setSpeakerState("playing");
74
  const cancel = replayAudio([pcm], sampleRate);
75
  cancelPlayRef.current = cancel;
 
6
  interface FeedbackWidgetProps {
7
  messageId: string;
8
  content: string;
9
+ audioText: string;
10
  audioChunks?: ArrayBuffer[];
11
  audioSampleRate?: number;
12
  }
13
 
14
  export default function FeedbackWidget({
15
  content,
16
+ audioText,
17
  audioChunks,
18
  audioSampleRate,
19
  }: FeedbackWidgetProps) {
 
71
  // Text mode: request TTS now
72
  setSpeakerState("loading");
73
  try {
74
+ const { pcm, sampleRate } = await textToSpeech(audioText);
75
  setSpeakerState("playing");
76
  const cancel = replayAudio([pcm], sampleRate);
77
  cancelPlayRef.current = cancel;
src/app/components/chat/MessageBubble.tsx CHANGED
@@ -69,6 +69,7 @@ export default function MessageBubble({ message, isStreamingPlaceholder }: Messa
69
  <FeedbackWidget
70
  messageId={message.id}
71
  content={message.content}
 
72
  audioChunks={message.audioChunks}
73
  audioSampleRate={message.audioSampleRate}
74
  />
 
69
  <FeedbackWidget
70
  messageId={message.id}
71
  content={message.content}
72
+ audioText={message.audioText}
73
  audioChunks={message.audioChunks}
74
  audioSampleRate={message.audioSampleRate}
75
  />
src/app/components/chat/VoiceMicButton.tsx CHANGED
@@ -1,5 +1,5 @@
1
  import { motion } from "motion/react";
2
- import { Loader2, Mic, MicOff, Volume2, WifiOff } from "lucide-react";
3
  import type { VoiceState } from "../../../hooks/useVoiceSession";
4
 
5
  interface VoiceMicButtonProps {
@@ -9,7 +9,7 @@ interface VoiceMicButtonProps {
9
  }
10
 
11
  export default function VoiceMicButton({ voiceState, onToggle, disabled }: VoiceMicButtonProps) {
12
- const isDisabled = disabled || voiceState === "RECONNECTING";
13
 
14
  const stateConfig: Record<
15
  VoiceState,
@@ -50,13 +50,6 @@ export default function VoiceMicButton({ voiceState, onToggle, disabled }: Voice
50
  pulse: false,
51
  scalePulse: true,
52
  },
53
- RECONNECTING: {
54
- icon: <WifiOff className="h-4 w-4" />,
55
- className: "bg-neutral-200 text-neutral-400 cursor-not-allowed",
56
- title: "Reconnecting...",
57
- pulse: false,
58
- scalePulse: false,
59
- },
60
  ERROR: {
61
  icon: <MicOff className="h-4 w-4" />,
62
  className: "bg-red-100 text-red-400 hover:bg-red-200",
 
1
  import { motion } from "motion/react";
2
+ import { Loader2, Mic, MicOff, Volume2 } from "lucide-react";
3
  import type { VoiceState } from "../../../hooks/useVoiceSession";
4
 
5
  interface VoiceMicButtonProps {
 
9
  }
10
 
11
  export default function VoiceMicButton({ voiceState, onToggle, disabled }: VoiceMicButtonProps) {
12
+ const isDisabled = disabled ?? false;
13
 
14
  const stateConfig: Record<
15
  VoiceState,
 
50
  pulse: false,
51
  scalePulse: true,
52
  },
 
 
 
 
 
 
 
53
  ERROR: {
54
  icon: <MicOff className="h-4 w-4" />,
55
  className: "bg-red-100 text-red-400 hover:bg-red-200",
src/app/components/chat/VoiceStatusBar.tsx CHANGED
@@ -13,7 +13,6 @@ const STATE_LABELS: Record<VoiceState, string> = {
13
  LISTENING: "Listening...",
14
  PROCESSING: "Processing...",
15
  SPEAKING: "Agent is speaking",
16
- RECONNECTING: "Reconnecting...",
17
  ERROR: "Connection error",
18
  };
19
 
@@ -23,7 +22,6 @@ const STATE_COLORS: Record<VoiceState, string> = {
23
  LISTENING: "bg-brand-green/10 text-brand-green border-brand-green/20",
24
  PROCESSING: "bg-brand-amber/10 text-brand-amber border-brand-amber/20",
25
  SPEAKING: "bg-brand-cyan/10 text-brand-cyan border-brand-cyan/20",
26
- RECONNECTING: "bg-neutral-100 text-neutral-500 border-neutral-200",
27
  ERROR: "bg-red-50 text-red-500 border-red-200",
28
  };
29
 
 
13
  LISTENING: "Listening...",
14
  PROCESSING: "Processing...",
15
  SPEAKING: "Agent is speaking",
 
16
  ERROR: "Connection error",
17
  };
18
 
 
22
  LISTENING: "bg-brand-green/10 text-brand-green border-brand-green/20",
23
  PROCESSING: "bg-brand-amber/10 text-brand-amber border-brand-amber/20",
24
  SPEAKING: "bg-brand-cyan/10 text-brand-cyan border-brand-cyan/20",
 
25
  ERROR: "bg-red-50 text-red-500 border-red-200",
26
  };
27
 
src/app/components/chat/types.ts CHANGED
@@ -4,6 +4,7 @@ export interface Message {
4
  id: string;
5
  role: "user" | "assistant";
6
  content: string;
 
7
  timestamp: number;
8
  sources?: ChatSource[];
9
  /** PCM audio chunks from TTS — only populated when sent via voice mode */
 
4
  id: string;
5
  role: "user" | "assistant";
6
  content: string;
7
+ audioText: string;
8
  timestamp: number;
9
  sources?: ChatSource[];
10
  /** PCM audio chunks from TTS — only populated when sent via voice mode */
src/hooks/useVoiceSession.ts CHANGED
@@ -1,6 +1,7 @@
1
  import { useState, useRef, useEffect, useCallback } from "react";
2
  import { AudioRecorder } from "../audio/AudioRecorder";
3
  import { AudioPlayer } from "../audio/AudioPlayer";
 
4
 
5
  export type VoiceState =
6
  | "IDLE"
@@ -8,28 +9,16 @@ export type VoiceState =
8
  | "LISTENING"
9
  | "PROCESSING"
10
  | "SPEAKING"
11
- | "RECONNECTING"
12
  | "ERROR";
13
 
14
  export interface VoiceSessionParams {
15
- userId?: string;
16
- fullname?: string;
17
- company?: string;
18
- function?: string;
19
- site?: string;
20
- role?: string;
21
- agent?: string;
22
  sttProvider?: string;
23
  ttsProvider?: string;
24
- wakeWordEnabled?: boolean;
25
  }
26
 
27
  interface UseVoiceSessionOptions {
28
  onTranscript: (text: string) => void;
29
- onReply: (text: string) => void;
30
  onError?: (code: string, message: string) => void;
31
- /** When true, binary TTS audio frames from the WebSocket are ignored. */
32
- bypassWsTts?: boolean;
33
  sessionParams?: VoiceSessionParams;
34
  }
35
 
@@ -37,68 +26,39 @@ export interface UseVoiceSessionReturn {
37
  voiceState: VoiceState;
38
  start: () => Promise<void>;
39
  stop: () => void;
 
 
40
  isActive: boolean;
41
  }
42
 
43
- const BARGE_IN_THRESHOLD = 500;
44
- const MAX_RECONNECT_ATTEMPTS = 10;
45
- const HEARTBEAT_INTERVAL_MS = 20_000;
46
- const PONG_TIMEOUT_MS = 5_000;
47
-
48
  const BUFFER_SOUNDS = [
49
  "/sounds/01_Pertanyaan_bagus_mohon_ditunggu_sebentar.wav",
50
  "/sounds/02_Oke_menararik_banget_Sebentar_ya_saya_se.wav",
51
  "/sounds/03_Sip_aku_sudah_dengar_pertanyaanmu_Tunggu.wav",
52
  ];
53
 
 
 
54
  function getVoiceHttpBaseUrl(): string {
55
  return (import.meta as unknown as { env: Record<string, string> }).env
56
  .VITE_API_BASE_VOICE_URL ?? "http://localhost:7861";
57
  }
58
 
59
- function buildWsUrl(params: VoiceSessionParams): string {
60
- const base = (import.meta as unknown as { env: Record<string, string> }).env
61
- .VITE_API_BASE_VOICE_WS_URL ?? "ws://localhost:7861";
62
- const p = new URLSearchParams({
63
- user_id: params.userId ?? "anonymous",
64
- fullname: params.fullname ?? "",
65
- company: params.company ?? "",
66
- function: params.function ?? "",
67
- site: params.site ?? "HO",
68
- role: params.role ?? "engineer",
69
- agent: params.agent ?? "analysis",
70
- stt_provider: params.sttProvider ?? "gemini",
71
- tts_provider: params.ttsProvider ?? "gemini",
72
- wake_word_enabled: String(params.wakeWordEnabled ?? false),
73
- });
74
- return `${base}/ws/voice?${p}`;
75
- }
76
-
77
  export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionReturn {
78
  const [voiceState, setVoiceState] = useState<VoiceState>("IDLE");
79
  const stateRef = useRef<VoiceState>("IDLE");
80
 
81
- const wsRef = useRef<WebSocket | null>(null);
82
  const recorderRef = useRef<AudioRecorder | null>(null);
83
  const playerRef = useRef<AudioPlayer | null>(null);
84
-
85
- const heartbeatTimerRef = useRef<ReturnType<typeof setInterval> | null>(null);
86
- const pongTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
87
- const rafRef = useRef<number | null>(null);
88
- const reconnectAttemptRef = useRef(0);
89
 
90
  const bufferAudioRef = useRef<HTMLAudioElement | null>(null);
91
  const lastBufferIndexRef = useRef<number>(-1);
92
 
93
- // Keep opts in a ref so callbacks never go stale
94
  const optsRef = useRef(opts);
95
  useEffect(() => { optsRef.current = opts; });
96
 
97
- const setState = useCallback((s: VoiceState) => {
98
- stateRef.current = s;
99
- setVoiceState(s);
100
- }, []);
101
-
102
  const stopBufferSound = useCallback(() => {
103
  if (bufferAudioRef.current) {
104
  bufferAudioRef.current.pause();
@@ -107,6 +67,15 @@ export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionRe
107
  }
108
  }, []);
109
 
 
 
 
 
 
 
 
 
 
110
  const playBufferSound = useCallback(() => {
111
  stopBufferSound();
112
  let idx: number;
@@ -119,198 +88,57 @@ export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionRe
119
  audio.play().catch(() => {});
120
  }, [stopBufferSound]);
121
 
122
- const clearHeartbeat = useCallback(() => {
123
- if (heartbeatTimerRef.current) clearInterval(heartbeatTimerRef.current);
124
- if (pongTimeoutRef.current) clearTimeout(pongTimeoutRef.current);
125
- heartbeatTimerRef.current = null;
126
- pongTimeoutRef.current = null;
127
- }, []);
128
-
129
- const stopBargeInLoop = useCallback(() => {
130
- if (rafRef.current !== null) {
131
- cancelAnimationFrame(rafRef.current);
132
- rafRef.current = null;
133
- }
134
- }, []);
135
-
136
- const closeWs = useCallback(() => {
137
- if (wsRef.current) {
138
- wsRef.current.onopen = null;
139
- wsRef.current.onmessage = null;
140
- wsRef.current.onerror = null;
141
- wsRef.current.onclose = null;
142
- wsRef.current.close();
143
- wsRef.current = null;
144
- }
145
- }, []);
146
-
147
  const stopSession = useCallback(() => {
148
- stopBargeInLoop();
149
- clearHeartbeat();
150
- stopBufferSound();
151
- if (wsRef.current?.readyState === WebSocket.OPEN) {
152
- wsRef.current.send(JSON.stringify({ action: "stop" }));
153
- }
154
- closeWs();
155
  recorderRef.current?.stop();
156
  playerRef.current?.stopImmediately();
157
- reconnectAttemptRef.current = 0;
158
- setState("IDLE");
159
- }, [clearHeartbeat, closeWs, setState, stopBargeInLoop, stopBufferSound]);
160
-
161
- const startBargeInLoop = useCallback(() => {
162
- const check = () => {
163
- if (stateRef.current !== "SPEAKING") {
164
- rafRef.current = null;
165
- return;
166
- }
167
- const level = recorderRef.current?.micLevel ?? 0;
168
- if (level > BARGE_IN_THRESHOLD && wsRef.current?.readyState === WebSocket.OPEN) {
169
- wsRef.current.send(JSON.stringify({ action: "interrupt" }));
170
- playerRef.current?.stopImmediately();
171
- playerRef.current?.init();
172
- }
173
- rafRef.current = requestAnimationFrame(check);
174
- };
175
- rafRef.current = requestAnimationFrame(check);
176
- }, []);
177
-
178
- const startHeartbeat = useCallback(() => {
179
- clearHeartbeat();
180
- heartbeatTimerRef.current = setInterval(() => {
181
- if (wsRef.current?.readyState === WebSocket.OPEN) {
182
- wsRef.current.send(JSON.stringify({ action: "ping" }));
183
- pongTimeoutRef.current = setTimeout(() => {
184
- // No pong received — reconnect
185
- wsRef.current?.close();
186
- }, PONG_TIMEOUT_MS);
187
- }
188
- }, HEARTBEAT_INTERVAL_MS);
189
- }, [clearHeartbeat]);
190
 
191
- const openWebSocket = useCallback(() => {
192
- closeWs();
193
-
194
- const ws = new WebSocket(buildWsUrl(optsRef.current.sessionParams ?? {}));
195
- ws.binaryType = "arraybuffer";
196
- wsRef.current = ws;
 
197
 
198
- ws.onopen = () => {
199
- reconnectAttemptRef.current = 0;
200
- // Stay in CONNECTING until tts_config is received
201
- startHeartbeat();
202
- };
203
-
204
- ws.onmessage = (event) => {
205
- if (event.data instanceof ArrayBuffer) {
206
- // Binary frame = TTS audio chunk from voice backend
207
- if (!optsRef.current.bypassWsTts) {
208
- if (stateRef.current === "SPEAKING" || stateRef.current === "PROCESSING") {
209
- if (stateRef.current === "PROCESSING") {
210
- stopBufferSound();
211
- setState("SPEAKING");
212
- startBargeInLoop();
213
- }
214
- playerRef.current?.enqueue(event.data);
215
- }
216
- }
217
- return;
218
- }
219
 
 
220
  try {
221
- const msg = JSON.parse(event.data as string);
222
-
223
- switch (msg.event) {
224
- case "tts_config": {
225
- const sampleRate = (msg.sample_rate as number) ?? 16000;
226
- playerRef.current?.stopImmediately();
227
- playerRef.current?.init(sampleRate);
228
- setState("LISTENING");
229
- break;
230
- }
231
-
232
- case "transcript":
233
- if (!msg.is_partial) {
234
- setState("PROCESSING");
235
- stopBargeInLoop();
236
- playBufferSound();
237
- optsRef.current.onTranscript(msg.text as string);
238
- }
239
- break;
240
-
241
- case "reply":
242
- optsRef.current.onReply(msg.text as string);
243
- break;
244
-
245
- case "tts_end":
246
- playerRef.current?.drain();
247
- setState("LISTENING");
248
- stopBargeInLoop();
249
- break;
250
-
251
- case "interrupted":
252
- stopBufferSound();
253
- playerRef.current?.stopImmediately();
254
- setState("LISTENING");
255
- stopBargeInLoop();
256
- break;
257
-
258
- case "pong":
259
- if (pongTimeoutRef.current) clearTimeout(pongTimeoutRef.current);
260
- break;
261
-
262
- case "error":
263
- optsRef.current.onError?.(msg.code as string, msg.message as string);
264
- break;
265
  }
266
- } catch {
267
- // non-JSON frame — ignore
268
- }
269
- };
270
-
271
- ws.onerror = () => {
272
- // onclose will fire next and handle reconnect
273
- };
274
 
275
- ws.onclose = () => {
276
- clearHeartbeat();
277
- stopBargeInLoop();
278
- recorderRef.current?.stop();
279
-
280
- if (stateRef.current === "IDLE") return; // intentional stop
281
 
282
- // Reconnect with exponential backoff
283
- if (reconnectAttemptRef.current >= MAX_RECONNECT_ATTEMPTS) {
284
- setState("ERROR");
285
- return;
286
- }
287
- setState("RECONNECTING");
288
- const delay = Math.min(Math.pow(2, reconnectAttemptRef.current), 8) * 1000;
289
- reconnectAttemptRef.current++;
290
 
291
- setTimeout(async () => {
292
- if (stateRef.current !== "RECONNECTING") return;
293
- try {
294
- if (!recorderRef.current) recorderRef.current = new AudioRecorder();
295
- await recorderRef.current.start((chunk) => {
296
- if (wsRef.current?.readyState === WebSocket.OPEN) {
297
- wsRef.current.send(chunk);
298
- }
299
- });
300
- setState("CONNECTING");
301
- openWebSocket();
302
- } catch {
303
- setState("ERROR");
304
  }
305
- }, delay);
306
- };
307
- }, [clearHeartbeat, closeWs, playBufferSound, setState, startBargeInLoop, startHeartbeat, stopBargeInLoop, stopBufferSound]);
 
 
 
 
308
 
309
  const start = useCallback(async () => {
310
  if (stateRef.current !== "IDLE" && stateRef.current !== "ERROR") return;
311
  setState("CONNECTING");
312
 
313
- // Health check — best-effort: don't block connect if endpoint unreachable
314
  try {
315
  const res = await fetch(`${getVoiceHttpBaseUrl()}/health`);
316
  if (res.ok) {
@@ -326,26 +154,23 @@ export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionRe
326
  }
327
 
328
  try {
 
329
  if (!recorderRef.current) recorderRef.current = new AudioRecorder();
330
  if (!playerRef.current) playerRef.current = new AudioPlayer();
331
 
332
- await recorderRef.current.start((chunk) => {
333
- if (wsRef.current?.readyState === WebSocket.OPEN) {
334
- wsRef.current.send(chunk);
335
  }
336
  });
337
 
338
- // Init player inside user-gesture chain to satisfy autoplay policy
339
- playerRef.current.init();
340
-
341
- openWebSocket();
342
  } catch {
343
  recorderRef.current?.stop();
344
  setState("ERROR");
345
  }
346
- }, [openWebSocket, setState]);
347
 
348
- // Cleanup on unmount
349
  useEffect(() => {
350
  return () => {
351
  stopSession();
@@ -357,6 +182,8 @@ export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionRe
357
  voiceState,
358
  start,
359
  stop: stopSession,
 
 
360
  isActive: voiceState !== "IDLE" && voiceState !== "ERROR",
361
  };
362
  }
 
1
  import { useState, useRef, useEffect, useCallback } from "react";
2
  import { AudioRecorder } from "../audio/AudioRecorder";
3
  import { AudioPlayer } from "../audio/AudioPlayer";
4
+ import { createWavBlob, speechToText } from "../services/voiceApi";
5
 
6
  export type VoiceState =
7
  | "IDLE"
 
9
  | "LISTENING"
10
  | "PROCESSING"
11
  | "SPEAKING"
 
12
  | "ERROR";
13
 
14
  export interface VoiceSessionParams {
 
 
 
 
 
 
 
15
  sttProvider?: string;
16
  ttsProvider?: string;
 
17
  }
18
 
19
  interface UseVoiceSessionOptions {
20
  onTranscript: (text: string) => void;
 
21
  onError?: (code: string, message: string) => void;
 
 
22
  sessionParams?: VoiceSessionParams;
23
  }
24
 
 
26
  voiceState: VoiceState;
27
  start: () => Promise<void>;
28
  stop: () => void;
29
+ stopRecording: () => void;
30
+ setStateExternal: (s: VoiceState) => void;
31
  isActive: boolean;
32
  }
33
 
 
 
 
 
 
34
  const BUFFER_SOUNDS = [
35
  "/sounds/01_Pertanyaan_bagus_mohon_ditunggu_sebentar.wav",
36
  "/sounds/02_Oke_menararik_banget_Sebentar_ya_saya_se.wav",
37
  "/sounds/03_Sip_aku_sudah_dengar_pertanyaanmu_Tunggu.wav",
38
  ];
39
 
40
+ const RECORDER_SAMPLE_RATE = 16000;
41
+
42
  function getVoiceHttpBaseUrl(): string {
43
  return (import.meta as unknown as { env: Record<string, string> }).env
44
  .VITE_API_BASE_VOICE_URL ?? "http://localhost:7861";
45
  }
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  export function useVoiceSession(opts: UseVoiceSessionOptions): UseVoiceSessionReturn {
48
  const [voiceState, setVoiceState] = useState<VoiceState>("IDLE");
49
  const stateRef = useRef<VoiceState>("IDLE");
50
 
 
51
  const recorderRef = useRef<AudioRecorder | null>(null);
52
  const playerRef = useRef<AudioPlayer | null>(null);
53
+ const chunksRef = useRef<ArrayBuffer[]>([]);
 
 
 
 
54
 
55
  const bufferAudioRef = useRef<HTMLAudioElement | null>(null);
56
  const lastBufferIndexRef = useRef<number>(-1);
57
 
 
58
  const optsRef = useRef(opts);
59
  useEffect(() => { optsRef.current = opts; });
60
 
61
+ // Defined before setState so setState can call it without circular deps.
 
 
 
 
62
  const stopBufferSound = useCallback(() => {
63
  if (bufferAudioRef.current) {
64
  bufferAudioRef.current.pause();
 
67
  }
68
  }, []);
69
 
70
+ // Auto-stops the buffer audio when the waiting phase ends (TTS about to start, or session ends).
71
+ const setState = useCallback((s: VoiceState) => {
72
+ if (s === "SPEAKING" || s === "IDLE" || s === "ERROR") {
73
+ stopBufferSound();
74
+ }
75
+ stateRef.current = s;
76
+ setVoiceState(s);
77
+ }, [stopBufferSound]);
78
+
79
  const playBufferSound = useCallback(() => {
80
  stopBufferSound();
81
  let idx: number;
 
88
  audio.play().catch(() => {});
89
  }, [stopBufferSound]);
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  const stopSession = useCallback(() => {
 
 
 
 
 
 
 
92
  recorderRef.current?.stop();
93
  playerRef.current?.stopImmediately();
94
+ chunksRef.current = [];
95
+ setState("IDLE"); // setState("IDLE") calls stopBufferSound internally
96
+ }, [setState]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ const stopRecording = useCallback(() => {
99
+ if (stateRef.current !== "LISTENING") return;
100
+ setState("PROCESSING");
101
+ recorderRef.current?.stop();
102
+ // Play buffer audio — it keeps playing through STT and chatbot processing.
103
+ // It stops automatically when setState("SPEAKING"), setState("IDLE"), or setState("ERROR") is called.
104
+ playBufferSound();
105
 
106
+ const chunks = chunksRef.current;
107
+ chunksRef.current = [];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ void (async () => {
110
  try {
111
+ if (chunks.length === 0) {
112
+ setState("IDLE");
113
+ return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  }
 
 
 
 
 
 
 
 
115
 
116
+ const wav = createWavBlob(chunks, RECORDER_SAMPLE_RATE);
117
+ const { text } = await speechToText(wav, optsRef.current.sessionParams?.sttProvider ?? "chirp3");
 
 
 
 
118
 
119
+ // Guard: session may have been cancelled while STT was in flight.
120
+ if (stateRef.current !== "PROCESSING") return;
 
 
 
 
 
 
121
 
122
+ if (text.trim()) {
123
+ console.log("[Voice] STT transcript →", text);
124
+ // Buffer audio continues to play while Main.tsx calls the chatbot API.
125
+ // It will stop when setStateExternal("SPEAKING") or setStateExternal("IDLE") is called.
126
+ optsRef.current.onTranscript(text);
127
+ } else {
128
+ setState("IDLE");
 
 
 
 
 
 
129
  }
130
+ } catch (err) {
131
+ console.error("[STT] Request failed:", err);
132
+ optsRef.current.onError?.("STT_ERROR", (err as Error).message);
133
+ setState("ERROR"); // setState("ERROR") calls stopBufferSound internally
134
+ }
135
+ })();
136
+ }, [playBufferSound, setState]);
137
 
138
  const start = useCallback(async () => {
139
  if (stateRef.current !== "IDLE" && stateRef.current !== "ERROR") return;
140
  setState("CONNECTING");
141
 
 
142
  try {
143
  const res = await fetch(`${getVoiceHttpBaseUrl()}/health`);
144
  if (res.ok) {
 
154
  }
155
 
156
  try {
157
+ chunksRef.current = [];
158
  if (!recorderRef.current) recorderRef.current = new AudioRecorder();
159
  if (!playerRef.current) playerRef.current = new AudioPlayer();
160
 
161
+ await recorderRef.current.start((chunk: ArrayBuffer) => {
162
+ if (stateRef.current === "LISTENING") {
163
+ chunksRef.current.push(chunk);
164
  }
165
  });
166
 
167
+ setState("LISTENING");
 
 
 
168
  } catch {
169
  recorderRef.current?.stop();
170
  setState("ERROR");
171
  }
172
+ }, [setState]);
173
 
 
174
  useEffect(() => {
175
  return () => {
176
  stopSession();
 
182
  voiceState,
183
  start,
184
  stop: stopSession,
185
+ stopRecording,
186
+ setStateExternal: setState,
187
  isActive: voiceState !== "IDLE" && voiceState !== "ERROR",
188
  };
189
  }
src/services/voiceApi.ts CHANGED
@@ -2,6 +2,62 @@ const VOICE_BASE_URL =
2
  (import.meta as unknown as { env: Record<string, string> }).env
3
  .VITE_API_BASE_VOICE_URL ?? "http://localhost:7861";
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  export async function textToSpeech(
6
  text: string,
7
  provider = "gemini"
 
2
  (import.meta as unknown as { env: Record<string, string> }).env
3
  .VITE_API_BASE_VOICE_URL ?? "http://localhost:7861";
4
 
5
+ function writeString(view: DataView, offset: number, str: string): void {
6
+ for (let i = 0; i < str.length; i++) view.setUint8(offset + i, str.charCodeAt(i));
7
+ }
8
+
9
+ export function createWavBlob(chunks: ArrayBuffer[], sampleRate: number): Blob {
10
+ const pcmByteLength = chunks.reduce((sum, c) => sum + c.byteLength, 0);
11
+ const buffer = new ArrayBuffer(44 + pcmByteLength);
12
+ const view = new DataView(buffer);
13
+ writeString(view, 0, "RIFF");
14
+ view.setUint32(4, 36 + pcmByteLength, true);
15
+ writeString(view, 8, "WAVE");
16
+ writeString(view, 12, "fmt ");
17
+ view.setUint32(16, 16, true);
18
+ view.setUint16(20, 1, true); // PCM
19
+ view.setUint16(22, 1, true); // mono
20
+ view.setUint32(24, sampleRate, true);
21
+ view.setUint32(28, sampleRate * 2, true);
22
+ view.setUint16(32, 2, true);
23
+ view.setUint16(34, 16, true);
24
+ writeString(view, 36, "data");
25
+ view.setUint32(40, pcmByteLength, true);
26
+ let offset = 44;
27
+ for (const chunk of chunks) {
28
+ new Uint8Array(buffer, offset, chunk.byteLength).set(new Uint8Array(chunk));
29
+ offset += chunk.byteLength;
30
+ }
31
+ return new Blob([buffer], { type: "audio/wav" });
32
+ }
33
+
34
+ export async function speechToText(
35
+ wavBlob: Blob,
36
+ provider = "chirp3"
37
+ ): Promise<{ text: string; language: string; duration: number | null }> {
38
+ const form = new FormData();
39
+ form.append("audio", wavBlob, "recording.wav");
40
+ form.append("provider", provider);
41
+ const res = await fetch(`${VOICE_BASE_URL}/stt`, { method: "POST", body: form });
42
+ if (!res.ok) throw new Error(`STT error: ${res.status}`);
43
+ return res.json();
44
+ }
45
+
46
+ export async function textToSpeechStreaming(
47
+ text: string,
48
+ provider = "gemini"
49
+ ): Promise<{ sampleRate: number; stream: ReadableStream<Uint8Array> }> {
50
+ const res = await fetch(`${VOICE_BASE_URL}/tts`, {
51
+ method: "POST",
52
+ headers: { "Content-Type": "application/json" },
53
+ body: JSON.stringify({ text, provider }),
54
+ });
55
+ if (!res.ok) throw new Error(`TTS error: ${res.status}`);
56
+ if (!res.body) throw new Error("TTS response has no body");
57
+ const sampleRate = parseInt(res.headers.get("X-Sample-Rate") ?? "24000", 10);
58
+ return { sampleRate, stream: res.body };
59
+ }
60
+
61
  export async function textToSpeech(
62
  text: string,
63
  provider = "gemini"