dvc890 commited on
Commit
9df572c
·
verified ·
1 Parent(s): 0e885ae

Update components/LiveAssistant.tsx

Browse files
Files changed (1) hide show
  1. components/LiveAssistant.tsx +319 -263
components/LiveAssistant.tsx CHANGED
@@ -1,11 +1,16 @@
1
 
2
  import React, { useState, useRef, useEffect } from 'react';
3
  import { GoogleGenAI, LiveServerMessage, Modality } from "@google/genai";
4
- import { Mic, X, MessageCircle, Volume2, Power, Play, Square, Loader2, Bot, ChevronDown, RefreshCw } from 'lucide-react';
5
  import { api } from '../services/api';
6
 
7
- // --- Helper Functions for Audio Processing ---
8
- function decode(base64: string) {
 
 
 
 
 
9
  const binaryString = atob(base64);
10
  const len = binaryString.length;
11
  const bytes = new Uint8Array(len);
@@ -15,73 +20,29 @@ function decode(base64: string) {
15
  return bytes;
16
  }
17
 
18
- async function decodeAudioData(
19
- data: Uint8Array,
20
- ctx: AudioContext,
21
- sampleRate: number,
22
- numChannels: number,
23
- ): Promise<AudioBuffer> {
24
- const dataInt16 = new Int16Array(data.buffer);
25
- const frameCount = dataInt16.length / numChannels;
26
- const buffer = ctx.createBuffer(numChannels, frameCount, sampleRate);
27
-
28
- for (let channel = 0; channel < numChannels; channel++) {
29
- const channelData = buffer.getChannelData(channel);
30
- for (let i = 0; i < frameCount; i++) {
31
- channelData[i] = dataInt16[i * numChannels + channel] / 32768.0;
32
- }
33
- }
34
- return buffer;
35
- }
36
-
37
- function createBlob(data: Float32Array): { data: string; mimeType: string } {
38
- const l = data.length;
39
- const int16 = new Int16Array(l);
40
- for (let i = 0; i < l; i++) {
41
- int16[i] = data[i] * 32768;
42
- }
43
-
44
- // Custom encode function instead of js-base64
45
- let binary = '';
46
- const bytes = new Uint8Array(int16.buffer);
47
- const len = bytes.byteLength;
48
- for (let i = 0; i < len; i++) {
49
- binary += String.fromCharCode(bytes[i]);
50
- }
51
- const base64 = btoa(binary);
52
-
53
- return {
54
- data: base64,
55
- mimeType: 'audio/pcm;rate=16000',
56
- };
57
- }
58
-
59
  export const LiveAssistant: React.FC = () => {
60
  const [isOpen, setIsOpen] = useState(false);
61
- const [isConnected, setIsConnected] = useState(false);
62
- const [isMicOn, setIsMicOn] = useState(false); // Toggle for "Hold to Talk" simulation
63
- const [isSpeaking, setIsSpeaking] = useState(false); // Model speaking
64
- const [logs, setLogs] = useState<{role: 'user'|'model', text: string}[]>([]);
65
  const [apiKey, setApiKey] = useState('');
66
- const [isInitializing, setIsInitializing] = useState(false);
 
67
 
68
- // Audio Refs
69
  const audioContextRef = useRef<AudioContext | null>(null);
70
- const audioStreamRef = useRef<MediaStream | null>(null);
71
- const inputProcessorRef = useRef<ScriptProcessorNode | null>(null);
72
- const inputSourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
73
- const outputNodeRef = useRef<GainNode | null>(null);
74
- const nextStartTimeRef = useRef<number>(0);
75
- const activeSourcesRef = useRef<Set<AudioBufferSourceNode>>(new Set());
76
 
77
- // Session Ref
78
- const sessionPromiseRef = useRef<Promise<any> | null>(null);
 
 
79
 
80
- // 1. Get Key on Mount (if allowed)
81
  useEffect(() => {
82
- // Only fetch key if user opens the widget to save resources
83
- if (isOpen && !apiKey && !isInitializing) {
84
- setIsInitializing(true);
85
  fetch('/api/ai/live-access', {
86
  headers: {
87
  'x-user-username': api.auth.getCurrentUser()?.username || '',
@@ -91,204 +52,264 @@ export const LiveAssistant: React.FC = () => {
91
  .then(res => res.json())
92
  .then(data => {
93
  if (data.key) setApiKey(data.key);
94
- setIsInitializing(false);
95
  })
96
- .catch(() => setIsInitializing(false));
97
  }
98
  }, [isOpen]);
99
 
100
- const connect = async () => {
101
- if (!apiKey) return;
102
-
103
- try {
104
- setIsInitializing(true);
105
- console.log("Starting Live Connection...");
 
 
 
106
 
107
- // Setup Audio Context
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  // @ts-ignore
109
  const AudioCtor = window.AudioContext || window.webkitAudioContext;
110
- const ctx = new AudioCtor({sampleRate: 24000}); // Output rate usually 24k
111
- audioContextRef.current = ctx;
112
- outputNodeRef.current = ctx.createGain();
113
- outputNodeRef.current.connect(ctx.destination);
 
 
 
114
 
115
- // Setup Input (Mic) - But don't connect processor yet until "Mic On"
116
- const stream = await navigator.mediaDevices.getUserMedia({ audio: {
117
- sampleRate: 16000,
118
- channelCount: 1,
119
- echoCancellation: true,
120
- noiseSuppression: true
121
- }});
122
- audioStreamRef.current = stream;
123
- console.log("Microphone access granted");
124
 
125
- // Initialize Gemini Client
 
 
 
 
 
 
126
  const client = new GoogleGenAI({ apiKey });
127
 
128
- const sessionPromise = client.live.connect({
129
  model: 'gemini-2.5-flash-native-audio-preview-09-2025',
130
- config: {
131
- responseModalities: [Modality.AUDIO],
132
- speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } } },
133
- systemInstruction: { parts: [{ text: "你是一位乐于助人的校园AI助手。请始终使用中文回答。请简短、自然地进行对话,不要使用 Markdown 格式,不要进行搜索。" }] },
134
- outputAudioTranscription: { model: true } // Enable transcription to show text
135
- },
136
  callbacks: {
137
  onopen: () => {
138
- console.log("Live Session Opened");
139
- setIsConnected(true);
140
- setIsInitializing(false);
141
- setLogs(prev => [...prev, {role: 'model', text: '已连接,请点击麦克风说话。'}]);
142
  },
143
- onmessage: async (msg: LiveServerMessage) => {
144
- // Handle Audio Output
145
- const audioData = msg.serverContent?.modelTurn?.parts?.[0]?.inlineData?.data;
146
- if (audioData && audioContextRef.current && outputNodeRef.current) {
147
- setIsSpeaking(true);
148
- const ctx = audioContextRef.current;
149
- const buffer = await decodeAudioData(decode(audioData), ctx, 24000, 1);
150
-
151
- const source = ctx.createBufferSource();
152
- source.buffer = buffer;
153
- source.connect(outputNodeRef.current);
154
-
155
- // Scheduling
156
- const now = ctx.currentTime;
157
- const startTime = Math.max(now, nextStartTimeRef.current);
158
- source.start(startTime);
159
- nextStartTimeRef.current = startTime + buffer.duration;
160
-
161
- activeSourcesRef.current.add(source);
162
- source.onended = () => {
163
- activeSourcesRef.current.delete(source);
164
- if (activeSourcesRef.current.size === 0) setIsSpeaking(false);
165
- };
166
- }
167
-
168
- // Handle Text Transcription
169
- const transcript = msg.serverContent?.modelTurn?.parts?.[0]?.text;
170
- if (transcript) {
171
- // Update last model log or add new
172
- setLogs(prev => {
173
- const last = prev[prev.length - 1];
174
- const isInitialMessage = last && last.text === '已连接,请点击麦克风说话。';
175
-
176
- // IMPORTANT: Do not append to the initial system message
177
- if (last && last.role === 'model' && !isInitialMessage && !last.text.endsWith('\n')) {
178
- // Append to existing turn (simplified logic)
179
- return [...prev.slice(0, -1), { ...last, text: last.text + transcript }];
180
- }
181
- return [...prev, { role: 'model', text: transcript }];
182
- });
183
- }
184
-
185
- // Handle Transcription of User Input (Echo)
186
- // @ts-ignore - types might be missing in some SDK versions
187
- const userTranscript = msg.serverContent?.outputAudioTranscription?.text || msg.serverContent?.turnComplete && "User input processed";
188
- // Note: Standard API usually doesn't echo user transcript in serverContent easily without config, relying on model turn.
189
  },
190
  onclose: () => {
191
- console.log("Live Session Closed");
192
- setIsConnected(false);
193
- setLogs(prev => [...prev, {role: 'model', text: '连接已断开'}]);
194
  },
195
  onerror: (e) => {
196
- console.error("Live API Error", e);
197
- setIsConnected(false);
 
198
  }
 
 
 
 
 
 
199
  }
200
  });
201
-
202
- sessionPromiseRef.current = sessionPromise;
 
 
203
 
204
  } catch (e) {
205
- console.error("Connection failed", e);
206
- setIsInitializing(false);
 
207
  }
208
  };
209
 
210
- const disconnect = () => {
211
- // Close Session
212
- if (sessionPromiseRef.current) {
213
- sessionPromiseRef.current.then(s => s.close());
214
- sessionPromiseRef.current = null;
215
- }
216
 
217
- // Cleanup Audio
218
- if (audioStreamRef.current) audioStreamRef.current.getTracks().forEach(t => t.stop());
219
- if (inputProcessorRef.current) inputProcessorRef.current.disconnect();
220
- if (inputSourceRef.current) inputSourceRef.current.disconnect();
221
- if (audioContextRef.current) audioContextRef.current.close();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- setIsConnected(false);
224
- setIsMicOn(false);
225
- setLogs([]);
 
226
  };
227
 
228
- const toggleMic = async () => {
229
- if (!isConnected || !audioContextRef.current || !sessionPromiseRef.current || !audioStreamRef.current) return;
230
 
231
- const newMicState = !isMicOn;
232
- setIsMicOn(newMicState);
233
- console.log("Toggling Mic:", newMicState ? "ON" : "OFF");
 
 
 
234
 
235
- if (newMicState) {
236
- // START SENDING
237
- const ctx = audioContextRef.current;
238
- // Input context sample rate usually needs to match stream, but we resample manually or rely on createScriptProcessor logic
239
- // Simple approach: Use 16k context for input if possible, or downsample.
240
- // Here we assume ctx is created at 24k (output), so input might need resampling or just sending as is if API tolerates.
241
- // Gemini API expects 16k for input usually.
242
 
243
- const inputCtx = new (window.AudioContext || (window as any).webkitAudioContext)({ sampleRate: 16000 });
244
- const source = inputCtx.createMediaStreamSource(audioStreamRef.current);
245
- const processor = inputCtx.createScriptProcessor(4096, 1, 1);
246
 
247
- let chunkCount = 0;
248
  processor.onaudioprocess = (e) => {
249
- if (!newMicState) return; // Guard
250
  const inputData = e.inputBuffer.getChannelData(0);
251
- const blob = createBlob(inputData);
252
 
253
- // Debug log every 20 chunks (~0.5s) to avoid spam but confirm data flow
254
- chunkCount++;
255
- if (chunkCount % 20 === 0) console.log("Sending audio chunk...", chunkCount);
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- sessionPromiseRef.current?.then(session => {
258
- session.sendRealtimeInput({ media: { mimeType: 'audio/pcm;rate=16000', data: blob.data } });
259
- });
 
 
260
  };
261
-
262
  source.connect(processor);
263
- processor.connect(inputCtx.destination);
264
 
265
- // Store refs to disconnect later
266
- // @ts-ignore
267
- inputProcessorRef.current = processor;
268
- // @ts-ignore
269
- inputSourceRef.current = source;
270
- // Store input context to close? Usually separate from output context to handle diff sample rates easily.
271
-
272
- } else {
273
- // STOP SENDING
274
- console.log("Stopping audio stream...");
275
- if (inputProcessorRef.current) {
276
- inputProcessorRef.current.disconnect();
277
- inputProcessorRef.current = null;
278
- }
279
- if (inputSourceRef.current) {
280
- inputSourceRef.current.disconnect();
281
- inputSourceRef.current = null;
282
- }
283
  }
284
  };
285
 
286
- // Auto-disconnect when closing modal
287
- useEffect(() => {
288
- if (!isOpen && isConnected) disconnect();
289
- }, [isOpen]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
- if (!api.auth.getCurrentUser()) return null; // Safety check
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  return (
294
  <div className="fixed bottom-6 right-6 z-[9999]">
@@ -302,74 +323,109 @@ export const LiveAssistant: React.FC = () => {
302
  </button>
303
  )}
304
 
305
- {/* Expanded Interface */}
306
  {isOpen && (
307
- <div className="bg-white w-80 md:w-96 rounded-2xl shadow-2xl border border-gray-200 overflow-hidden flex flex-col animate-in slide-in-from-bottom-5 fade-in duration-300" style={{maxHeight: '600px', height: '80vh'}}>
308
  {/* Header */}
309
- <div className="bg-gradient-to-r from-indigo-600 to-purple-600 p-4 flex justify-between items-center text-white shrink-0">
310
  <div className="flex items-center gap-2">
311
- <Bot size={20}/>
312
- <span className="font-bold">AI 语音助理</span>
313
  </div>
314
- <div className="flex items-center gap-2">
315
- <button onClick={disconnect} title="重置" className="hover:bg-white/20 p-1.5 rounded-full"><RefreshCw size={16}/></button>
316
- <button onClick={() => setIsOpen(false)} title="最小化" className="hover:bg-white/20 p-1.5 rounded-full"><ChevronDown size={20}/></button>
317
  </div>
318
  </div>
319
 
320
- {/* Content / Logs */}
321
- <div className="flex-1 bg-gray-50 p-4 overflow-y-auto space-y-3 custom-scrollbar">
322
- {logs.length === 0 && isConnected && (
323
- <div className="text-center text-gray-400 mt-10 text-sm">
324
- <p>点击下方麦克风开始说话</p>
325
- <p className="text-xs mt-2 opacity-70">Gemini 2.5 Flash (Native Audio)</p>
326
- </div>
327
- )}
328
- {logs.map((log, i) => (
329
- <div key={i} className={`flex ${log.role === 'user' ? 'justify-end' : 'justify-start'}`}>
330
- <div className={`max-w-[85%] p-3 rounded-2xl text-sm ${log.role === 'user' ? 'bg-indigo-600 text-white rounded-tr-none' : 'bg-white border border-gray-200 text-gray-800 rounded-tl-none shadow-sm'}`}>
331
- {log.text}
332
- </div>
333
- </div>
334
- ))}
335
- {isSpeaking && (
336
- <div className="flex justify-start">
337
- <div className="bg-white border border-gray-200 px-4 py-2 rounded-full shadow-sm flex items-center gap-2">
338
- <span className="flex gap-1 h-3 items-end">
339
- <span className="w-1 bg-indigo-500 animate-[bounce_1s_infinite] h-2"></span>
340
- <span className="w-1 bg-indigo-500 animate-[bounce_1.2s_infinite] h-3"></span>
341
- <span className="w-1 bg-indigo-500 animate-[bounce_0.8s_infinite] h-1"></span>
342
- </span>
343
- <span className="text-xs text-indigo-600 font-bold">正在说话...</span>
344
- </div>
 
 
 
 
 
 
 
 
 
 
345
  </div>
346
- )}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  </div>
348
 
349
  {/* Controls */}
350
- <div className="p-4 bg-white border-t border-gray-100 shrink-0">
351
- {!isConnected ? (
352
  <button
353
- onClick={connect}
354
- disabled={isInitializing || !apiKey}
355
- className="w-full py-3 bg-indigo-600 hover:bg-indigo-700 text-white rounded-xl font-bold flex items-center justify-center gap-2 disabled:opacity-50 disabled:cursor-not-allowed transition-colors"
356
  >
357
- {isInitializing ? <Loader2 className="animate-spin"/> : <Power size={18}/>}
358
- {isInitializing ? '正在连接...' : '开启语音会话'}
359
  </button>
360
  ) : (
361
- <div className="flex flex-col gap-3">
362
- <div className="flex items-center justify-center">
363
  <button
364
- onClick={toggleMic}
365
- className={`w-16 h-16 rounded-full flex items-center justify-center shadow-lg transition-all transform active:scale-95 ${isMicOn ? 'bg-red-500 text-white animate-pulse ring-4 ring-red-100' : 'bg-indigo-100 text-indigo-600 hover:bg-indigo-200'}`}
 
 
 
 
 
 
366
  >
367
- {isMicOn ? <Square fill="currentColor" size={24}/> : <Mic size={28}/>}
368
  </button>
 
 
 
 
 
369
  </div>
370
- <p className="text-center text-xs text-gray-400 font-medium">
371
- {isMicOn ? '正在聆听... 点击停止发送' : '点击麦克风开始说话'}
372
- </p>
373
  </div>
374
  )}
375
  </div>
 
1
 
2
  import React, { useState, useRef, useEffect } from 'react';
3
  import { GoogleGenAI, LiveServerMessage, Modality } from "@google/genai";
4
+ import { Mic, X, Power, Loader2, Bot, Volume2, Radio, Activity, RefreshCw, ChevronDown } from 'lucide-react';
5
  import { api } from '../services/api';
6
 
7
+ // --- Audio Types & Helpers ---
8
+ // 16kHz for Gemini Input
9
+ const INPUT_SAMPLE_RATE = 16000;
10
+ // 24kHz for Gemini Output
11
+ const OUTPUT_SAMPLE_RATE = 24000;
12
+
13
+ function base64ToUint8Array(base64: string) {
14
  const binaryString = atob(base64);
15
  const len = binaryString.length;
16
  const bytes = new Uint8Array(len);
 
20
  return bytes;
21
  }
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  export const LiveAssistant: React.FC = () => {
24
  const [isOpen, setIsOpen] = useState(false);
25
+ const [status, setStatus] = useState<'DISCONNECTED' | 'CONNECTING' | 'CONNECTED' | 'LISTENING' | 'THINKING' | 'SPEAKING'>('DISCONNECTED');
 
 
 
26
  const [apiKey, setApiKey] = useState('');
27
+ const [transcript, setTranscript] = useState(''); // Current subtitle
28
+ const [volumeLevel, setVolumeLevel] = useState(0);
29
 
30
+ // --- Refs for managing Audio & Session Lifecycle ---
31
  const audioContextRef = useRef<AudioContext | null>(null);
32
+ const mediaStreamRef = useRef<MediaStream | null>(null);
33
+ const processorRef = useRef<ScriptProcessorNode | null>(null);
34
+ const sourceNodeRef = useRef<MediaStreamAudioSourceNode | null>(null);
35
+ const gainNodeRef = useRef<GainNode | null>(null);
 
 
36
 
37
+ const sessionRef = useRef<any>(null); // The GenAI Session
38
+ const nextPlayTimeRef = useRef<number>(0);
39
+ const analyserRef = useRef<AnalyserNode | null>(null);
40
+ const volumeIntervalRef = useRef<any>(null);
41
 
42
+ // 1. Fetch Key on Open
43
  useEffect(() => {
44
+ if (isOpen && !apiKey) {
45
+ api.ai.getStats().catch(() => {}); // Warm up
 
46
  fetch('/api/ai/live-access', {
47
  headers: {
48
  'x-user-username': api.auth.getCurrentUser()?.username || '',
 
52
  .then(res => res.json())
53
  .then(data => {
54
  if (data.key) setApiKey(data.key);
 
55
  })
56
+ .catch(err => console.error("Failed to get live key", err));
57
  }
58
  }, [isOpen]);
59
 
60
+ // 2. Clean up on unmount or close
61
+ useEffect(() => {
62
+ if (!isOpen) {
63
+ handleDisconnect();
64
+ }
65
+ return () => {
66
+ handleDisconnect();
67
+ };
68
+ }, [isOpen]);
69
 
70
+ // Visualizer Loop
71
+ useEffect(() => {
72
+ if (status === 'DISCONNECTED') {
73
+ setVolumeLevel(0);
74
+ return;
75
+ }
76
+ volumeIntervalRef.current = setInterval(() => {
77
+ if (analyserRef.current) {
78
+ const array = new Uint8Array(analyserRef.current.frequencyBinCount);
79
+ analyserRef.current.getByteFrequencyData(array);
80
+ const avg = array.reduce((a,b)=>a+b) / array.length;
81
+ setVolumeLevel(Math.min(100, avg * 1.5));
82
+ }
83
+ }, 100);
84
+ return () => clearInterval(volumeIntervalRef.current);
85
+ }, [status]);
86
+
87
+ const initAudioContext = () => {
88
+ if (!audioContextRef.current) {
89
  // @ts-ignore
90
  const AudioCtor = window.AudioContext || window.webkitAudioContext;
91
+ const ctx = new AudioCtor({ sampleRate: OUTPUT_SAMPLE_RATE });
92
+
93
+ const analyser = ctx.createAnalyser();
94
+ analyser.fftSize = 64;
95
+
96
+ const gain = ctx.createGain();
97
+ gain.connect(ctx.destination); // For output
98
 
99
+ audioContextRef.current = ctx;
100
+ analyserRef.current = analyser;
101
+ gainNodeRef.current = gain;
102
+ }
103
+ if (audioContextRef.current.state === 'suspended') {
104
+ audioContextRef.current.resume();
105
+ }
106
+ };
 
107
 
108
+ const handleConnect = async () => {
109
+ if (!apiKey) return;
110
+ setStatus('CONNECTING');
111
+ setTranscript('正在建立连接...');
112
+
113
+ try {
114
+ initAudioContext();
115
  const client = new GoogleGenAI({ apiKey });
116
 
117
+ const session = await client.live.connect({
118
  model: 'gemini-2.5-flash-native-audio-preview-09-2025',
 
 
 
 
 
 
119
  callbacks: {
120
  onopen: () => {
121
+ console.log('Session Open');
 
 
 
122
  },
123
+ onmessage: (msg: LiveServerMessage) => {
124
+ handleServerMessage(msg);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  },
126
  onclose: () => {
127
+ console.log('Session Close');
128
+ handleDisconnect();
 
129
  },
130
  onerror: (e) => {
131
+ console.error('Session Error', e);
132
+ setTranscript('连接发生错误,请重试');
133
+ handleDisconnect();
134
  }
135
+ },
136
+ config: {
137
+ responseModalities: [Modality.AUDIO],
138
+ speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } } },
139
+ // Strong instruction to force Chinese
140
+ systemInstruction: "You are a helpful school assistant. You MUST reply in spoken Chinese (Mandarin). Keep answers concise and friendly. Do not use markdown.",
141
  }
142
  });
143
+
144
+ sessionRef.current = session;
145
+ setStatus('CONNECTED');
146
+ setTranscript('连接成功,请按住麦克风说话');
147
 
148
  } catch (e) {
149
+ console.error("Connect failed", e);
150
+ setStatus('DISCONNECTED');
151
+ setTranscript('连接失败');
152
  }
153
  };
154
 
155
+ const handleServerMessage = async (msg: LiveServerMessage) => {
156
+ const serverContent = msg.serverContent;
 
 
 
 
157
 
158
+ // 1. Audio Output
159
+ const audioData = serverContent?.modelTurn?.parts?.[0]?.inlineData?.data;
160
+ if (audioData && audioContextRef.current) {
161
+ setStatus('SPEAKING'); // Receiving audio means speaking
162
+ const ctx = audioContextRef.current;
163
+ const bytes = base64ToUint8Array(audioData);
164
+
165
+ // Decode Raw PCM (16-bit, 24kHz, Mono)
166
+ const int16 = new Int16Array(bytes.buffer);
167
+ const float32 = new Float32Array(int16.length);
168
+ for(let i=0; i<int16.length; i++) float32[i] = int16[i] / 32768.0;
169
+
170
+ const buffer = ctx.createBuffer(1, float32.length, OUTPUT_SAMPLE_RATE);
171
+ buffer.copyToChannel(float32, 0);
172
+
173
+ const source = ctx.createBufferSource();
174
+ source.buffer = buffer;
175
+
176
+ // Connect to analyser for visuals
177
+ if (analyserRef.current && gainNodeRef.current) {
178
+ source.connect(analyserRef.current);
179
+ analyserRef.current.connect(gainNodeRef.current);
180
+ } else {
181
+ source.connect(ctx.destination);
182
+ }
183
+
184
+ // Schedule gapless playback
185
+ const now = ctx.currentTime;
186
+ const startTime = Math.max(now, nextPlayTimeRef.current);
187
+ source.start(startTime);
188
+ nextPlayTimeRef.current = startTime + buffer.duration;
189
+
190
+ source.onended = () => {
191
+ // If gap is large, we assume finished
192
+ if (ctx.currentTime >= nextPlayTimeRef.current - 0.1) {
193
+ setStatus('CONNECTED'); // Back to idle
194
+ }
195
+ };
196
+ }
197
+
198
+ // 2. Text Transcription (Subtitle)
199
+ // Note: The model sometimes returns 'thought' or 'search' logs here.
200
+ // We rely on audio mostly, but show text if it looks like a response.
201
+ const text = serverContent?.modelTurn?.parts?.[0]?.text;
202
+ if (text) {
203
+ if (!text.startsWith('**') && !text.includes('Finding')) {
204
+ setTranscript(text);
205
+ }
206
+ }
207
 
208
+ // 3. User Turn Finished (Model starts thinking)
209
+ if (serverContent?.turnComplete) {
210
+ setStatus('THINKING');
211
+ }
212
  };
213
 
214
+ const startRecording = async () => {
215
+ if (status !== 'CONNECTED' && status !== 'SPEAKING') return;
216
 
217
+ try {
218
+ // Interrupt model if speaking
219
+ if (status === 'SPEAKING') {
220
+ // We can send an interruption message or just stop playing, but API handles new input as interrupt usually
221
+ setStatus('CONNECTED');
222
+ }
223
 
224
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: INPUT_SAMPLE_RATE } });
225
+ mediaStreamRef.current = stream;
 
 
 
 
 
226
 
227
+ const ctx = new (window.AudioContext || (window as any).webkitAudioContext)({ sampleRate: INPUT_SAMPLE_RATE });
228
+ const source = ctx.createMediaStreamSource(stream);
229
+ const processor = ctx.createScriptProcessor(4096, 1, 1);
230
 
 
231
  processor.onaudioprocess = (e) => {
 
232
  const inputData = e.inputBuffer.getChannelData(0);
 
233
 
234
+ // Downconvert Float32 to Int16 for Gemini
235
+ const l = inputData.length;
236
+ const int16Data = new Int16Array(l);
237
+ for (let i = 0; i < l; i++) {
238
+ int16Data[i] = inputData[i] * 32768;
239
+ }
240
+
241
+ // Convert to Base64 manually to avoid large lib dependency
242
+ let binary = '';
243
+ const bytes = new Uint8Array(int16Data.buffer);
244
+ const len = bytes.byteLength;
245
+ for (let i = 0; i < len; i++) {
246
+ binary += String.fromCharCode(bytes[i]);
247
+ }
248
+ const b64 = btoa(binary);
249
 
250
+ if (sessionRef.current) {
251
+ sessionRef.current.sendRealtimeInput({
252
+ media: { mimeType: `audio/pcm;rate=${INPUT_SAMPLE_RATE}`, data: b64 }
253
+ });
254
+ }
255
  };
256
+
257
  source.connect(processor);
258
+ processor.connect(ctx.destination);
259
 
260
+ sourceNodeRef.current = source;
261
+ processorRef.current = processor;
262
+ setStatus('LISTENING');
263
+ setTranscript('正在聆听...');
264
+
265
+ } catch (e) {
266
+ console.error(e);
267
+ setTranscript('无法访问麦克风');
 
 
 
 
 
 
 
 
 
 
268
  }
269
  };
270
 
271
+ const stopRecording = () => {
272
+ if (status !== 'LISTENING') return;
273
+
274
+ // Cleanup Mic Processing
275
+ if (processorRef.current) {
276
+ processorRef.current.disconnect();
277
+ processorRef.current = null;
278
+ }
279
+ if (sourceNodeRef.current) {
280
+ sourceNodeRef.current.disconnect();
281
+ sourceNodeRef.current = null;
282
+ }
283
+ if (mediaStreamRef.current) {
284
+ mediaStreamRef.current.getTracks().forEach(t => t.stop());
285
+ mediaStreamRef.current = null;
286
+ }
287
+
288
+ setStatus('THINKING');
289
+ setTranscript('思考中...');
290
+ };
291
 
292
+ const handleDisconnect = () => {
293
+ if (sessionRef.current) {
294
+ // sessionRef.current.close(); // SDK might not have close method exposed directly depending on version, but usually does
295
+ sessionRef.current = null;
296
+ }
297
+ // Cleanup Audio
298
+ if (audioContextRef.current) {
299
+ audioContextRef.current.suspend(); // Suspend instead of close to reuse? Or close.
300
+ // For robustness, let's just close and nullify.
301
+ audioContextRef.current.close().catch(()=>{});
302
+ audioContextRef.current = null;
303
+ }
304
+
305
+ stopRecording(); // Ensure mic is off
306
+
307
+ setStatus('DISCONNECTED');
308
+ setTranscript('');
309
+ nextPlayTimeRef.current = 0;
310
+ };
311
+
312
+ if (!api.auth.getCurrentUser()) return null;
313
 
314
  return (
315
  <div className="fixed bottom-6 right-6 z-[9999]">
 
323
  </button>
324
  )}
325
 
326
+ {/* Call Interface */}
327
  {isOpen && (
328
+ <div className="bg-slate-900 w-80 md:w-96 rounded-3xl shadow-2xl border border-slate-700 overflow-hidden flex flex-col animate-in slide-in-from-bottom-5 fade-in duration-300 h-[500px]">
329
  {/* Header */}
330
+ <div className="bg-slate-800/50 p-4 flex justify-between items-center text-white shrink-0 backdrop-blur-md">
331
  <div className="flex items-center gap-2">
332
+ <div className={`w-2 h-2 rounded-full ${status === 'DISCONNECTED' ? 'bg-red-500' : 'bg-green-500 animate-pulse'}`}></div>
333
+ <span className="font-bold text-sm">AI 实时通话</span>
334
  </div>
335
+ <div className="flex gap-2">
336
+ <button onClick={handleDisconnect} title="重置" className="hover:bg-white/10 p-1.5 rounded-full text-gray-400 hover:text-white transition-colors"><RefreshCw size={16}/></button>
337
+ <button onClick={() => setIsOpen(false)} title="最小化" className="hover:bg-white/10 p-1.5 rounded-full text-gray-400 hover:text-white transition-colors"><ChevronDown size={20}/></button>
338
  </div>
339
  </div>
340
 
341
+ {/* Main Visual Area */}
342
+ <div className="flex-1 flex flex-col items-center justify-center p-6 relative">
343
+ {/* Visualizer Circle */}
344
+ <div className={`relative w-40 h-40 flex items-center justify-center transition-all duration-500 ${status === 'LISTENING' ? 'scale-110' : 'scale-100'}`}>
345
+ {/* Outer Glow */}
346
+ <div
347
+ className={`absolute inset-0 rounded-full blur-2xl transition-all duration-300 ${
348
+ status === 'SPEAKING' ? 'bg-blue-500/40' :
349
+ status === 'LISTENING' ? 'bg-green-500/40' :
350
+ status === 'THINKING' ? 'bg-purple-500/40' : 'bg-gray-500/10'
351
+ }`}
352
+ style={{ opacity: 0.5 + (volumeLevel / 200) }}
353
+ ></div>
354
+
355
+ {/* Dynamic Rings */}
356
+ <div
357
+ className={`absolute inset-0 rounded-full border-2 border-white/10 transition-all duration-100`}
358
+ style={{ transform: `scale(${1 + volumeLevel/100})` }}
359
+ ></div>
360
+ <div
361
+ className={`absolute inset-0 rounded-full border border-white/20 transition-all duration-100 delay-75`}
362
+ style={{ transform: `scale(${1 + volumeLevel/150})` }}
363
+ ></div>
364
+
365
+ {/* Central Icon */}
366
+ <div className={`z-10 w-24 h-24 rounded-full flex items-center justify-center text-white shadow-xl transition-colors duration-500 ${
367
+ status === 'SPEAKING' ? 'bg-blue-600' :
368
+ status === 'LISTENING' ? 'bg-green-600' :
369
+ status === 'THINKING' ? 'bg-purple-600' :
370
+ status === 'CONNECTED' ? 'bg-slate-700' : 'bg-slate-800'
371
+ }`}>
372
+ {status === 'SPEAKING' ? <Volume2 size={40} className="animate-pulse"/> :
373
+ status === 'LISTENING' ? <Mic size={40} className="animate-bounce"/> :
374
+ status === 'THINKING' ? <Loader2 size={40} className="animate-spin"/> :
375
+ status === 'CONNECTED' ? <Radio size={40}/> : <Power size={40}/>}
376
  </div>
377
+ </div>
378
+
379
+ {/* Status Text */}
380
+ <div className="mt-8 text-center px-4 w-full">
381
+ <p className={`text-sm font-bold uppercase tracking-wider mb-2 ${
382
+ status === 'SPEAKING' ? 'text-blue-400' :
383
+ status === 'LISTENING' ? 'text-green-400' :
384
+ status === 'THINKING' ? 'text-purple-400' : 'text-gray-500'
385
+ }`}>
386
+ {status === 'DISCONNECTED' ? '未连接' :
387
+ status === 'CONNECTING' ? '连接中...' :
388
+ status === 'CONNECTED' ? '准备就绪' :
389
+ status === 'LISTENING' ? '正在聆听...' :
390
+ status === 'THINKING' ? '思考中...' : '正在说话'}
391
+ </p>
392
+ <p className="text-white text-lg font-medium leading-relaxed min-h-[3rem] line-clamp-3 transition-all">
393
+ {transcript}
394
+ </p>
395
+ </div>
396
  </div>
397
 
398
  {/* Controls */}
399
+ <div className="p-6 pb-8 bg-slate-800/50 backdrop-blur-md border-t border-slate-700 flex justify-center">
400
+ {status === 'DISCONNECTED' ? (
401
  <button
402
+ onClick={handleConnect}
403
+ disabled={!apiKey}
404
+ className="w-full py-4 bg-blue-600 hover:bg-blue-500 text-white rounded-2xl font-bold flex items-center justify-center gap-2 transition-all hover:scale-[1.02] active:scale-95 disabled:opacity-50 disabled:cursor-not-allowed"
405
  >
406
+ <Power size={20}/> 开启 AI 语音
 
407
  </button>
408
  ) : (
409
+ <div className="flex items-center gap-4 w-full justify-center">
410
+ <div className="relative group">
411
  <button
412
+ onMouseDown={startRecording}
413
+ onMouseUp={stopRecording}
414
+ onTouchStart={(e) => { e.preventDefault(); startRecording(); }}
415
+ onTouchEnd={(e) => { e.preventDefault(); stopRecording(); }}
416
+ className={`w-20 h-20 rounded-full flex items-center justify-center shadow-lg transition-all transform ${
417
+ status === 'LISTENING' ? 'bg-green-500 scale-110 ring-4 ring-green-500/30' :
418
+ 'bg-white text-slate-900 hover:bg-gray-100'
419
+ }`}
420
  >
421
+ <Mic size={32} fill={status==='LISTENING' ? 'white' : 'currentColor'} className={status==='LISTENING'?'text-white':''}/>
422
  </button>
423
+ {status === 'CONNECTED' && (
424
+ <div className="absolute -bottom-8 left-1/2 -translate-x-1/2 text-xs text-gray-400 whitespace-nowrap opacity-0 group-hover:opacity-100 transition-opacity">
425
+ 按住说话
426
+ </div>
427
+ )}
428
  </div>
 
 
 
429
  </div>
430
  )}
431
  </div>