Spaces:
Sleeping
Sleeping
Update components/LiveAssistant.tsx
Browse files- components/LiveAssistant.tsx +319 -263
components/LiveAssistant.tsx
CHANGED
|
@@ -1,11 +1,16 @@
|
|
| 1 |
|
| 2 |
import React, { useState, useRef, useEffect } from 'react';
|
| 3 |
import { GoogleGenAI, LiveServerMessage, Modality } from "@google/genai";
|
| 4 |
-
import { Mic, X,
|
| 5 |
import { api } from '../services/api';
|
| 6 |
|
| 7 |
-
// ---
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
const binaryString = atob(base64);
|
| 10 |
const len = binaryString.length;
|
| 11 |
const bytes = new Uint8Array(len);
|
|
@@ -15,73 +20,29 @@ function decode(base64: string) {
|
|
| 15 |
return bytes;
|
| 16 |
}
|
| 17 |
|
| 18 |
-
async function decodeAudioData(
|
| 19 |
-
data: Uint8Array,
|
| 20 |
-
ctx: AudioContext,
|
| 21 |
-
sampleRate: number,
|
| 22 |
-
numChannels: number,
|
| 23 |
-
): Promise<AudioBuffer> {
|
| 24 |
-
const dataInt16 = new Int16Array(data.buffer);
|
| 25 |
-
const frameCount = dataInt16.length / numChannels;
|
| 26 |
-
const buffer = ctx.createBuffer(numChannels, frameCount, sampleRate);
|
| 27 |
-
|
| 28 |
-
for (let channel = 0; channel < numChannels; channel++) {
|
| 29 |
-
const channelData = buffer.getChannelData(channel);
|
| 30 |
-
for (let i = 0; i < frameCount; i++) {
|
| 31 |
-
channelData[i] = dataInt16[i * numChannels + channel] / 32768.0;
|
| 32 |
-
}
|
| 33 |
-
}
|
| 34 |
-
return buffer;
|
| 35 |
-
}
|
| 36 |
-
|
| 37 |
-
function createBlob(data: Float32Array): { data: string; mimeType: string } {
|
| 38 |
-
const l = data.length;
|
| 39 |
-
const int16 = new Int16Array(l);
|
| 40 |
-
for (let i = 0; i < l; i++) {
|
| 41 |
-
int16[i] = data[i] * 32768;
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
// Custom encode function instead of js-base64
|
| 45 |
-
let binary = '';
|
| 46 |
-
const bytes = new Uint8Array(int16.buffer);
|
| 47 |
-
const len = bytes.byteLength;
|
| 48 |
-
for (let i = 0; i < len; i++) {
|
| 49 |
-
binary += String.fromCharCode(bytes[i]);
|
| 50 |
-
}
|
| 51 |
-
const base64 = btoa(binary);
|
| 52 |
-
|
| 53 |
-
return {
|
| 54 |
-
data: base64,
|
| 55 |
-
mimeType: 'audio/pcm;rate=16000',
|
| 56 |
-
};
|
| 57 |
-
}
|
| 58 |
-
|
| 59 |
export const LiveAssistant: React.FC = () => {
|
| 60 |
const [isOpen, setIsOpen] = useState(false);
|
| 61 |
-
const [
|
| 62 |
-
const [isMicOn, setIsMicOn] = useState(false); // Toggle for "Hold to Talk" simulation
|
| 63 |
-
const [isSpeaking, setIsSpeaking] = useState(false); // Model speaking
|
| 64 |
-
const [logs, setLogs] = useState<{role: 'user'|'model', text: string}[]>([]);
|
| 65 |
const [apiKey, setApiKey] = useState('');
|
| 66 |
-
const [
|
|
|
|
| 67 |
|
| 68 |
-
//
|
| 69 |
const audioContextRef = useRef<AudioContext | null>(null);
|
| 70 |
-
const
|
| 71 |
-
const
|
| 72 |
-
const
|
| 73 |
-
const
|
| 74 |
-
const nextStartTimeRef = useRef<number>(0);
|
| 75 |
-
const activeSourcesRef = useRef<Set<AudioBufferSourceNode>>(new Set());
|
| 76 |
|
| 77 |
-
//
|
| 78 |
-
const
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
// 1.
|
| 81 |
useEffect(() => {
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
setIsInitializing(true);
|
| 85 |
fetch('/api/ai/live-access', {
|
| 86 |
headers: {
|
| 87 |
'x-user-username': api.auth.getCurrentUser()?.username || '',
|
|
@@ -91,204 +52,264 @@ export const LiveAssistant: React.FC = () => {
|
|
| 91 |
.then(res => res.json())
|
| 92 |
.then(data => {
|
| 93 |
if (data.key) setApiKey(data.key);
|
| 94 |
-
setIsInitializing(false);
|
| 95 |
})
|
| 96 |
-
.catch(
|
| 97 |
}
|
| 98 |
}, [isOpen]);
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
// @ts-ignore
|
| 109 |
const AudioCtor = window.AudioContext || window.webkitAudioContext;
|
| 110 |
-
const ctx = new AudioCtor({sampleRate:
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
console.log("Microphone access granted");
|
| 124 |
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
const client = new GoogleGenAI({ apiKey });
|
| 127 |
|
| 128 |
-
const
|
| 129 |
model: 'gemini-2.5-flash-native-audio-preview-09-2025',
|
| 130 |
-
config: {
|
| 131 |
-
responseModalities: [Modality.AUDIO],
|
| 132 |
-
speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } } },
|
| 133 |
-
systemInstruction: { parts: [{ text: "你是一位乐于助人的校园AI助手。请始终使用中文回答。请简短、自然地进行对话,不要使用 Markdown 格式,不要进行搜索。" }] },
|
| 134 |
-
outputAudioTranscription: { model: true } // Enable transcription to show text
|
| 135 |
-
},
|
| 136 |
callbacks: {
|
| 137 |
onopen: () => {
|
| 138 |
-
console.log(
|
| 139 |
-
setIsConnected(true);
|
| 140 |
-
setIsInitializing(false);
|
| 141 |
-
setLogs(prev => [...prev, {role: 'model', text: '已连接,请点击麦克风说话。'}]);
|
| 142 |
},
|
| 143 |
-
onmessage:
|
| 144 |
-
|
| 145 |
-
const audioData = msg.serverContent?.modelTurn?.parts?.[0]?.inlineData?.data;
|
| 146 |
-
if (audioData && audioContextRef.current && outputNodeRef.current) {
|
| 147 |
-
setIsSpeaking(true);
|
| 148 |
-
const ctx = audioContextRef.current;
|
| 149 |
-
const buffer = await decodeAudioData(decode(audioData), ctx, 24000, 1);
|
| 150 |
-
|
| 151 |
-
const source = ctx.createBufferSource();
|
| 152 |
-
source.buffer = buffer;
|
| 153 |
-
source.connect(outputNodeRef.current);
|
| 154 |
-
|
| 155 |
-
// Scheduling
|
| 156 |
-
const now = ctx.currentTime;
|
| 157 |
-
const startTime = Math.max(now, nextStartTimeRef.current);
|
| 158 |
-
source.start(startTime);
|
| 159 |
-
nextStartTimeRef.current = startTime + buffer.duration;
|
| 160 |
-
|
| 161 |
-
activeSourcesRef.current.add(source);
|
| 162 |
-
source.onended = () => {
|
| 163 |
-
activeSourcesRef.current.delete(source);
|
| 164 |
-
if (activeSourcesRef.current.size === 0) setIsSpeaking(false);
|
| 165 |
-
};
|
| 166 |
-
}
|
| 167 |
-
|
| 168 |
-
// Handle Text Transcription
|
| 169 |
-
const transcript = msg.serverContent?.modelTurn?.parts?.[0]?.text;
|
| 170 |
-
if (transcript) {
|
| 171 |
-
// Update last model log or add new
|
| 172 |
-
setLogs(prev => {
|
| 173 |
-
const last = prev[prev.length - 1];
|
| 174 |
-
const isInitialMessage = last && last.text === '已连接,请点击麦克风说话。';
|
| 175 |
-
|
| 176 |
-
// IMPORTANT: Do not append to the initial system message
|
| 177 |
-
if (last && last.role === 'model' && !isInitialMessage && !last.text.endsWith('\n')) {
|
| 178 |
-
// Append to existing turn (simplified logic)
|
| 179 |
-
return [...prev.slice(0, -1), { ...last, text: last.text + transcript }];
|
| 180 |
-
}
|
| 181 |
-
return [...prev, { role: 'model', text: transcript }];
|
| 182 |
-
});
|
| 183 |
-
}
|
| 184 |
-
|
| 185 |
-
// Handle Transcription of User Input (Echo)
|
| 186 |
-
// @ts-ignore - types might be missing in some SDK versions
|
| 187 |
-
const userTranscript = msg.serverContent?.outputAudioTranscription?.text || msg.serverContent?.turnComplete && "User input processed";
|
| 188 |
-
// Note: Standard API usually doesn't echo user transcript in serverContent easily without config, relying on model turn.
|
| 189 |
},
|
| 190 |
onclose: () => {
|
| 191 |
-
console.log(
|
| 192 |
-
|
| 193 |
-
setLogs(prev => [...prev, {role: 'model', text: '连接已断开'}]);
|
| 194 |
},
|
| 195 |
onerror: (e) => {
|
| 196 |
-
console.error(
|
| 197 |
-
|
|
|
|
| 198 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
}
|
| 200 |
});
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
|
| 203 |
|
| 204 |
} catch (e) {
|
| 205 |
-
console.error("
|
| 206 |
-
|
|
|
|
| 207 |
}
|
| 208 |
};
|
| 209 |
|
| 210 |
-
const
|
| 211 |
-
|
| 212 |
-
if (sessionPromiseRef.current) {
|
| 213 |
-
sessionPromiseRef.current.then(s => s.close());
|
| 214 |
-
sessionPromiseRef.current = null;
|
| 215 |
-
}
|
| 216 |
|
| 217 |
-
//
|
| 218 |
-
|
| 219 |
-
if (
|
| 220 |
-
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
| 226 |
};
|
| 227 |
|
| 228 |
-
const
|
| 229 |
-
if (
|
| 230 |
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
const ctx = audioContextRef.current;
|
| 238 |
-
// Input context sample rate usually needs to match stream, but we resample manually or rely on createScriptProcessor logic
|
| 239 |
-
// Simple approach: Use 16k context for input if possible, or downsample.
|
| 240 |
-
// Here we assume ctx is created at 24k (output), so input might need resampling or just sending as is if API tolerates.
|
| 241 |
-
// Gemini API expects 16k for input usually.
|
| 242 |
|
| 243 |
-
const
|
| 244 |
-
const source =
|
| 245 |
-
const processor =
|
| 246 |
|
| 247 |
-
let chunkCount = 0;
|
| 248 |
processor.onaudioprocess = (e) => {
|
| 249 |
-
if (!newMicState) return; // Guard
|
| 250 |
const inputData = e.inputBuffer.getChannelData(0);
|
| 251 |
-
const blob = createBlob(inputData);
|
| 252 |
|
| 253 |
-
//
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
| 260 |
};
|
| 261 |
-
|
| 262 |
source.connect(processor);
|
| 263 |
-
processor.connect(
|
| 264 |
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
// STOP SENDING
|
| 274 |
-
console.log("Stopping audio stream...");
|
| 275 |
-
if (inputProcessorRef.current) {
|
| 276 |
-
inputProcessorRef.current.disconnect();
|
| 277 |
-
inputProcessorRef.current = null;
|
| 278 |
-
}
|
| 279 |
-
if (inputSourceRef.current) {
|
| 280 |
-
inputSourceRef.current.disconnect();
|
| 281 |
-
inputSourceRef.current = null;
|
| 282 |
-
}
|
| 283 |
}
|
| 284 |
};
|
| 285 |
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
return (
|
| 294 |
<div className="fixed bottom-6 right-6 z-[9999]">
|
|
@@ -302,74 +323,109 @@ export const LiveAssistant: React.FC = () => {
|
|
| 302 |
</button>
|
| 303 |
)}
|
| 304 |
|
| 305 |
-
{/*
|
| 306 |
{isOpen && (
|
| 307 |
-
<div className="bg-
|
| 308 |
{/* Header */}
|
| 309 |
-
<div className="bg-
|
| 310 |
<div className="flex items-center gap-2">
|
| 311 |
-
<
|
| 312 |
-
<span className="font-bold">AI
|
| 313 |
</div>
|
| 314 |
-
<div className="flex
|
| 315 |
-
<button onClick={
|
| 316 |
-
<button onClick={() => setIsOpen(false)} title="最小化" className="hover:bg-white/
|
| 317 |
</div>
|
| 318 |
</div>
|
| 319 |
|
| 320 |
-
{/*
|
| 321 |
-
<div className="flex-1
|
| 322 |
-
{
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
</div>
|
| 346 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
</div>
|
| 348 |
|
| 349 |
{/* Controls */}
|
| 350 |
-
<div className="p-
|
| 351 |
-
{
|
| 352 |
<button
|
| 353 |
-
onClick={
|
| 354 |
-
disabled={
|
| 355 |
-
className="w-full py-
|
| 356 |
>
|
| 357 |
-
|
| 358 |
-
{isInitializing ? '正在连接...' : '开启语音会话'}
|
| 359 |
</button>
|
| 360 |
) : (
|
| 361 |
-
<div className="flex
|
| 362 |
-
<div className="
|
| 363 |
<button
|
| 364 |
-
|
| 365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
>
|
| 367 |
-
|
| 368 |
</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
</div>
|
| 370 |
-
<p className="text-center text-xs text-gray-400 font-medium">
|
| 371 |
-
{isMicOn ? '正在聆听... 点击停止发送' : '点击麦克风开始说话'}
|
| 372 |
-
</p>
|
| 373 |
</div>
|
| 374 |
)}
|
| 375 |
</div>
|
|
|
|
| 1 |
|
| 2 |
import React, { useState, useRef, useEffect } from 'react';
|
| 3 |
import { GoogleGenAI, LiveServerMessage, Modality } from "@google/genai";
|
| 4 |
+
import { Mic, X, Power, Loader2, Bot, Volume2, Radio, Activity, RefreshCw, ChevronDown } from 'lucide-react';
|
| 5 |
import { api } from '../services/api';
|
| 6 |
|
| 7 |
+
// --- Audio Types & Helpers ---
|
| 8 |
+
// 16kHz for Gemini Input
|
| 9 |
+
const INPUT_SAMPLE_RATE = 16000;
|
| 10 |
+
// 24kHz for Gemini Output
|
| 11 |
+
const OUTPUT_SAMPLE_RATE = 24000;
|
| 12 |
+
|
| 13 |
+
function base64ToUint8Array(base64: string) {
|
| 14 |
const binaryString = atob(base64);
|
| 15 |
const len = binaryString.length;
|
| 16 |
const bytes = new Uint8Array(len);
|
|
|
|
| 20 |
return bytes;
|
| 21 |
}
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
export const LiveAssistant: React.FC = () => {
|
| 24 |
const [isOpen, setIsOpen] = useState(false);
|
| 25 |
+
const [status, setStatus] = useState<'DISCONNECTED' | 'CONNECTING' | 'CONNECTED' | 'LISTENING' | 'THINKING' | 'SPEAKING'>('DISCONNECTED');
|
|
|
|
|
|
|
|
|
|
| 26 |
const [apiKey, setApiKey] = useState('');
|
| 27 |
+
const [transcript, setTranscript] = useState(''); // Current subtitle
|
| 28 |
+
const [volumeLevel, setVolumeLevel] = useState(0);
|
| 29 |
|
| 30 |
+
// --- Refs for managing Audio & Session Lifecycle ---
|
| 31 |
const audioContextRef = useRef<AudioContext | null>(null);
|
| 32 |
+
const mediaStreamRef = useRef<MediaStream | null>(null);
|
| 33 |
+
const processorRef = useRef<ScriptProcessorNode | null>(null);
|
| 34 |
+
const sourceNodeRef = useRef<MediaStreamAudioSourceNode | null>(null);
|
| 35 |
+
const gainNodeRef = useRef<GainNode | null>(null);
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
const sessionRef = useRef<any>(null); // The GenAI Session
|
| 38 |
+
const nextPlayTimeRef = useRef<number>(0);
|
| 39 |
+
const analyserRef = useRef<AnalyserNode | null>(null);
|
| 40 |
+
const volumeIntervalRef = useRef<any>(null);
|
| 41 |
|
| 42 |
+
// 1. Fetch Key on Open
|
| 43 |
useEffect(() => {
|
| 44 |
+
if (isOpen && !apiKey) {
|
| 45 |
+
api.ai.getStats().catch(() => {}); // Warm up
|
|
|
|
| 46 |
fetch('/api/ai/live-access', {
|
| 47 |
headers: {
|
| 48 |
'x-user-username': api.auth.getCurrentUser()?.username || '',
|
|
|
|
| 52 |
.then(res => res.json())
|
| 53 |
.then(data => {
|
| 54 |
if (data.key) setApiKey(data.key);
|
|
|
|
| 55 |
})
|
| 56 |
+
.catch(err => console.error("Failed to get live key", err));
|
| 57 |
}
|
| 58 |
}, [isOpen]);
|
| 59 |
|
| 60 |
+
// 2. Clean up on unmount or close
|
| 61 |
+
useEffect(() => {
|
| 62 |
+
if (!isOpen) {
|
| 63 |
+
handleDisconnect();
|
| 64 |
+
}
|
| 65 |
+
return () => {
|
| 66 |
+
handleDisconnect();
|
| 67 |
+
};
|
| 68 |
+
}, [isOpen]);
|
| 69 |
|
| 70 |
+
// Visualizer Loop
|
| 71 |
+
useEffect(() => {
|
| 72 |
+
if (status === 'DISCONNECTED') {
|
| 73 |
+
setVolumeLevel(0);
|
| 74 |
+
return;
|
| 75 |
+
}
|
| 76 |
+
volumeIntervalRef.current = setInterval(() => {
|
| 77 |
+
if (analyserRef.current) {
|
| 78 |
+
const array = new Uint8Array(analyserRef.current.frequencyBinCount);
|
| 79 |
+
analyserRef.current.getByteFrequencyData(array);
|
| 80 |
+
const avg = array.reduce((a,b)=>a+b) / array.length;
|
| 81 |
+
setVolumeLevel(Math.min(100, avg * 1.5));
|
| 82 |
+
}
|
| 83 |
+
}, 100);
|
| 84 |
+
return () => clearInterval(volumeIntervalRef.current);
|
| 85 |
+
}, [status]);
|
| 86 |
+
|
| 87 |
+
const initAudioContext = () => {
|
| 88 |
+
if (!audioContextRef.current) {
|
| 89 |
// @ts-ignore
|
| 90 |
const AudioCtor = window.AudioContext || window.webkitAudioContext;
|
| 91 |
+
const ctx = new AudioCtor({ sampleRate: OUTPUT_SAMPLE_RATE });
|
| 92 |
+
|
| 93 |
+
const analyser = ctx.createAnalyser();
|
| 94 |
+
analyser.fftSize = 64;
|
| 95 |
+
|
| 96 |
+
const gain = ctx.createGain();
|
| 97 |
+
gain.connect(ctx.destination); // For output
|
| 98 |
|
| 99 |
+
audioContextRef.current = ctx;
|
| 100 |
+
analyserRef.current = analyser;
|
| 101 |
+
gainNodeRef.current = gain;
|
| 102 |
+
}
|
| 103 |
+
if (audioContextRef.current.state === 'suspended') {
|
| 104 |
+
audioContextRef.current.resume();
|
| 105 |
+
}
|
| 106 |
+
};
|
|
|
|
| 107 |
|
| 108 |
+
const handleConnect = async () => {
|
| 109 |
+
if (!apiKey) return;
|
| 110 |
+
setStatus('CONNECTING');
|
| 111 |
+
setTranscript('正在建立连接...');
|
| 112 |
+
|
| 113 |
+
try {
|
| 114 |
+
initAudioContext();
|
| 115 |
const client = new GoogleGenAI({ apiKey });
|
| 116 |
|
| 117 |
+
const session = await client.live.connect({
|
| 118 |
model: 'gemini-2.5-flash-native-audio-preview-09-2025',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
callbacks: {
|
| 120 |
onopen: () => {
|
| 121 |
+
console.log('Session Open');
|
|
|
|
|
|
|
|
|
|
| 122 |
},
|
| 123 |
+
onmessage: (msg: LiveServerMessage) => {
|
| 124 |
+
handleServerMessage(msg);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
},
|
| 126 |
onclose: () => {
|
| 127 |
+
console.log('Session Close');
|
| 128 |
+
handleDisconnect();
|
|
|
|
| 129 |
},
|
| 130 |
onerror: (e) => {
|
| 131 |
+
console.error('Session Error', e);
|
| 132 |
+
setTranscript('连接发生错误,请重试');
|
| 133 |
+
handleDisconnect();
|
| 134 |
}
|
| 135 |
+
},
|
| 136 |
+
config: {
|
| 137 |
+
responseModalities: [Modality.AUDIO],
|
| 138 |
+
speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } } },
|
| 139 |
+
// Strong instruction to force Chinese
|
| 140 |
+
systemInstruction: "You are a helpful school assistant. You MUST reply in spoken Chinese (Mandarin). Keep answers concise and friendly. Do not use markdown.",
|
| 141 |
}
|
| 142 |
});
|
| 143 |
+
|
| 144 |
+
sessionRef.current = session;
|
| 145 |
+
setStatus('CONNECTED');
|
| 146 |
+
setTranscript('连接成功,请按住麦克风说话');
|
| 147 |
|
| 148 |
} catch (e) {
|
| 149 |
+
console.error("Connect failed", e);
|
| 150 |
+
setStatus('DISCONNECTED');
|
| 151 |
+
setTranscript('连接失败');
|
| 152 |
}
|
| 153 |
};
|
| 154 |
|
| 155 |
+
const handleServerMessage = async (msg: LiveServerMessage) => {
|
| 156 |
+
const serverContent = msg.serverContent;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
// 1. Audio Output
|
| 159 |
+
const audioData = serverContent?.modelTurn?.parts?.[0]?.inlineData?.data;
|
| 160 |
+
if (audioData && audioContextRef.current) {
|
| 161 |
+
setStatus('SPEAKING'); // Receiving audio means speaking
|
| 162 |
+
const ctx = audioContextRef.current;
|
| 163 |
+
const bytes = base64ToUint8Array(audioData);
|
| 164 |
+
|
| 165 |
+
// Decode Raw PCM (16-bit, 24kHz, Mono)
|
| 166 |
+
const int16 = new Int16Array(bytes.buffer);
|
| 167 |
+
const float32 = new Float32Array(int16.length);
|
| 168 |
+
for(let i=0; i<int16.length; i++) float32[i] = int16[i] / 32768.0;
|
| 169 |
+
|
| 170 |
+
const buffer = ctx.createBuffer(1, float32.length, OUTPUT_SAMPLE_RATE);
|
| 171 |
+
buffer.copyToChannel(float32, 0);
|
| 172 |
+
|
| 173 |
+
const source = ctx.createBufferSource();
|
| 174 |
+
source.buffer = buffer;
|
| 175 |
+
|
| 176 |
+
// Connect to analyser for visuals
|
| 177 |
+
if (analyserRef.current && gainNodeRef.current) {
|
| 178 |
+
source.connect(analyserRef.current);
|
| 179 |
+
analyserRef.current.connect(gainNodeRef.current);
|
| 180 |
+
} else {
|
| 181 |
+
source.connect(ctx.destination);
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
// Schedule gapless playback
|
| 185 |
+
const now = ctx.currentTime;
|
| 186 |
+
const startTime = Math.max(now, nextPlayTimeRef.current);
|
| 187 |
+
source.start(startTime);
|
| 188 |
+
nextPlayTimeRef.current = startTime + buffer.duration;
|
| 189 |
+
|
| 190 |
+
source.onended = () => {
|
| 191 |
+
// If gap is large, we assume finished
|
| 192 |
+
if (ctx.currentTime >= nextPlayTimeRef.current - 0.1) {
|
| 193 |
+
setStatus('CONNECTED'); // Back to idle
|
| 194 |
+
}
|
| 195 |
+
};
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
// 2. Text Transcription (Subtitle)
|
| 199 |
+
// Note: The model sometimes returns 'thought' or 'search' logs here.
|
| 200 |
+
// We rely on audio mostly, but show text if it looks like a response.
|
| 201 |
+
const text = serverContent?.modelTurn?.parts?.[0]?.text;
|
| 202 |
+
if (text) {
|
| 203 |
+
if (!text.startsWith('**') && !text.includes('Finding')) {
|
| 204 |
+
setTranscript(text);
|
| 205 |
+
}
|
| 206 |
+
}
|
| 207 |
|
| 208 |
+
// 3. User Turn Finished (Model starts thinking)
|
| 209 |
+
if (serverContent?.turnComplete) {
|
| 210 |
+
setStatus('THINKING');
|
| 211 |
+
}
|
| 212 |
};
|
| 213 |
|
| 214 |
+
const startRecording = async () => {
|
| 215 |
+
if (status !== 'CONNECTED' && status !== 'SPEAKING') return;
|
| 216 |
|
| 217 |
+
try {
|
| 218 |
+
// Interrupt model if speaking
|
| 219 |
+
if (status === 'SPEAKING') {
|
| 220 |
+
// We can send an interruption message or just stop playing, but API handles new input as interrupt usually
|
| 221 |
+
setStatus('CONNECTED');
|
| 222 |
+
}
|
| 223 |
|
| 224 |
+
const stream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: INPUT_SAMPLE_RATE } });
|
| 225 |
+
mediaStreamRef.current = stream;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
+
const ctx = new (window.AudioContext || (window as any).webkitAudioContext)({ sampleRate: INPUT_SAMPLE_RATE });
|
| 228 |
+
const source = ctx.createMediaStreamSource(stream);
|
| 229 |
+
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
| 230 |
|
|
|
|
| 231 |
processor.onaudioprocess = (e) => {
|
|
|
|
| 232 |
const inputData = e.inputBuffer.getChannelData(0);
|
|
|
|
| 233 |
|
| 234 |
+
// Downconvert Float32 to Int16 for Gemini
|
| 235 |
+
const l = inputData.length;
|
| 236 |
+
const int16Data = new Int16Array(l);
|
| 237 |
+
for (let i = 0; i < l; i++) {
|
| 238 |
+
int16Data[i] = inputData[i] * 32768;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
// Convert to Base64 manually to avoid large lib dependency
|
| 242 |
+
let binary = '';
|
| 243 |
+
const bytes = new Uint8Array(int16Data.buffer);
|
| 244 |
+
const len = bytes.byteLength;
|
| 245 |
+
for (let i = 0; i < len; i++) {
|
| 246 |
+
binary += String.fromCharCode(bytes[i]);
|
| 247 |
+
}
|
| 248 |
+
const b64 = btoa(binary);
|
| 249 |
|
| 250 |
+
if (sessionRef.current) {
|
| 251 |
+
sessionRef.current.sendRealtimeInput({
|
| 252 |
+
media: { mimeType: `audio/pcm;rate=${INPUT_SAMPLE_RATE}`, data: b64 }
|
| 253 |
+
});
|
| 254 |
+
}
|
| 255 |
};
|
| 256 |
+
|
| 257 |
source.connect(processor);
|
| 258 |
+
processor.connect(ctx.destination);
|
| 259 |
|
| 260 |
+
sourceNodeRef.current = source;
|
| 261 |
+
processorRef.current = processor;
|
| 262 |
+
setStatus('LISTENING');
|
| 263 |
+
setTranscript('正在聆听...');
|
| 264 |
+
|
| 265 |
+
} catch (e) {
|
| 266 |
+
console.error(e);
|
| 267 |
+
setTranscript('无法访问麦克风');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
}
|
| 269 |
};
|
| 270 |
|
| 271 |
+
const stopRecording = () => {
|
| 272 |
+
if (status !== 'LISTENING') return;
|
| 273 |
+
|
| 274 |
+
// Cleanup Mic Processing
|
| 275 |
+
if (processorRef.current) {
|
| 276 |
+
processorRef.current.disconnect();
|
| 277 |
+
processorRef.current = null;
|
| 278 |
+
}
|
| 279 |
+
if (sourceNodeRef.current) {
|
| 280 |
+
sourceNodeRef.current.disconnect();
|
| 281 |
+
sourceNodeRef.current = null;
|
| 282 |
+
}
|
| 283 |
+
if (mediaStreamRef.current) {
|
| 284 |
+
mediaStreamRef.current.getTracks().forEach(t => t.stop());
|
| 285 |
+
mediaStreamRef.current = null;
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
setStatus('THINKING');
|
| 289 |
+
setTranscript('思考中...');
|
| 290 |
+
};
|
| 291 |
|
| 292 |
+
const handleDisconnect = () => {
|
| 293 |
+
if (sessionRef.current) {
|
| 294 |
+
// sessionRef.current.close(); // SDK might not have close method exposed directly depending on version, but usually does
|
| 295 |
+
sessionRef.current = null;
|
| 296 |
+
}
|
| 297 |
+
// Cleanup Audio
|
| 298 |
+
if (audioContextRef.current) {
|
| 299 |
+
audioContextRef.current.suspend(); // Suspend instead of close to reuse? Or close.
|
| 300 |
+
// For robustness, let's just close and nullify.
|
| 301 |
+
audioContextRef.current.close().catch(()=>{});
|
| 302 |
+
audioContextRef.current = null;
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
stopRecording(); // Ensure mic is off
|
| 306 |
+
|
| 307 |
+
setStatus('DISCONNECTED');
|
| 308 |
+
setTranscript('');
|
| 309 |
+
nextPlayTimeRef.current = 0;
|
| 310 |
+
};
|
| 311 |
+
|
| 312 |
+
if (!api.auth.getCurrentUser()) return null;
|
| 313 |
|
| 314 |
return (
|
| 315 |
<div className="fixed bottom-6 right-6 z-[9999]">
|
|
|
|
| 323 |
</button>
|
| 324 |
)}
|
| 325 |
|
| 326 |
+
{/* Call Interface */}
|
| 327 |
{isOpen && (
|
| 328 |
+
<div className="bg-slate-900 w-80 md:w-96 rounded-3xl shadow-2xl border border-slate-700 overflow-hidden flex flex-col animate-in slide-in-from-bottom-5 fade-in duration-300 h-[500px]">
|
| 329 |
{/* Header */}
|
| 330 |
+
<div className="bg-slate-800/50 p-4 flex justify-between items-center text-white shrink-0 backdrop-blur-md">
|
| 331 |
<div className="flex items-center gap-2">
|
| 332 |
+
<div className={`w-2 h-2 rounded-full ${status === 'DISCONNECTED' ? 'bg-red-500' : 'bg-green-500 animate-pulse'}`}></div>
|
| 333 |
+
<span className="font-bold text-sm">AI 实时通话</span>
|
| 334 |
</div>
|
| 335 |
+
<div className="flex gap-2">
|
| 336 |
+
<button onClick={handleDisconnect} title="重置" className="hover:bg-white/10 p-1.5 rounded-full text-gray-400 hover:text-white transition-colors"><RefreshCw size={16}/></button>
|
| 337 |
+
<button onClick={() => setIsOpen(false)} title="最小化" className="hover:bg-white/10 p-1.5 rounded-full text-gray-400 hover:text-white transition-colors"><ChevronDown size={20}/></button>
|
| 338 |
</div>
|
| 339 |
</div>
|
| 340 |
|
| 341 |
+
{/* Main Visual Area */}
|
| 342 |
+
<div className="flex-1 flex flex-col items-center justify-center p-6 relative">
|
| 343 |
+
{/* Visualizer Circle */}
|
| 344 |
+
<div className={`relative w-40 h-40 flex items-center justify-center transition-all duration-500 ${status === 'LISTENING' ? 'scale-110' : 'scale-100'}`}>
|
| 345 |
+
{/* Outer Glow */}
|
| 346 |
+
<div
|
| 347 |
+
className={`absolute inset-0 rounded-full blur-2xl transition-all duration-300 ${
|
| 348 |
+
status === 'SPEAKING' ? 'bg-blue-500/40' :
|
| 349 |
+
status === 'LISTENING' ? 'bg-green-500/40' :
|
| 350 |
+
status === 'THINKING' ? 'bg-purple-500/40' : 'bg-gray-500/10'
|
| 351 |
+
}`}
|
| 352 |
+
style={{ opacity: 0.5 + (volumeLevel / 200) }}
|
| 353 |
+
></div>
|
| 354 |
+
|
| 355 |
+
{/* Dynamic Rings */}
|
| 356 |
+
<div
|
| 357 |
+
className={`absolute inset-0 rounded-full border-2 border-white/10 transition-all duration-100`}
|
| 358 |
+
style={{ transform: `scale(${1 + volumeLevel/100})` }}
|
| 359 |
+
></div>
|
| 360 |
+
<div
|
| 361 |
+
className={`absolute inset-0 rounded-full border border-white/20 transition-all duration-100 delay-75`}
|
| 362 |
+
style={{ transform: `scale(${1 + volumeLevel/150})` }}
|
| 363 |
+
></div>
|
| 364 |
+
|
| 365 |
+
{/* Central Icon */}
|
| 366 |
+
<div className={`z-10 w-24 h-24 rounded-full flex items-center justify-center text-white shadow-xl transition-colors duration-500 ${
|
| 367 |
+
status === 'SPEAKING' ? 'bg-blue-600' :
|
| 368 |
+
status === 'LISTENING' ? 'bg-green-600' :
|
| 369 |
+
status === 'THINKING' ? 'bg-purple-600' :
|
| 370 |
+
status === 'CONNECTED' ? 'bg-slate-700' : 'bg-slate-800'
|
| 371 |
+
}`}>
|
| 372 |
+
{status === 'SPEAKING' ? <Volume2 size={40} className="animate-pulse"/> :
|
| 373 |
+
status === 'LISTENING' ? <Mic size={40} className="animate-bounce"/> :
|
| 374 |
+
status === 'THINKING' ? <Loader2 size={40} className="animate-spin"/> :
|
| 375 |
+
status === 'CONNECTED' ? <Radio size={40}/> : <Power size={40}/>}
|
| 376 |
</div>
|
| 377 |
+
</div>
|
| 378 |
+
|
| 379 |
+
{/* Status Text */}
|
| 380 |
+
<div className="mt-8 text-center px-4 w-full">
|
| 381 |
+
<p className={`text-sm font-bold uppercase tracking-wider mb-2 ${
|
| 382 |
+
status === 'SPEAKING' ? 'text-blue-400' :
|
| 383 |
+
status === 'LISTENING' ? 'text-green-400' :
|
| 384 |
+
status === 'THINKING' ? 'text-purple-400' : 'text-gray-500'
|
| 385 |
+
}`}>
|
| 386 |
+
{status === 'DISCONNECTED' ? '未连接' :
|
| 387 |
+
status === 'CONNECTING' ? '连接中...' :
|
| 388 |
+
status === 'CONNECTED' ? '准备就绪' :
|
| 389 |
+
status === 'LISTENING' ? '正在聆听...' :
|
| 390 |
+
status === 'THINKING' ? '思考中...' : '正在说话'}
|
| 391 |
+
</p>
|
| 392 |
+
<p className="text-white text-lg font-medium leading-relaxed min-h-[3rem] line-clamp-3 transition-all">
|
| 393 |
+
{transcript}
|
| 394 |
+
</p>
|
| 395 |
+
</div>
|
| 396 |
</div>
|
| 397 |
|
| 398 |
{/* Controls */}
|
| 399 |
+
<div className="p-6 pb-8 bg-slate-800/50 backdrop-blur-md border-t border-slate-700 flex justify-center">
|
| 400 |
+
{status === 'DISCONNECTED' ? (
|
| 401 |
<button
|
| 402 |
+
onClick={handleConnect}
|
| 403 |
+
disabled={!apiKey}
|
| 404 |
+
className="w-full py-4 bg-blue-600 hover:bg-blue-500 text-white rounded-2xl font-bold flex items-center justify-center gap-2 transition-all hover:scale-[1.02] active:scale-95 disabled:opacity-50 disabled:cursor-not-allowed"
|
| 405 |
>
|
| 406 |
+
<Power size={20}/> 开启 AI 语音
|
|
|
|
| 407 |
</button>
|
| 408 |
) : (
|
| 409 |
+
<div className="flex items-center gap-4 w-full justify-center">
|
| 410 |
+
<div className="relative group">
|
| 411 |
<button
|
| 412 |
+
onMouseDown={startRecording}
|
| 413 |
+
onMouseUp={stopRecording}
|
| 414 |
+
onTouchStart={(e) => { e.preventDefault(); startRecording(); }}
|
| 415 |
+
onTouchEnd={(e) => { e.preventDefault(); stopRecording(); }}
|
| 416 |
+
className={`w-20 h-20 rounded-full flex items-center justify-center shadow-lg transition-all transform ${
|
| 417 |
+
status === 'LISTENING' ? 'bg-green-500 scale-110 ring-4 ring-green-500/30' :
|
| 418 |
+
'bg-white text-slate-900 hover:bg-gray-100'
|
| 419 |
+
}`}
|
| 420 |
>
|
| 421 |
+
<Mic size={32} fill={status==='LISTENING' ? 'white' : 'currentColor'} className={status==='LISTENING'?'text-white':''}/>
|
| 422 |
</button>
|
| 423 |
+
{status === 'CONNECTED' && (
|
| 424 |
+
<div className="absolute -bottom-8 left-1/2 -translate-x-1/2 text-xs text-gray-400 whitespace-nowrap opacity-0 group-hover:opacity-100 transition-opacity">
|
| 425 |
+
按住说话
|
| 426 |
+
</div>
|
| 427 |
+
)}
|
| 428 |
</div>
|
|
|
|
|
|
|
|
|
|
| 429 |
</div>
|
| 430 |
)}
|
| 431 |
</div>
|