SentimentDetectiontest / frontend /src /components /VoiceSessionModal.jsx
3v324v23's picture
Migrate chatbot to Socratic sentiment tutor with Gemini Live Voice Session capability
c622774
Raw
History Blame Contribute Delete
8.46 kB
import React, { useEffect, useState, useRef } from 'react';
import { Mic, X, MicOff, AlertCircle } from 'lucide-react';
export default function VoiceSessionModal({ isOpen, onClose, apiKey }) {
const [status, setStatus] = useState('connecting'); // 'connecting', 'listening', 'speaking', 'error'
const [errorMessage, setErrorMessage] = useState('');
const [isMuted, setIsMuted] = useState(false);
const wsRef = useRef(null);
const audioContextRef = useRef(null);
const playbackContextRef = useRef(null);
const streamRef = useRef(null);
const processorRef = useRef(null);
const nextPlayTimeRef = useRef(0);
const textTranscriptRef = useRef('');
const [transcript, setTranscript] = useState('');
// Helper: Convert ArrayBuffer to Base64
const base64ArrayBuffer = (arrayBuffer) => {
let binary = '';
const bytes = new Uint8Array(arrayBuffer);
const len = bytes.byteLength;
for (let i = 0; i < len; i++) {
binary += String.fromCharCode(bytes[i]);
}
return window.btoa(binary);
};
useEffect(() => {
if (!isOpen) return;
setStatus('connecting');
setTranscript('');
textTranscriptRef.current = '';
// Determine WebSocket URL
const protocol = window.location.protocol === 'https:' ? 'wss://' : 'ws://';
const host = window.location.host === 'localhost:5173' ? 'localhost:8000' : window.location.host;
const wsUrl = `${protocol}${host}/api/live-ws?api_key=${apiKey || ''}`;
// Establish WebSocket Connection
const ws = new WebSocket(wsUrl);
wsRef.current = ws;
ws.onopen = async () => {
try {
// Request Microphone access
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
streamRef.current = stream;
// Initialize Audio contexts
audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
playbackContextRef.current = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 24000 });
nextPlayTimeRef.current = playbackContextRef.current.currentTime;
// Capture Mic Input
const source = audioContextRef.current.createMediaStreamSource(stream);
const processor = audioContextRef.current.createScriptProcessor(2048, 1, 1);
processorRef.current = processor;
processor.onaudioprocess = (e) => {
if (isMuted) return;
const inputData = e.inputBuffer.getChannelData(0);
// Convert Float32 to Int16 PCM
const pcmData = new Int16Array(inputData.length);
for (let i = 0; i < inputData.length; i++) {
pcmData[i] = Math.max(-1, Math.min(1, inputData[i])) * 0x7FFF;
}
// Send chunk to server
if (ws.readyState === WebSocket.OPEN) {
const base64Audio = base64ArrayBuffer(pcmData.buffer);
ws.send(JSON.stringify({ type: 'audio', data: base64Audio }));
}
};
source.connect(processor);
processor.connect(audioContextRef.current.destination);
setStatus('listening');
} catch (err) {
console.error("Microphone access failed:", err);
setStatus('error');
setErrorMessage("Microphone access is required for voice session. Please allow mic permissions.");
}
};
ws.onmessage = async (event) => {
const message = JSON.parse(event.data);
if (message.type === 'audio') {
setStatus('speaking');
// Decode base64 24kHz PCM back to Float32 for Web Audio playback
const binary = window.atob(message.data);
const bytes = new Uint8Array(binary.length);
for (let i = 0; i < binary.length; i++) {
bytes[i] = binary.charCodeAt(i);
}
const int16Data = new Int16Array(bytes.buffer);
const float32Data = new Float32Array(int16Data.length);
for (let i = 0; i < int16Data.length; i++) {
float32Data[i] = int16Data[i] / 0x7FFF;
}
const pContext = playbackContextRef.current;
if (pContext && pContext.state !== 'suspended') {
const audioBuffer = pContext.createBuffer(1, float32Data.length, 24000);
audioBuffer.getChannelData(0).set(float32Data);
const bufferSource = pContext.createBufferSource();
bufferSource.buffer = audioBuffer;
bufferSource.connect(pContext.destination);
// Gapless scheduling
const startTime = Math.max(pContext.currentTime, nextPlayTimeRef.current);
bufferSource.start(startTime);
nextPlayTimeRef.current = startTime + audioBuffer.duration;
}
} else if (message.type === 'text') {
// Handle incoming Socratic tutor speech transcription
textTranscriptRef.current += message.data;
setTranscript(textTranscriptRef.current);
} else if (message.type === 'turn_complete') {
setStatus('listening');
textTranscriptRef.current = '';
}
};
ws.onerror = (err) => {
console.error("WebSocket error:", err);
setStatus('error');
setErrorMessage("Lost connection to Gemini Live server.");
};
ws.onclose = () => {
setStatus('connecting');
};
return () => {
// Clean up connections and audio context on unmount
if (wsRef.current) wsRef.current.close();
if (processorRef.current) processorRef.current.disconnect();
if (streamRef.current) {
streamRef.current.getTracks().forEach(track => track.stop());
}
if (audioContextRef.current) audioContextRef.current.close();
if (playbackContextRef.current) playbackContextRef.current.close();
};
}, [isOpen, isMuted]);
if (!isOpen) return null;
return (
<div className="voice-modal-backdrop">
<div className="voice-modal-content">
<button className="voice-modal-close" onClick={onClose}>
<X size={20} />
</button>
<div className="voice-modal-header">
<h2>Socratic Voice Space</h2>
<p>Real-Time Bidirectional Dialogue</p>
</div>
{/* Pulse Animations and Mic States */}
<div className="voice-visualizer-container">
{status === 'connecting' && (
<div className="voice-status-indicator connecting">
<div className="pulse-circle" />
<span>Connecting to Gemini Live...</span>
</div>
)}
{status === 'listening' && (
<div className="voice-status-indicator listening">
<div className="pulse-circle active" />
<div className="pulse-ring ring-1" />
<div className="pulse-ring ring-2" />
<span style={{ color: 'var(--color-happy)' }}>Listening to you... Go ahead and speak!</span>
</div>
)}
{status === 'speaking' && (
<div className="voice-status-indicator speaking">
<div className="pulse-circle active speaking-pulse" />
<div className="pulse-ring ring-1 speaking-ring" />
<div className="pulse-ring ring-2 speaking-ring" />
<span style={{ color: 'var(--secondary)' }}>Socratic Tutor is speaking...</span>
</div>
)}
{status === 'error' && (
<div className="voice-status-indicator error" style={{ gap: '0.8rem' }}>
<AlertCircle size={40} color="var(--color-frustrated)" />
<p style={{ color: 'var(--color-frustrated)', fontSize: '0.9rem', textAlign: 'center', maxWidth: '300px' }}>
{errorMessage}
</p>
</div>
)}
</div>
{/* Transcription Display */}
{status === 'speaking' && transcript && (
<div className="voice-transcript-box">
<p>"{transcript}"</p>
</div>
)}
{/* Control Buttons */}
<div className="voice-modal-controls">
<button
className={`voice-control-btn ${isMuted ? 'muted' : ''}`}
onClick={() => setIsMuted(!isMuted)}
disabled={status === 'error' || status === 'connecting'}
title={isMuted ? "Unmute microphone" : "Mute microphone"}
>
{isMuted ? <MicOff size={22} /> : <Mic size={22} />}
<span>{isMuted ? "Muted" : "Active"}</span>
</button>
</div>
</div>
</div>
);
}