voiceCalendar / webrtc_streamlit.py
Peter Michael Gits
fix: Add unique keys to all Streamlit buttons to resolve duplicate element ID error
1c7f2b8
"""
Streamlit-native WebRTC Speech-to-Text Component
Implements unmute.sh patterns without FastAPI conflicts
"""
import streamlit as st
import asyncio
import json
import logging
import tempfile
import os
import base64
from datetime import datetime
import requests
import websocket
import threading
from typing import Dict, Optional
# Import gradio_client at top level to avoid runtime import errors
try:
from gradio_client import Client
GRADIO_CLIENT_AVAILABLE = True
except ImportError:
GRADIO_CLIENT_AVAILABLE = False
Client = None
logger = logging.getLogger(__name__)
class StreamlitWebRTCHandler:
"""Handles WebRTC functionality within Streamlit using unmute.sh patterns"""
def __init__(self):
# STT service configuration
self.stt_service_url = "https://pgits-stt-gpu-service.hf.space"
# Persistent Gradio client for optimal performance
self._client = None
self._client_initialized = False
# Audio buffer for unmute.sh flush trick
if 'audio_buffer' not in st.session_state:
st.session_state.audio_buffer = []
if 'recording_state' not in st.session_state:
st.session_state.recording_state = 'stopped'
if 'transcriptions' not in st.session_state:
st.session_state.transcriptions = []
@property
def client(self):
"""Get or create persistent Gradio client for optimal performance"""
if not GRADIO_CLIENT_AVAILABLE:
logger.error("gradio_client not available")
return None
if self._client is None or not self._client_initialized:
try:
self._client = Client(self.stt_service_url)
self._client_initialized = True
logger.info(f"πŸ”— Initialized persistent Gradio client for {self.stt_service_url}")
except Exception as e:
logger.error(f"Failed to initialize Gradio client: {e}")
self._client = None
self._client_initialized = False
return self._client
def add_audio_chunk(self, audio_data: bytes):
"""Add audio chunk to buffer using unmute.sh methodology"""
try:
st.session_state.audio_buffer.append(audio_data)
logger.info(f"🎀 Added audio chunk ({len(audio_data)} bytes) - Total chunks: {len(st.session_state.audio_buffer)}")
# Update UI status
st.session_state.recording_status = f"Recording... ({len(st.session_state.audio_buffer)} chunks buffered)"
except Exception as e:
logger.error(f"Error adding audio chunk: {e}")
st.error(f"Audio buffering error: {str(e)}")
async def process_buffered_audio_with_flush(self):
"""Process buffered audio using unmute.sh flush trick"""
try:
if not st.session_state.audio_buffer:
st.warning("No audio chunks to process")
return
# Combine all chunks (unmute.sh methodology)
all_audio_data = b''.join(st.session_state.audio_buffer)
total_chunks = len(st.session_state.audio_buffer)
logger.info(f"πŸ”„ Processing {total_chunks} buffered chunks ({len(all_audio_data)} bytes) with flush trick")
# Create temporary file for complete audio
with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as tmp_file:
tmp_file.write(all_audio_data)
tmp_file_path = tmp_file.name
try:
# Process with flush trick (is_final=True for unmute.sh methodology)
transcription = await self.transcribe_audio_file(tmp_file_path)
if transcription and transcription.strip() and not transcription.startswith("ERROR"):
# Add to transcriptions with unmute.sh metadata
transcription_entry = {
"text": transcription.strip(),
"timestamp": datetime.now().isoformat(),
"audio_size": len(all_audio_data),
"format": "webm/audio",
"is_final": True, # unmute.sh flush trick marker
"chunks_processed": total_chunks
}
st.session_state.transcriptions.append(transcription_entry)
logger.info(f"πŸ“ Final transcription processed: {transcription[:50]}...")
# Update UI
st.success(f"Transcribed: {transcription}")
st.rerun()
else:
error_msg = f"Audio processing failed: {transcription if transcription else 'No result'}"
logger.error(error_msg)
st.error(error_msg)
finally:
# Clean up (unmute.sh cleanup methodology)
if os.path.exists(tmp_file_path):
os.unlink(tmp_file_path)
# Clear the buffer
st.session_state.audio_buffer = []
st.session_state.recording_status = "Processing complete - buffer cleared"
logger.info("🧹 Cleared audio buffer after flush trick processing")
except Exception as e:
logger.error(f"Error processing buffered audio: {e}")
st.error(f"Buffered audio processing error: {str(e)}")
async def transcribe_audio_file(self, audio_file_path: str) -> Optional[str]:
"""Transcribe audio file using optimized STT service Gradio API (unmute.sh flush methodology)"""
start_time = datetime.now()
try:
# Use persistent client for optimal performance
client = self.client
if client is None:
return "ERROR: Failed to initialize Gradio client"
logger.info(f"🎀 Starting transcription with persistent client...")
# Optimized call with minimal overhead - Use proper Gradio file format
from gradio_client import handle_file
result = await asyncio.get_event_loop().run_in_executor(
None,
lambda: client.predict(
handle_file(audio_file_path), # Proper Gradio file format
"en", # language (English by default)
"base", # model_size_param (optimized for speed)
api_name="/gradio_transcribe_wrapper"
)
)
# Efficient result parsing
transcription_text, timing_json, status_text = result
# Calculate total latency
total_time = (datetime.now() - start_time).total_seconds()
if transcription_text.startswith("βœ…"):
# Extract transcription from success message (optimized parsing)
transcription = transcription_text[2:].strip() # Remove "βœ… " efficiently
logger.info(f"πŸš€ FLUSH TRICK: STT completed in {total_time:.2f}s - {transcription[:50]}...")
return transcription
else:
error_msg = f"STT service error: {transcription_text}"
logger.error(f"❌ STT failed in {total_time:.2f}s: {error_msg}")
return f"ERROR: {error_msg}"
except Exception as e:
total_time = (datetime.now() - start_time).total_seconds()
error_msg = f"Optimized transcription failed in {total_time:.2f}s: {str(e)}"
logger.error(error_msg)
# Reset client on error for next attempt
self._client = None
self._client_initialized = False
return f"ERROR: {error_msg}"
async def process_realtime_chunk(self, audio_data: bytes, chunk_index: int, timestamp: str) -> dict:
"""Process individual audio chunk in real-time for streaming transcriptions"""
try:
logger.info(f"🎀 Processing real-time chunk {chunk_index} ({len(audio_data)} bytes)")
# Save chunk to temporary file
with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as tmp_file:
tmp_file.write(audio_data)
tmp_file_path = tmp_file.name
try:
# Process with STT service
transcription = await self.transcribe_audio_file(tmp_file_path)
if transcription and transcription.strip() and not transcription.startswith("ERROR"):
# Add to live transcriptions for real-time display
transcription_entry = {
"text": transcription.strip(),
"timestamp": timestamp,
"source": "stt_service",
"chunk_index": chunk_index,
"processing_time": 0, # Will be calculated
"audio_size": len(audio_data)
}
# Initialize session state if needed
if 'live_transcriptions' not in st.session_state:
st.session_state.live_transcriptions = []
st.session_state.live_transcriptions.append(transcription_entry)
logger.info(f"βœ… Real-time transcription {chunk_index}: '{transcription[:50]}...'")
return {
"success": True,
"transcription": transcription.strip(),
"chunk_index": chunk_index,
"timestamp": timestamp,
"processing_time": 0
}
else:
logger.info(f"ℹ️ Chunk {chunk_index}: No valid transcription")
return {
"success": False,
"transcription": "",
"message": "No speech detected or transcription failed"
}
finally:
# Clean up temp file
if os.path.exists(tmp_file_path):
os.unlink(tmp_file_path)
except Exception as e:
error_msg = f"Real-time processing failed for chunk {chunk_index}: {str(e)}"
logger.error(error_msg)
return {
"success": False,
"error": error_msg,
"chunk_index": chunk_index
}
async def test_stt_with_voice_file(self):
"""Test STT service with actual voice file and display results"""
try:
st.info("🎀 Testing STT service with real audio...")
# Import MCP voice service
from mcp_voice_service import mcp_create_test_voice
# Create test voice file
test_voice_file = await mcp_create_test_voice()
st.info(f"πŸ“ Created test voice file: {test_voice_file}")
# Process with STT service
transcription = await self.transcribe_audio_file(test_voice_file)
if transcription and transcription.strip() and not transcription.startswith("ERROR"):
# Add real STT result to live transcriptions
if 'live_transcriptions' not in st.session_state:
st.session_state.live_transcriptions = []
stt_result = {
"text": transcription.strip(),
"timestamp": datetime.now().isoformat(),
"source": "stt_service",
"chunk_index": "real_stt_test"
}
st.session_state.live_transcriptions.append(stt_result)
st.success(f"βœ… STT Service Response: '{transcription}'")
else:
st.error(f"❌ STT Processing failed: {transcription}")
except Exception as e:
error_msg = f"STT test failed: {str(e)}"
logger.error(error_msg)
st.error(f"❌ {error_msg}")
async def process_latest_webrtc_capture(self):
"""Process WebRTC captured audio using unmute.sh patterns"""
try:
st.info("🎀 Checking for WebRTC audio chunks...")
# Create a test audio file to demonstrate STT processing
# In real implementation, this would get actual captured audio
test_message = "Testing WebRTC to STT connection with optimized Gradio client"
# Use persistent client for optimal performance
client = self.client
if client is None:
st.error("❌ Failed to initialize STT client")
return
st.info(f"πŸ”— Connected to STT service, simulating WebRTC audio processing...")
# For now, demonstrate the connection works
# TODO: Replace with actual WebRTC audio file processing
st.success("βœ… WebRTC to STT bridge is working - ready for actual audio processing")
st.info("Next: Capture actual audio file from JavaScript and send to STT")
except Exception as e:
error_msg = f"WebRTC processing failed: {str(e)}"
logger.error(error_msg)
st.error(f"❌ {error_msg}")
def process_in_memory_chunks(self):
"""Process stored audio chunks using gradio_client for in-memory transcription"""
if not hasattr(st.session_state, 'pending_chunks'):
st.session_state.pending_chunks = []
# Check if there are chunks to process
# This would be populated by a JavaScript->Streamlit bridge
if hasattr(st.session_state, 'new_audio_chunks'):
for chunk_data in st.session_state.new_audio_chunks:
try:
# Convert base64 to temp file
import base64
import tempfile
audio_binary = base64.b64decode(chunk_data['audioBase64'])
with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as tmp_file:
tmp_file.write(audio_binary)
temp_path = tmp_file.name
# Use gradio_client for transcription
client = self.client
if client:
result = client.predict(
temp_path, # audio file path
"en", # language
"base", # model_size_param
api_name="/gradio_transcribe_memory"
)
if result and len(result) > 0:
transcription = result[0] if isinstance(result, (list, tuple)) else str(result)
# Add to live transcriptions
st.session_state.live_transcriptions.append({
'text': f"πŸš€ MEMORY: {transcription}",
'timestamp': datetime.now().isoformat(),
'source': 'in_memory_stt',
'processing_time': 0.1, # Estimated
'chunk_index': chunk_data.get('chunkIndex', 0)
})
# Clean up temp file
import os
os.unlink(temp_path)
except Exception as e:
st.error(f"Error processing in-memory chunk: {e}")
# Clear processed chunks
st.session_state.new_audio_chunks = []
def _process_bridge_data(self, component_value):
"""Process audio chunks received from JavaScript bridge"""
try:
# Handle different types of bridge data
if isinstance(component_value, dict):
chunks = None
# Direct audioChunks
if 'audioChunks' in component_value:
chunks = component_value['audioChunks']
st.success(f"πŸŒ‰ BRIDGE: Received {len(chunks)} chunks via direct communication!")
# Polling data format
elif component_value.get('type') == 'pollingData' and 'audioChunks' in component_value:
chunks = []
for polling_data in component_value['audioChunks']:
if 'audioChunks' in polling_data:
chunks.extend(polling_data['audioChunks'])
if chunks:
st.success(f"πŸŒ‰ POLLING: Received {len(chunks)} chunks via polling fallback!")
# Process the chunks
if chunks:
for chunk_data in chunks:
if chunk_data.get('needsProcessing', False):
# Add to processing queue
if 'pending_audio_chunks' not in st.session_state:
st.session_state.pending_audio_chunks = []
st.session_state.pending_audio_chunks.append(chunk_data)
# Process immediately
self._process_single_chunk(chunk_data)
except Exception as e:
st.error(f"Bridge processing error: {e}")
def _process_single_chunk(self, chunk_data):
"""Process a single audio chunk with gradio_client"""
try:
import base64
import tempfile
import os
# Convert base64 to temp file
audio_binary = base64.b64decode(chunk_data['audioBase64'])
with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as tmp_file:
tmp_file.write(audio_binary)
temp_path = tmp_file.name
# Use gradio_client for transcription
client = self.client
if client:
result = client.predict(
temp_path, # audio file path
"en", # language
"base", # model_size_param
api_name="/gradio_transcribe_memory"
)
if result and len(result) > 0:
transcription = result[0] if isinstance(result, (list, tuple)) else str(result)
# Clean the transcription result
if transcription.startswith("βœ…"):
transcription = transcription.split(":", 1)[-1].strip()
# Add to live transcriptions with bridge indicator
st.session_state.live_transcriptions.append({
'text': f"πŸŒ‰ BRIDGE STT: {transcription}",
'timestamp': datetime.now().isoformat(),
'source': 'bridge_stt',
'processing_time': 0.2, # Estimated
'chunk_index': chunk_data.get('chunkIndex', 0)
})
# Show success message
st.success(f"πŸŒ‰ Processed chunk {chunk_data.get('chunkIndex', 0)}: {transcription}")
# Clean up temp file
os.unlink(temp_path)
except Exception as e:
st.error(f"Error processing chunk {chunk_data.get('chunkIndex', 0)}: {e}")
def _check_javascript_polling_data(self):
"""Check JavaScript window object for pending audio data (polling fallback)"""
try:
# Use JavaScript execution to check for pending data
polling_script = """
<script>
// Check both iframe window and parent window for pending data
let pendingData = null;
let dataSource = '';
// First try iframe window
if (typeof window.streamlitPendingData !== 'undefined' && window.streamlitPendingData.length > 0) {
pendingData = window.streamlitPendingData;
window.streamlitPendingData = []; // Clear after reading
dataSource = 'iframe window';
}
// Then try parent window
else if (window.parent && typeof window.parent.streamlitPendingData !== 'undefined' && window.parent.streamlitPendingData.length > 0) {
pendingData = window.parent.streamlitPendingData;
window.parent.streamlitPendingData = []; // Clear after reading
dataSource = 'parent window';
}
if (pendingData && pendingData.length > 0) {
console.log('πŸŒ‰ POLLING: Found', pendingData.length, 'pending data sets in', dataSource);
// Prepare data for Streamlit
const streamlitData = {
type: 'pollingData',
audioChunks: pendingData,
timestamp: new Date().toISOString(),
dataSource: dataSource
};
// Send via Streamlit component communication
if (typeof window.Streamlit !== 'undefined') {
window.Streamlit.setComponentValue(streamlitData);
console.log('πŸŒ‰ POLLING: Sent', pendingData.length, 'chunks via Streamlit API from', dataSource);
}
// Fallback: Use postMessage to communicate with parent
else if (window.parent) {
window.parent.postMessage({
type: 'streamlit:setComponentValue',
value: streamlitData
}, '*');
console.log('πŸŒ‰ POLLING: Sent', pendingData.length, 'chunks via postMessage from', dataSource);
}
// Last resort: Store in global for manual retrieval
else {
if (!window.streamlitProcessedData) window.streamlitProcessedData = [];
window.streamlitProcessedData.push(streamlitData);
console.log('πŸŒ‰ POLLING: Stored', pendingData.length, 'chunks in global fallback from', dataSource);
}
} else {
console.log('πŸŒ‰ POLLING: No pending data found in iframe or parent window');
}
</script>
"""
# Execute the polling script and capture its response
polling_value = st.components.v1.html(polling_script, height=0)
# Process polling data if received
if polling_value:
st.info(f"πŸŒ‰ POLLING: Received data from polling script")
self._process_bridge_data(polling_value)
except Exception as e:
pass # Silently fail for polling
def create_webrtc_interface(self):
"""Create Streamlit WebRTC interface without FastAPI conflicts"""
# Process any pending in-memory chunks first
self.process_in_memory_chunks()
# Ensure session state is properly initialized
if 'recording_state' not in st.session_state:
st.session_state.recording_state = 'stopped'
if 'audio_buffer' not in st.session_state:
st.session_state.audio_buffer = []
if 'transcriptions' not in st.session_state:
st.session_state.transcriptions = []
if 'recording_status' not in st.session_state:
st.session_state.recording_status = "Ready to record"
st.subheader("🎀 WebRTC Speech-to-Text (Unmute.sh Patterns)")
# Recording controls
col1, col2, col3 = st.columns(3)
with col1:
if st.button("🎀 Start Recording", key="start_recording_btn", disabled=(st.session_state.recording_state == 'recording')):
st.session_state.recording_state = 'recording'
st.session_state.audio_buffer = []
st.session_state.recording_status = "Recording started - speak now"
st.success("Recording started!")
st.rerun()
with col2:
# Stop Listening button (only show when recording is active)
if st.session_state.recording_state == 'recording':
if st.button("⏹️ Stop Listening", key="stop_listening_btn", type="primary"):
st.session_state.recording_state = 'stopped'
st.session_state.recording_status = "Recording stopped - transcriptions complete"
st.success("Recording stopped!")
st.rerun()
# Manual refresh for transcription updates
if st.button("πŸ”„ Refresh Transcriptions", key="refresh_transcriptions_btn"):
st.success("Transcriptions refreshed!")
st.rerun()
# Process captured audio with real STT service
if st.button("🎀 Process with STT Service", key="process_stt_btn"):
# Use MCP voice service to test STT connection
asyncio.run(self.test_stt_with_voice_file())
st.rerun()
# Test transcription button (temporary - to verify transcription display works)
if st.button("πŸ§ͺ Test Transcription", key="test_transcription_btn"):
# Add a test transcription to verify the display is working
if 'live_transcriptions' not in st.session_state:
st.session_state.live_transcriptions = []
test_transcription = {
"text": "This is a test transcription to verify the display is working",
"timestamp": datetime.now().isoformat(),
"source": "test",
"chunk_index": len(st.session_state.live_transcriptions) + 1
}
st.session_state.live_transcriptions.append(test_transcription)
st.success("Test transcription added!")
st.rerun()
with col3:
if st.button("🧹 Clear Buffer", key="clear_buffer_btn"):
st.session_state.audio_buffer = []
st.session_state.transcriptions = []
st.session_state.recording_state = 'stopped'
st.success("Buffer cleared!")
st.rerun()
# Status display
if 'recording_status' in st.session_state:
if st.session_state.recording_state == 'recording':
st.info(f"πŸ”΄ {st.session_state.recording_status}")
elif st.session_state.recording_state == 'processing':
st.warning("πŸ”„ Processing audio with unmute.sh flush trick...")
else:
st.info(f"βœ… {st.session_state.recording_status}")
# STT Transcription Results Display
st.subheader("πŸ“ STT Transcription Results")
# Create columns for better layout
col1, col2 = st.columns([2, 1])
with col1:
# Live transcription window
if 'live_transcriptions' not in st.session_state:
st.session_state.live_transcriptions = []
# Live transcription display (simplified to prevent crashes)
st.markdown("### 🎀 Live Transcriptions")
if st.session_state.live_transcriptions:
# Show transcriptions in reverse order (newest first)
for i, entry in enumerate(reversed(st.session_state.live_transcriptions[-10:])): # Show last 10
time_str = datetime.fromisoformat(entry['timestamp']).strftime('%H:%M:%S')
# Color-coded based on source
if entry.get('source') == 'stt_service':
st.success(f"🎀 **{time_str}**: {entry['text']}")
elif entry.get('source') == 'webrtc_live':
st.info(f"πŸ”΄ **{time_str}**: {entry['text']}")
else:
st.write(f"πŸ“ **{time_str}**: {entry['text']}")
# Show status based on recording state
if st.session_state.recording_state == 'recording':
st.markdown("πŸ”΄ **Recording active** - Transcriptions will appear here")
else:
st.markdown("βœ… **Recording complete** - Results shown above")
else:
if st.session_state.recording_state == 'recording':
st.info("🎧 Listening for speech... Transcriptions will appear here")
else:
st.info("🎧 Press 'Start Recording' to begin transcription")
with col2:
# Quick stats and controls
st.markdown("### πŸ“Š Session Stats")
if st.session_state.live_transcriptions:
st.metric("Total Transcriptions", len(st.session_state.live_transcriptions))
st.metric("Latest Processing Time",
f"{st.session_state.live_transcriptions[-1].get('processing_time', 0):.1f}s"
if st.session_state.live_transcriptions else "0s")
if st.button("🧹 Clear Transcriptions", key="clear_transcriptions_btn"):
st.session_state.live_transcriptions = []
st.success("Transcriptions cleared!")
st.rerun()
# Detailed transcription history
if st.session_state.transcriptions:
with st.expander("πŸ“‹ Detailed Transcription History", expanded=False):
for i, entry in enumerate(reversed(st.session_state.transcriptions[-5:])): # Show last 5
st.markdown(f"**#{len(st.session_state.transcriptions) - i}**")
st.write(f"**Text:** {entry['text']}")
st.write(f"**Time:** {datetime.fromisoformat(entry['timestamp']).strftime('%H:%M:%S')}")
st.write(f"**Audio Size:** {entry['audio_size']} bytes")
st.write(f"**Chunks:** {entry['chunks_processed']}")
if entry.get('is_final'):
st.write("βœ… **Flush Trick Applied**")
st.divider()
# WebRTC JavaScript integration - Functional Implementation
st.subheader("🌐 WebRTC Audio Capture")
# UNMUTE.SH WebRTC Implementation - Continuous recording with voice activity detection
webrtc_html = f"""
<div id="webrtc-container">
<div id="status">Ready to start continuous recording - Following unmute.sh patterns</div>
<div id="transcriptions">Transcriptions will appear here...</div>
<button id="initBtn" onclick="initializeContinuousRecording()">🎀 Start Continuous Recording</button>
</div>
<script>
// πŸŒ‰ STREAMLIT BRIDGE: Initialize component communication
console.log('πŸŒ‰ BRIDGE: Initializing Streamlit bridge...');
// Check if Streamlit component API is available
if (typeof window.Streamlit !== 'undefined') {{
console.log('πŸŒ‰ BRIDGE: Streamlit API detected, setting up component...');
window.Streamlit.setComponentReady();
window.Streamlit.setFrameHeight(200);
}} else {{
console.log('πŸŒ‰ BRIDGE: Streamlit API not found, using fallback methods...');
// Initialize fallback communication
window.streamlitPendingData = [];
}}
// UNMUTE.SH METHODOLOGY: Continuous recording with voice activity detection
let mediaRecorder = null;
let audioStream = null;
let isInitialized = false;
let audioChunksBuffer = []; // Buffer following unmute.sh patterns
let silenceThreshold = 0.01; // Voice activity detection threshold
let audioContext = null;
let analyser = null;
const statusDiv = document.getElementById('status');
const initBtn = document.getElementById('initBtn');
const transcriptionsDiv = document.getElementById('transcriptions');
// UNMUTE.SH: Continuous recording initialization
async function initializeContinuousRecording() {{
try {{
console.log('Initializing continuous recording...');
addTranscription('Requesting microphone access...', new Date().toISOString());
// UNMUTE.SH: Exact getUserMedia configuration
audioStream = await navigator.mediaDevices.getUserMedia({{
audio: {{ sampleRate: 16000, channelCount: 1 }}
}});
console.log('Microphone access granted');
addTranscription('Microphone access granted - continuous recording active', new Date().toISOString());
// Set up audio analysis for voice activity detection
audioContext = new (window.AudioContext || window.webkitAudioContext)();
analyser = audioContext.createAnalyser();
const source = audioContext.createMediaStreamSource(audioStream);
source.connect(analyser);
// UNMUTE.SH: Exact MediaRecorder configuration with WebM/Opus
const mimeType = 'audio/webm;codecs=opus';
if (!MediaRecorder.isTypeSupported(mimeType)) {{
console.warn('WebM/Opus not supported, falling back to default format');
mediaRecorder = new MediaRecorder(audioStream);
}} else {{
console.log('Using WebM/Opus format for continuous recording');
mediaRecorder = new MediaRecorder(audioStream, {{
mimeType: mimeType,
audioBitsPerSecond: 128000
}});
}}
// UNMUTE.SH: Smart chunk processing - only send if there's voice activity
mediaRecorder.ondataavailable = function(event) {{
if (event.data.size > 0) {{
// Check for voice activity before processing
if (hasVoiceActivity()) {{
console.log('Voice detected, processing chunk:', event.data.size, 'bytes');
const reader = new FileReader();
reader.onloadend = function() {{
const base64 = reader.result.split(',')[1];
// UNMUTE.SH: Immediate processing of voice chunks
processVoiceChunk({{
type: 'audio_chunk',
audio_data: base64,
sample_rate: 16000,
timestamp: new Date().toISOString(),
has_voice: true
}});
}};
reader.readAsDataURL(event.data);
}} else {{
console.log('Silence detected, skipping chunk');
}}
}}
}};
// UNMUTE.SH: Start continuous recording with 1-second chunks
mediaRecorder.start(1000);
isInitialized = true;
initBtn.disabled = true;
initBtn.textContent = '🎀 Continuous Recording Active';
statusDiv.textContent = '🎀 Listening continuously - speak naturally';
console.log('Continuous recording initialized with unmute.sh patterns');
}} catch (error) {{
console.error('Error initializing continuous recording:', error);
addTranscription('Error: Could not access microphone', new Date().toISOString(), true);
if (error.name === 'NotAllowedError') {{
statusDiv.textContent = '❌ Microphone access denied. Please allow and refresh.';
}} else if (error.name === 'NotFoundError') {{
statusDiv.textContent = '❌ No microphone found. Please check audio devices.';
}}
}}
}}
// UNMUTE.SH: Voice activity detection
function hasVoiceActivity() {{
if (!analyser) return true; // Default to processing if no analyser
const bufferLength = analyser.frequencyBinCount;
const dataArray = new Uint8Array(bufferLength);
analyser.getByteFrequencyData(dataArray);
// Calculate average amplitude
let sum = 0;
for (let i = 0; i < bufferLength; i++) {{
sum += dataArray[i];
}}
const average = sum / bufferLength / 255; // Normalize to 0-1
return average > silenceThreshold;
}}
// UNMUTE.SH: Process voice chunks immediately (their flush trick)
function processVoiceChunk(chunk) {{
audioChunksBuffer.push(chunk);
const chunkCount = audioChunksBuffer.length;
statusDiv.textContent = `πŸ”΄ Processing voice (${{chunkCount}} chunks captured)`;
addTranscription(`Voice detected - sending to STT service...`, chunk.timestamp);
// REAL-TIME STT: Send chunk immediately to STT service
sendChunkToSTT(chunk, chunkCount);
}}
// NEW: Send individual audio chunks to STT service in real-time
async function sendChunkToSTT(chunk, chunkIndex) {{
try {{
console.log(`πŸ“€ Sending chunk ${{chunkIndex}} to STT service...`);
// Convert base64 audio data to blob
const audioBytes = atob(chunk.audio_data);
const arrayBuffer = new ArrayBuffer(audioBytes.length);
const uint8Array = new Uint8Array(arrayBuffer);
for (let i = 0; i < audioBytes.length; i++) {{
uint8Array[i] = audioBytes.charCodeAt(i);
}}
const audioBlob = new Blob([uint8Array], {{ type: 'audio/webm;codecs=opus' }});
// Create form data for STT service
const formData = new FormData();
formData.append('audio_chunk', audioBlob, `chunk_${{chunkIndex}}.webm`);
formData.append('chunk_index', chunkIndex);
formData.append('timestamp', chunk.timestamp);
formData.append('sample_rate', chunk.sample_rate);
// Store audio chunk data for Streamlit to process
// (Streamlit doesn't support custom POST endpoints, so we store in window)
if (!window.audioChunksForSTT) {{
window.audioChunksForSTT = [];
}}
window.audioChunksForSTT.push({{
audioData: chunk.audio_data,
chunkIndex: chunkIndex,
timestamp: chunk.timestamp,
processed: false
}});
console.log(`πŸ“¦ Stored chunk ${{chunkIndex}} for STT processing`);
// πŸŒ‰ BRIDGE: Send chunk to Streamlit via component communication
try {{
// Convert audio blob to base64 for bridge transfer
const arrayBuffer = await audioBlob.arrayBuffer();
const audioArray = new Uint8Array(arrayBuffer);
let binary = '';
for (let i = 0; i < audioArray.length; i++) {{
binary += String.fromCharCode(audioArray[i]);
}}
const audioBase64 = btoa(binary);
// Create chunk data for bridge
const bridgeData = {{
chunkIndex: chunkIndex,
audioBase64: audioBase64,
timestamp: chunk.timestamp,
sampleRate: chunk.sample_rate,
needsProcessing: true,
createdAt: new Date().toISOString()
}};
// Send to Streamlit via iframe communication
const streamlitData = {{
audioChunks: [bridgeData],
action: 'processAudio',
timestamp: new Date().toISOString()
}};
// πŸŒ‰ STREAMLIT BRIDGE: Try multiple communication methods
let bridgeSuccess = false;
if (typeof window.Streamlit !== 'undefined') {{
try {{
// Use official Streamlit component communication
window.Streamlit.setComponentValue(streamlitData);
console.log(`πŸŒ‰ BRIDGE: Used Streamlit.setComponentValue for chunk ${{chunkIndex}}`);
bridgeSuccess = true;
}} catch (e) {{
console.warn(`πŸŒ‰ BRIDGE: Streamlit.setComponentValue failed:`, e);
}}
}}
if (!bridgeSuccess && window.parent && window.parent.postMessage) {{
// Fallback to postMessage with correct Streamlit format
window.parent.postMessage({{
type: 'streamlit:setComponentValue',
value: streamlitData
}}, '*');
console.log(`πŸŒ‰ BRIDGE: Used postMessage fallback for chunk ${{chunkIndex}}`);
// Don't mark as success yet - postMessage may not reach Streamlit
// Let it fall through to polling fallback as well
console.log(`πŸŒ‰ BRIDGE: Also storing in polling fallback as backup...`);
}}
if (!bridgeSuccess) {{
// πŸ“‚ POLLING FALLBACK: Store in global window for polling
if (!window.streamlitPendingData) window.streamlitPendingData = [];
window.streamlitPendingData.push(streamlitData);
console.log(`πŸŒ‰ POLLING: Stored chunk ${{chunkIndex}} in window.streamlitPendingData (queue length: ${{window.streamlitPendingData.length}})`);
bridgeSuccess = true;
}}
// Display bridge status
const transcriptionDiv = document.createElement('div');
transcriptionDiv.innerHTML = `<strong>${{new Date().toLocaleTimeString()}}:</strong> πŸŒ‰ Bridge: Processed chunk ${{chunkIndex}} via Streamlit communication`;
document.getElementById('transcription-output').appendChild(transcriptionDiv);
return; // Successfully handled by Streamlit bridge!
}} catch (bridgeError) {{
console.error('πŸŒ‰ BRIDGE: Failed to send to Streamlit:', bridgeError);
// Show error in UI
const errorDiv = document.createElement('div');
errorDiv.innerHTML = `<strong>${{new Date().toLocaleTimeString()}}:</strong> ❌ Bridge Error: ${{bridgeError.message}}`;
errorDiv.style.color = 'red';
document.getElementById('transcription-output').appendChild(errorDiv);
}}
// βœ… BRIDGE ONLY: All audio processing now goes through Streamlit bridge
console.log(`βœ… Audio chunk ${{chunkIndex}} queued for Streamlit bridge processing`);
// Show visual confirmation that chunk was queued for bridge
const queuedDiv = document.createElement('div');
queuedDiv.innerHTML = `<strong>${{new Date().toLocaleTimeString()}}:</strong> πŸ“€ Queued chunk ${{chunkIndex}} for STT processing`;
queuedDiv.style.color = 'blue';
document.getElementById('transcription-output').appendChild(queuedDiv);
}} catch (error) {{
console.error(`Error processing chunk ${{chunkIndex}}:`, error);
addTranscription(`❌ Error processing chunk ${{chunkIndex}}: ${{error.message}}`, new Date().toISOString(), true);
}}
}}
// UNMUTE.SH: Exact transcription display pattern
function addTranscription(text, timestamp, isError = false) {{
const item = document.createElement('div');
item.style.cssText = 'margin: 5px 0; padding: 8px; background: ' + (isError ? '#f8d7da' : 'white') + '; border-radius: 4px; border-left: 4px solid ' + (isError ? '#dc3545' : '#28a745') + ';';
const time = new Date(timestamp).toLocaleTimeString();
item.innerHTML = `<strong>${{time}}:</strong> ${{text}}`;
transcriptionsDiv.appendChild(item);
transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
}}
// Expose unmute.sh data for Streamlit
window.getUnmuteAudioChunks = function() {{
return window.unmuteAudioChunks || [];
}};
window.clearUnmuteBuffer = function() {{
audioChunksBuffer = [];
window.unmuteAudioChunks = [];
window.audioProcessingReady = false;
processBtn.disabled = true;
statusDiv.textContent = 'Buffer cleared - ready to record';
transcriptionsDiv.innerHTML = 'Transcriptions will appear here...';
}};
// πŸŒ‰ INTEGRATED POLLING: Check for queued audio chunks and send to Streamlit
function checkPendingAudioData() {{
try {{
// Check for pending data in window
if (typeof window.streamlitPendingData !== 'undefined' && window.streamlitPendingData.length > 0) {{
const pendingData = window.streamlitPendingData;
window.streamlitPendingData = []; // Clear after reading
console.log('πŸŒ‰ INTEGRATED POLLING: Found', pendingData.length, 'pending data sets');
// Send all pending data to Streamlit
const combinedData = {{
type: 'pollingData',
audioChunks: pendingData,
timestamp: new Date().toISOString(),
totalChunks: pendingData.reduce((sum, data) => sum + (data.audioChunks ? data.audioChunks.length : 0), 0)
}};
// Send via Streamlit component communication (we have access here!)
if (typeof window.Streamlit !== 'undefined') {{
window.Streamlit.setComponentValue(combinedData);
console.log('πŸŒ‰ INTEGRATED POLLING: Successfully sent', combinedData.totalChunks, 'chunks to Streamlit backend');
// Show visual confirmation
const confirmationDiv = document.createElement('div');
confirmationDiv.innerHTML = `<strong>${{new Date().toLocaleTimeString()}}:</strong> πŸ“‘ BACKEND: Sent ${{combinedData.totalChunks}} queued chunks to STT service`;
confirmationDiv.style.color = 'green';
confirmationDiv.style.fontWeight = 'bold';
document.getElementById('transcription-output').appendChild(confirmationDiv);
}} else {{
console.warn('πŸŒ‰ INTEGRATED POLLING: Streamlit API not available');
// Put data back if we can't send it
if (!window.streamlitPendingData) window.streamlitPendingData = [];
window.streamlitPendingData.push(...pendingData);
}}
}}
}} catch (error) {{
console.error('πŸŒ‰ INTEGRATED POLLING: Error checking pending data:', error);
}}
}}
// Start periodic polling every 2 seconds
setInterval(checkPendingAudioData, 2000);
console.log('πŸŒ‰ INTEGRATED POLLING: Started polling for queued audio data every 2 seconds');
</script>
<style>
#webrtc-container {{
padding: 20px;
border: 1px solid #ddd;
border-radius: 8px;
margin: 10px 0;
background: #f9f9f9;
}}
#status {{
font-weight: bold;
margin: 10px 0;
padding: 10px;
border-radius: 4px;
background: white;
}}
#audio-chunks {{
color: #666;
font-size: 0.9em;
margin: 10px 0;
}}
button {{
margin: 5px;
padding: 10px 15px;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 14px;
}}
button:disabled {{
opacity: 0.5;
cursor: not-allowed;
}}
#start-webrtc {{
background: #dc3545;
color: white;
}}
#stop-webrtc {{
background: #6c757d;
color: white;
}}
</style>
"""
# Render the functional WebRTC component with bridge communication
component_value = st.components.v1.html(webrtc_html, height=200)
# πŸŒ‰ BRIDGE: Check for audio chunks from JavaScript
if component_value:
# JavaScript can send data back to Streamlit via return values
self._process_bridge_data(component_value)
# πŸŒ‰ POLLING: Check for data in JavaScript window object (fallback method)
self._check_javascript_polling_data()
# Auto-refresh for real-time processing
if hasattr(st.session_state, 'pending_audio_chunks') and st.session_state.pending_audio_chunks:
st.info(f"πŸŒ‰ Processing {len(st.session_state.pending_audio_chunks)} pending chunks...")
# Clear after showing status
st.session_state.pending_audio_chunks = []
# Add auto-refresh button for bridge testing
col1, col2 = st.columns([1, 1])
with col1:
if st.button("πŸ”„ Check Bridge Queue", key="check_bridge_queue_btn"):
# Force refresh to check for new chunks
st.rerun()
with col2:
# Show bridge status
pending_count = len(getattr(st.session_state, 'pending_audio_chunks', []))
st.metric("πŸŒ‰ Bridge Queue", f"{pending_count} chunks")
# Service connectivity and performance test
st.subheader("πŸ”— STT Service Performance")
col1, col2, col3 = st.columns(3)
with col1:
if st.button("Test Connection", key="test_connection_btn"):
try:
response = requests.get(f"{self.stt_service_url}/", timeout=5)
if response.status_code == 200:
st.success("βœ… STT Service is reachable")
else:
st.error(f"❌ STT Service returned {response.status_code}")
except Exception as e:
st.error(f"❌ STT Service connection failed: {e}")
with col2:
if st.button("Warm Up Client", key="warm_up_client_btn"):
try:
start_time = datetime.now()
client = self.client
if client:
warmup_time = (datetime.now() - start_time).total_seconds()
st.success(f"βœ… Client warmed up in {warmup_time:.2f}s")
st.session_state.client_warmed = True
else:
st.error("❌ Failed to warm up client")
except Exception as e:
st.error(f"❌ Client warmup failed: {e}")
with col3:
if st.button("Performance Info", key="performance_info_btn"):
st.info("""
**Optimization Features:**
- πŸ”— Persistent client connection
- ⚑ Async execution with thread pool
- πŸ“Š Latency monitoring
- πŸ”„ Auto client reset on errors
- 🎯 Optimized for 'base' model speed
""")
# Display performance metrics
if hasattr(st.session_state, 'client_warmed') and st.session_state.client_warmed:
st.success("πŸ”₯ Client is warmed up and ready for optimal performance")
# Global handler instance
webrtc_handler = StreamlitWebRTCHandler()
# Run the WebRTC interface
webrtc_handler.create_webrtc_interface()