Spaces:
Sleeping
Sleeping
Peter Michael Gits
fix: Add unique keys to all Streamlit buttons to resolve duplicate element ID error
1c7f2b8 | """ | |
| Streamlit-native WebRTC Speech-to-Text Component | |
| Implements unmute.sh patterns without FastAPI conflicts | |
| """ | |
| import streamlit as st | |
| import asyncio | |
| import json | |
| import logging | |
| import tempfile | |
| import os | |
| import base64 | |
| from datetime import datetime | |
| import requests | |
| import websocket | |
| import threading | |
| from typing import Dict, Optional | |
| # Import gradio_client at top level to avoid runtime import errors | |
| try: | |
| from gradio_client import Client | |
| GRADIO_CLIENT_AVAILABLE = True | |
| except ImportError: | |
| GRADIO_CLIENT_AVAILABLE = False | |
| Client = None | |
| logger = logging.getLogger(__name__) | |
| class StreamlitWebRTCHandler: | |
| """Handles WebRTC functionality within Streamlit using unmute.sh patterns""" | |
| def __init__(self): | |
| # STT service configuration | |
| self.stt_service_url = "https://pgits-stt-gpu-service.hf.space" | |
| # Persistent Gradio client for optimal performance | |
| self._client = None | |
| self._client_initialized = False | |
| # Audio buffer for unmute.sh flush trick | |
| if 'audio_buffer' not in st.session_state: | |
| st.session_state.audio_buffer = [] | |
| if 'recording_state' not in st.session_state: | |
| st.session_state.recording_state = 'stopped' | |
| if 'transcriptions' not in st.session_state: | |
| st.session_state.transcriptions = [] | |
| def client(self): | |
| """Get or create persistent Gradio client for optimal performance""" | |
| if not GRADIO_CLIENT_AVAILABLE: | |
| logger.error("gradio_client not available") | |
| return None | |
| if self._client is None or not self._client_initialized: | |
| try: | |
| self._client = Client(self.stt_service_url) | |
| self._client_initialized = True | |
| logger.info(f"π Initialized persistent Gradio client for {self.stt_service_url}") | |
| except Exception as e: | |
| logger.error(f"Failed to initialize Gradio client: {e}") | |
| self._client = None | |
| self._client_initialized = False | |
| return self._client | |
| def add_audio_chunk(self, audio_data: bytes): | |
| """Add audio chunk to buffer using unmute.sh methodology""" | |
| try: | |
| st.session_state.audio_buffer.append(audio_data) | |
| logger.info(f"π€ Added audio chunk ({len(audio_data)} bytes) - Total chunks: {len(st.session_state.audio_buffer)}") | |
| # Update UI status | |
| st.session_state.recording_status = f"Recording... ({len(st.session_state.audio_buffer)} chunks buffered)" | |
| except Exception as e: | |
| logger.error(f"Error adding audio chunk: {e}") | |
| st.error(f"Audio buffering error: {str(e)}") | |
| async def process_buffered_audio_with_flush(self): | |
| """Process buffered audio using unmute.sh flush trick""" | |
| try: | |
| if not st.session_state.audio_buffer: | |
| st.warning("No audio chunks to process") | |
| return | |
| # Combine all chunks (unmute.sh methodology) | |
| all_audio_data = b''.join(st.session_state.audio_buffer) | |
| total_chunks = len(st.session_state.audio_buffer) | |
| logger.info(f"π Processing {total_chunks} buffered chunks ({len(all_audio_data)} bytes) with flush trick") | |
| # Create temporary file for complete audio | |
| with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as tmp_file: | |
| tmp_file.write(all_audio_data) | |
| tmp_file_path = tmp_file.name | |
| try: | |
| # Process with flush trick (is_final=True for unmute.sh methodology) | |
| transcription = await self.transcribe_audio_file(tmp_file_path) | |
| if transcription and transcription.strip() and not transcription.startswith("ERROR"): | |
| # Add to transcriptions with unmute.sh metadata | |
| transcription_entry = { | |
| "text": transcription.strip(), | |
| "timestamp": datetime.now().isoformat(), | |
| "audio_size": len(all_audio_data), | |
| "format": "webm/audio", | |
| "is_final": True, # unmute.sh flush trick marker | |
| "chunks_processed": total_chunks | |
| } | |
| st.session_state.transcriptions.append(transcription_entry) | |
| logger.info(f"π Final transcription processed: {transcription[:50]}...") | |
| # Update UI | |
| st.success(f"Transcribed: {transcription}") | |
| st.rerun() | |
| else: | |
| error_msg = f"Audio processing failed: {transcription if transcription else 'No result'}" | |
| logger.error(error_msg) | |
| st.error(error_msg) | |
| finally: | |
| # Clean up (unmute.sh cleanup methodology) | |
| if os.path.exists(tmp_file_path): | |
| os.unlink(tmp_file_path) | |
| # Clear the buffer | |
| st.session_state.audio_buffer = [] | |
| st.session_state.recording_status = "Processing complete - buffer cleared" | |
| logger.info("π§Ή Cleared audio buffer after flush trick processing") | |
| except Exception as e: | |
| logger.error(f"Error processing buffered audio: {e}") | |
| st.error(f"Buffered audio processing error: {str(e)}") | |
| async def transcribe_audio_file(self, audio_file_path: str) -> Optional[str]: | |
| """Transcribe audio file using optimized STT service Gradio API (unmute.sh flush methodology)""" | |
| start_time = datetime.now() | |
| try: | |
| # Use persistent client for optimal performance | |
| client = self.client | |
| if client is None: | |
| return "ERROR: Failed to initialize Gradio client" | |
| logger.info(f"π€ Starting transcription with persistent client...") | |
| # Optimized call with minimal overhead - Use proper Gradio file format | |
| from gradio_client import handle_file | |
| result = await asyncio.get_event_loop().run_in_executor( | |
| None, | |
| lambda: client.predict( | |
| handle_file(audio_file_path), # Proper Gradio file format | |
| "en", # language (English by default) | |
| "base", # model_size_param (optimized for speed) | |
| api_name="/gradio_transcribe_wrapper" | |
| ) | |
| ) | |
| # Efficient result parsing | |
| transcription_text, timing_json, status_text = result | |
| # Calculate total latency | |
| total_time = (datetime.now() - start_time).total_seconds() | |
| if transcription_text.startswith("β "): | |
| # Extract transcription from success message (optimized parsing) | |
| transcription = transcription_text[2:].strip() # Remove "β " efficiently | |
| logger.info(f"π FLUSH TRICK: STT completed in {total_time:.2f}s - {transcription[:50]}...") | |
| return transcription | |
| else: | |
| error_msg = f"STT service error: {transcription_text}" | |
| logger.error(f"β STT failed in {total_time:.2f}s: {error_msg}") | |
| return f"ERROR: {error_msg}" | |
| except Exception as e: | |
| total_time = (datetime.now() - start_time).total_seconds() | |
| error_msg = f"Optimized transcription failed in {total_time:.2f}s: {str(e)}" | |
| logger.error(error_msg) | |
| # Reset client on error for next attempt | |
| self._client = None | |
| self._client_initialized = False | |
| return f"ERROR: {error_msg}" | |
| async def process_realtime_chunk(self, audio_data: bytes, chunk_index: int, timestamp: str) -> dict: | |
| """Process individual audio chunk in real-time for streaming transcriptions""" | |
| try: | |
| logger.info(f"π€ Processing real-time chunk {chunk_index} ({len(audio_data)} bytes)") | |
| # Save chunk to temporary file | |
| with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as tmp_file: | |
| tmp_file.write(audio_data) | |
| tmp_file_path = tmp_file.name | |
| try: | |
| # Process with STT service | |
| transcription = await self.transcribe_audio_file(tmp_file_path) | |
| if transcription and transcription.strip() and not transcription.startswith("ERROR"): | |
| # Add to live transcriptions for real-time display | |
| transcription_entry = { | |
| "text": transcription.strip(), | |
| "timestamp": timestamp, | |
| "source": "stt_service", | |
| "chunk_index": chunk_index, | |
| "processing_time": 0, # Will be calculated | |
| "audio_size": len(audio_data) | |
| } | |
| # Initialize session state if needed | |
| if 'live_transcriptions' not in st.session_state: | |
| st.session_state.live_transcriptions = [] | |
| st.session_state.live_transcriptions.append(transcription_entry) | |
| logger.info(f"β Real-time transcription {chunk_index}: '{transcription[:50]}...'") | |
| return { | |
| "success": True, | |
| "transcription": transcription.strip(), | |
| "chunk_index": chunk_index, | |
| "timestamp": timestamp, | |
| "processing_time": 0 | |
| } | |
| else: | |
| logger.info(f"βΉοΈ Chunk {chunk_index}: No valid transcription") | |
| return { | |
| "success": False, | |
| "transcription": "", | |
| "message": "No speech detected or transcription failed" | |
| } | |
| finally: | |
| # Clean up temp file | |
| if os.path.exists(tmp_file_path): | |
| os.unlink(tmp_file_path) | |
| except Exception as e: | |
| error_msg = f"Real-time processing failed for chunk {chunk_index}: {str(e)}" | |
| logger.error(error_msg) | |
| return { | |
| "success": False, | |
| "error": error_msg, | |
| "chunk_index": chunk_index | |
| } | |
| async def test_stt_with_voice_file(self): | |
| """Test STT service with actual voice file and display results""" | |
| try: | |
| st.info("π€ Testing STT service with real audio...") | |
| # Import MCP voice service | |
| from mcp_voice_service import mcp_create_test_voice | |
| # Create test voice file | |
| test_voice_file = await mcp_create_test_voice() | |
| st.info(f"π Created test voice file: {test_voice_file}") | |
| # Process with STT service | |
| transcription = await self.transcribe_audio_file(test_voice_file) | |
| if transcription and transcription.strip() and not transcription.startswith("ERROR"): | |
| # Add real STT result to live transcriptions | |
| if 'live_transcriptions' not in st.session_state: | |
| st.session_state.live_transcriptions = [] | |
| stt_result = { | |
| "text": transcription.strip(), | |
| "timestamp": datetime.now().isoformat(), | |
| "source": "stt_service", | |
| "chunk_index": "real_stt_test" | |
| } | |
| st.session_state.live_transcriptions.append(stt_result) | |
| st.success(f"β STT Service Response: '{transcription}'") | |
| else: | |
| st.error(f"β STT Processing failed: {transcription}") | |
| except Exception as e: | |
| error_msg = f"STT test failed: {str(e)}" | |
| logger.error(error_msg) | |
| st.error(f"β {error_msg}") | |
| async def process_latest_webrtc_capture(self): | |
| """Process WebRTC captured audio using unmute.sh patterns""" | |
| try: | |
| st.info("π€ Checking for WebRTC audio chunks...") | |
| # Create a test audio file to demonstrate STT processing | |
| # In real implementation, this would get actual captured audio | |
| test_message = "Testing WebRTC to STT connection with optimized Gradio client" | |
| # Use persistent client for optimal performance | |
| client = self.client | |
| if client is None: | |
| st.error("β Failed to initialize STT client") | |
| return | |
| st.info(f"π Connected to STT service, simulating WebRTC audio processing...") | |
| # For now, demonstrate the connection works | |
| # TODO: Replace with actual WebRTC audio file processing | |
| st.success("β WebRTC to STT bridge is working - ready for actual audio processing") | |
| st.info("Next: Capture actual audio file from JavaScript and send to STT") | |
| except Exception as e: | |
| error_msg = f"WebRTC processing failed: {str(e)}" | |
| logger.error(error_msg) | |
| st.error(f"β {error_msg}") | |
| def process_in_memory_chunks(self): | |
| """Process stored audio chunks using gradio_client for in-memory transcription""" | |
| if not hasattr(st.session_state, 'pending_chunks'): | |
| st.session_state.pending_chunks = [] | |
| # Check if there are chunks to process | |
| # This would be populated by a JavaScript->Streamlit bridge | |
| if hasattr(st.session_state, 'new_audio_chunks'): | |
| for chunk_data in st.session_state.new_audio_chunks: | |
| try: | |
| # Convert base64 to temp file | |
| import base64 | |
| import tempfile | |
| audio_binary = base64.b64decode(chunk_data['audioBase64']) | |
| with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as tmp_file: | |
| tmp_file.write(audio_binary) | |
| temp_path = tmp_file.name | |
| # Use gradio_client for transcription | |
| client = self.client | |
| if client: | |
| result = client.predict( | |
| temp_path, # audio file path | |
| "en", # language | |
| "base", # model_size_param | |
| api_name="/gradio_transcribe_memory" | |
| ) | |
| if result and len(result) > 0: | |
| transcription = result[0] if isinstance(result, (list, tuple)) else str(result) | |
| # Add to live transcriptions | |
| st.session_state.live_transcriptions.append({ | |
| 'text': f"π MEMORY: {transcription}", | |
| 'timestamp': datetime.now().isoformat(), | |
| 'source': 'in_memory_stt', | |
| 'processing_time': 0.1, # Estimated | |
| 'chunk_index': chunk_data.get('chunkIndex', 0) | |
| }) | |
| # Clean up temp file | |
| import os | |
| os.unlink(temp_path) | |
| except Exception as e: | |
| st.error(f"Error processing in-memory chunk: {e}") | |
| # Clear processed chunks | |
| st.session_state.new_audio_chunks = [] | |
| def _process_bridge_data(self, component_value): | |
| """Process audio chunks received from JavaScript bridge""" | |
| try: | |
| # Handle different types of bridge data | |
| if isinstance(component_value, dict): | |
| chunks = None | |
| # Direct audioChunks | |
| if 'audioChunks' in component_value: | |
| chunks = component_value['audioChunks'] | |
| st.success(f"π BRIDGE: Received {len(chunks)} chunks via direct communication!") | |
| # Polling data format | |
| elif component_value.get('type') == 'pollingData' and 'audioChunks' in component_value: | |
| chunks = [] | |
| for polling_data in component_value['audioChunks']: | |
| if 'audioChunks' in polling_data: | |
| chunks.extend(polling_data['audioChunks']) | |
| if chunks: | |
| st.success(f"π POLLING: Received {len(chunks)} chunks via polling fallback!") | |
| # Process the chunks | |
| if chunks: | |
| for chunk_data in chunks: | |
| if chunk_data.get('needsProcessing', False): | |
| # Add to processing queue | |
| if 'pending_audio_chunks' not in st.session_state: | |
| st.session_state.pending_audio_chunks = [] | |
| st.session_state.pending_audio_chunks.append(chunk_data) | |
| # Process immediately | |
| self._process_single_chunk(chunk_data) | |
| except Exception as e: | |
| st.error(f"Bridge processing error: {e}") | |
| def _process_single_chunk(self, chunk_data): | |
| """Process a single audio chunk with gradio_client""" | |
| try: | |
| import base64 | |
| import tempfile | |
| import os | |
| # Convert base64 to temp file | |
| audio_binary = base64.b64decode(chunk_data['audioBase64']) | |
| with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as tmp_file: | |
| tmp_file.write(audio_binary) | |
| temp_path = tmp_file.name | |
| # Use gradio_client for transcription | |
| client = self.client | |
| if client: | |
| result = client.predict( | |
| temp_path, # audio file path | |
| "en", # language | |
| "base", # model_size_param | |
| api_name="/gradio_transcribe_memory" | |
| ) | |
| if result and len(result) > 0: | |
| transcription = result[0] if isinstance(result, (list, tuple)) else str(result) | |
| # Clean the transcription result | |
| if transcription.startswith("β "): | |
| transcription = transcription.split(":", 1)[-1].strip() | |
| # Add to live transcriptions with bridge indicator | |
| st.session_state.live_transcriptions.append({ | |
| 'text': f"π BRIDGE STT: {transcription}", | |
| 'timestamp': datetime.now().isoformat(), | |
| 'source': 'bridge_stt', | |
| 'processing_time': 0.2, # Estimated | |
| 'chunk_index': chunk_data.get('chunkIndex', 0) | |
| }) | |
| # Show success message | |
| st.success(f"π Processed chunk {chunk_data.get('chunkIndex', 0)}: {transcription}") | |
| # Clean up temp file | |
| os.unlink(temp_path) | |
| except Exception as e: | |
| st.error(f"Error processing chunk {chunk_data.get('chunkIndex', 0)}: {e}") | |
| def _check_javascript_polling_data(self): | |
| """Check JavaScript window object for pending audio data (polling fallback)""" | |
| try: | |
| # Use JavaScript execution to check for pending data | |
| polling_script = """ | |
| <script> | |
| // Check both iframe window and parent window for pending data | |
| let pendingData = null; | |
| let dataSource = ''; | |
| // First try iframe window | |
| if (typeof window.streamlitPendingData !== 'undefined' && window.streamlitPendingData.length > 0) { | |
| pendingData = window.streamlitPendingData; | |
| window.streamlitPendingData = []; // Clear after reading | |
| dataSource = 'iframe window'; | |
| } | |
| // Then try parent window | |
| else if (window.parent && typeof window.parent.streamlitPendingData !== 'undefined' && window.parent.streamlitPendingData.length > 0) { | |
| pendingData = window.parent.streamlitPendingData; | |
| window.parent.streamlitPendingData = []; // Clear after reading | |
| dataSource = 'parent window'; | |
| } | |
| if (pendingData && pendingData.length > 0) { | |
| console.log('π POLLING: Found', pendingData.length, 'pending data sets in', dataSource); | |
| // Prepare data for Streamlit | |
| const streamlitData = { | |
| type: 'pollingData', | |
| audioChunks: pendingData, | |
| timestamp: new Date().toISOString(), | |
| dataSource: dataSource | |
| }; | |
| // Send via Streamlit component communication | |
| if (typeof window.Streamlit !== 'undefined') { | |
| window.Streamlit.setComponentValue(streamlitData); | |
| console.log('π POLLING: Sent', pendingData.length, 'chunks via Streamlit API from', dataSource); | |
| } | |
| // Fallback: Use postMessage to communicate with parent | |
| else if (window.parent) { | |
| window.parent.postMessage({ | |
| type: 'streamlit:setComponentValue', | |
| value: streamlitData | |
| }, '*'); | |
| console.log('π POLLING: Sent', pendingData.length, 'chunks via postMessage from', dataSource); | |
| } | |
| // Last resort: Store in global for manual retrieval | |
| else { | |
| if (!window.streamlitProcessedData) window.streamlitProcessedData = []; | |
| window.streamlitProcessedData.push(streamlitData); | |
| console.log('π POLLING: Stored', pendingData.length, 'chunks in global fallback from', dataSource); | |
| } | |
| } else { | |
| console.log('π POLLING: No pending data found in iframe or parent window'); | |
| } | |
| </script> | |
| """ | |
| # Execute the polling script and capture its response | |
| polling_value = st.components.v1.html(polling_script, height=0) | |
| # Process polling data if received | |
| if polling_value: | |
| st.info(f"π POLLING: Received data from polling script") | |
| self._process_bridge_data(polling_value) | |
| except Exception as e: | |
| pass # Silently fail for polling | |
| def create_webrtc_interface(self): | |
| """Create Streamlit WebRTC interface without FastAPI conflicts""" | |
| # Process any pending in-memory chunks first | |
| self.process_in_memory_chunks() | |
| # Ensure session state is properly initialized | |
| if 'recording_state' not in st.session_state: | |
| st.session_state.recording_state = 'stopped' | |
| if 'audio_buffer' not in st.session_state: | |
| st.session_state.audio_buffer = [] | |
| if 'transcriptions' not in st.session_state: | |
| st.session_state.transcriptions = [] | |
| if 'recording_status' not in st.session_state: | |
| st.session_state.recording_status = "Ready to record" | |
| st.subheader("π€ WebRTC Speech-to-Text (Unmute.sh Patterns)") | |
| # Recording controls | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if st.button("π€ Start Recording", key="start_recording_btn", disabled=(st.session_state.recording_state == 'recording')): | |
| st.session_state.recording_state = 'recording' | |
| st.session_state.audio_buffer = [] | |
| st.session_state.recording_status = "Recording started - speak now" | |
| st.success("Recording started!") | |
| st.rerun() | |
| with col2: | |
| # Stop Listening button (only show when recording is active) | |
| if st.session_state.recording_state == 'recording': | |
| if st.button("βΉοΈ Stop Listening", key="stop_listening_btn", type="primary"): | |
| st.session_state.recording_state = 'stopped' | |
| st.session_state.recording_status = "Recording stopped - transcriptions complete" | |
| st.success("Recording stopped!") | |
| st.rerun() | |
| # Manual refresh for transcription updates | |
| if st.button("π Refresh Transcriptions", key="refresh_transcriptions_btn"): | |
| st.success("Transcriptions refreshed!") | |
| st.rerun() | |
| # Process captured audio with real STT service | |
| if st.button("π€ Process with STT Service", key="process_stt_btn"): | |
| # Use MCP voice service to test STT connection | |
| asyncio.run(self.test_stt_with_voice_file()) | |
| st.rerun() | |
| # Test transcription button (temporary - to verify transcription display works) | |
| if st.button("π§ͺ Test Transcription", key="test_transcription_btn"): | |
| # Add a test transcription to verify the display is working | |
| if 'live_transcriptions' not in st.session_state: | |
| st.session_state.live_transcriptions = [] | |
| test_transcription = { | |
| "text": "This is a test transcription to verify the display is working", | |
| "timestamp": datetime.now().isoformat(), | |
| "source": "test", | |
| "chunk_index": len(st.session_state.live_transcriptions) + 1 | |
| } | |
| st.session_state.live_transcriptions.append(test_transcription) | |
| st.success("Test transcription added!") | |
| st.rerun() | |
| with col3: | |
| if st.button("π§Ή Clear Buffer", key="clear_buffer_btn"): | |
| st.session_state.audio_buffer = [] | |
| st.session_state.transcriptions = [] | |
| st.session_state.recording_state = 'stopped' | |
| st.success("Buffer cleared!") | |
| st.rerun() | |
| # Status display | |
| if 'recording_status' in st.session_state: | |
| if st.session_state.recording_state == 'recording': | |
| st.info(f"π΄ {st.session_state.recording_status}") | |
| elif st.session_state.recording_state == 'processing': | |
| st.warning("π Processing audio with unmute.sh flush trick...") | |
| else: | |
| st.info(f"β {st.session_state.recording_status}") | |
| # STT Transcription Results Display | |
| st.subheader("π STT Transcription Results") | |
| # Create columns for better layout | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| # Live transcription window | |
| if 'live_transcriptions' not in st.session_state: | |
| st.session_state.live_transcriptions = [] | |
| # Live transcription display (simplified to prevent crashes) | |
| st.markdown("### π€ Live Transcriptions") | |
| if st.session_state.live_transcriptions: | |
| # Show transcriptions in reverse order (newest first) | |
| for i, entry in enumerate(reversed(st.session_state.live_transcriptions[-10:])): # Show last 10 | |
| time_str = datetime.fromisoformat(entry['timestamp']).strftime('%H:%M:%S') | |
| # Color-coded based on source | |
| if entry.get('source') == 'stt_service': | |
| st.success(f"π€ **{time_str}**: {entry['text']}") | |
| elif entry.get('source') == 'webrtc_live': | |
| st.info(f"π΄ **{time_str}**: {entry['text']}") | |
| else: | |
| st.write(f"π **{time_str}**: {entry['text']}") | |
| # Show status based on recording state | |
| if st.session_state.recording_state == 'recording': | |
| st.markdown("π΄ **Recording active** - Transcriptions will appear here") | |
| else: | |
| st.markdown("β **Recording complete** - Results shown above") | |
| else: | |
| if st.session_state.recording_state == 'recording': | |
| st.info("π§ Listening for speech... Transcriptions will appear here") | |
| else: | |
| st.info("π§ Press 'Start Recording' to begin transcription") | |
| with col2: | |
| # Quick stats and controls | |
| st.markdown("### π Session Stats") | |
| if st.session_state.live_transcriptions: | |
| st.metric("Total Transcriptions", len(st.session_state.live_transcriptions)) | |
| st.metric("Latest Processing Time", | |
| f"{st.session_state.live_transcriptions[-1].get('processing_time', 0):.1f}s" | |
| if st.session_state.live_transcriptions else "0s") | |
| if st.button("π§Ή Clear Transcriptions", key="clear_transcriptions_btn"): | |
| st.session_state.live_transcriptions = [] | |
| st.success("Transcriptions cleared!") | |
| st.rerun() | |
| # Detailed transcription history | |
| if st.session_state.transcriptions: | |
| with st.expander("π Detailed Transcription History", expanded=False): | |
| for i, entry in enumerate(reversed(st.session_state.transcriptions[-5:])): # Show last 5 | |
| st.markdown(f"**#{len(st.session_state.transcriptions) - i}**") | |
| st.write(f"**Text:** {entry['text']}") | |
| st.write(f"**Time:** {datetime.fromisoformat(entry['timestamp']).strftime('%H:%M:%S')}") | |
| st.write(f"**Audio Size:** {entry['audio_size']} bytes") | |
| st.write(f"**Chunks:** {entry['chunks_processed']}") | |
| if entry.get('is_final'): | |
| st.write("β **Flush Trick Applied**") | |
| st.divider() | |
| # WebRTC JavaScript integration - Functional Implementation | |
| st.subheader("π WebRTC Audio Capture") | |
| # UNMUTE.SH WebRTC Implementation - Continuous recording with voice activity detection | |
| webrtc_html = f""" | |
| <div id="webrtc-container"> | |
| <div id="status">Ready to start continuous recording - Following unmute.sh patterns</div> | |
| <div id="transcriptions">Transcriptions will appear here...</div> | |
| <button id="initBtn" onclick="initializeContinuousRecording()">π€ Start Continuous Recording</button> | |
| </div> | |
| <script> | |
| // π STREAMLIT BRIDGE: Initialize component communication | |
| console.log('π BRIDGE: Initializing Streamlit bridge...'); | |
| // Check if Streamlit component API is available | |
| if (typeof window.Streamlit !== 'undefined') {{ | |
| console.log('π BRIDGE: Streamlit API detected, setting up component...'); | |
| window.Streamlit.setComponentReady(); | |
| window.Streamlit.setFrameHeight(200); | |
| }} else {{ | |
| console.log('π BRIDGE: Streamlit API not found, using fallback methods...'); | |
| // Initialize fallback communication | |
| window.streamlitPendingData = []; | |
| }} | |
| // UNMUTE.SH METHODOLOGY: Continuous recording with voice activity detection | |
| let mediaRecorder = null; | |
| let audioStream = null; | |
| let isInitialized = false; | |
| let audioChunksBuffer = []; // Buffer following unmute.sh patterns | |
| let silenceThreshold = 0.01; // Voice activity detection threshold | |
| let audioContext = null; | |
| let analyser = null; | |
| const statusDiv = document.getElementById('status'); | |
| const initBtn = document.getElementById('initBtn'); | |
| const transcriptionsDiv = document.getElementById('transcriptions'); | |
| // UNMUTE.SH: Continuous recording initialization | |
| async function initializeContinuousRecording() {{ | |
| try {{ | |
| console.log('Initializing continuous recording...'); | |
| addTranscription('Requesting microphone access...', new Date().toISOString()); | |
| // UNMUTE.SH: Exact getUserMedia configuration | |
| audioStream = await navigator.mediaDevices.getUserMedia({{ | |
| audio: {{ sampleRate: 16000, channelCount: 1 }} | |
| }}); | |
| console.log('Microphone access granted'); | |
| addTranscription('Microphone access granted - continuous recording active', new Date().toISOString()); | |
| // Set up audio analysis for voice activity detection | |
| audioContext = new (window.AudioContext || window.webkitAudioContext)(); | |
| analyser = audioContext.createAnalyser(); | |
| const source = audioContext.createMediaStreamSource(audioStream); | |
| source.connect(analyser); | |
| // UNMUTE.SH: Exact MediaRecorder configuration with WebM/Opus | |
| const mimeType = 'audio/webm;codecs=opus'; | |
| if (!MediaRecorder.isTypeSupported(mimeType)) {{ | |
| console.warn('WebM/Opus not supported, falling back to default format'); | |
| mediaRecorder = new MediaRecorder(audioStream); | |
| }} else {{ | |
| console.log('Using WebM/Opus format for continuous recording'); | |
| mediaRecorder = new MediaRecorder(audioStream, {{ | |
| mimeType: mimeType, | |
| audioBitsPerSecond: 128000 | |
| }}); | |
| }} | |
| // UNMUTE.SH: Smart chunk processing - only send if there's voice activity | |
| mediaRecorder.ondataavailable = function(event) {{ | |
| if (event.data.size > 0) {{ | |
| // Check for voice activity before processing | |
| if (hasVoiceActivity()) {{ | |
| console.log('Voice detected, processing chunk:', event.data.size, 'bytes'); | |
| const reader = new FileReader(); | |
| reader.onloadend = function() {{ | |
| const base64 = reader.result.split(',')[1]; | |
| // UNMUTE.SH: Immediate processing of voice chunks | |
| processVoiceChunk({{ | |
| type: 'audio_chunk', | |
| audio_data: base64, | |
| sample_rate: 16000, | |
| timestamp: new Date().toISOString(), | |
| has_voice: true | |
| }}); | |
| }}; | |
| reader.readAsDataURL(event.data); | |
| }} else {{ | |
| console.log('Silence detected, skipping chunk'); | |
| }} | |
| }} | |
| }}; | |
| // UNMUTE.SH: Start continuous recording with 1-second chunks | |
| mediaRecorder.start(1000); | |
| isInitialized = true; | |
| initBtn.disabled = true; | |
| initBtn.textContent = 'π€ Continuous Recording Active'; | |
| statusDiv.textContent = 'π€ Listening continuously - speak naturally'; | |
| console.log('Continuous recording initialized with unmute.sh patterns'); | |
| }} catch (error) {{ | |
| console.error('Error initializing continuous recording:', error); | |
| addTranscription('Error: Could not access microphone', new Date().toISOString(), true); | |
| if (error.name === 'NotAllowedError') {{ | |
| statusDiv.textContent = 'β Microphone access denied. Please allow and refresh.'; | |
| }} else if (error.name === 'NotFoundError') {{ | |
| statusDiv.textContent = 'β No microphone found. Please check audio devices.'; | |
| }} | |
| }} | |
| }} | |
| // UNMUTE.SH: Voice activity detection | |
| function hasVoiceActivity() {{ | |
| if (!analyser) return true; // Default to processing if no analyser | |
| const bufferLength = analyser.frequencyBinCount; | |
| const dataArray = new Uint8Array(bufferLength); | |
| analyser.getByteFrequencyData(dataArray); | |
| // Calculate average amplitude | |
| let sum = 0; | |
| for (let i = 0; i < bufferLength; i++) {{ | |
| sum += dataArray[i]; | |
| }} | |
| const average = sum / bufferLength / 255; // Normalize to 0-1 | |
| return average > silenceThreshold; | |
| }} | |
| // UNMUTE.SH: Process voice chunks immediately (their flush trick) | |
| function processVoiceChunk(chunk) {{ | |
| audioChunksBuffer.push(chunk); | |
| const chunkCount = audioChunksBuffer.length; | |
| statusDiv.textContent = `π΄ Processing voice (${{chunkCount}} chunks captured)`; | |
| addTranscription(`Voice detected - sending to STT service...`, chunk.timestamp); | |
| // REAL-TIME STT: Send chunk immediately to STT service | |
| sendChunkToSTT(chunk, chunkCount); | |
| }} | |
| // NEW: Send individual audio chunks to STT service in real-time | |
| async function sendChunkToSTT(chunk, chunkIndex) {{ | |
| try {{ | |
| console.log(`π€ Sending chunk ${{chunkIndex}} to STT service...`); | |
| // Convert base64 audio data to blob | |
| const audioBytes = atob(chunk.audio_data); | |
| const arrayBuffer = new ArrayBuffer(audioBytes.length); | |
| const uint8Array = new Uint8Array(arrayBuffer); | |
| for (let i = 0; i < audioBytes.length; i++) {{ | |
| uint8Array[i] = audioBytes.charCodeAt(i); | |
| }} | |
| const audioBlob = new Blob([uint8Array], {{ type: 'audio/webm;codecs=opus' }}); | |
| // Create form data for STT service | |
| const formData = new FormData(); | |
| formData.append('audio_chunk', audioBlob, `chunk_${{chunkIndex}}.webm`); | |
| formData.append('chunk_index', chunkIndex); | |
| formData.append('timestamp', chunk.timestamp); | |
| formData.append('sample_rate', chunk.sample_rate); | |
| // Store audio chunk data for Streamlit to process | |
| // (Streamlit doesn't support custom POST endpoints, so we store in window) | |
| if (!window.audioChunksForSTT) {{ | |
| window.audioChunksForSTT = []; | |
| }} | |
| window.audioChunksForSTT.push({{ | |
| audioData: chunk.audio_data, | |
| chunkIndex: chunkIndex, | |
| timestamp: chunk.timestamp, | |
| processed: false | |
| }}); | |
| console.log(`π¦ Stored chunk ${{chunkIndex}} for STT processing`); | |
| // π BRIDGE: Send chunk to Streamlit via component communication | |
| try {{ | |
| // Convert audio blob to base64 for bridge transfer | |
| const arrayBuffer = await audioBlob.arrayBuffer(); | |
| const audioArray = new Uint8Array(arrayBuffer); | |
| let binary = ''; | |
| for (let i = 0; i < audioArray.length; i++) {{ | |
| binary += String.fromCharCode(audioArray[i]); | |
| }} | |
| const audioBase64 = btoa(binary); | |
| // Create chunk data for bridge | |
| const bridgeData = {{ | |
| chunkIndex: chunkIndex, | |
| audioBase64: audioBase64, | |
| timestamp: chunk.timestamp, | |
| sampleRate: chunk.sample_rate, | |
| needsProcessing: true, | |
| createdAt: new Date().toISOString() | |
| }}; | |
| // Send to Streamlit via iframe communication | |
| const streamlitData = {{ | |
| audioChunks: [bridgeData], | |
| action: 'processAudio', | |
| timestamp: new Date().toISOString() | |
| }}; | |
| // π STREAMLIT BRIDGE: Try multiple communication methods | |
| let bridgeSuccess = false; | |
| if (typeof window.Streamlit !== 'undefined') {{ | |
| try {{ | |
| // Use official Streamlit component communication | |
| window.Streamlit.setComponentValue(streamlitData); | |
| console.log(`π BRIDGE: Used Streamlit.setComponentValue for chunk ${{chunkIndex}}`); | |
| bridgeSuccess = true; | |
| }} catch (e) {{ | |
| console.warn(`π BRIDGE: Streamlit.setComponentValue failed:`, e); | |
| }} | |
| }} | |
| if (!bridgeSuccess && window.parent && window.parent.postMessage) {{ | |
| // Fallback to postMessage with correct Streamlit format | |
| window.parent.postMessage({{ | |
| type: 'streamlit:setComponentValue', | |
| value: streamlitData | |
| }}, '*'); | |
| console.log(`π BRIDGE: Used postMessage fallback for chunk ${{chunkIndex}}`); | |
| // Don't mark as success yet - postMessage may not reach Streamlit | |
| // Let it fall through to polling fallback as well | |
| console.log(`π BRIDGE: Also storing in polling fallback as backup...`); | |
| }} | |
| if (!bridgeSuccess) {{ | |
| // π POLLING FALLBACK: Store in global window for polling | |
| if (!window.streamlitPendingData) window.streamlitPendingData = []; | |
| window.streamlitPendingData.push(streamlitData); | |
| console.log(`π POLLING: Stored chunk ${{chunkIndex}} in window.streamlitPendingData (queue length: ${{window.streamlitPendingData.length}})`); | |
| bridgeSuccess = true; | |
| }} | |
| // Display bridge status | |
| const transcriptionDiv = document.createElement('div'); | |
| transcriptionDiv.innerHTML = `<strong>${{new Date().toLocaleTimeString()}}:</strong> π Bridge: Processed chunk ${{chunkIndex}} via Streamlit communication`; | |
| document.getElementById('transcription-output').appendChild(transcriptionDiv); | |
| return; // Successfully handled by Streamlit bridge! | |
| }} catch (bridgeError) {{ | |
| console.error('π BRIDGE: Failed to send to Streamlit:', bridgeError); | |
| // Show error in UI | |
| const errorDiv = document.createElement('div'); | |
| errorDiv.innerHTML = `<strong>${{new Date().toLocaleTimeString()}}:</strong> β Bridge Error: ${{bridgeError.message}}`; | |
| errorDiv.style.color = 'red'; | |
| document.getElementById('transcription-output').appendChild(errorDiv); | |
| }} | |
| // β BRIDGE ONLY: All audio processing now goes through Streamlit bridge | |
| console.log(`β Audio chunk ${{chunkIndex}} queued for Streamlit bridge processing`); | |
| // Show visual confirmation that chunk was queued for bridge | |
| const queuedDiv = document.createElement('div'); | |
| queuedDiv.innerHTML = `<strong>${{new Date().toLocaleTimeString()}}:</strong> π€ Queued chunk ${{chunkIndex}} for STT processing`; | |
| queuedDiv.style.color = 'blue'; | |
| document.getElementById('transcription-output').appendChild(queuedDiv); | |
| }} catch (error) {{ | |
| console.error(`Error processing chunk ${{chunkIndex}}:`, error); | |
| addTranscription(`β Error processing chunk ${{chunkIndex}}: ${{error.message}}`, new Date().toISOString(), true); | |
| }} | |
| }} | |
| // UNMUTE.SH: Exact transcription display pattern | |
| function addTranscription(text, timestamp, isError = false) {{ | |
| const item = document.createElement('div'); | |
| item.style.cssText = 'margin: 5px 0; padding: 8px; background: ' + (isError ? '#f8d7da' : 'white') + '; border-radius: 4px; border-left: 4px solid ' + (isError ? '#dc3545' : '#28a745') + ';'; | |
| const time = new Date(timestamp).toLocaleTimeString(); | |
| item.innerHTML = `<strong>${{time}}:</strong> ${{text}}`; | |
| transcriptionsDiv.appendChild(item); | |
| transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight; | |
| }} | |
| // Expose unmute.sh data for Streamlit | |
| window.getUnmuteAudioChunks = function() {{ | |
| return window.unmuteAudioChunks || []; | |
| }}; | |
| window.clearUnmuteBuffer = function() {{ | |
| audioChunksBuffer = []; | |
| window.unmuteAudioChunks = []; | |
| window.audioProcessingReady = false; | |
| processBtn.disabled = true; | |
| statusDiv.textContent = 'Buffer cleared - ready to record'; | |
| transcriptionsDiv.innerHTML = 'Transcriptions will appear here...'; | |
| }}; | |
| // π INTEGRATED POLLING: Check for queued audio chunks and send to Streamlit | |
| function checkPendingAudioData() {{ | |
| try {{ | |
| // Check for pending data in window | |
| if (typeof window.streamlitPendingData !== 'undefined' && window.streamlitPendingData.length > 0) {{ | |
| const pendingData = window.streamlitPendingData; | |
| window.streamlitPendingData = []; // Clear after reading | |
| console.log('π INTEGRATED POLLING: Found', pendingData.length, 'pending data sets'); | |
| // Send all pending data to Streamlit | |
| const combinedData = {{ | |
| type: 'pollingData', | |
| audioChunks: pendingData, | |
| timestamp: new Date().toISOString(), | |
| totalChunks: pendingData.reduce((sum, data) => sum + (data.audioChunks ? data.audioChunks.length : 0), 0) | |
| }}; | |
| // Send via Streamlit component communication (we have access here!) | |
| if (typeof window.Streamlit !== 'undefined') {{ | |
| window.Streamlit.setComponentValue(combinedData); | |
| console.log('π INTEGRATED POLLING: Successfully sent', combinedData.totalChunks, 'chunks to Streamlit backend'); | |
| // Show visual confirmation | |
| const confirmationDiv = document.createElement('div'); | |
| confirmationDiv.innerHTML = `<strong>${{new Date().toLocaleTimeString()}}:</strong> π‘ BACKEND: Sent ${{combinedData.totalChunks}} queued chunks to STT service`; | |
| confirmationDiv.style.color = 'green'; | |
| confirmationDiv.style.fontWeight = 'bold'; | |
| document.getElementById('transcription-output').appendChild(confirmationDiv); | |
| }} else {{ | |
| console.warn('π INTEGRATED POLLING: Streamlit API not available'); | |
| // Put data back if we can't send it | |
| if (!window.streamlitPendingData) window.streamlitPendingData = []; | |
| window.streamlitPendingData.push(...pendingData); | |
| }} | |
| }} | |
| }} catch (error) {{ | |
| console.error('π INTEGRATED POLLING: Error checking pending data:', error); | |
| }} | |
| }} | |
| // Start periodic polling every 2 seconds | |
| setInterval(checkPendingAudioData, 2000); | |
| console.log('π INTEGRATED POLLING: Started polling for queued audio data every 2 seconds'); | |
| </script> | |
| <style> | |
| #webrtc-container {{ | |
| padding: 20px; | |
| border: 1px solid #ddd; | |
| border-radius: 8px; | |
| margin: 10px 0; | |
| background: #f9f9f9; | |
| }} | |
| #status {{ | |
| font-weight: bold; | |
| margin: 10px 0; | |
| padding: 10px; | |
| border-radius: 4px; | |
| background: white; | |
| }} | |
| #audio-chunks {{ | |
| color: #666; | |
| font-size: 0.9em; | |
| margin: 10px 0; | |
| }} | |
| button {{ | |
| margin: 5px; | |
| padding: 10px 15px; | |
| border: none; | |
| border-radius: 4px; | |
| cursor: pointer; | |
| font-size: 14px; | |
| }} | |
| button:disabled {{ | |
| opacity: 0.5; | |
| cursor: not-allowed; | |
| }} | |
| #start-webrtc {{ | |
| background: #dc3545; | |
| color: white; | |
| }} | |
| #stop-webrtc {{ | |
| background: #6c757d; | |
| color: white; | |
| }} | |
| </style> | |
| """ | |
| # Render the functional WebRTC component with bridge communication | |
| component_value = st.components.v1.html(webrtc_html, height=200) | |
| # π BRIDGE: Check for audio chunks from JavaScript | |
| if component_value: | |
| # JavaScript can send data back to Streamlit via return values | |
| self._process_bridge_data(component_value) | |
| # π POLLING: Check for data in JavaScript window object (fallback method) | |
| self._check_javascript_polling_data() | |
| # Auto-refresh for real-time processing | |
| if hasattr(st.session_state, 'pending_audio_chunks') and st.session_state.pending_audio_chunks: | |
| st.info(f"π Processing {len(st.session_state.pending_audio_chunks)} pending chunks...") | |
| # Clear after showing status | |
| st.session_state.pending_audio_chunks = [] | |
| # Add auto-refresh button for bridge testing | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| if st.button("π Check Bridge Queue", key="check_bridge_queue_btn"): | |
| # Force refresh to check for new chunks | |
| st.rerun() | |
| with col2: | |
| # Show bridge status | |
| pending_count = len(getattr(st.session_state, 'pending_audio_chunks', [])) | |
| st.metric("π Bridge Queue", f"{pending_count} chunks") | |
| # Service connectivity and performance test | |
| st.subheader("π STT Service Performance") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if st.button("Test Connection", key="test_connection_btn"): | |
| try: | |
| response = requests.get(f"{self.stt_service_url}/", timeout=5) | |
| if response.status_code == 200: | |
| st.success("β STT Service is reachable") | |
| else: | |
| st.error(f"β STT Service returned {response.status_code}") | |
| except Exception as e: | |
| st.error(f"β STT Service connection failed: {e}") | |
| with col2: | |
| if st.button("Warm Up Client", key="warm_up_client_btn"): | |
| try: | |
| start_time = datetime.now() | |
| client = self.client | |
| if client: | |
| warmup_time = (datetime.now() - start_time).total_seconds() | |
| st.success(f"β Client warmed up in {warmup_time:.2f}s") | |
| st.session_state.client_warmed = True | |
| else: | |
| st.error("β Failed to warm up client") | |
| except Exception as e: | |
| st.error(f"β Client warmup failed: {e}") | |
| with col3: | |
| if st.button("Performance Info", key="performance_info_btn"): | |
| st.info(""" | |
| **Optimization Features:** | |
| - π Persistent client connection | |
| - β‘ Async execution with thread pool | |
| - π Latency monitoring | |
| - π Auto client reset on errors | |
| - π― Optimized for 'base' model speed | |
| """) | |
| # Display performance metrics | |
| if hasattr(st.session_state, 'client_warmed') and st.session_state.client_warmed: | |
| st.success("π₯ Client is warmed up and ready for optimal performance") | |
| # Global handler instance | |
| webrtc_handler = StreamlitWebRTCHandler() | |
| # Run the WebRTC interface | |
| webrtc_handler.create_webrtc_interface() |