# D:\jan-contract\components/chat_interface.py import streamlit as st import speech_recognition as sr from gtts import gTTS import io import av import queue import wave import threading import time import numpy as np from typing import Optional from streamlit_webrtc import webrtc_streamer, WebRtcMode # --- Setup --- recognizer = sr.Recognizer() recognizer.energy_threshold = 300 # Lower threshold for better sensitivity recognizer.dynamic_energy_threshold = True recognizer.pause_threshold = 0.8 def text_to_speech(text: str) -> bytes: """Converts text to an in-memory MP3 file bytes.""" try: audio_io = io.BytesIO() tts = gTTS(text=text, lang='en', slow=False) tts.write_to_fp(audio_io) audio_io.seek(0) return audio_io.read() except Exception as e: st.error(f"Error during Text-to-Speech: {e}") return None def chat_interface(handler_function, session_state_key: str): """ A reusable component that provides a full Text and Voice chat interface. Args: handler_function: The function to call with the user's text input. session_state_key (str): A unique key to store chat history AND to use as a base for widget keys. """ st.subheader("💬 Chat via Text") if session_state_key not in st.session_state: st.session_state[session_state_key] = [] for message in st.session_state[session_state_key]: with st.chat_message(message["role"]): st.markdown(message["content"]) if prompt := st.chat_input("Ask a question...", key=f"chat_input_{session_state_key}"): st.session_state[session_state_key].append({"role": "user", "content": prompt}) with st.chat_message("user"): st.markdown(prompt) with st.chat_message("assistant"): with st.spinner("Thinking..."): response = handler_function(prompt) st.markdown(response) st.session_state[session_state_key].append({"role": "assistant", "content": response}) st.divider() st.subheader("🎙️ Chat via Voice") st.info("🎤 **Instructions:** Click START to begin recording, speak your question clearly, then click STOP.") # Initialize session state for voice recording voice_key = f"voice_{session_state_key}" if f"{voice_key}_frames" not in st.session_state: st.session_state[f"{voice_key}_frames"] = [] if f"{voice_key}_processing" not in st.session_state: st.session_state[f"{voice_key}_processing"] = False if f"{voice_key}_recording_start" not in st.session_state: st.session_state[f"{voice_key}_recording_start"] = None if f"{voice_key}_bytes" not in st.session_state: st.session_state[f"{voice_key}_bytes"] = 0 if f"{voice_key}_component_key" not in st.session_state: st.session_state[f"{voice_key}_component_key"] = f"voice-chat-{session_state_key}-{int(time.time())}" def audio_frame_callback(frame: av.AudioFrame): """Callback to collect audio frames during recording""" if st.session_state[f"{voice_key}_processing"]: try: # Resample every frame to 16kHz mono, 16-bit PCM for SR resampled = frame.reformat(format="s16", layout="mono", rate=16000) chunk = resampled.planes[0].to_bytes() st.session_state[f"{voice_key}_frames"].append(chunk) st.session_state[f"{voice_key}_bytes"] += len(chunk) except Exception as e: st.error(f"Error processing audio frame: {e}") def process_voice_input(): """Process the collected audio frames and get response""" # Short-audio threshold (~0.5s at 16kHz, 16-bit mono) total_bytes = st.session_state.get(f"{voice_key}_bytes", 0) if total_bytes < int(16000 * 2 * 0.5): st.error("❌ No audio captured or recording too short. Please speak for at least 1 second and try again.") st.session_state[f"{voice_key}_frames"] = [] st.session_state[f"{voice_key}_processing"] = False st.session_state[f"{voice_key}_bytes"] = 0 return status_placeholder = st.empty() status_placeholder.info("🔄 Processing audio...") try: # Combine all audio frames (already PCM s16 mono 16kHz) audio_data = b"".join(st.session_state[f"{voice_key}_frames"]) # Create WAV file in memory with proper format with io.BytesIO() as wav_buffer: with wave.open(wav_buffer, 'wb') as wf: wf.setnchannels(1) # Mono wf.setsampwidth(2) # 16-bit wf.setframerate(16000) # 16kHz wf.writeframes(audio_data) wav_buffer.seek(0) # Use speech recognition with better error handling with sr.AudioFile(wav_buffer) as source: # Adjust for ambient noise quickly; avoid long pauses recognizer.adjust_for_ambient_noise(source, duration=0.1) audio = recognizer.record(source) # Recognize speech with multiple fallbacks try: user_input = recognizer.recognize_google(audio, language="en-US") except sr.UnknownValueError: try: user_input = recognizer.recognize_google(audio, language="en-GB") except sr.UnknownValueError: st.error("❌ Could not understand the audio. Please speak more clearly and try again.") return if not user_input.strip(): st.error("❌ No speech detected. Please try again.") return st.write(f"🎤 **You said:** *{user_input}*") # Get response from handler with st.spinner("🤔 Getting response..."): response_text = handler_function(user_input) st.write(f"🤖 **Assistant says:** *{response_text}*") # Generate audio response with st.spinner("🔊 Generating audio response..."): audio_response = text_to_speech(response_text) if audio_response: st.audio(audio_response, format="audio/mp3", start_time=0) st.success("✅ Audio response generated!") # Add to chat history st.session_state[session_state_key].append({"role": "user", "content": user_input}) st.session_state[session_state_key].append({"role": "assistant", "content": response_text}) except sr.RequestError as e: st.error(f"❌ Speech recognition service error: {e}") except Exception as e: st.error(f"❌ Error processing audio: {str(e)}") finally: # Clear the audio frames st.session_state[f"{voice_key}_frames"] = [] st.session_state[f"{voice_key}_processing"] = False st.session_state[f"{voice_key}_bytes"] = 0 status_placeholder.empty() # Create a unique key for each component instance to avoid registration issues component_key = st.session_state[f"{voice_key}_component_key"] # WebRTC streamer with proper error handling and component lifecycle try: ctx = webrtc_streamer( key=component_key, mode=WebRtcMode.SENDONLY, rtc_configuration={ "iceServers": [ {"urls": ["stun:stun.l.google.com:19302"]}, {"urls": ["stun:stun1.l.google.com:19302"]} ] }, audio_frame_callback=audio_frame_callback, media_stream_constraints={ "video": False, "audio": { "echoCancellation": True, "noiseSuppression": True, "autoGainControl": True } }, async_processing=True, on_change=lambda: None, # Prevent component registration issues ) # Handle recording state with better feedback bytes_captured = st.session_state.get(f"{voice_key}_bytes", 0) if ctx.state.playing and not st.session_state.get(f"{voice_key}_processing", False): st.session_state[f"{voice_key}_processing"] = True st.session_state[f"{voice_key}_recording_start"] = time.time() st.session_state[f"{voice_key}_frames"] = [] st.session_state[f"{voice_key}_bytes"] = 0 st.success("🔴 **Recording started!** Speak your question now...") elif ctx.state.playing and st.session_state.get(f"{voice_key}_processing", False): # Show recording progress if st.session_state.get(f"{voice_key}_recording_start"): elapsed = time.time() - st.session_state[f"{voice_key}_recording_start"] approx_seconds = bytes_captured / (16000 * 2) if bytes_captured else 0 st.caption(f"🎤 Recording... ~{approx_seconds:.1f}s captured") # Process audio when recording stops if not ctx.state.playing and st.session_state.get(f"{voice_key}_processing", False): process_voice_input() except Exception as e: st.error(f"❌ WebRTC Error: {str(e)}") st.info("💡 Try refreshing the page or using a different browser (Chrome recommended).") # Fallback: manual audio input st.subheader("🔄 Fallback: Manual Audio Input") if st.button("Try Alternative Audio Method", key=f"fallback_{voice_key}"): st.info("This feature requires WebRTC support. Please ensure your browser supports WebRTC and try again.")