import logging import logging.handlers import time import os import io import soundfile as sf import requests import numpy as np import pydub import streamlit as st from twilio.rest import Client from streamlit_webrtc import WebRtcMode, webrtc_streamer from stt_module import AudioBufferProcessor # Import our custom processor logger = logging.getLogger(__name__) # --- Session State Initialization --- if 'is_recording' not in st.session_state: st.session_state.is_recording = False if 'transcribed_text' not in st.session_state: st.session_state.transcribed_text = "" if 'audio_processor_instance' not in st.session_state: st.session_state.audio_processor_instance = None # --- Utility Functions --- @st.cache_data def get_ice_servers(): """Fetches ICE servers for WebRTC connection.""" try: account_sid = os.environ["TWILIO_ACCOUNT_SID"] auth_token = os.environ["TWILIO_AUTH_TOKEN"] except KeyError: logger.warning( "Twilio credentials (TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN) are not set. " "Falling back to a free STUN server from Google. " "This might be less reliable for WebRTC connections." ) return [{"urls": ["stun:stun.l.google.com:19302"]}] client = Client(account_sid, auth_token) token = client.tokens.create() return token.ice_servers def main(): st.header("Whisper Speech-to-Text with Recording") st.markdown( """ Click "Start Recording" to begin capturing audio from your microphone. Click "Stop Recording" to end the capture, save the audio, and send it to the Whisper model for transcription. The transcribed text will appear character by character below. """ ) # Initialize the webrtc_streamer once. webrtc_ctx = webrtc_streamer( key="audio_recorder", mode=WebRtcMode.SENDONLY, audio_processor_factory=AudioBufferProcessor, rtc_configuration={"iceServers": get_ice_servers()}, media_stream_constraints={"video": False, "audio": True}, async_processing=True ) # Store the audio_processor instance in session_state for later retrieval if webrtc_ctx.audio_processor and st.session_state.audio_processor_instance is None: st.session_state.audio_processor_instance = webrtc_ctx.audio_processor # Display status of the WebRTC connection if webrtc_ctx.state.playing: st.success("Microphone connected. Ready to record.") else: st.warning("Waiting for microphone connection... Please allow microphone access.") # --- Recording Controls --- col1, col2 = st.columns(2) with col1: # Disable "Start Recording" if already recording or mic not connected start_button = st.button( "Start Recording", key="start_rec_btn", disabled=st.session_state.is_recording or not webrtc_ctx.state.playing ) with col2: # Disable "Stop Recording" if not recording stop_button = st.button( "Stop Recording", key="stop_rec_btn", disabled=not st.session_state.is_recording ) # Placeholder for the animated text area # Initialize it with current session state text transcription_text_area = st.text_area("Transcription Result", value=st.session_state.transcribed_text, height=150, disabled=True) # Logic for Start/Stop buttons if start_button: if webrtc_ctx.state.playing: st.session_state.is_recording = True st.session_state.transcribed_text = "" # Clear previous text transcription_text_area.empty() # Clear the display st.info("Recording... Click 'Stop Recording' to transcribe.") logger.info("Recording started.") st.rerun() # Use st.rerun() to immediately update UI state else: st.error("Cannot start recording: Microphone not connected. Please allow microphone access.") if stop_button: if st.session_state.is_recording: # Only process if recording was active st.session_state.is_recording = False st.info("Processing recording... Please wait.") logger.info("Recording stopped. Processing audio...") # Retrieve all buffered audio from the processor instance if st.session_state.audio_processor_instance: recorded_audio = st.session_state.audio_processor_instance.get_and_clear_buffered_audio() if len(recorded_audio) > 0: # Save the audio to an in-memory WAV file wav_file_buffer = io.BytesIO() audio_array = np.array(recorded_audio.get_array_of_samples()) audio_array = audio_array.astype(np.float32) sf.write(wav_file_buffer, audio_array, recorded_audio.frame_rate, format='WAV', subtype='PCM_16') wav_file_buffer.seek(0) # Rewind the buffer to the beginning # Send the WAV file to the FastAPI Whisper endpoint WHISPER_API_URL = "http://localhost:1990/transcribe_audio/" try: files = {'audio_file': ('recorded_audio.wav', wav_file_buffer, 'audio/wav')} response = requests.post(WHISPER_API_URL, files=files, timeout=120) # Increased timeout for transcription response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx) transcription_data = response.json() full_transcribed_text = transcription_data.get("transcription", "No transcription found.") st.session_state.transcribed_text = full_transcribed_text # --- Character-by-character display logic --- animated_text = "" # Re-display the placeholder to clear previous content transcription_text_area.empty() for char in full_transcribed_text: animated_text += char transcription_text_area.text_area("Transcription Result", value=animated_text, height=150, disabled=True) time.sleep(0.02) # Adjust speed as desired (e.g., 0.05 for slower) # Ensure the final text is displayed transcription_text_area.text_area("Transcription Result", value=full_transcribed_text, height=150, disabled=True) # --- End character-by-character display logic --- st.success("Transcription complete!") logger.info(f"Transcription received: '{full_transcribed_text[:100]}...'") except requests.exceptions.ConnectionError as e: st.error(f"Could not connect to Whisper API at {WHISPER_API_URL}. Is the FastAPI server running on port 1990?") logger.error(f"Connection Error: {e}", exc_info=True) except requests.exceptions.Timeout: st.error("Whisper API request timed out. The model might be busy or the audio too long. Try a shorter recording.") logger.error("Request Timeout.", exc_info=True) except requests.exceptions.RequestException as e: st.error(f"Error during API request: {e}. Response: {e.response.text if e.response else 'No response'}") logger.error(f"API Request Error: {e}", exc_info=True) except Exception as e: st.error(f"An unexpected error occurred during transcription: {e}") logger.error(f"Unexpected Transcription Error: {e}", exc_info=True) else: st.warning("No audio recorded. Please ensure your microphone is active and you spoke.") logger.warning("No audio recorded after stopping.") else: st.error("Audio processor instance not found. Please refresh the app and allow microphone access.") # Trigger a rerun to update button states and display transcription st.rerun() if __name__ == "__main__": DEBUG = os.environ.get("DEBUG", "false").lower() not in ["false", "no", "0"] logging.basicConfig( format="[%(asctime)s] %(levelname)7s from %(name)s in %(pathname)s:%(lineno)d: " "%(message)s", force=True, ) logger.setLevel(level=logging.DEBUG if DEBUG else logging.INFO) st_webrtc_logger = logging.getLogger("streamlit_webrtc") st_webrtc_logger.setLevel(logging.DEBUG if DEBUG else logging.INFO) # Removed fsevents logger as Pathlib is not explicitly imported or used as much here # fsevents_logger = logging.getLogger("fsevents") # fsevents_logger.setLevel(logging.WARNING) main()