Spaces:
Paused
Paused
| import logging | |
| import logging.handlers | |
| import time | |
| import os | |
| import io | |
| import soundfile as sf | |
| import requests | |
| import numpy as np | |
| import pydub | |
| import streamlit as st | |
| from twilio.rest import Client | |
| from streamlit_webrtc import WebRtcMode, webrtc_streamer | |
| from stt_module import AudioBufferProcessor # Import our custom processor | |
| logger = logging.getLogger(__name__) | |
| # --- Session State Initialization --- | |
| if 'is_recording' not in st.session_state: | |
| st.session_state.is_recording = False | |
| if 'transcribed_text' not in st.session_state: | |
| st.session_state.transcribed_text = "" | |
| if 'audio_processor_instance' not in st.session_state: | |
| st.session_state.audio_processor_instance = None | |
| # --- Utility Functions --- | |
| def get_ice_servers(): | |
| """Fetches ICE servers for WebRTC connection.""" | |
| try: | |
| account_sid = os.environ["TWILIO_ACCOUNT_SID"] | |
| auth_token = os.environ["TWILIO_AUTH_TOKEN"] | |
| except KeyError: | |
| logger.warning( | |
| "Twilio credentials (TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN) are not set. " | |
| "Falling back to a free STUN server from Google. " | |
| "This might be less reliable for WebRTC connections." | |
| ) | |
| return [{"urls": ["stun:stun.l.google.com:19302"]}] | |
| client = Client(account_sid, auth_token) | |
| token = client.tokens.create() | |
| return token.ice_servers | |
| def main(): | |
| st.header("Whisper Speech-to-Text with Recording") | |
| st.markdown( | |
| """ | |
| Click "Start Recording" to begin capturing audio from your microphone. | |
| Click "Stop Recording" to end the capture, save the audio, | |
| and send it to the Whisper model for transcription. | |
| The transcribed text will appear character by character below. | |
| """ | |
| ) | |
| # Initialize the webrtc_streamer once. | |
| webrtc_ctx = webrtc_streamer( | |
| key="audio_recorder", | |
| mode=WebRtcMode.SENDONLY, | |
| audio_processor_factory=AudioBufferProcessor, | |
| rtc_configuration={"iceServers": get_ice_servers()}, | |
| media_stream_constraints={"video": False, "audio": True}, | |
| async_processing=True | |
| ) | |
| # Store the audio_processor instance in session_state for later retrieval | |
| if webrtc_ctx.audio_processor and st.session_state.audio_processor_instance is None: | |
| st.session_state.audio_processor_instance = webrtc_ctx.audio_processor | |
| # Display status of the WebRTC connection | |
| if webrtc_ctx.state.playing: | |
| st.success("Microphone connected. Ready to record.") | |
| else: | |
| st.warning("Waiting for microphone connection... Please allow microphone access.") | |
| # --- Recording Controls --- | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Disable "Start Recording" if already recording or mic not connected | |
| start_button = st.button( | |
| "Start Recording", | |
| key="start_rec_btn", | |
| disabled=st.session_state.is_recording or not webrtc_ctx.state.playing | |
| ) | |
| with col2: | |
| # Disable "Stop Recording" if not recording | |
| stop_button = st.button( | |
| "Stop Recording", | |
| key="stop_rec_btn", | |
| disabled=not st.session_state.is_recording | |
| ) | |
| # Placeholder for the animated text area | |
| # Initialize it with current session state text | |
| transcription_text_area = st.text_area("Transcription Result", value=st.session_state.transcribed_text, height=150, disabled=True) | |
| # Logic for Start/Stop buttons | |
| if start_button: | |
| if webrtc_ctx.state.playing: | |
| st.session_state.is_recording = True | |
| st.session_state.transcribed_text = "" # Clear previous text | |
| transcription_text_area.empty() # Clear the display | |
| st.info("Recording... Click 'Stop Recording' to transcribe.") | |
| logger.info("Recording started.") | |
| st.rerun() # Use st.rerun() to immediately update UI state | |
| else: | |
| st.error("Cannot start recording: Microphone not connected. Please allow microphone access.") | |
| if stop_button: | |
| if st.session_state.is_recording: # Only process if recording was active | |
| st.session_state.is_recording = False | |
| st.info("Processing recording... Please wait.") | |
| logger.info("Recording stopped. Processing audio...") | |
| # Retrieve all buffered audio from the processor instance | |
| if st.session_state.audio_processor_instance: | |
| recorded_audio = st.session_state.audio_processor_instance.get_and_clear_buffered_audio() | |
| if len(recorded_audio) > 0: | |
| # Save the audio to an in-memory WAV file | |
| wav_file_buffer = io.BytesIO() | |
| audio_array = np.array(recorded_audio.get_array_of_samples()) | |
| audio_array = audio_array.astype(np.float32) | |
| sf.write(wav_file_buffer, audio_array, recorded_audio.frame_rate, format='WAV', subtype='PCM_16') | |
| wav_file_buffer.seek(0) # Rewind the buffer to the beginning | |
| # Send the WAV file to the FastAPI Whisper endpoint | |
| WHISPER_API_URL = "http://localhost:1990/transcribe_audio/" | |
| try: | |
| files = {'audio_file': ('recorded_audio.wav', wav_file_buffer, 'audio/wav')} | |
| response = requests.post(WHISPER_API_URL, files=files, timeout=120) # Increased timeout for transcription | |
| response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx) | |
| transcription_data = response.json() | |
| full_transcribed_text = transcription_data.get("transcription", "No transcription found.") | |
| st.session_state.transcribed_text = full_transcribed_text | |
| # --- Character-by-character display logic --- | |
| animated_text = "" | |
| # Re-display the placeholder to clear previous content | |
| transcription_text_area.empty() | |
| for char in full_transcribed_text: | |
| animated_text += char | |
| transcription_text_area.text_area("Transcription Result", value=animated_text, height=150, disabled=True) | |
| time.sleep(0.02) # Adjust speed as desired (e.g., 0.05 for slower) | |
| # Ensure the final text is displayed | |
| transcription_text_area.text_area("Transcription Result", value=full_transcribed_text, height=150, disabled=True) | |
| # --- End character-by-character display logic --- | |
| st.success("Transcription complete!") | |
| logger.info(f"Transcription received: '{full_transcribed_text[:100]}...'") | |
| except requests.exceptions.ConnectionError as e: | |
| st.error(f"Could not connect to Whisper API at {WHISPER_API_URL}. Is the FastAPI server running on port 1990?") | |
| logger.error(f"Connection Error: {e}", exc_info=True) | |
| except requests.exceptions.Timeout: | |
| st.error("Whisper API request timed out. The model might be busy or the audio too long. Try a shorter recording.") | |
| logger.error("Request Timeout.", exc_info=True) | |
| except requests.exceptions.RequestException as e: | |
| st.error(f"Error during API request: {e}. Response: {e.response.text if e.response else 'No response'}") | |
| logger.error(f"API Request Error: {e}", exc_info=True) | |
| except Exception as e: | |
| st.error(f"An unexpected error occurred during transcription: {e}") | |
| logger.error(f"Unexpected Transcription Error: {e}", exc_info=True) | |
| else: | |
| st.warning("No audio recorded. Please ensure your microphone is active and you spoke.") | |
| logger.warning("No audio recorded after stopping.") | |
| else: | |
| st.error("Audio processor instance not found. Please refresh the app and allow microphone access.") | |
| # Trigger a rerun to update button states and display transcription | |
| st.rerun() | |
| if __name__ == "__main__": | |
| DEBUG = os.environ.get("DEBUG", "false").lower() not in ["false", "no", "0"] | |
| logging.basicConfig( | |
| format="[%(asctime)s] %(levelname)7s from %(name)s in %(pathname)s:%(lineno)d: " | |
| "%(message)s", | |
| force=True, | |
| ) | |
| logger.setLevel(level=logging.DEBUG if DEBUG else logging.INFO) | |
| st_webrtc_logger = logging.getLogger("streamlit_webrtc") | |
| st_webrtc_logger.setLevel(logging.DEBUG if DEBUG else logging.INFO) | |
| # Removed fsevents logger as Pathlib is not explicitly imported or used as much here | |
| # fsevents_logger = logging.getLogger("fsevents") | |
| # fsevents_logger.setLevel(logging.WARNING) | |
| main() |