fast_rep_voice / streamlit_app.py
Krish-05's picture
Update streamlit_app.py
073f4d8 verified
import logging
import logging.handlers
import time
import os
import io
import soundfile as sf
import requests
import numpy as np
import pydub
import streamlit as st
from twilio.rest import Client
from streamlit_webrtc import WebRtcMode, webrtc_streamer
from stt_module import AudioBufferProcessor # Import our custom processor
logger = logging.getLogger(__name__)
# --- Session State Initialization ---
if 'is_recording' not in st.session_state:
st.session_state.is_recording = False
if 'transcribed_text' not in st.session_state:
st.session_state.transcribed_text = ""
if 'audio_processor_instance' not in st.session_state:
st.session_state.audio_processor_instance = None
# --- Utility Functions ---
@st.cache_data
def get_ice_servers():
"""Fetches ICE servers for WebRTC connection."""
try:
account_sid = os.environ["TWILIO_ACCOUNT_SID"]
auth_token = os.environ["TWILIO_AUTH_TOKEN"]
except KeyError:
logger.warning(
"Twilio credentials (TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN) are not set. "
"Falling back to a free STUN server from Google. "
"This might be less reliable for WebRTC connections."
)
return [{"urls": ["stun:stun.l.google.com:19302"]}]
client = Client(account_sid, auth_token)
token = client.tokens.create()
return token.ice_servers
def main():
st.header("Whisper Speech-to-Text with Recording")
st.markdown(
"""
Click "Start Recording" to begin capturing audio from your microphone.
Click "Stop Recording" to end the capture, save the audio,
and send it to the Whisper model for transcription.
The transcribed text will appear character by character below.
"""
)
# Initialize the webrtc_streamer once.
webrtc_ctx = webrtc_streamer(
key="audio_recorder",
mode=WebRtcMode.SENDONLY,
audio_processor_factory=AudioBufferProcessor,
rtc_configuration={"iceServers": get_ice_servers()},
media_stream_constraints={"video": False, "audio": True},
async_processing=True
)
# Store the audio_processor instance in session_state for later retrieval
if webrtc_ctx.audio_processor and st.session_state.audio_processor_instance is None:
st.session_state.audio_processor_instance = webrtc_ctx.audio_processor
# Display status of the WebRTC connection
if webrtc_ctx.state.playing:
st.success("Microphone connected. Ready to record.")
else:
st.warning("Waiting for microphone connection... Please allow microphone access.")
# --- Recording Controls ---
col1, col2 = st.columns(2)
with col1:
# Disable "Start Recording" if already recording or mic not connected
start_button = st.button(
"Start Recording",
key="start_rec_btn",
disabled=st.session_state.is_recording or not webrtc_ctx.state.playing
)
with col2:
# Disable "Stop Recording" if not recording
stop_button = st.button(
"Stop Recording",
key="stop_rec_btn",
disabled=not st.session_state.is_recording
)
# Placeholder for the animated text area
# Initialize it with current session state text
transcription_text_area = st.text_area("Transcription Result", value=st.session_state.transcribed_text, height=150, disabled=True)
# Logic for Start/Stop buttons
if start_button:
if webrtc_ctx.state.playing:
st.session_state.is_recording = True
st.session_state.transcribed_text = "" # Clear previous text
transcription_text_area.empty() # Clear the display
st.info("Recording... Click 'Stop Recording' to transcribe.")
logger.info("Recording started.")
st.rerun() # Use st.rerun() to immediately update UI state
else:
st.error("Cannot start recording: Microphone not connected. Please allow microphone access.")
if stop_button:
if st.session_state.is_recording: # Only process if recording was active
st.session_state.is_recording = False
st.info("Processing recording... Please wait.")
logger.info("Recording stopped. Processing audio...")
# Retrieve all buffered audio from the processor instance
if st.session_state.audio_processor_instance:
recorded_audio = st.session_state.audio_processor_instance.get_and_clear_buffered_audio()
if len(recorded_audio) > 0:
# Save the audio to an in-memory WAV file
wav_file_buffer = io.BytesIO()
audio_array = np.array(recorded_audio.get_array_of_samples())
audio_array = audio_array.astype(np.float32)
sf.write(wav_file_buffer, audio_array, recorded_audio.frame_rate, format='WAV', subtype='PCM_16')
wav_file_buffer.seek(0) # Rewind the buffer to the beginning
# Send the WAV file to the FastAPI Whisper endpoint
WHISPER_API_URL = "http://localhost:1990/transcribe_audio/"
try:
files = {'audio_file': ('recorded_audio.wav', wav_file_buffer, 'audio/wav')}
response = requests.post(WHISPER_API_URL, files=files, timeout=120) # Increased timeout for transcription
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
transcription_data = response.json()
full_transcribed_text = transcription_data.get("transcription", "No transcription found.")
st.session_state.transcribed_text = full_transcribed_text
# --- Character-by-character display logic ---
animated_text = ""
# Re-display the placeholder to clear previous content
transcription_text_area.empty()
for char in full_transcribed_text:
animated_text += char
transcription_text_area.text_area("Transcription Result", value=animated_text, height=150, disabled=True)
time.sleep(0.02) # Adjust speed as desired (e.g., 0.05 for slower)
# Ensure the final text is displayed
transcription_text_area.text_area("Transcription Result", value=full_transcribed_text, height=150, disabled=True)
# --- End character-by-character display logic ---
st.success("Transcription complete!")
logger.info(f"Transcription received: '{full_transcribed_text[:100]}...'")
except requests.exceptions.ConnectionError as e:
st.error(f"Could not connect to Whisper API at {WHISPER_API_URL}. Is the FastAPI server running on port 1990?")
logger.error(f"Connection Error: {e}", exc_info=True)
except requests.exceptions.Timeout:
st.error("Whisper API request timed out. The model might be busy or the audio too long. Try a shorter recording.")
logger.error("Request Timeout.", exc_info=True)
except requests.exceptions.RequestException as e:
st.error(f"Error during API request: {e}. Response: {e.response.text if e.response else 'No response'}")
logger.error(f"API Request Error: {e}", exc_info=True)
except Exception as e:
st.error(f"An unexpected error occurred during transcription: {e}")
logger.error(f"Unexpected Transcription Error: {e}", exc_info=True)
else:
st.warning("No audio recorded. Please ensure your microphone is active and you spoke.")
logger.warning("No audio recorded after stopping.")
else:
st.error("Audio processor instance not found. Please refresh the app and allow microphone access.")
# Trigger a rerun to update button states and display transcription
st.rerun()
if __name__ == "__main__":
DEBUG = os.environ.get("DEBUG", "false").lower() not in ["false", "no", "0"]
logging.basicConfig(
format="[%(asctime)s] %(levelname)7s from %(name)s in %(pathname)s:%(lineno)d: "
"%(message)s",
force=True,
)
logger.setLevel(level=logging.DEBUG if DEBUG else logging.INFO)
st_webrtc_logger = logging.getLogger("streamlit_webrtc")
st_webrtc_logger.setLevel(logging.DEBUG if DEBUG else logging.INFO)
# Removed fsevents logger as Pathlib is not explicitly imported or used as much here
# fsevents_logger = logging.getLogger("fsevents")
# fsevents_logger.setLevel(logging.WARNING)
main()