Spaces:

Krish-05
/

fast_rep_voice

Paused

App Files Files Community

fast_rep_voice / streamlit_app.py

Krish-05

Update streamlit_app.py

073f4d8 verified 6 months ago

raw

history blame contribute delete

8.99 kB

	import logging
	import logging.handlers
	import time
	import os
	import io
	import soundfile as sf
	import requests

	import numpy as np
	import pydub
	import streamlit as st
	from twilio.rest import Client

	from streamlit_webrtc import WebRtcMode, webrtc_streamer
	from stt_module import AudioBufferProcessor # Import our custom processor

	logger = logging.getLogger(__name__)

	# --- Session State Initialization ---
	if 'is_recording' not in st.session_state:
	st.session_state.is_recording = False
	if 'transcribed_text' not in st.session_state:
	st.session_state.transcribed_text = ""
	if 'audio_processor_instance' not in st.session_state:
	st.session_state.audio_processor_instance = None


	# --- Utility Functions ---
	@st.cache_data
	def get_ice_servers():
	"""Fetches ICE servers for WebRTC connection."""
	try:
	account_sid = os.environ["TWILIO_ACCOUNT_SID"]
	auth_token = os.environ["TWILIO_AUTH_TOKEN"]
	except KeyError:
	logger.warning(
	"Twilio credentials (TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN) are not set. "
	"Falling back to a free STUN server from Google. "
	"This might be less reliable for WebRTC connections."
	)
	return [{"urls": ["stun:stun.l.google.com:19302"]}]

	client = Client(account_sid, auth_token)
	token = client.tokens.create()
	return token.ice_servers


	def main():
	st.header("Whisper Speech-to-Text with Recording")
	st.markdown(
	"""
	Click "Start Recording" to begin capturing audio from your microphone.
	Click "Stop Recording" to end the capture, save the audio,
	and send it to the Whisper model for transcription.
	The transcribed text will appear character by character below.
	"""
	)

	# Initialize the webrtc_streamer once.
	webrtc_ctx = webrtc_streamer(
	key="audio_recorder",
	mode=WebRtcMode.SENDONLY,
	audio_processor_factory=AudioBufferProcessor,
	rtc_configuration={"iceServers": get_ice_servers()},
	media_stream_constraints={"video": False, "audio": True},
	async_processing=True
	)

	# Store the audio_processor instance in session_state for later retrieval
	if webrtc_ctx.audio_processor and st.session_state.audio_processor_instance is None:
	st.session_state.audio_processor_instance = webrtc_ctx.audio_processor

	# Display status of the WebRTC connection
	if webrtc_ctx.state.playing:
	st.success("Microphone connected. Ready to record.")
	else:
	st.warning("Waiting for microphone connection... Please allow microphone access.")


	# --- Recording Controls ---
	col1, col2 = st.columns(2)

	with col1:
	# Disable "Start Recording" if already recording or mic not connected
	start_button = st.button(
	"Start Recording",
	key="start_rec_btn",
	disabled=st.session_state.is_recording or not webrtc_ctx.state.playing
	)
	with col2:
	# Disable "Stop Recording" if not recording
	stop_button = st.button(
	"Stop Recording",
	key="stop_rec_btn",
	disabled=not st.session_state.is_recording
	)

	# Placeholder for the animated text area
	# Initialize it with current session state text
	transcription_text_area = st.text_area("Transcription Result", value=st.session_state.transcribed_text, height=150, disabled=True)


	# Logic for Start/Stop buttons
	if start_button:
	if webrtc_ctx.state.playing:
	st.session_state.is_recording = True
	st.session_state.transcribed_text = "" # Clear previous text
	transcription_text_area.empty() # Clear the display
	st.info("Recording... Click 'Stop Recording' to transcribe.")
	logger.info("Recording started.")
	st.rerun() # Use st.rerun() to immediately update UI state
	else:
	st.error("Cannot start recording: Microphone not connected. Please allow microphone access.")

	if stop_button:
	if st.session_state.is_recording: # Only process if recording was active
	st.session_state.is_recording = False
	st.info("Processing recording... Please wait.")
	logger.info("Recording stopped. Processing audio...")

	# Retrieve all buffered audio from the processor instance
	if st.session_state.audio_processor_instance:
	recorded_audio = st.session_state.audio_processor_instance.get_and_clear_buffered_audio()

	if len(recorded_audio) > 0:
	# Save the audio to an in-memory WAV file
	wav_file_buffer = io.BytesIO()
	audio_array = np.array(recorded_audio.get_array_of_samples())
	audio_array = audio_array.astype(np.float32)
	sf.write(wav_file_buffer, audio_array, recorded_audio.frame_rate, format='WAV', subtype='PCM_16')
	wav_file_buffer.seek(0) # Rewind the buffer to the beginning

	# Send the WAV file to the FastAPI Whisper endpoint
	WHISPER_API_URL = "http://localhost:1990/transcribe_audio/"
	try:
	files = {'audio_file': ('recorded_audio.wav', wav_file_buffer, 'audio/wav')}
	response = requests.post(WHISPER_API_URL, files=files, timeout=120) # Increased timeout for transcription
	response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
	transcription_data = response.json()
	full_transcribed_text = transcription_data.get("transcription", "No transcription found.")

	st.session_state.transcribed_text = full_transcribed_text

	# --- Character-by-character display logic ---
	animated_text = ""
	# Re-display the placeholder to clear previous content
	transcription_text_area.empty()
	for char in full_transcribed_text:
	animated_text += char
	transcription_text_area.text_area("Transcription Result", value=animated_text, height=150, disabled=True)
	time.sleep(0.02) # Adjust speed as desired (e.g., 0.05 for slower)
	# Ensure the final text is displayed
	transcription_text_area.text_area("Transcription Result", value=full_transcribed_text, height=150, disabled=True)
	# --- End character-by-character display logic ---

	st.success("Transcription complete!")
	logger.info(f"Transcription received: '{full_transcribed_text[:100]}...'")
	except requests.exceptions.ConnectionError as e:
	st.error(f"Could not connect to Whisper API at {WHISPER_API_URL}. Is the FastAPI server running on port 1990?")
	logger.error(f"Connection Error: {e}", exc_info=True)
	except requests.exceptions.Timeout:
	st.error("Whisper API request timed out. The model might be busy or the audio too long. Try a shorter recording.")
	logger.error("Request Timeout.", exc_info=True)
	except requests.exceptions.RequestException as e:
	st.error(f"Error during API request: {e}. Response: {e.response.text if e.response else 'No response'}")
	logger.error(f"API Request Error: {e}", exc_info=True)
	except Exception as e:
	st.error(f"An unexpected error occurred during transcription: {e}")
	logger.error(f"Unexpected Transcription Error: {e}", exc_info=True)

	else:
	st.warning("No audio recorded. Please ensure your microphone is active and you spoke.")
	logger.warning("No audio recorded after stopping.")
	else:
	st.error("Audio processor instance not found. Please refresh the app and allow microphone access.")
	# Trigger a rerun to update button states and display transcription
	st.rerun()


	if __name__ == "__main__":
	DEBUG = os.environ.get("DEBUG", "false").lower() not in ["false", "no", "0"]

	logging.basicConfig(
	format="[%(asctime)s] %(levelname)7s from %(name)s in %(pathname)s:%(lineno)d: "
	"%(message)s",
	force=True,
	)

	logger.setLevel(level=logging.DEBUG if DEBUG else logging.INFO)

	st_webrtc_logger = logging.getLogger("streamlit_webrtc")
	st_webrtc_logger.setLevel(logging.DEBUG if DEBUG else logging.INFO)

	# Removed fsevents logger as Pathlib is not explicitly imported or used as much here
	# fsevents_logger = logging.getLogger("fsevents")
	# fsevents_logger.setLevel(logging.WARNING)

	main()