Spaces:

Krish-05
/

fast_rep_voice

Paused

App Files Files Community

Krish-05 commited on Jul 25, 2025

Commit

073f4d8

verified ·

1 Parent(s): a9ca228

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +34 -89

streamlit_app.py CHANGED Viewed

@@ -1,24 +1,18 @@
 import logging
 import logging.handlers
-import threading
 import time
-import urllib.request
 import os
-from pathlib import Path
-from typing import List
 import io
 import soundfile as sf
 import requests
-import av
 import numpy as np
 import pydub
 import streamlit as st
 from twilio.rest import Client
-from streamlit_webrtc import WebRtcMode, webrtc_streamer, AudioProcessorBase
-HERE = Path(__file__).parent
 logger = logging.getLogger(__name__)
@@ -31,49 +25,7 @@ if 'audio_processor_instance' not in st.session_state:
     st.session_state.audio_processor_instance = None
-# --- Utility Functions (from original code, kept for completeness) ---
-def download_file(url, download_to: Path, expected_size=None):
-    # This function is retained but might not be strictly necessary for this new workflow
-    # as Whisper model is loaded by FastAPI server.
-    if download_to.exists():
-        if expected_size:
-            if download_to.stat().st_size == expected_size:
-                return
-        else:
-            st.info(f"{url} is already downloaded.")
-            if not st.button("Download again?"):
-                return
-    download_to.parent.mkdir(parents=True, exist_ok=True)
-    weights_warning, progress_bar = None, None
-    try:
-        weights_warning = st.warning("Downloading %s..." % url)
-        progress_bar = st.progress(0)
-        with open(download_to, "wb") as output_file:
-            with urllib.request.urlopen(url) as response:
-                length = int(response.info()["Content-Length"])
-                counter = 0.0
-                MEGABYTES = 2.0 ** 20.0
-                while True:
-                    data = response.read(8192)
-                    if not data:
-                        break
-                    counter += len(data)
-                    output_file.write(data)
-                    weights_warning.warning(
-                        "Downloading %s... (%6.2f/%6.2f MB)"
-                        % (url, counter / MEGABYTES, length / MEGABYTES)
-                    )
-                    progress_bar.progress(min(counter / length, 1.0))
-    finally:
-        if weights_warning is not None:
-            weights_warning.empty()
-        if progress_bar is not None:
-            progress_bar.empty()
 @st.cache_data
 def get_ice_servers():
     """Fetches ICE servers for WebRTC connection."""
@@ -82,7 +34,9 @@ def get_ice_servers():
         auth_token = os.environ["TWILIO_AUTH_TOKEN"]
     except KeyError:
         logger.warning(
-            "Twilio credentials are not set. Fallback to a free STUN server from Google."
         )
         return [{"urls": ["stun:stun.l.google.com:19302"]}]
@@ -91,31 +45,6 @@ def get_ice_servers():
     return token.ice_servers
-# --- Custom Audio Processor for streamlit-webrtc ---
-class AudioBufferProcessor(AudioProcessorBase):
-    def __init__(self) -> None:
-        self._audio_buffer = pydub.AudioSegment.empty()
-        self._lock = threading.Lock()
-    def recv(self, frame: av.AudioFrame) -> None:
-        if st.session_state.is_recording:
-            sound = pydub.AudioSegment(
-                data=frame.to_ndarray().tobytes(),
-                sample_width=frame.format.bytes,
-                frame_rate=frame.sample_rate,
-                channels=len(frame.layout.channels),
-            )
-            sound = sound.set_channels(1).set_frame_rate(16000)
-            with self._lock:
-                self._audio_buffer += sound
-    def get_and_clear_buffered_audio(self) -> pydub.AudioSegment:
-        with self._lock:
-            recorded_audio = self._audio_buffer
-            self._audio_buffer = pydub.AudioSegment.empty()
-            return recorded_audio
 def main():
     st.header("Whisper Speech-to-Text with Recording")
     st.markdown(
@@ -123,9 +52,11 @@ def main():
         Click "Start Recording" to begin capturing audio from your microphone.
         Click "Stop Recording" to end the capture, save the audio,
         and send it to the Whisper model for transcription.
         """
     )
     webrtc_ctx = webrtc_streamer(
         key="audio_recorder",
         mode=WebRtcMode.SENDONLY,
@@ -135,9 +66,11 @@ def main():
         async_processing=True
     )
     if webrtc_ctx.audio_processor and st.session_state.audio_processor_instance is None:
         st.session_state.audio_processor_instance = webrtc_ctx.audio_processor
     if webrtc_ctx.state.playing:
         st.success("Microphone connected. Ready to record.")
     else:
@@ -148,54 +81,64 @@ def main():
     col1, col2 = st.columns(2)
     with col1:
         start_button = st.button(
             "Start Recording",
             disabled=st.session_state.is_recording or not webrtc_ctx.state.playing
         )
     with col2:
         stop_button = st.button(
             "Stop Recording",
             disabled=not st.session_state.is_recording
         )
     # Placeholder for the animated text area
-    transcription_text_area = st.text_area("Transcription Result", value="", height=150, disabled=True)
     if start_button:
         if webrtc_ctx.state.playing:
             st.session_state.is_recording = True
-            st.session_state.transcribed_text = ""
-            # Clear text area immediately
-            transcription_text_area.empty()
             st.info("Recording... Click 'Stop Recording' to transcribe.")
             logger.info("Recording started.")
-            st.rerun()
         else:
             st.error("Cannot start recording: Microphone not connected. Please allow microphone access.")
     if stop_button:
-        if st.session_state.is_recording:
             st.session_state.is_recording = False
             st.info("Processing recording... Please wait.")
             logger.info("Recording stopped. Processing audio...")
             if st.session_state.audio_processor_instance:
                 recorded_audio = st.session_state.audio_processor_instance.get_and_clear_buffered_audio()
                 if len(recorded_audio) > 0:
                     wav_file_buffer = io.BytesIO()
                     audio_array = np.array(recorded_audio.get_array_of_samples())
                     audio_array = audio_array.astype(np.float32)
                     sf.write(wav_file_buffer, audio_array, recorded_audio.frame_rate, format='WAV', subtype='PCM_16')
-                    wav_file_buffer.seek(0)
                     WHISPER_API_URL = "http://localhost:1990/transcribe_audio/"
                     try:
                         files = {'audio_file': ('recorded_audio.wav', wav_file_buffer, 'audio/wav')}
-                        response = requests.post(WHISPER_API_URL, files=files, timeout=120)
-                        response.raise_for_status()
                         transcription_data = response.json()
                         full_transcribed_text = transcription_data.get("transcription", "No transcription found.")
                         st.session_state.transcribed_text = full_transcribed_text
                         # --- Character-by-character display logic ---
@@ -230,7 +173,8 @@ def main():
                     logger.warning("No audio recorded after stopping.")
             else:
                 st.error("Audio processor instance not found. Please refresh the app and allow microphone access.")
-            st.rerun()
 if __name__ == "__main__":
@@ -247,7 +191,8 @@ if __name__ == "__main__":
     st_webrtc_logger = logging.getLogger("streamlit_webrtc")
     st_webrtc_logger.setLevel(logging.DEBUG if DEBUG else logging.INFO)
-    fsevents_logger = logging.getLogger("fsevents")
-    fsevents_logger.setLevel(logging.WARNING)
     main()

 import logging
 import logging.handlers
 import time
 import os
 import io
 import soundfile as sf
 import requests
 import numpy as np
 import pydub
 import streamlit as st
 from twilio.rest import Client
+from streamlit_webrtc import WebRtcMode, webrtc_streamer
+from stt_module import AudioBufferProcessor # Import our custom processor
 logger = logging.getLogger(__name__)
     st.session_state.audio_processor_instance = None
+# --- Utility Functions ---
 @st.cache_data
 def get_ice_servers():
     """Fetches ICE servers for WebRTC connection."""
         auth_token = os.environ["TWILIO_AUTH_TOKEN"]
     except KeyError:
         logger.warning(
+            "Twilio credentials (TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN) are not set. "
+            "Falling back to a free STUN server from Google. "
+            "This might be less reliable for WebRTC connections."
         )
         return [{"urls": ["stun:stun.l.google.com:19302"]}]
     return token.ice_servers
 def main():
     st.header("Whisper Speech-to-Text with Recording")
     st.markdown(
         Click "Start Recording" to begin capturing audio from your microphone.
         Click "Stop Recording" to end the capture, save the audio,
         and send it to the Whisper model for transcription.
+        The transcribed text will appear character by character below.
         """
     )
+    # Initialize the webrtc_streamer once.
     webrtc_ctx = webrtc_streamer(
         key="audio_recorder",
         mode=WebRtcMode.SENDONLY,
         async_processing=True
     )
+    # Store the audio_processor instance in session_state for later retrieval
     if webrtc_ctx.audio_processor and st.session_state.audio_processor_instance is None:
         st.session_state.audio_processor_instance = webrtc_ctx.audio_processor
+    # Display status of the WebRTC connection
     if webrtc_ctx.state.playing:
         st.success("Microphone connected. Ready to record.")
     else:
     col1, col2 = st.columns(2)
     with col1:
+        # Disable "Start Recording" if already recording or mic not connected
         start_button = st.button(
             "Start Recording",
+            key="start_rec_btn",
             disabled=st.session_state.is_recording or not webrtc_ctx.state.playing
         )
     with col2:
+        # Disable "Stop Recording" if not recording
         stop_button = st.button(
             "Stop Recording",
+            key="stop_rec_btn",
             disabled=not st.session_state.is_recording
         )
     # Placeholder for the animated text area
+    # Initialize it with current session state text
+    transcription_text_area = st.text_area("Transcription Result", value=st.session_state.transcribed_text, height=150, disabled=True)
+    # Logic for Start/Stop buttons
     if start_button:
         if webrtc_ctx.state.playing:
             st.session_state.is_recording = True
+            st.session_state.transcribed_text = "" # Clear previous text
+            transcription_text_area.empty() # Clear the display
             st.info("Recording... Click 'Stop Recording' to transcribe.")
             logger.info("Recording started.")
+            st.rerun() # Use st.rerun() to immediately update UI state
         else:
             st.error("Cannot start recording: Microphone not connected. Please allow microphone access.")
     if stop_button:
+        if st.session_state.is_recording: # Only process if recording was active
             st.session_state.is_recording = False
             st.info("Processing recording... Please wait.")
             logger.info("Recording stopped. Processing audio...")
+            # Retrieve all buffered audio from the processor instance
             if st.session_state.audio_processor_instance:
                 recorded_audio = st.session_state.audio_processor_instance.get_and_clear_buffered_audio()
                 if len(recorded_audio) > 0:
+                    # Save the audio to an in-memory WAV file
                     wav_file_buffer = io.BytesIO()
                     audio_array = np.array(recorded_audio.get_array_of_samples())
                     audio_array = audio_array.astype(np.float32)
                     sf.write(wav_file_buffer, audio_array, recorded_audio.frame_rate, format='WAV', subtype='PCM_16')
+                    wav_file_buffer.seek(0) # Rewind the buffer to the beginning
+                    # Send the WAV file to the FastAPI Whisper endpoint
                     WHISPER_API_URL = "http://localhost:1990/transcribe_audio/"
                     try:
                         files = {'audio_file': ('recorded_audio.wav', wav_file_buffer, 'audio/wav')}
+                        response = requests.post(WHISPER_API_URL, files=files, timeout=120) # Increased timeout for transcription
+                        response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
                         transcription_data = response.json()
                         full_transcribed_text = transcription_data.get("transcription", "No transcription found.")
                         st.session_state.transcribed_text = full_transcribed_text
                         # --- Character-by-character display logic ---
                     logger.warning("No audio recorded after stopping.")
             else:
                 st.error("Audio processor instance not found. Please refresh the app and allow microphone access.")
+        # Trigger a rerun to update button states and display transcription
+        st.rerun()
 if __name__ == "__main__":
     st_webrtc_logger = logging.getLogger("streamlit_webrtc")
     st_webrtc_logger.setLevel(logging.DEBUG if DEBUG else logging.INFO)
+    # Removed fsevents logger as Pathlib is not explicitly imported or used as much here
+    # fsevents_logger = logging.getLogger("fsevents")
+    # fsevents_logger.setLevel(logging.WARNING)
     main()