Spaces:

Krish-05
/

fast_rep_voice

Paused

App Files Files Community

Krish-05 commited on Jul 25, 2025

Commit

aa6e621

verified ·

1 Parent(s): 9f4d48c

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +138 -200

streamlit_app.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import logging
 import logging.handlers
-import queue
 import threading
 import time
 import urllib.request
 import os
-from collections import deque
 from pathlib import Path
 from typing import List
 import av
 import numpy as np
@@ -21,11 +22,19 @@ HERE = Path(__file__).parent
 logger = logging.getLogger(__name__)
-# This code is based on https://github.com/streamlit/demo-self-driving/blob/230245391f2dda0cb464008195a470751c01770b/streamlit_app.py#L48 # noqa: E501
 def download_file(url, download_to: Path, expected_size=None):
-    # Don't download the file twice.
-    # (If possible, verify the download using the file length.)
     if download_to.exists():
         if expected_size:
             if download_to.stat().st_size == expected_size:
@@ -37,7 +46,6 @@ def download_file(url, download_to: Path, expected_size=None):
     download_to.parent.mkdir(parents=True, exist_ok=True)
-    # These are handles to two visual elements to animate.
     weights_warning, progress_bar = None, None
     try:
         weights_warning = st.warning("Downloading %s..." % url)
@@ -54,13 +62,11 @@ def download_file(url, download_to: Path, expected_size=None):
                     counter += len(data)
                     output_file.write(data)
-                    # We perform animation by overwriting the elements.
                     weights_warning.warning(
                         "Downloading %s... (%6.2f/%6.2f MB)"
                         % (url, counter / MEGABYTES, length / MEGABYTES)
                     )
                     progress_bar.progress(min(counter / length, 1.0))
-    # Finally, we remove these visual elements by calling .empty().
     finally:
         if weights_warning is not None:
             weights_warning.empty()
@@ -68,234 +74,166 @@ def download_file(url, download_to: Path, expected_size=None):
             progress_bar.empty()
-# This code is based on https://github.com/whitphx/streamlit-webrtc/blob/c1fe3c783c9e8042ce0c95d789e833233fd82e74/sample_utils/turn.py
-@st.cache_data  # type: ignore
 def get_ice_servers():
-    """Use Twilio's TURN server because Streamlit Community Cloud has changed
-    its infrastructure and WebRTC connection cannot be established without TURN server now. # noqa: E501
-    We considered Open Relay Project (https://www.metered.ca/tools/openrelay/) too,
-    but it is not stable and hardly works as some people reported like https://github.com/aiortc/aiortc/issues/832#issuecomment-1482420656 # noqa: E501
-    See https://github.com/whitphx/streamlit-webrtc/issues/1213
-    """
-    # Ref: https://www.twilio.com/docs/stun-turn/api
     try:
         account_sid = os.environ["TWILIO_ACCOUNT_SID"]
         auth_token = os.environ["TWILIO_AUTH_TOKEN"]
     except KeyError:
         logger.warning(
-            "Twilio credentials are not set. Fallback to a free STUN server from Google." # noqa: E501
         )
         return [{"urls": ["stun:stun.l.google.com:19302"]}]
     client = Client(account_sid, auth_token)
     token = client.tokens.create()
     return token.ice_servers
 def main():
-    st.header("Real Time Speech-to-Text")
     st.markdown(
         """
-This demo app is using [DeepSpeech](https://github.com/mozilla/DeepSpeech),
-an open speech-to-text engine.
-A pre-trained model released with
-[v0.9.3](https://github.com/mozilla/DeepSpeech/releases/tag/v0.9.3),
-trained on American English is being served.
-"""
     )
-    # https://github.com/mozilla/DeepSpeech/releases/tag/v0.9.3
-    MODEL_URL = "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm"  # noqa
-    LANG_MODEL_URL = "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer"  # noqa
-    MODEL_LOCAL_PATH = HERE / "models/deepspeech-0.9.3-models.pbmm"
-    LANG_MODEL_LOCAL_PATH = HERE / "models/deepspeech-0.9.3-models.scorer"
-    download_file(MODEL_URL, MODEL_LOCAL_PATH, expected_size=188915987)
-    download_file(LANG_MODEL_URL, LANG_MODEL_LOCAL_PATH, expected_size=953363776)
-    lm_alpha = 0.931289039105002
-    lm_beta = 1.1834137581510284
-    beam = 100
-    sound_only_page = "Sound only (sendonly)"
-    with_video_page = "With video (sendrecv)"
-    app_mode = st.selectbox("Choose the app mode", [sound_only_page, with_video_page])
-    if app_mode == sound_only_page:
-        app_sst(
-            str(MODEL_LOCAL_PATH), str(LANG_MODEL_LOCAL_PATH), lm_alpha, lm_beta, beam
-        )
-    elif app_mode == with_video_page:
-        app_sst_with_video(
-            str(MODEL_LOCAL_PATH), str(LANG_MODEL_LOCAL_PATH), lm_alpha, lm_beta, beam
-        )
-def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int):
     webrtc_ctx = webrtc_streamer(
-        key="speech-to-text",
         mode=WebRtcMode.SENDONLY,
-        audio_receiver_size=1024,
         rtc_configuration={"iceServers": get_ice_servers()},
         media_stream_constraints={"video": False, "audio": True},
     )
-    status_indicator = st.empty()
-    if not webrtc_ctx.state.playing:
-        return
-    status_indicator.write("Loading...")
-    text_output = st.empty()
-    stream = None
-    while True:
-        if webrtc_ctx.audio_receiver:
-            if stream is None:
-                from deepspeech import Model
-                model = Model(model_path)
-                model.enableExternalScorer(lm_path)
-                model.setScorerAlphaBeta(lm_alpha, lm_beta)
-                model.setBeamWidth(beam)
-                stream = model.createStream()
-                status_indicator.write("Model loaded.")
-            sound_chunk = pydub.AudioSegment.empty()
-            try:
-                audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
-            except queue.Empty:
-                time.sleep(0.1)
-                status_indicator.write("No frame arrived.")
-                continue
-            status_indicator.write("Running. Say something!")
-            for audio_frame in audio_frames:
-                sound = pydub.AudioSegment(
-                    data=audio_frame.to_ndarray().tobytes(),
-                    sample_width=audio_frame.format.bytes,
-                    frame_rate=audio_frame.sample_rate,
-                    channels=len(audio_frame.layout.channels),
-                )
-                sound_chunk += sound
-            if len(sound_chunk) > 0:
-                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
-                    model.sampleRate()
-                )
-                buffer = np.array(sound_chunk.get_array_of_samples())
-                stream.feedAudioContent(buffer)
-                text = stream.intermediateDecode()
-                text_output.markdown(f"**Text:** {text}")
-        else:
-            status_indicator.write("AudioReciver is not set. Abort.")
-            break
-def app_sst_with_video(
-    model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int
-):
-    frames_deque_lock = threading.Lock()
-    frames_deque: deque = deque([])
-    async def queued_audio_frames_callback(
-        frames: List[av.AudioFrame],
-    ) -> av.AudioFrame:
-        with frames_deque_lock:
-            frames_deque.extend(frames)
-        # Return empty frames to be silent.
-        new_frames = []
-        for frame in frames:
-            input_array = frame.to_ndarray()
-            new_frame = av.AudioFrame.from_ndarray(
-                np.zeros(input_array.shape, dtype=input_array.dtype),
-                layout=frame.layout.name,
-            )
-            new_frame.sample_rate = frame.sample_rate
-            new_frames.append(new_frame)
-        return new_frames
-    webrtc_ctx = webrtc_streamer(
-        key="speech-to-text-w-video",
-        mode=WebRtcMode.SENDRECV,
-        queued_audio_frames_callback=queued_audio_frames_callback,
-        rtc_configuration={"iceServers": get_ice_servers()},
-        media_stream_constraints={"video": True, "audio": True},
-    )
-    status_indicator = st.empty()
-    if not webrtc_ctx.state.playing:
-        return
-    status_indicator.write("Loading...")
-    text_output = st.empty()
-    stream = None
-    while True:
         if webrtc_ctx.state.playing:
-            if stream is None:
-                from deepspeech import Model
-                model = Model(model_path)
-                model.enableExternalScorer(lm_path)
-                model.setScorerAlphaBeta(lm_alpha, lm_beta)
-                model.setBeamWidth(beam)
-                stream = model.createStream()
-                status_indicator.write("Model loaded.")
-            sound_chunk = pydub.AudioSegment.empty()
-            audio_frames = []
-            with frames_deque_lock:
-                while len(frames_deque) > 0:
-                    frame = frames_deque.popleft()
-                    audio_frames.append(frame)
-            if len(audio_frames) == 0:
-                time.sleep(0.1)
-                status_indicator.write("No frame arrived.")
-                continue
-            status_indicator.write("Running. Say something!")
-            for audio_frame in audio_frames:
-                sound = pydub.AudioSegment(
-                    data=audio_frame.to_ndarray().tobytes(),
-                    sample_width=audio_frame.format.bytes,
-                    frame_rate=audio_frame.sample_rate,
-                    channels=len(audio_frame.layout.channels),
-                )
-                sound_chunk += sound
-            if len(sound_chunk) > 0:
-                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
-                    model.sampleRate()
-                )
-                buffer = np.array(sound_chunk.get_array_of_samples())
-                stream.feedAudioContent(buffer)
-                text = stream.intermediateDecode()
-                text_output.markdown(f"**Text:** {text}")
         else:
-            status_indicator.write("Stopped.")
-            break
 if __name__ == "__main__":
-    import os
     DEBUG = os.environ.get("DEBUG", "false").lower() not in ["false", "no", "0"]
     logging.basicConfig(
@@ -307,7 +245,7 @@ if __name__ == "__main__":
     logger.setLevel(level=logging.DEBUG if DEBUG else logging.INFO)
     st_webrtc_logger = logging.getLogger("streamlit_webrtc")
-    st_webrtc_logger.setLevel(logging.DEBUG)
     fsevents_logger = logging.getLogger("fsevents")
     fsevents_logger.setLevel(logging.WARNING)

 import logging
 import logging.handlers
 import threading
 import time
 import urllib.request
 import os
 from pathlib import Path
 from typing import List
+import io
+import soundfile as sf
+import requests
 import av
 import numpy as np
 logger = logging.getLogger(__name__)
+# --- Session State Initialization ---
+if 'is_recording' not in st.session_state:
+    st.session_state.is_recording = False
+if 'transcribed_text' not in st.session_state:
+    st.session_state.transcribed_text = ""
+if 'audio_processor_instance' not in st.session_state:
+    st.session_state.audio_processor_instance = None
+# --- Utility Functions (from original code, kept for completeness) ---
 def download_file(url, download_to: Path, expected_size=None):
+    # This function is retained but might not be strictly necessary for this new workflow
+    # as Whisper model is loaded by FastAPI server.
     if download_to.exists():
         if expected_size:
             if download_to.stat().st_size == expected_size:
     download_to.parent.mkdir(parents=True, exist_ok=True)
     weights_warning, progress_bar = None, None
     try:
         weights_warning = st.warning("Downloading %s..." % url)
                     counter += len(data)
                     output_file.write(data)
                     weights_warning.warning(
                         "Downloading %s... (%6.2f/%6.2f MB)"
                         % (url, counter / MEGABYTES, length / MEGABYTES)
                     )
                     progress_bar.progress(min(counter / length, 1.0))
     finally:
         if weights_warning is not None:
             weights_warning.empty()
             progress_bar.empty()
+@st.cache_data
 def get_ice_servers():
+    """Fetches ICE servers for WebRTC connection."""
     try:
         account_sid = os.environ["TWILIO_ACCOUNT_SID"]
         auth_token = os.environ["TWILIO_AUTH_TOKEN"]
     except KeyError:
         logger.warning(
+            "Twilio credentials are not set. Fallback to a free STUN server from Google."
         )
         return [{"urls": ["stun:stun.l.google.com:19302"]}]
     client = Client(account_sid, auth_token)
     token = client.tokens.create()
     return token.ice_servers
+# --- Custom Audio Processor for streamlit-webrtc ---
+class AudioBufferProcessor(AudioProcessorBase):
+    def __init__(self) -> None:
+        self._audio_buffer = pydub.AudioSegment.empty()
+        self._lock = threading.Lock()
+    def recv(self, frame: av.AudioFrame) -> None:
+        if st.session_state.is_recording:
+            sound = pydub.AudioSegment(
+                data=frame.to_ndarray().tobytes(),
+                sample_width=frame.format.bytes,
+                frame_rate=frame.sample_rate,
+                channels=len(frame.layout.channels),
+            )
+            sound = sound.set_channels(1).set_frame_rate(16000)
+            with self._lock:
+                self._audio_buffer += sound
+    def get_and_clear_buffered_audio(self) -> pydub.AudioSegment:
+        with self._lock:
+            recorded_audio = self._audio_buffer
+            self._audio_buffer = pydub.AudioSegment.empty()
+            return recorded_audio
 def main():
+    st.header("Whisper Speech-to-Text with Recording")
     st.markdown(
         """
+        Click "Start Recording" to begin capturing audio from your microphone.
+        Click "Stop Recording" to end the capture, save the audio,
+        and send it to the Whisper model for transcription.
+        """
     )
     webrtc_ctx = webrtc_streamer(
+        key="audio_recorder",
         mode=WebRtcMode.SENDONLY,
+        audio_processor_factory=AudioBufferProcessor,
         rtc_configuration={"iceServers": get_ice_servers()},
         media_stream_constraints={"video": False, "audio": True},
+        async_processing=True
     )
+    if webrtc_ctx.audio_processor and st.session_state.audio_processor_instance is None:
+        st.session_state.audio_processor_instance = webrtc_ctx.audio_processor
+    if webrtc_ctx.state.playing:
+        st.success("Microphone connected. Ready to record.")
+    else:
+        st.warning("Waiting for microphone connection... Please allow microphone access.")
+    # --- Recording Controls ---
+    col1, col2 = st.columns(2)
+    with col1:
+        start_button = st.button(
+            "Start Recording",
+            disabled=st.session_state.is_recording or not webrtc_ctx.state.playing
+        )
+    with col2:
+        stop_button = st.button(
+            "Stop Recording",
+            disabled=not st.session_state.is_recording
+        )
+    # Placeholder for the animated text area
+    transcription_text_area = st.text_area("Transcription Result", value="", height=150, disabled=True)
+    if start_button:
         if webrtc_ctx.state.playing:
+            st.session_state.is_recording = True
+            st.session_state.transcribed_text = ""
+            # Clear text area immediately
+            transcription_text_area.empty()
+            st.info("Recording... Click 'Stop Recording' to transcribe.")
+            logger.info("Recording started.")
+            st.rerun()
         else:
+            st.error("Cannot start recording: Microphone not connected. Please allow microphone access.")
+    if stop_button:
+        if st.session_state.is_recording:
+            st.session_state.is_recording = False
+            st.info("Processing recording... Please wait.")
+            logger.info("Recording stopped. Processing audio...")
+            if st.session_state.audio_processor_instance:
+                recorded_audio = st.session_state.audio_processor_instance.get_and_clear_buffered_audio()
+                if len(recorded_audio) > 0:
+                    wav_file_buffer = io.BytesIO()
+                    audio_array = np.array(recorded_audio.get_array_of_samples())
+                    audio_array = audio_array.astype(np.float32)
+                    sf.write(wav_file_buffer, audio_array, recorded_audio.frame_rate, format='WAV', subtype='PCM_16')
+                    wav_file_buffer.seek(0)
+                    WHISPER_API_URL = "http://localhost:1990/transcribe_audio/"
+                    try:
+                        files = {'audio_file': ('recorded_audio.wav', wav_file_buffer, 'audio/wav')}
+                        response = requests.post(WHISPER_API_URL, files=files, timeout=120)
+                        response.raise_for_status()
+                        transcription_data = response.json()
+                        full_transcribed_text = transcription_data.get("transcription", "No transcription found.")
+                        st.session_state.transcribed_text = full_transcribed_text
+                        # --- Character-by-character display logic ---
+                        animated_text = ""
+                        # Re-display the placeholder to clear previous content
+                        transcription_text_area.empty()
+                        for char in full_transcribed_text:
+                            animated_text += char
+                            transcription_text_area.text_area("Transcription Result", value=animated_text, height=150, disabled=True)
+                            time.sleep(0.02) # Adjust speed as desired (e.g., 0.05 for slower)
+                        # Ensure the final text is displayed
+                        transcription_text_area.text_area("Transcription Result", value=full_transcribed_text, height=150, disabled=True)
+                        # --- End character-by-character display logic ---
+                        st.success("Transcription complete!")
+                        logger.info(f"Transcription received: '{full_transcribed_text[:100]}...'")
+                    except requests.exceptions.ConnectionError as e:
+                        st.error(f"Could not connect to Whisper API at {WHISPER_API_URL}. Is the FastAPI server running on port 1990?")
+                        logger.error(f"Connection Error: {e}", exc_info=True)
+                    except requests.exceptions.Timeout:
+                        st.error("Whisper API request timed out. The model might be busy or the audio too long. Try a shorter recording.")
+                        logger.error("Request Timeout.", exc_info=True)
+                    except requests.exceptions.RequestException as e:
+                        st.error(f"Error during API request: {e}. Response: {e.response.text if e.response else 'No response'}")
+                        logger.error(f"API Request Error: {e}", exc_info=True)
+                    except Exception as e:
+                        st.error(f"An unexpected error occurred during transcription: {e}")
+                        logger.error(f"Unexpected Transcription Error: {e}", exc_info=True)
+                else:
+                    st.warning("No audio recorded. Please ensure your microphone is active and you spoke.")
+                    logger.warning("No audio recorded after stopping.")
+            else:
+                st.error("Audio processor instance not found. Please refresh the app and allow microphone access.")
+            st.rerun()
 if __name__ == "__main__":
     DEBUG = os.environ.get("DEBUG", "false").lower() not in ["false", "no", "0"]
     logging.basicConfig(
     logger.setLevel(level=logging.DEBUG if DEBUG else logging.INFO)
     st_webrtc_logger = logging.getLogger("streamlit_webrtc")
+    st_webrtc_logger.setLevel(logging.DEBUG if DEBUG else logging.INFO)
     fsevents_logger = logging.getLogger("fsevents")
     fsevents_logger.setLevel(logging.WARNING)