Spaces:

Krish-05
/

fast_rep_voice

Paused

App Files Files Community

Krish-05 commited on Jul 25, 2025

Commit

5b53a0a

verified ·

1 Parent(s): 5afac77

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +300 -349

streamlit_app.py CHANGED Viewed

@@ -1,364 +1,315 @@
-import streamlit as st
-import requests
-import json
-import time
 import logging
 import numpy as np
-import sys
-import io # New: For handling audio bytes
-from pydub import AudioSegment # New: For converting audio formats (requires ffmpeg)
-from streamlit_webrtc import WebRtcMode, webrtc_streamer, AudioProcessorBase, ClientSettings # New: For microphone access
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# --- Debugging: Display installed package versions ---
-try:
-    import pkg_resources
-    st.sidebar.write(f"Streamlit version: {pkg_resources.get_distribution('streamlit').version}")
-    st.sidebar.write(f"Requests version: {pkg_resources.get_distribution('requests').version}")
-    try:
-        webrtc_version = pkg_resources.get_distribution("streamlit-webrtc").version
-        st.sidebar.write(f"streamlit-webrtc version: {webrtc_version}")
-    except pkg_resources.DistributionNotFound:
-        st.sidebar.write("streamlit-webrtc not found (expected for current app logic).")
-    except Exception as e:
-        st.sidebar.write(f"Could not get streamlit-webrtc version: {e}")
-    try:
-        # Check for faster-whisper and pydub
-        fw_version = pkg_resources.get_distribution("faster-whisper").version
-        st.sidebar.write(f"faster-whisper version: {fw_version}")
-    except pkg_resources.DistributionNotFound:
-        st.sidebar.write("faster-whisper not found (expected for current app logic).")
-    except Exception as e:
-        st.sidebar.write(f"Could not get faster-whisper version: {e}")
     try:
-        pd_version = pkg_resources.get_distribution("pydub").version
-        st.sidebar.write(f"pydub version: {pd_version}")
-    except pkg_resources.DistributionNotFound:
-        st.sidebar.write("pydub not found (expected for current app logic).")
-    except Exception as e:
-        st.sidebar.write(f"Could not get pydub version: {e}")
-    # Not expecting transformers here, removed for clarity.
-except Exception as e:
-    st.sidebar.write(f"Could not get package versions: {e}")
-# --- End Debugging Section ---
-# Configuration for the FastAPI backend
-FASTAPI_HOST = "localhost"
-FASTAPI_PORT = 7860
-FASTAPI_LLM_URL = f"http://{FASTAPI_HOST}:{FASTAPI_PORT}/ask" # For LLM requests
-FASTAPI_STT_URL = f"http://{FASTAPI_HOST}:{FASTAPI_PORT}/transcribe_audio" # For STT requests
-# Set Streamlit page configuration
-st.set_page_config(page_title="Ollama AI Assistant", page_icon="🤖", layout="wide")
-# --- Session state for chat history ---
-# Initialize chat history if it doesn't exist in session state
-if 'chat_history' not in st.session_state:
-    st.session_state.chat_history = [
-        {"role": "assistant", "message": "Hello! How can I assist you today?"}
-    ]
-    logger.info("Chat history initialized.")
-# --- Session state for STT and WebRTC ---
-# This controls the microphone recording lifecycle
-if 'transcribed_text' not in st.session_state:
-    st.session_state.transcribed_text = "" # Stores the last transcribed text
-if 'webrtc_state' not in st.session_state:
-    st.session_state.webrtc_state = "idle" # idle, listening, processing_audio
-# --- Custom Audio Processor for VAD and Audio Buffering ---
-class VADAudioProcessor(AudioProcessorBase):
-    """
-    Processes audio frames from WebRTC. It buffers audio and
-    implements a simple volume-based Voice Activity Detection (VAD).
     """
-    def __init__(self):
-        self.audio_buffer = io.BytesIO()
-        self.silent_frames_count = 0
-        self.voice_detected = False
-        self.frame_rate = 16000 # Standard for WebRTC audio
-        self.samples_width = 2 # 16-bit audio (2 bytes per sample)
-        self.threshold = 500 # Adjust this based on environment noise and microphone sensitivity
-        self.max_silent_frames = 30 # Stop after N silent frames (~0.3 seconds at 10ms/frame)
-        self.total_frames_processed = 0
-        logger.info("VADAudioProcessor initialized.")
-    def _calculate_volume(self, audio_chunk: bytes) -> float:
-        """Calculate RMS (Root Mean Square) volume of an audio chunk."""
-        # Convert bytes to a numpy array of 16-bit integers
-        audio_array = np.frombuffer(audio_chunk, dtype=np.int16)
-        if audio_array.size == 0:
-            return 0.0
-        # Calculate RMS
-        rms = np.sqrt(np.mean(audio_array**2))
-        return rms
-    def process(self, audio_chunk: bytes) -> bytes:
-        """
-        Processes each incoming audio chunk from the microphone.
         """
-        # Write the raw audio chunk to the buffer
-        self.audio_buffer.write(audio_chunk)
-        self.total_frames_processed += 1
-        # Perform simple VAD
-        volume = self._calculate_volume(audio_chunk)
-        # logger.debug(f"Audio chunk received, volume: {volume:.2f}") # Use debug for less verbose logging
-        if volume > self.threshold:
-            self.voice_detected = True
-            self.silent_frames_count = 0 # Reset silence count on voice detection
-            # logger.debug("Voice detected!")
-        elif self.voice_detected: # Only count silence if voice was previously detected
-            self.silent_frames_count += 1
-            # logger.debug(f"Silence detected. Silent frames: {self.silent_frames_count}")
-        # This processor simply collects data. The stopping logic is handled
-        # by the Streamlit app's main loop reacting to this processor's state.
-        return audio_chunk # Return the chunk (pass-through)
-# --- App Header ---
-st.title("🤖 Ollama AI Assistant")
-st.caption("Start chatting with our AI assistant. Type your message or use the microphone.")
-# --- Chat Display ---
-st.markdown("---") # Separator for visual clarity
-for chat in st.session_state.chat_history:
-    # Use Streamlit's chat_message container for distinct roles
-    with st.chat_message(chat["role"], avatar="🤖" if chat["role"] == "assistant" else "👤"):
-        st.write(chat["message"])
-# --- Input Area ---
-# Use a form to handle user input and submission
-with st.form("chat_form", clear_on_submit=True):
-    # Store the user's prompt in session state so it can be pre-filled by STT
-    user_prompt_key = "user_input_text_area" # A unique key for the text area
-    user_prompt = st.text_area(
-        "Type your message here...",
-        height=100,
-        placeholder="e.g., Explain quantum computing in simple terms.",
-        label_visibility="collapsed", # Hide the default label for a cleaner look
-        key=user_prompt_key,
-        value=st.session_state.transcribed_text # Pre-fill with transcribed text from STT
     )
-    col1, col2 = st.columns([1, 1])
-    with col1:
-        submitted = st.form_submit_button("Send")
-    with col2:
-        # Microphone button logic
-        record_button_label = "Stop Listening" if st.session_state.webrtc_state == "listening" else "Start Listening"
-        microphone_button = st.form_submit_button(record_button_label, key="microphone_button")
-    # Handle microphone button press to control WebRTC state
-    if microphone_button:
-        if st.session_state.webrtc_state == "idle":
-            # Transition to 'listening' state
-            st.session_state.webrtc_state = "listening"
-            st.session_state.transcribed_text = "" # Clear any previous transcription
-            st.info("Listening... Tap 'Stop Listening' or wait for silence to auto-stop.")
-            st.rerun() # Rerun to activate the WebRTC streamer
-        elif st.session_state.webrtc_state == "listening":
-            # User manually clicked 'Stop Listening', transition to 'processing_audio'
-            st.session_state.webrtc_state = "processing_audio"
-            st.info("Stopping recording and processing audio...")
-            st.rerun() # Rerun to trigger audio processing
-    # Process the prompt when the 'Send' button is submitted and prompt is not empty
-    if submitted and user_prompt:
-        logger.info(f"User submitted prompt: {user_prompt[:50]}...") # Log the submitted prompt
-        # Add user's message to chat history immediately
-        st.session_state.chat_history.append({"role": "user", "message": user_prompt})
-        st.session_state.transcribed_text = "" # Clear transcribed text after it's sent to LLM
-        # Display a "Thinking..." message while waiting for the AI response
-        with st.chat_message("assistant", avatar="🤖"):
-            response_placeholder = st.empty() # Create an empty placeholder for streaming content
-            response_placeholder.write("Thinking...") # Initial message
-            logger.info("Displaying 'Thinking...' message.")
-        full_response = "" # Initialize an empty string to build the full response
-        byte_buffer = b"" # Initialize a buffer for incomplete UTF-8 characters for streaming
-        try:
-            # Prepare the request payload for FastAPI LLM endpoint
-            payload = {"text": user_prompt}
-            headers = {"Content-Type": "application/json"}
-            logger.info(f"Sending LLM request to FastAPI at {FASTAPI_LLM_URL}")
-            # Make a streaming POST request to the FastAPI endpoint
-            with requests.post(FASTAPI_LLM_URL, json=payload, headers=headers, stream=True) as response:
-                logger.info(f"Received LLM response from FastAPI with status code: {response.status_code}")
-                response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
-                # Iterate over the response content as it streams (byte by byte)
-                for chunk in response.iter_content(chunk_size=1):
-                    if chunk: # Filter out potential empty keep-alive chunks
-                        byte_buffer += chunk # Append new bytes to the buffer
-                        try:
-                            # Attempt to decode the entire buffer using 'strict' error handling
-                            decoded_text = byte_buffer.decode("utf-8", errors="strict")
-                            full_response += decoded_text
-                            response_placeholder.markdown(full_response + "▌") # Update display, add cursor
-                            byte_buffer = b"" # Clear the buffer if decoding was successful
-                        except UnicodeDecodeError:
-                            # This is expected if a multi-byte character is split across chunks.
-                            # Do nothing, just wait for the next chunk to complete the character.
-                            pass
-                        except Exception as e:
-                            # Catch any other unexpected decoding errors
-                            logger.error(f"Error decoding stream chunk: {e} - Raw bytes: {chunk}")
-                            try:
-                                full_response += chunk.decode("utf-8", errors="replace")
-                            except Exception as decode_err:
-                                logger.error(f"Failed to decode even with replace errors: {decode_err}")
-                                full_response += "[Decoding Error]" # Indicate a severe decoding issue
-                            response_placeholder.markdown(full_response + "▌")
-                            byte_buffer = b"" # Clear buffer to try and recover
-            # After the loop, if there are any remaining bytes in the buffer, try to decode them
-            if byte_buffer:
-                try:
-                    full_response += byte_buffer.decode("utf-8", errors="replace")
-                    logger.warning("Remaining bytes in buffer decoded with replacement.")
-                except Exception as e:
-                    logger.error(f"Failed to decode final buffer bytes: {e}")
-                    full_response += "[Final Decoding Error]"
-            response_placeholder.markdown(full_response) # Final update without cursor
-            logger.info("Streaming complete. Full LLM response received.")
-        except requests.exceptions.ConnectionError:
-            # Handle cases where Streamlit cannot connect to FastAPI
-            full_response = (f"Error: Could not connect to the FastAPI server. "
-                             f"Please ensure it is running at {FASTAPI_LLM_URL}.")
-            response_placeholder.error(full_response) # Display error in the placeholder
-            logger.error(f"ConnectionError: Could not connect to FastAPI at {FASTAPI_LLM_URL}")
-        except requests.exceptions.RequestException as e:
-            # Handle other request-related errors (e.g., HTTP errors from raise_for_status)
-            error_details = e.response.text if e.response is not None else str(e)
-            status_code = e.response.status_code if e.response is not None else "N/A"
-            full_response = (f"An error occurred during the request to FastAPI. "
-                             f"Status code: {status_code}\nDetails: {error_details}")
-            response_placeholder.error(full_response) # Display error in the placeholder
-            logger.error(f"Request error to FastAPI: {e}", exc_info=True)
-        except Exception as e:
-            # Catch any other unexpected errors during the request or processing
-            full_response = f"An unexpected error occurred: {e}"
-            response_placeholder.error(full_response) # Display error in the placeholder
-            logger.exception("An unexpected error occurred during API request.") # Logs traceback
-        # After the streaming is complete (or an error occurred), add the final response
-        # to the chat history. This ensures it persists across reruns.
-        st.session_state.chat_history.append({"role": "assistant", "message": full_response})
-        logger.info("Final LLM response added to chat history.")
-        # Rerun the app to display the updated chat history with the final response
-        st.rerun()
-    elif submitted and not user_prompt:
-        # Warn user if no prompt is entered for the 'Send' button
-        st.warning("Please enter a prompt before clicking 'Send'.")
-        logger.warning("User attempted to send an empty text prompt.")
-# --- WebRTC Streamer for Microphone Input ---
-webrtc_ctx = None
-if st.session_state.webrtc_state in ["listening", "processing_audio"]:
-    logger.info(f"Initiating webrtc_streamer with state: {st.session_state.webrtc_state}")
     webrtc_ctx = webrtc_streamer(
-        key="ollama-audio-input", # Unique key for this component
-        mode=WebRtcMode.SENDONLY, # Only send audio from browser to Python
-        audio_html_attrs={
-            "autoPlay": "true",
-            "controls": "",
-            "muted": "muted", # Mute local playback to avoid echo
-        },
-        # Use our custom processor to handle audio frames and VAD
-        in_audio_frames_processor_factory=VADAudioProcessor,
-        client_settings=ClientSettings(
-            rtc_configuration={"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}, # STUN server for NAT traversal
-            media_stream_constraints={"audio": True, "video": False}, # Only request audio stream
-        ),
     )
-    # Display status messages while recording
-    if webrtc_ctx.state.playing and st.session_state.webrtc_state == "listening":
-        st.info("Microphone active. Speak clearly now...")
-    elif not webrtc_ctx.state.playing and st.session_state.webrtc_state == "listening":
-        st.warning("Waiting for microphone permissions... Please grant access if prompted.")
-    # Check VAD status from the audio processor
-    if webrtc_ctx.audio_processor:
-        processor: VADAudioProcessor = webrtc_ctx.audio_processor
-        # If voice was detected, and now prolonged silence is detected
-        if processor.voice_detected and processor.silent_frames_count >= processor.max_silent_frames:
-            logger.info("VAD detected prolonged silence. Transitioning to processing audio.")
-            # Set state to processing, which will cause a rerun and stop the streamer
-            if st.session_state.webrtc_state == "listening": # Only auto-stop if currently listening
-                st.session_state.webrtc_state = "processing_audio"
-                st.info("Silence detected. Processing audio for transcription...")
-                st.rerun() # Trigger a rerun to process the audio
-# --- Audio Processing and STT Call after Recording Stops ---
-# This block runs when we transition to 'processing_audio' state and the WebRTC session is truly stopped.
-if st.session_state.webrtc_state == "processing_audio" and (webrtc_ctx is None or not webrtc_ctx.state.playing):
-    logger.info("WebRTC session stopped (or never started in processing_audio state). Attempting to get audio.")
-    # Ensure we have an audio processor instance from the stopped session
-    if webrtc_ctx and webrtc_ctx.audio_processor:
-        processor: VADAudioProcessor = webrtc_ctx.audio_processor
-        if processor.audio_buffer.tell() > 0: # Check if any audio data was recorded
-            recorded_audio_bytes = processor.audio_buffer.getvalue()
-            logger.info(f"Recorded audio buffer size: {len(recorded_audio_bytes)} bytes.")
-            # Convert raw 16-bit PCM (from WebRTC) to WAV format using pydub
             try:
-                audio = AudioSegment(
-                    recorded_audio_bytes,
-                    sample_width=processor.samples_width,
-                    frame_rate=processor.frame_rate,
-                    channels=1 # WebRTC typically provides mono audio
                 )
-                wav_io = io.BytesIO()
-                audio.export(wav_io, format="wav") # Export to WAV format
-                wav_io.seek(0) # Rewind the buffer to the beginning for reading
-                st.info("Sending recorded audio to STT backend for transcription...")
-                # Send the WAV audio bytes to the FastAPI STT endpoint
-                files = {'audio_file': ('audio.wav', wav_io.getvalue(), 'audio/wav')}
-                response = requests.post(FASTAPI_STT_URL, files=files)
-                response.raise_for_status() # Raise HTTPError for bad responses
-                transcription_result = response.json()
-                transcribed_text = transcription_result.get("transcribed_text", "").strip()
-                st.session_state.transcribed_text = transcribed_text # Store transcribed text
-                logger.info(f"Transcription received: {transcribed_text[:100]}...")
-                if transcribed_text:
-                    st.success("Transcription complete!")
-                else:
-                    st.warning("No clear speech detected or transcription resulted in empty text.")
-            except requests.exceptions.RequestException as e:
-                st.error(f"Error sending audio to STT backend: {e}")
-                logger.error(f"STT Backend error: {e}", exc_info=True)
-                st.session_state.transcribed_text = "" # Clear on error
-            except Exception as e:
-                st.error(f"An unexpected error occurred during audio processing or STT: {e}")
-                logger.exception("Unexpected error in STT processing.")
-                st.session_state.transcribed_text = "" # Clear on error
         else:
-            st.warning("No audio was recorded during the session.")
-            st.session_state.transcribed_text = ""
-        # Reset WebRTC state to idle after processing is complete
-        st.session_state.webrtc_state = "idle"
-        st.rerun() # Rerun to update the text area with transcription and reset UI
-    elif st.session_state.webrtc_state == "processing_audio":
-        st.warning("WebRTC context or audio processor was not available for transcription. Retrying or check permissions.")
-        st.session_state.webrtc_state = "idle" # Reset for next attempt
-        st.rerun()
-# --- Footer ---
-st.markdown("---")
-st.caption("Powered by Ollama, FastAPI, Streamlit, and WebRTC.")

 import logging
+import logging.handlers
+import queue
+import threading
+import time
+import urllib.request
+import os
+from collections import deque
+from pathlib import Path
+from typing import List
+import av
 import numpy as np
+import pydub
+import streamlit as st
+from twilio.rest import Client
+from streamlit_webrtc import WebRtcMode, webrtc_streamer, AudioProcessorBase
+HERE = Path(__file__).parent
 logger = logging.getLogger(__name__)
+# This code is based on https://github.com/streamlit/demo-self-driving/blob/230245391f2dda0cb464008195a470751c01770b/streamlit_app.py#L48 # noqa: E501
+def download_file(url, download_to: Path, expected_size=None):
+    # Don't download the file twice.
+    # (If possible, verify the download using the file length.)
+    if download_to.exists():
+        if expected_size:
+            if download_to.stat().st_size == expected_size:
+                return
+        else:
+            st.info(f"{url} is already downloaded.")
+            if not st.button("Download again?"):
+                return
+    download_to.parent.mkdir(parents=True, exist_ok=True)
+    # These are handles to two visual elements to animate.
+    weights_warning, progress_bar = None, None
     try:
+        weights_warning = st.warning("Downloading %s..." % url)
+        progress_bar = st.progress(0)
+        with open(download_to, "wb") as output_file:
+            with urllib.request.urlopen(url) as response:
+                length = int(response.info()["Content-Length"])
+                counter = 0.0
+                MEGABYTES = 2.0 ** 20.0
+                while True:
+                    data = response.read(8192)
+                    if not data:
+                        break
+                    counter += len(data)
+                    output_file.write(data)
+                    # We perform animation by overwriting the elements.
+                    weights_warning.warning(
+                        "Downloading %s... (%6.2f/%6.2f MB)"
+                        % (url, counter / MEGABYTES, length / MEGABYTES)
+                    )
+                    progress_bar.progress(min(counter / length, 1.0))
+    # Finally, we remove these visual elements by calling .empty().
+    finally:
+        if weights_warning is not None:
+            weights_warning.empty()
+        if progress_bar is not None:
+            progress_bar.empty()
+# This code is based on https://github.com/whitphx/streamlit-webrtc/blob/c1fe3c783c9e8042ce0c95d789e833233fd82e74/sample_utils/turn.py
+@st.cache_data  # type: ignore
+def get_ice_servers():
+    """Use Twilio's TURN server because Streamlit Community Cloud has changed
+    its infrastructure and WebRTC connection cannot be established without TURN server now. # noqa: E501
+    We considered Open Relay Project (https://www.metered.ca/tools/openrelay/) too,
+    but it is not stable and hardly works as some people reported like https://github.com/aiortc/aiortc/issues/832#issuecomment-1482420656 # noqa: E501
+    See https://github.com/whitphx/streamlit-webrtc/issues/1213
     """
+    # Ref: https://www.twilio.com/docs/stun-turn/api
+    try:
+        account_sid = os.environ["TWILIO_ACCOUNT_SID"]
+        auth_token = os.environ["TWILIO_AUTH_TOKEN"]
+    except KeyError:
+        logger.warning(
+            "Twilio credentials are not set. Fallback to a free STUN server from Google." # noqa: E501
+        )
+        return [{"urls": ["stun:stun.l.google.com:19302"]}]
+    client = Client(account_sid, auth_token)
+    token = client.tokens.create()
+    return token.ice_servers
+def main():
+    st.header("Real Time Speech-to-Text")
+    st.markdown(
         """
+This demo app is using [DeepSpeech](https://github.com/mozilla/DeepSpeech),
+an open speech-to-text engine.
+A pre-trained model released with
+[v0.9.3](https://github.com/mozilla/DeepSpeech/releases/tag/v0.9.3),
+trained on American English is being served.
+"""
     )
+    # https://github.com/mozilla/DeepSpeech/releases/tag/v0.9.3
+    MODEL_URL = "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm"  # noqa
+    LANG_MODEL_URL = "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer"  # noqa
+    MODEL_LOCAL_PATH = HERE / "models/deepspeech-0.9.3-models.pbmm"
+    LANG_MODEL_LOCAL_PATH = HERE / "models/deepspeech-0.9.3-models.scorer"
+    download_file(MODEL_URL, MODEL_LOCAL_PATH, expected_size=188915987)
+    download_file(LANG_MODEL_URL, LANG_MODEL_LOCAL_PATH, expected_size=953363776)
+    lm_alpha = 0.931289039105002
+    lm_beta = 1.1834137581510284
+    beam = 100
+    sound_only_page = "Sound only (sendonly)"
+    with_video_page = "With video (sendrecv)"
+    app_mode = st.selectbox("Choose the app mode", [sound_only_page, with_video_page])
+    if app_mode == sound_only_page:
+        app_sst(
+            str(MODEL_LOCAL_PATH), str(LANG_MODEL_LOCAL_PATH), lm_alpha, lm_beta, beam
+        )
+    elif app_mode == with_video_page:
+        app_sst_with_video(
+            str(MODEL_LOCAL_PATH), str(LANG_MODEL_LOCAL_PATH), lm_alpha, lm_beta, beam
+        )
+def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int):
     webrtc_ctx = webrtc_streamer(
+        key="speech-to-text",
+        mode=WebRtcMode.SENDONLY,
+        audio_receiver_size=1024,
+        rtc_configuration={"iceServers": get_ice_servers()},
+        media_stream_constraints={"video": False, "audio": True},
     )
+    status_indicator = st.empty()
+    if not webrtc_ctx.state.playing:
+        return
+    status_indicator.write("Loading...")
+    text_output = st.empty()
+    stream = None
+    while True:
+        if webrtc_ctx.audio_receiver:
+            if stream is None:
+                from deepspeech import Model
+                model = Model(model_path)
+                model.enableExternalScorer(lm_path)
+                model.setScorerAlphaBeta(lm_alpha, lm_beta)
+                model.setBeamWidth(beam)
+                stream = model.createStream()
+                status_indicator.write("Model loaded.")
+            sound_chunk = pydub.AudioSegment.empty()
             try:
+                audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
+            except queue.Empty:
+                time.sleep(0.1)
+                status_indicator.write("No frame arrived.")
+                continue
+            status_indicator.write("Running. Say something!")
+            for audio_frame in audio_frames:
+                sound = pydub.AudioSegment(
+                    data=audio_frame.to_ndarray().tobytes(),
+                    sample_width=audio_frame.format.bytes,
+                    frame_rate=audio_frame.sample_rate,
+                    channels=len(audio_frame.layout.channels),
+                )
+                sound_chunk += sound
+            if len(sound_chunk) > 0:
+                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
+                    model.sampleRate()
                 )
+                buffer = np.array(sound_chunk.get_array_of_samples())
+                stream.feedAudioContent(buffer)
+                text = stream.intermediateDecode()
+                text_output.markdown(f"**Text:** {text}")
         else:
+            status_indicator.write("AudioReciver is not set. Abort.")
+            break
+def app_sst_with_video(
+    model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int
+):
+    frames_deque_lock = threading.Lock()
+    frames_deque: deque = deque([])
+    async def queued_audio_frames_callback(
+        frames: List[av.AudioFrame],
+    ) -> av.AudioFrame:
+        with frames_deque_lock:
+            frames_deque.extend(frames)
+        # Return empty frames to be silent.
+        new_frames = []
+        for frame in frames:
+            input_array = frame.to_ndarray()
+            new_frame = av.AudioFrame.from_ndarray(
+                np.zeros(input_array.shape, dtype=input_array.dtype),
+                layout=frame.layout.name,
+            )
+            new_frame.sample_rate = frame.sample_rate
+            new_frames.append(new_frame)
+        return new_frames
+    webrtc_ctx = webrtc_streamer(
+        key="speech-to-text-w-video",
+        mode=WebRtcMode.SENDRECV,
+        queued_audio_frames_callback=queued_audio_frames_callback,
+        rtc_configuration={"iceServers": get_ice_servers()},
+        media_stream_constraints={"video": True, "audio": True},
+    )
+    status_indicator = st.empty()
+    if not webrtc_ctx.state.playing:
+        return
+    status_indicator.write("Loading...")
+    text_output = st.empty()
+    stream = None
+    while True:
+        if webrtc_ctx.state.playing:
+            if stream is None:
+                from deepspeech import Model
+                model = Model(model_path)
+                model.enableExternalScorer(lm_path)
+                model.setScorerAlphaBeta(lm_alpha, lm_beta)
+                model.setBeamWidth(beam)
+                stream = model.createStream()
+                status_indicator.write("Model loaded.")
+            sound_chunk = pydub.AudioSegment.empty()
+            audio_frames = []
+            with frames_deque_lock:
+                while len(frames_deque) > 0:
+                    frame = frames_deque.popleft()
+                    audio_frames.append(frame)
+            if len(audio_frames) == 0:
+                time.sleep(0.1)
+                status_indicator.write("No frame arrived.")
+                continue
+            status_indicator.write("Running. Say something!")
+            for audio_frame in audio_frames:
+                sound = pydub.AudioSegment(
+                    data=audio_frame.to_ndarray().tobytes(),
+                    sample_width=audio_frame.format.bytes,
+                    frame_rate=audio_frame.sample_rate,
+                    channels=len(audio_frame.layout.channels),
+                )
+                sound_chunk += sound
+            if len(sound_chunk) > 0:
+                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
+                    model.sampleRate()
+                )
+                buffer = np.array(sound_chunk.get_array_of_samples())
+                stream.feedAudioContent(buffer)
+                text = stream.intermediateDecode()
+                text_output.markdown(f"**Text:** {text}")
+        else:
+            status_indicator.write("Stopped.")
+            break
+if __name__ == "__main__":
+    import os
+    DEBUG = os.environ.get("DEBUG", "false").lower() not in ["false", "no", "0"]
+    logging.basicConfig(
+        format="[%(asctime)s] %(levelname)7s from %(name)s in %(pathname)s:%(lineno)d: "
+        "%(message)s",
+        force=True,
+    )
+    logger.setLevel(level=logging.DEBUG if DEBUG else logging.INFO)
+    st_webrtc_logger = logging.getLogger("streamlit_webrtc")
+    st_webrtc_logger.setLevel(logging.DEBUG)
+    fsevents_logger = logging.getLogger("fsevents")
+    fsevents_logger.setLevel(logging.WARNING)
+    main()