Krish-05 commited on
Commit
5b53a0a
·
verified ·
1 Parent(s): 5afac77

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +300 -349
streamlit_app.py CHANGED
@@ -1,364 +1,315 @@
1
- import streamlit as st
2
- import requests
3
- import json
4
- import time
5
  import logging
 
 
 
 
 
 
 
 
 
 
 
6
  import numpy as np
7
- import sys
8
- import io # New: For handling audio bytes
9
- from pydub import AudioSegment # New: For converting audio formats (requires ffmpeg)
10
- from streamlit_webrtc import WebRtcMode, webrtc_streamer, AudioProcessorBase, ClientSettings # New: For microphone access
 
 
 
11
 
12
- # Configure logging
13
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
  logger = logging.getLogger(__name__)
15
 
16
- # --- Debugging: Display installed package versions ---
17
- try:
18
- import pkg_resources
19
- st.sidebar.write(f"Streamlit version: {pkg_resources.get_distribution('streamlit').version}")
20
- st.sidebar.write(f"Requests version: {pkg_resources.get_distribution('requests').version}")
21
-
22
- try:
23
- webrtc_version = pkg_resources.get_distribution("streamlit-webrtc").version
24
- st.sidebar.write(f"streamlit-webrtc version: {webrtc_version}")
25
- except pkg_resources.DistributionNotFound:
26
- st.sidebar.write("streamlit-webrtc not found (expected for current app logic).")
27
- except Exception as e:
28
- st.sidebar.write(f"Could not get streamlit-webrtc version: {e}")
29
- try:
30
- # Check for faster-whisper and pydub
31
- fw_version = pkg_resources.get_distribution("faster-whisper").version
32
- st.sidebar.write(f"faster-whisper version: {fw_version}")
33
- except pkg_resources.DistributionNotFound:
34
- st.sidebar.write("faster-whisper not found (expected for current app logic).")
35
- except Exception as e:
36
- st.sidebar.write(f"Could not get faster-whisper version: {e}")
37
  try:
38
- pd_version = pkg_resources.get_distribution("pydub").version
39
- st.sidebar.write(f"pydub version: {pd_version}")
40
- except pkg_resources.DistributionNotFound:
41
- st.sidebar.write("pydub not found (expected for current app logic).")
42
- except Exception as e:
43
- st.sidebar.write(f"Could not get pydub version: {e}")
44
-
45
- # Not expecting transformers here, removed for clarity.
46
- except Exception as e:
47
- st.sidebar.write(f"Could not get package versions: {e}")
48
- # --- End Debugging Section ---
49
-
50
- # Configuration for the FastAPI backend
51
- FASTAPI_HOST = "localhost"
52
- FASTAPI_PORT = 7860
53
- FASTAPI_LLM_URL = f"http://{FASTAPI_HOST}:{FASTAPI_PORT}/ask" # For LLM requests
54
- FASTAPI_STT_URL = f"http://{FASTAPI_HOST}:{FASTAPI_PORT}/transcribe_audio" # For STT requests
55
-
56
- # Set Streamlit page configuration
57
- st.set_page_config(page_title="Ollama AI Assistant", page_icon="🤖", layout="wide")
58
-
59
- # --- Session state for chat history ---
60
- # Initialize chat history if it doesn't exist in session state
61
- if 'chat_history' not in st.session_state:
62
- st.session_state.chat_history = [
63
- {"role": "assistant", "message": "Hello! How can I assist you today?"}
64
- ]
65
- logger.info("Chat history initialized.")
66
-
67
- # --- Session state for STT and WebRTC ---
68
- # This controls the microphone recording lifecycle
69
- if 'transcribed_text' not in st.session_state:
70
- st.session_state.transcribed_text = "" # Stores the last transcribed text
71
- if 'webrtc_state' not in st.session_state:
72
- st.session_state.webrtc_state = "idle" # idle, listening, processing_audio
73
-
74
- # --- Custom Audio Processor for VAD and Audio Buffering ---
75
- class VADAudioProcessor(AudioProcessorBase):
76
- """
77
- Processes audio frames from WebRTC. It buffers audio and
78
- implements a simple volume-based Voice Activity Detection (VAD).
79
  """
80
- def __init__(self):
81
- self.audio_buffer = io.BytesIO()
82
- self.silent_frames_count = 0
83
- self.voice_detected = False
84
- self.frame_rate = 16000 # Standard for WebRTC audio
85
- self.samples_width = 2 # 16-bit audio (2 bytes per sample)
86
- self.threshold = 500 # Adjust this based on environment noise and microphone sensitivity
87
- self.max_silent_frames = 30 # Stop after N silent frames (~0.3 seconds at 10ms/frame)
88
- self.total_frames_processed = 0
89
- logger.info("VADAudioProcessor initialized.")
90
-
91
- def _calculate_volume(self, audio_chunk: bytes) -> float:
92
- """Calculate RMS (Root Mean Square) volume of an audio chunk."""
93
- # Convert bytes to a numpy array of 16-bit integers
94
- audio_array = np.frombuffer(audio_chunk, dtype=np.int16)
95
- if audio_array.size == 0:
96
- return 0.0
97
- # Calculate RMS
98
- rms = np.sqrt(np.mean(audio_array**2))
99
- return rms
100
-
101
- def process(self, audio_chunk: bytes) -> bytes:
102
- """
103
- Processes each incoming audio chunk from the microphone.
104
  """
105
- # Write the raw audio chunk to the buffer
106
- self.audio_buffer.write(audio_chunk)
107
- self.total_frames_processed += 1
108
-
109
- # Perform simple VAD
110
- volume = self._calculate_volume(audio_chunk)
111
- # logger.debug(f"Audio chunk received, volume: {volume:.2f}") # Use debug for less verbose logging
112
-
113
- if volume > self.threshold:
114
- self.voice_detected = True
115
- self.silent_frames_count = 0 # Reset silence count on voice detection
116
- # logger.debug("Voice detected!")
117
- elif self.voice_detected: # Only count silence if voice was previously detected
118
- self.silent_frames_count += 1
119
- # logger.debug(f"Silence detected. Silent frames: {self.silent_frames_count}")
120
-
121
- # This processor simply collects data. The stopping logic is handled
122
- # by the Streamlit app's main loop reacting to this processor's state.
123
- return audio_chunk # Return the chunk (pass-through)
124
-
125
- # --- App Header ---
126
- st.title("🤖 Ollama AI Assistant")
127
- st.caption("Start chatting with our AI assistant. Type your message or use the microphone.")
128
-
129
- # --- Chat Display ---
130
- st.markdown("---") # Separator for visual clarity
131
- for chat in st.session_state.chat_history:
132
- # Use Streamlit's chat_message container for distinct roles
133
- with st.chat_message(chat["role"], avatar="🤖" if chat["role"] == "assistant" else "👤"):
134
- st.write(chat["message"])
135
-
136
- # --- Input Area ---
137
- # Use a form to handle user input and submission
138
- with st.form("chat_form", clear_on_submit=True):
139
- # Store the user's prompt in session state so it can be pre-filled by STT
140
- user_prompt_key = "user_input_text_area" # A unique key for the text area
141
- user_prompt = st.text_area(
142
- "Type your message here...",
143
- height=100,
144
- placeholder="e.g., Explain quantum computing in simple terms.",
145
- label_visibility="collapsed", # Hide the default label for a cleaner look
146
- key=user_prompt_key,
147
- value=st.session_state.transcribed_text # Pre-fill with transcribed text from STT
148
  )
149
-
150
- col1, col2 = st.columns([1, 1])
151
- with col1:
152
- submitted = st.form_submit_button("Send")
153
- with col2:
154
- # Microphone button logic
155
- record_button_label = "Stop Listening" if st.session_state.webrtc_state == "listening" else "Start Listening"
156
- microphone_button = st.form_submit_button(record_button_label, key="microphone_button")
157
-
158
- # Handle microphone button press to control WebRTC state
159
- if microphone_button:
160
- if st.session_state.webrtc_state == "idle":
161
- # Transition to 'listening' state
162
- st.session_state.webrtc_state = "listening"
163
- st.session_state.transcribed_text = "" # Clear any previous transcription
164
- st.info("Listening... Tap 'Stop Listening' or wait for silence to auto-stop.")
165
- st.rerun() # Rerun to activate the WebRTC streamer
166
- elif st.session_state.webrtc_state == "listening":
167
- # User manually clicked 'Stop Listening', transition to 'processing_audio'
168
- st.session_state.webrtc_state = "processing_audio"
169
- st.info("Stopping recording and processing audio...")
170
- st.rerun() # Rerun to trigger audio processing
171
-
172
- # Process the prompt when the 'Send' button is submitted and prompt is not empty
173
- if submitted and user_prompt:
174
- logger.info(f"User submitted prompt: {user_prompt[:50]}...") # Log the submitted prompt
175
- # Add user's message to chat history immediately
176
- st.session_state.chat_history.append({"role": "user", "message": user_prompt})
177
- st.session_state.transcribed_text = "" # Clear transcribed text after it's sent to LLM
178
-
179
- # Display a "Thinking..." message while waiting for the AI response
180
- with st.chat_message("assistant", avatar="🤖"):
181
- response_placeholder = st.empty() # Create an empty placeholder for streaming content
182
- response_placeholder.write("Thinking...") # Initial message
183
- logger.info("Displaying 'Thinking...' message.")
184
-
185
- full_response = "" # Initialize an empty string to build the full response
186
- byte_buffer = b"" # Initialize a buffer for incomplete UTF-8 characters for streaming
187
- try:
188
- # Prepare the request payload for FastAPI LLM endpoint
189
- payload = {"text": user_prompt}
190
- headers = {"Content-Type": "application/json"}
191
- logger.info(f"Sending LLM request to FastAPI at {FASTAPI_LLM_URL}")
192
-
193
- # Make a streaming POST request to the FastAPI endpoint
194
- with requests.post(FASTAPI_LLM_URL, json=payload, headers=headers, stream=True) as response:
195
- logger.info(f"Received LLM response from FastAPI with status code: {response.status_code}")
196
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
197
-
198
- # Iterate over the response content as it streams (byte by byte)
199
- for chunk in response.iter_content(chunk_size=1):
200
- if chunk: # Filter out potential empty keep-alive chunks
201
- byte_buffer += chunk # Append new bytes to the buffer
202
- try:
203
- # Attempt to decode the entire buffer using 'strict' error handling
204
- decoded_text = byte_buffer.decode("utf-8", errors="strict")
205
- full_response += decoded_text
206
- response_placeholder.markdown(full_response + "▌") # Update display, add cursor
207
- byte_buffer = b"" # Clear the buffer if decoding was successful
208
- except UnicodeDecodeError:
209
- # This is expected if a multi-byte character is split across chunks.
210
- # Do nothing, just wait for the next chunk to complete the character.
211
- pass
212
- except Exception as e:
213
- # Catch any other unexpected decoding errors
214
- logger.error(f"Error decoding stream chunk: {e} - Raw bytes: {chunk}")
215
- try:
216
- full_response += chunk.decode("utf-8", errors="replace")
217
- except Exception as decode_err:
218
- logger.error(f"Failed to decode even with replace errors: {decode_err}")
219
- full_response += "[Decoding Error]" # Indicate a severe decoding issue
220
- response_placeholder.markdown(full_response + "▌")
221
- byte_buffer = b"" # Clear buffer to try and recover
222
-
223
- # After the loop, if there are any remaining bytes in the buffer, try to decode them
224
- if byte_buffer:
225
- try:
226
- full_response += byte_buffer.decode("utf-8", errors="replace")
227
- logger.warning("Remaining bytes in buffer decoded with replacement.")
228
- except Exception as e:
229
- logger.error(f"Failed to decode final buffer bytes: {e}")
230
- full_response += "[Final Decoding Error]"
231
- response_placeholder.markdown(full_response) # Final update without cursor
232
- logger.info("Streaming complete. Full LLM response received.")
233
-
234
- except requests.exceptions.ConnectionError:
235
- # Handle cases where Streamlit cannot connect to FastAPI
236
- full_response = (f"Error: Could not connect to the FastAPI server. "
237
- f"Please ensure it is running at {FASTAPI_LLM_URL}.")
238
- response_placeholder.error(full_response) # Display error in the placeholder
239
- logger.error(f"ConnectionError: Could not connect to FastAPI at {FASTAPI_LLM_URL}")
240
- except requests.exceptions.RequestException as e:
241
- # Handle other request-related errors (e.g., HTTP errors from raise_for_status)
242
- error_details = e.response.text if e.response is not None else str(e)
243
- status_code = e.response.status_code if e.response is not None else "N/A"
244
- full_response = (f"An error occurred during the request to FastAPI. "
245
- f"Status code: {status_code}\nDetails: {error_details}")
246
- response_placeholder.error(full_response) # Display error in the placeholder
247
- logger.error(f"Request error to FastAPI: {e}", exc_info=True)
248
- except Exception as e:
249
- # Catch any other unexpected errors during the request or processing
250
- full_response = f"An unexpected error occurred: {e}"
251
- response_placeholder.error(full_response) # Display error in the placeholder
252
- logger.exception("An unexpected error occurred during API request.") # Logs traceback
253
-
254
- # After the streaming is complete (or an error occurred), add the final response
255
- # to the chat history. This ensures it persists across reruns.
256
- st.session_state.chat_history.append({"role": "assistant", "message": full_response})
257
- logger.info("Final LLM response added to chat history.")
258
- # Rerun the app to display the updated chat history with the final response
259
- st.rerun()
260
- elif submitted and not user_prompt:
261
- # Warn user if no prompt is entered for the 'Send' button
262
- st.warning("Please enter a prompt before clicking 'Send'.")
263
- logger.warning("User attempted to send an empty text prompt.")
264
-
265
- # --- WebRTC Streamer for Microphone Input ---
266
- webrtc_ctx = None
267
- if st.session_state.webrtc_state in ["listening", "processing_audio"]:
268
- logger.info(f"Initiating webrtc_streamer with state: {st.session_state.webrtc_state}")
269
  webrtc_ctx = webrtc_streamer(
270
- key="ollama-audio-input", # Unique key for this component
271
- mode=WebRtcMode.SENDONLY, # Only send audio from browser to Python
272
- audio_html_attrs={
273
- "autoPlay": "true",
274
- "controls": "",
275
- "muted": "muted", # Mute local playback to avoid echo
276
- },
277
- # Use our custom processor to handle audio frames and VAD
278
- in_audio_frames_processor_factory=VADAudioProcessor,
279
- client_settings=ClientSettings(
280
- rtc_configuration={"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}, # STUN server for NAT traversal
281
- media_stream_constraints={"audio": True, "video": False}, # Only request audio stream
282
- ),
283
  )
284
 
285
- # Display status messages while recording
286
- if webrtc_ctx.state.playing and st.session_state.webrtc_state == "listening":
287
- st.info("Microphone active. Speak clearly now...")
288
- elif not webrtc_ctx.state.playing and st.session_state.webrtc_state == "listening":
289
- st.warning("Waiting for microphone permissions... Please grant access if prompted.")
290
-
291
- # Check VAD status from the audio processor
292
- if webrtc_ctx.audio_processor:
293
- processor: VADAudioProcessor = webrtc_ctx.audio_processor
294
- # If voice was detected, and now prolonged silence is detected
295
- if processor.voice_detected and processor.silent_frames_count >= processor.max_silent_frames:
296
- logger.info("VAD detected prolonged silence. Transitioning to processing audio.")
297
- # Set state to processing, which will cause a rerun and stop the streamer
298
- if st.session_state.webrtc_state == "listening": # Only auto-stop if currently listening
299
- st.session_state.webrtc_state = "processing_audio"
300
- st.info("Silence detected. Processing audio for transcription...")
301
- st.rerun() # Trigger a rerun to process the audio
302
-
303
- # --- Audio Processing and STT Call after Recording Stops ---
304
- # This block runs when we transition to 'processing_audio' state and the WebRTC session is truly stopped.
305
- if st.session_state.webrtc_state == "processing_audio" and (webrtc_ctx is None or not webrtc_ctx.state.playing):
306
- logger.info("WebRTC session stopped (or never started in processing_audio state). Attempting to get audio.")
307
- # Ensure we have an audio processor instance from the stopped session
308
- if webrtc_ctx and webrtc_ctx.audio_processor:
309
- processor: VADAudioProcessor = webrtc_ctx.audio_processor
310
- if processor.audio_buffer.tell() > 0: # Check if any audio data was recorded
311
- recorded_audio_bytes = processor.audio_buffer.getvalue()
312
- logger.info(f"Recorded audio buffer size: {len(recorded_audio_bytes)} bytes.")
313
-
314
- # Convert raw 16-bit PCM (from WebRTC) to WAV format using pydub
315
  try:
316
- audio = AudioSegment(
317
- recorded_audio_bytes,
318
- sample_width=processor.samples_width,
319
- frame_rate=processor.frame_rate,
320
- channels=1 # WebRTC typically provides mono audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  )
322
- wav_io = io.BytesIO()
323
- audio.export(wav_io, format="wav") # Export to WAV format
324
- wav_io.seek(0) # Rewind the buffer to the beginning for reading
325
-
326
- st.info("Sending recorded audio to STT backend for transcription...")
327
- # Send the WAV audio bytes to the FastAPI STT endpoint
328
- files = {'audio_file': ('audio.wav', wav_io.getvalue(), 'audio/wav')}
329
- response = requests.post(FASTAPI_STT_URL, files=files)
330
- response.raise_for_status() # Raise HTTPError for bad responses
331
-
332
- transcription_result = response.json()
333
- transcribed_text = transcription_result.get("transcribed_text", "").strip()
334
- st.session_state.transcribed_text = transcribed_text # Store transcribed text
335
-
336
- logger.info(f"Transcription received: {transcribed_text[:100]}...")
337
- if transcribed_text:
338
- st.success("Transcription complete!")
339
- else:
340
- st.warning("No clear speech detected or transcription resulted in empty text.")
341
- except requests.exceptions.RequestException as e:
342
- st.error(f"Error sending audio to STT backend: {e}")
343
- logger.error(f"STT Backend error: {e}", exc_info=True)
344
- st.session_state.transcribed_text = "" # Clear on error
345
- except Exception as e:
346
- st.error(f"An unexpected error occurred during audio processing or STT: {e}")
347
- logger.exception("Unexpected error in STT processing.")
348
- st.session_state.transcribed_text = "" # Clear on error
349
  else:
350
- st.warning("No audio was recorded during the session.")
351
- st.session_state.transcribed_text = ""
352
-
353
- # Reset WebRTC state to idle after processing is complete
354
- st.session_state.webrtc_state = "idle"
355
- st.rerun() # Rerun to update the text area with transcription and reset UI
356
- elif st.session_state.webrtc_state == "processing_audio":
357
- st.warning("WebRTC context or audio processor was not available for transcription. Retrying or check permissions.")
358
- st.session_state.webrtc_state = "idle" # Reset for next attempt
359
- st.rerun()
360
-
361
-
362
- # --- Footer ---
363
- st.markdown("---")
364
- st.caption("Powered by Ollama, FastAPI, Streamlit, and WebRTC.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import logging
2
+ import logging.handlers
3
+ import queue
4
+ import threading
5
+ import time
6
+ import urllib.request
7
+ import os
8
+ from collections import deque
9
+ from pathlib import Path
10
+ from typing import List
11
+
12
+ import av
13
  import numpy as np
14
+ import pydub
15
+ import streamlit as st
16
+ from twilio.rest import Client
17
+
18
+ from streamlit_webrtc import WebRtcMode, webrtc_streamer, AudioProcessorBase
19
+
20
+ HERE = Path(__file__).parent
21
 
 
 
22
  logger = logging.getLogger(__name__)
23
 
24
+
25
+ # This code is based on https://github.com/streamlit/demo-self-driving/blob/230245391f2dda0cb464008195a470751c01770b/streamlit_app.py#L48 # noqa: E501
26
+ def download_file(url, download_to: Path, expected_size=None):
27
+ # Don't download the file twice.
28
+ # (If possible, verify the download using the file length.)
29
+ if download_to.exists():
30
+ if expected_size:
31
+ if download_to.stat().st_size == expected_size:
32
+ return
33
+ else:
34
+ st.info(f"{url} is already downloaded.")
35
+ if not st.button("Download again?"):
36
+ return
37
+
38
+ download_to.parent.mkdir(parents=True, exist_ok=True)
39
+
40
+ # These are handles to two visual elements to animate.
41
+ weights_warning, progress_bar = None, None
 
 
 
42
  try:
43
+ weights_warning = st.warning("Downloading %s..." % url)
44
+ progress_bar = st.progress(0)
45
+ with open(download_to, "wb") as output_file:
46
+ with urllib.request.urlopen(url) as response:
47
+ length = int(response.info()["Content-Length"])
48
+ counter = 0.0
49
+ MEGABYTES = 2.0 ** 20.0
50
+ while True:
51
+ data = response.read(8192)
52
+ if not data:
53
+ break
54
+ counter += len(data)
55
+ output_file.write(data)
56
+
57
+ # We perform animation by overwriting the elements.
58
+ weights_warning.warning(
59
+ "Downloading %s... (%6.2f/%6.2f MB)"
60
+ % (url, counter / MEGABYTES, length / MEGABYTES)
61
+ )
62
+ progress_bar.progress(min(counter / length, 1.0))
63
+ # Finally, we remove these visual elements by calling .empty().
64
+ finally:
65
+ if weights_warning is not None:
66
+ weights_warning.empty()
67
+ if progress_bar is not None:
68
+ progress_bar.empty()
69
+
70
+
71
+ # This code is based on https://github.com/whitphx/streamlit-webrtc/blob/c1fe3c783c9e8042ce0c95d789e833233fd82e74/sample_utils/turn.py
72
+ @st.cache_data # type: ignore
73
+ def get_ice_servers():
74
+ """Use Twilio's TURN server because Streamlit Community Cloud has changed
75
+ its infrastructure and WebRTC connection cannot be established without TURN server now. # noqa: E501
76
+ We considered Open Relay Project (https://www.metered.ca/tools/openrelay/) too,
77
+ but it is not stable and hardly works as some people reported like https://github.com/aiortc/aiortc/issues/832#issuecomment-1482420656 # noqa: E501
78
+ See https://github.com/whitphx/streamlit-webrtc/issues/1213
 
 
 
 
 
79
  """
80
+
81
+ # Ref: https://www.twilio.com/docs/stun-turn/api
82
+ try:
83
+ account_sid = os.environ["TWILIO_ACCOUNT_SID"]
84
+ auth_token = os.environ["TWILIO_AUTH_TOKEN"]
85
+ except KeyError:
86
+ logger.warning(
87
+ "Twilio credentials are not set. Fallback to a free STUN server from Google." # noqa: E501
88
+ )
89
+ return [{"urls": ["stun:stun.l.google.com:19302"]}]
90
+
91
+ client = Client(account_sid, auth_token)
92
+
93
+ token = client.tokens.create()
94
+
95
+ return token.ice_servers
96
+
97
+
98
+
99
+ def main():
100
+ st.header("Real Time Speech-to-Text")
101
+ st.markdown(
 
 
102
  """
103
+ This demo app is using [DeepSpeech](https://github.com/mozilla/DeepSpeech),
104
+ an open speech-to-text engine.
105
+
106
+ A pre-trained model released with
107
+ [v0.9.3](https://github.com/mozilla/DeepSpeech/releases/tag/v0.9.3),
108
+ trained on American English is being served.
109
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  )
111
+
112
+ # https://github.com/mozilla/DeepSpeech/releases/tag/v0.9.3
113
+ MODEL_URL = "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm" # noqa
114
+ LANG_MODEL_URL = "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer" # noqa
115
+ MODEL_LOCAL_PATH = HERE / "models/deepspeech-0.9.3-models.pbmm"
116
+ LANG_MODEL_LOCAL_PATH = HERE / "models/deepspeech-0.9.3-models.scorer"
117
+
118
+ download_file(MODEL_URL, MODEL_LOCAL_PATH, expected_size=188915987)
119
+ download_file(LANG_MODEL_URL, LANG_MODEL_LOCAL_PATH, expected_size=953363776)
120
+
121
+ lm_alpha = 0.931289039105002
122
+ lm_beta = 1.1834137581510284
123
+ beam = 100
124
+
125
+ sound_only_page = "Sound only (sendonly)"
126
+ with_video_page = "With video (sendrecv)"
127
+ app_mode = st.selectbox("Choose the app mode", [sound_only_page, with_video_page])
128
+
129
+ if app_mode == sound_only_page:
130
+ app_sst(
131
+ str(MODEL_LOCAL_PATH), str(LANG_MODEL_LOCAL_PATH), lm_alpha, lm_beta, beam
132
+ )
133
+ elif app_mode == with_video_page:
134
+ app_sst_with_video(
135
+ str(MODEL_LOCAL_PATH), str(LANG_MODEL_LOCAL_PATH), lm_alpha, lm_beta, beam
136
+ )
137
+
138
+
139
+ def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  webrtc_ctx = webrtc_streamer(
141
+ key="speech-to-text",
142
+ mode=WebRtcMode.SENDONLY,
143
+ audio_receiver_size=1024,
144
+ rtc_configuration={"iceServers": get_ice_servers()},
145
+ media_stream_constraints={"video": False, "audio": True},
 
 
 
 
 
 
 
 
146
  )
147
 
148
+ status_indicator = st.empty()
149
+
150
+ if not webrtc_ctx.state.playing:
151
+ return
152
+
153
+ status_indicator.write("Loading...")
154
+ text_output = st.empty()
155
+ stream = None
156
+
157
+ while True:
158
+ if webrtc_ctx.audio_receiver:
159
+ if stream is None:
160
+ from deepspeech import Model
161
+
162
+ model = Model(model_path)
163
+ model.enableExternalScorer(lm_path)
164
+ model.setScorerAlphaBeta(lm_alpha, lm_beta)
165
+ model.setBeamWidth(beam)
166
+
167
+ stream = model.createStream()
168
+
169
+ status_indicator.write("Model loaded.")
170
+
171
+ sound_chunk = pydub.AudioSegment.empty()
 
 
 
 
 
 
172
  try:
173
+ audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
174
+ except queue.Empty:
175
+ time.sleep(0.1)
176
+ status_indicator.write("No frame arrived.")
177
+ continue
178
+
179
+ status_indicator.write("Running. Say something!")
180
+
181
+ for audio_frame in audio_frames:
182
+ sound = pydub.AudioSegment(
183
+ data=audio_frame.to_ndarray().tobytes(),
184
+ sample_width=audio_frame.format.bytes,
185
+ frame_rate=audio_frame.sample_rate,
186
+ channels=len(audio_frame.layout.channels),
187
+ )
188
+ sound_chunk += sound
189
+
190
+ if len(sound_chunk) > 0:
191
+ sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
192
+ model.sampleRate()
193
  )
194
+ buffer = np.array(sound_chunk.get_array_of_samples())
195
+ stream.feedAudioContent(buffer)
196
+ text = stream.intermediateDecode()
197
+ text_output.markdown(f"**Text:** {text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  else:
199
+ status_indicator.write("AudioReciver is not set. Abort.")
200
+ break
201
+
202
+
203
+ def app_sst_with_video(
204
+ model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int
205
+ ):
206
+ frames_deque_lock = threading.Lock()
207
+ frames_deque: deque = deque([])
208
+
209
+ async def queued_audio_frames_callback(
210
+ frames: List[av.AudioFrame],
211
+ ) -> av.AudioFrame:
212
+ with frames_deque_lock:
213
+ frames_deque.extend(frames)
214
+
215
+ # Return empty frames to be silent.
216
+ new_frames = []
217
+ for frame in frames:
218
+ input_array = frame.to_ndarray()
219
+ new_frame = av.AudioFrame.from_ndarray(
220
+ np.zeros(input_array.shape, dtype=input_array.dtype),
221
+ layout=frame.layout.name,
222
+ )
223
+ new_frame.sample_rate = frame.sample_rate
224
+ new_frames.append(new_frame)
225
+
226
+ return new_frames
227
+
228
+ webrtc_ctx = webrtc_streamer(
229
+ key="speech-to-text-w-video",
230
+ mode=WebRtcMode.SENDRECV,
231
+ queued_audio_frames_callback=queued_audio_frames_callback,
232
+ rtc_configuration={"iceServers": get_ice_servers()},
233
+ media_stream_constraints={"video": True, "audio": True},
234
+ )
235
+
236
+ status_indicator = st.empty()
237
+
238
+ if not webrtc_ctx.state.playing:
239
+ return
240
+
241
+ status_indicator.write("Loading...")
242
+ text_output = st.empty()
243
+ stream = None
244
+
245
+ while True:
246
+ if webrtc_ctx.state.playing:
247
+ if stream is None:
248
+ from deepspeech import Model
249
+
250
+ model = Model(model_path)
251
+ model.enableExternalScorer(lm_path)
252
+ model.setScorerAlphaBeta(lm_alpha, lm_beta)
253
+ model.setBeamWidth(beam)
254
+
255
+ stream = model.createStream()
256
+
257
+ status_indicator.write("Model loaded.")
258
+
259
+ sound_chunk = pydub.AudioSegment.empty()
260
+
261
+ audio_frames = []
262
+ with frames_deque_lock:
263
+ while len(frames_deque) > 0:
264
+ frame = frames_deque.popleft()
265
+ audio_frames.append(frame)
266
+
267
+ if len(audio_frames) == 0:
268
+ time.sleep(0.1)
269
+ status_indicator.write("No frame arrived.")
270
+ continue
271
+
272
+ status_indicator.write("Running. Say something!")
273
+
274
+ for audio_frame in audio_frames:
275
+ sound = pydub.AudioSegment(
276
+ data=audio_frame.to_ndarray().tobytes(),
277
+ sample_width=audio_frame.format.bytes,
278
+ frame_rate=audio_frame.sample_rate,
279
+ channels=len(audio_frame.layout.channels),
280
+ )
281
+ sound_chunk += sound
282
+
283
+ if len(sound_chunk) > 0:
284
+ sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
285
+ model.sampleRate()
286
+ )
287
+ buffer = np.array(sound_chunk.get_array_of_samples())
288
+ stream.feedAudioContent(buffer)
289
+ text = stream.intermediateDecode()
290
+ text_output.markdown(f"**Text:** {text}")
291
+ else:
292
+ status_indicator.write("Stopped.")
293
+ break
294
+
295
+
296
+ if __name__ == "__main__":
297
+ import os
298
+
299
+ DEBUG = os.environ.get("DEBUG", "false").lower() not in ["false", "no", "0"]
300
+
301
+ logging.basicConfig(
302
+ format="[%(asctime)s] %(levelname)7s from %(name)s in %(pathname)s:%(lineno)d: "
303
+ "%(message)s",
304
+ force=True,
305
+ )
306
+
307
+ logger.setLevel(level=logging.DEBUG if DEBUG else logging.INFO)
308
+
309
+ st_webrtc_logger = logging.getLogger("streamlit_webrtc")
310
+ st_webrtc_logger.setLevel(logging.DEBUG)
311
+
312
+ fsevents_logger = logging.getLogger("fsevents")
313
+ fsevents_logger.setLevel(logging.WARNING)
314
+
315
+ main()