Aakash jammula commited on
Commit
3d326e4
·
1 Parent(s): 06adce9
Files changed (3) hide show
  1. Dockerfile +12 -14
  2. app.py +600 -748
  3. requirements.txt +2 -0
Dockerfile CHANGED
@@ -6,29 +6,27 @@ RUN apt-get update && \
6
  portaudio19-dev build-essential ffmpeg \
7
  && rm -rf /var/lib/apt/lists/*
8
 
9
- # 2) Create non-root user and workdir
10
  WORKDIR /app
11
- RUN useradd -m -u 1000 user
12
 
13
- # 3) Redirect HF & tqdm caches / disable progress
14
- ENV PATH="/home/user/.local/bin:$PATH"
15
- ENV HF_HOME=/app/.cache/huggingface
16
- ENV HF_HUB_DISABLE_PROGRESS_BARS=1
17
- ENV TQDM_DISABLE=1
 
18
 
19
- # 4) Install Python deps as root
20
  COPY requirements.txt .
21
  RUN pip install --no-cache-dir -r requirements.txt
22
 
23
- # 5) Copy app code and fix permissions
24
  COPY . .
25
  RUN chown -R user:user /app
26
 
27
- # 6) Switch to non-root
28
  USER user
29
 
30
- # 7) Expose STT WebSocket ports
31
- EXPOSE 8011 8012
32
 
33
- # 8) Launch server with tiny.en
34
- CMD ["python", "app.py", "-m", "tiny.en", "-c", "8011", "-d", "8012"]
 
6
  portaudio19-dev build-essential ffmpeg \
7
  && rm -rf /var/lib/apt/lists/*
8
 
 
9
  WORKDIR /app
 
10
 
11
+ # 2) Create non-root user, set up cache/env
12
+ RUN useradd -m -u 1000 user
13
+ ENV HF_HOME=/app/.cache/huggingface \
14
+ HF_HUB_DISABLE_PROGRESS_BARS=1 \
15
+ TQDM_DISABLE=1 \
16
+ PATH="/home/user/.local/bin:$PATH"
17
 
18
+ # 3) Install Python deps
19
  COPY requirements.txt .
20
  RUN pip install --no-cache-dir -r requirements.txt
21
 
22
+ # 4) Copy code and fix ownership
23
  COPY . .
24
  RUN chown -R user:user /app
25
 
 
26
  USER user
27
 
28
+ # 5) Expose HTTP + WS port
29
+ EXPOSE 7860
30
 
31
+ # 6) Launch via the Python entrypoint so our __main__ block picks up "-m tiny.en"
32
+ CMD ["python", "app.py", "-m", "tiny.en", "--server_host", "0.0.0.0", "--server_port", "7860"]
app.py CHANGED
@@ -1,80 +1,17 @@
1
  """
2
- Speech-to-Text (STT) Server with Real-Time Transcription and WebSocket Interface
3
-
4
- This server provides real-time speech-to-text (STT) transcription using the RealtimeSTT library. It allows clients to connect via WebSocket to send audio data and receive real-time transcription updates. The server supports configurable audio recording parameters, voice activity detection (VAD), and wake word detection. It is designed to handle continuous transcription as well as post-recording processing, enabling real-time feedback with the option to improve final transcription quality after the complete sentence is recognized.
5
-
6
- ### Features:
7
- - Real-time transcription using pre-configured or user-defined STT models.
8
- - WebSocket-based communication for control and data handling.
9
- - Flexible recording and transcription options, including configurable pauses for sentence detection.
10
- - Supports Silero and WebRTC VAD for robust voice activity detection.
11
-
12
- ### Starting the Server:
13
- You can start the server using the command-line interface (CLI) command `stt-server`, passing the desired configuration options.
14
-
15
- ```bash
16
- stt-server [OPTIONS]
17
- ```
18
-
19
- ### Available Parameters:
20
- - `-m, --model`: Model path or size; default 'large-v2'.
21
- - `-r, --rt-model, --realtime_model_type`: Real-time model size; default 'tiny.en'.
22
- - `-l, --lang, --language`: Language code for transcription; default 'en'.
23
- - `-i, --input-device, --input_device_index`: Audio input device index; default 1.
24
- - `-c, --control, --control_port`: WebSocket control port; default 8011.
25
- - `-d, --data, --data_port`: WebSocket data port; default 8012.
26
- - `-w, --wake_words`: Wake word(s) to trigger listening; default "".
27
- - `-D, --debug`: Enable debug logging.
28
- - `-W, --write`: Save audio to WAV file.
29
- - `-s, --silence_timing`: Enable dynamic silence duration for sentence detection; default True.
30
- - `-b, --batch, --batch_size`: Batch size for inference; default 16.
31
- - `--root, --download_root`: Specifies the root path were the Whisper models are downloaded to.
32
- - `--silero_sensitivity`: Silero VAD sensitivity (0-1); default 0.05.
33
- - `--silero_use_onnx`: Use Silero ONNX model; default False.
34
- - `--webrtc_sensitivity`: WebRTC VAD sensitivity (0-3); default 3.
35
- - `--min_length_of_recording`: Minimum recording duration in seconds; default 1.1.
36
- - `--min_gap_between_recordings`: Min time between recordings in seconds; default 0.
37
- - `--enable_realtime_transcription`: Enable real-time transcription; default True.
38
- - `--realtime_processing_pause`: Pause between audio chunk processing; default 0.02.
39
- - `--silero_deactivity_detection`: Use Silero for end-of-speech detection; default True.
40
- - `--early_transcription_on_silence`: Start transcription after silence in seconds; default 0.2.
41
- - `--beam_size`: Beam size for main model; default 5.
42
- - `--beam_size_realtime`: Beam size for real-time model; default 3.
43
- - `--init_realtime_after_seconds`: Initial waiting time for realtime transcription; default 0.2.
44
- - `--realtime_batch_size`: Batch size for the real-time transcription model; default 16.
45
- - `--initial_prompt`: Initial main transcription guidance prompt.
46
- - `--initial_prompt_realtime`: Initial realtime transcription guidance prompt.
47
- - `--end_of_sentence_detection_pause`: Silence duration for sentence end detection; default 0.45.
48
- - `--unknown_sentence_detection_pause`: Pause duration for incomplete sentence detection; default 0.7.
49
- - `--mid_sentence_detection_pause`: Pause for mid-sentence break; default 2.0.
50
- - `--wake_words_sensitivity`: Wake word detection sensitivity (0-1); default 0.5.
51
- - `--wake_word_timeout`: Wake word timeout in seconds; default 5.0.
52
- - `--wake_word_activation_delay`: Delay before wake word activation; default 20.
53
- - `--wakeword_backend`: Backend for wake word detection; default 'none'.
54
- - `--openwakeword_model_paths`: Paths to OpenWakeWord models.
55
- - `--openwakeword_inference_framework`: OpenWakeWord inference framework; default 'tensorflow'.
56
- - `--wake_word_buffer_duration`: Wake word buffer duration in seconds; default 1.0.
57
- - `--use_main_model_for_realtime`: Use main model for real-time transcription.
58
- - `--use_extended_logging`: Enable extensive log messages.
59
- - `--logchunks`: Log incoming audio chunks.
60
- - `--compute_type`: Type of computation to use.
61
- - `--input_device_index`: Index of the audio input device.
62
- - `--gpu_device_index`: Index of the GPU device.
63
- - `--device`: Device to use for computation.
64
- - `--handle_buffer_overflow`: Handle buffer overflow during transcription.
65
- - `--suppress_tokens`: Suppress tokens during transcription.
66
- - `--allowed_latency_limit`: Allowed latency limit for real-time transcription.
67
- - `--faster_whisper_vad_filter`: Enable VAD filter for Faster Whisper; default False.
68
-
69
-
70
- ### WebSocket Interface:
71
- The server supports two WebSocket connections:
72
- 1. **Control WebSocket**: Used to send and receive commands, such as setting parameters or calling recorder methods.
73
- 2. **Data WebSocket**: Used to send audio data for transcription and receive real-time transcription updates.
74
-
75
- The server will broadcast real-time transcription updates to all connected clients on the data WebSocket.
76
  """
77
 
 
 
 
 
78
  from difflib import SequenceMatcher
79
  from collections import deque
80
  from datetime import datetime
@@ -83,709 +20,373 @@ import asyncio
83
  import pyaudio
84
  import base64
85
  import sys
 
 
 
 
 
 
 
 
 
86
 
 
 
87
 
88
- debug_logging = False
89
- extended_logging = False
90
- send_recorded_chunk = False
91
- log_incoming_chunks = False
92
- silence_timing = False
93
- writechunks = False
94
- wav_file = None
95
-
96
- hard_break_even_on_background_noise = 3.0
97
- hard_break_even_on_background_noise_min_texts = 3
98
- hard_break_even_on_background_noise_min_similarity = 0.99
99
- hard_break_even_on_background_noise_min_chars = 15
100
-
101
-
102
- text_time_deque = deque()
103
- loglevel = logging.WARNING
104
-
105
- FORMAT = pyaudio.paInt16
106
- CHANNELS = 1
107
-
108
-
109
- if sys.platform == 'win32':
110
- asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
111
-
112
-
113
- # Define ANSI color codes for terminal output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  class bcolors:
115
- HEADER = '\033[95m' # Magenta
116
- OKBLUE = '\033[94m' # Blue
117
- OKCYAN = '\033[96m' # Cyan
118
- OKGREEN = '\033[92m' # Green
119
- WARNING = '\033[93m' # Yellow
120
- FAIL = '\033[91m' # Red
121
- ENDC = '\033[0m' # Reset to default
122
  BOLD = '\033[1m'
123
  UNDERLINE = '\033[4m'
124
 
125
- print(f"{bcolors.BOLD}{bcolors.OKCYAN}Starting server, please wait...{bcolors.ENDC}")
126
-
127
- # Initialize colorama
128
- from colorama import init, Fore, Style
129
- init()
130
-
131
- from RealtimeSTT import AudioToTextRecorder
132
- from scipy.signal import resample
133
- import numpy as np
134
- import websockets
135
- import threading
136
- import logging
137
- import wave
138
- import json
139
- import time
140
 
141
- global_args = None
142
- recorder = None
143
- recorder_config = {}
144
- recorder_ready = threading.Event()
145
- recorder_thread = None
146
- stop_recorder = False
147
- prev_text = ""
148
-
149
- # Define allowed methods and parameters for security
150
- allowed_methods = [
151
- 'set_microphone',
152
- 'abort',
153
- 'stop',
154
- 'clear_audio_queue',
155
- 'wakeup',
156
- 'shutdown',
157
- 'text',
158
- ]
159
- allowed_parameters = [
160
- 'language',
161
- 'silero_sensitivity',
162
- 'wake_word_activation_delay',
163
- 'post_speech_silence_duration',
164
- 'listen_start',
165
- 'recording_stop_time',
166
- 'last_transcription_bytes',
167
- 'last_transcription_bytes_b64',
168
- 'speech_end_silence_start',
169
- 'is_recording',
170
- 'use_wake_words',
171
- ]
172
-
173
- # Queues and connections for control and data
174
- control_connections = set()
175
- data_connections = set()
176
- control_queue = asyncio.Queue()
177
- audio_queue = asyncio.Queue()
178
 
179
  def preprocess_text(text):
180
- # Remove leading whitespaces
181
  text = text.lstrip()
182
-
183
- # Remove starting ellipses if present
184
  if text.startswith("..."):
185
  text = text[3:]
186
-
187
  if text.endswith("...'."):
188
  text = text[:-1]
189
-
190
  if text.endswith("...'"):
191
  text = text[:-1]
192
-
193
- # Remove any leading whitespaces again after ellipses removal
194
  text = text.lstrip()
195
-
196
- # Uppercase the first letter
197
  if text:
198
  text = text[0].upper() + text[1:]
199
-
200
  return text
201
 
202
  def debug_print(message):
203
- if debug_logging:
204
  timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
205
- thread_name = threading.current_thread().name
 
206
  print(f"{Fore.CYAN}[DEBUG][{timestamp}][{thread_name}] {message}{Style.RESET_ALL}", file=sys.stderr)
207
 
208
  def format_timestamp_ns(timestamp_ns: int) -> str:
209
- # Split into whole seconds and the nanosecond remainder
210
  seconds = timestamp_ns // 1_000_000_000
211
  remainder_ns = timestamp_ns % 1_000_000_000
212
-
213
- # Convert seconds part into a datetime object (local time)
214
  dt = datetime.fromtimestamp(seconds)
215
-
216
- # Format the main time as HH:MM:SS
217
  time_str = dt.strftime("%H:%M:%S")
218
-
219
- # For instance, if you want milliseconds, divide the remainder by 1e6 and format as 3-digit
220
  milliseconds = remainder_ns // 1_000_000
221
- formatted_timestamp = f"{time_str}.{milliseconds:03d}"
222
 
223
- return formatted_timestamp
224
 
225
- def text_detected(text, loop):
226
- global prev_text
227
 
 
 
228
  text = preprocess_text(text)
229
 
230
- if silence_timing:
231
- def ends_with_ellipsis(text: str):
232
- if text.endswith("..."):
233
- return True
234
- if len(text) > 1 and text[:-1].endswith("..."):
235
- return True
236
- return False
237
-
238
- def sentence_end(text: str):
239
- sentence_end_marks = ['.', '!', '?', '。']
240
- if text and text[-1] in sentence_end_marks:
241
- return True
242
- return False
243
-
244
 
 
245
  if ends_with_ellipsis(text):
246
- recorder.post_speech_silence_duration = global_args.mid_sentence_detection_pause
247
- elif sentence_end(text) and sentence_end(prev_text) and not ends_with_ellipsis(prev_text):
248
- recorder.post_speech_silence_duration = global_args.end_of_sentence_detection_pause
249
  else:
250
- recorder.post_speech_silence_duration = global_args.unknown_sentence_detection_pause
251
 
252
-
253
- # Append the new text with its timestamp
254
  current_time = time.time()
255
- text_time_deque.append((current_time, text))
256
-
257
- # Remove texts older than hard_break_even_on_background_noise seconds
258
- while text_time_deque and text_time_deque[0][0] < current_time - hard_break_even_on_background_noise:
259
- text_time_deque.popleft()
260
-
261
- # Check if at least hard_break_even_on_background_noise_min_texts texts have arrived within the last hard_break_even_on_background_noise seconds
262
- if len(text_time_deque) >= hard_break_even_on_background_noise_min_texts:
263
- texts = [t[1] for t in text_time_deque]
264
- first_text = texts[0]
265
- last_text = texts[-1]
266
 
267
- # Compute the similarity ratio between the first and last texts
 
 
268
  similarity = SequenceMatcher(None, first_text, last_text).ratio()
269
-
270
- if similarity > hard_break_even_on_background_noise_min_similarity and len(first_text) > hard_break_even_on_background_noise_min_chars:
271
  recorder.stop()
272
  recorder.clear_audio_queue()
273
- prev_text = ""
 
 
 
 
 
 
 
274
 
275
- prev_text = text
276
 
277
- # Put the message in the audio queue to be sent to clients
278
- message = json.dumps({
279
- 'type': 'realtime',
280
- 'text': text
281
- })
282
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
283
-
284
- # Get current timestamp in HH:MM:SS.nnn format
285
  timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
286
-
287
- if extended_logging:
288
  print(f" [{timestamp}] Realtime text: {bcolors.OKCYAN}{text}{bcolors.ENDC}\n", flush=True, end="")
289
  else:
290
- print(f"\r[{timestamp}] {bcolors.OKCYAN}{text}{bcolors.ENDC}", flush=True, end='')
291
 
292
- def on_recording_start(loop):
293
- message = json.dumps({
294
- 'type': 'recording_start'
295
- })
296
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
297
 
298
- def on_recording_stop(loop):
299
- message = json.dumps({
300
- 'type': 'recording_stop'
301
- })
302
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
303
 
304
- def on_vad_detect_start(loop):
305
- message = json.dumps({
306
- 'type': 'vad_detect_start'
307
- })
308
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
309
 
310
- def on_vad_detect_stop(loop):
311
- message = json.dumps({
312
- 'type': 'vad_detect_stop'
313
- })
314
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
 
 
 
 
 
 
315
 
316
- def on_wakeword_detected(loop):
317
- message = json.dumps({
318
- 'type': 'wakeword_detected'
319
- })
320
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
321
 
322
- def on_wakeword_detection_start(loop):
323
- message = json.dumps({
324
- 'type': 'wakeword_detection_start'
325
- })
326
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
327
 
328
- def on_wakeword_detection_end(loop):
329
- message = json.dumps({
330
- 'type': 'wakeword_detection_end'
331
- })
332
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
333
 
334
- def on_transcription_start(_audio_bytes, loop):
335
  bytes_b64 = base64.b64encode(_audio_bytes.tobytes()).decode('utf-8')
336
  message = json.dumps({
337
  'type': 'transcription_start',
338
  'audio_bytes_base64': bytes_b64
339
  })
340
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
 
 
341
 
342
- def on_turn_detection_start(loop):
343
  print("&&& stt_server on_turn_detection_start")
344
- message = json.dumps({
345
- 'type': 'start_turn_detection'
346
- })
347
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
348
 
349
- def on_turn_detection_stop(loop):
350
  print("&&& stt_server on_turn_detection_stop")
351
- message = json.dumps({
352
- 'type': 'stop_turn_detection'
353
- })
354
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
355
-
356
-
357
- # def on_realtime_transcription_update(text, loop):
358
- # # Send real-time transcription updates to the client
359
- # text = preprocess_text(text)
360
- # message = json.dumps({
361
- # 'type': 'realtime_update',
362
- # 'text': text
363
- # })
364
- # asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
365
-
366
- # def on_recorded_chunk(chunk, loop):
367
- # if send_recorded_chunk:
368
- # bytes_b64 = base64.b64encode(chunk.tobytes()).decode('utf-8')
369
- # message = json.dumps({
370
- # 'type': 'recorded_chunk',
371
- # 'bytes': bytes_b64
372
- # })
373
- # asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
374
-
375
- # Define the server's arguments
376
- def parse_arguments():
377
- global debug_logging, extended_logging, loglevel, writechunks, log_incoming_chunks, dynamic_silence_timing
378
 
379
- import argparse
 
 
 
380
  parser = argparse.ArgumentParser(description='Start the Speech-to-Text (STT) server with various configuration options.')
381
-
382
- parser.add_argument('-m', '--model', type=str, default='large-v2',
383
- help='Path to the STT model or model size. Options include: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, or any huggingface CTranslate2 STT model such as deepdml/faster-whisper-large-v3-turbo-ct2. Default is large-v2.')
384
-
385
- parser.add_argument('-r', '--rt-model', '--realtime_model_type', type=str, default='tiny',
386
- help='Model size for real-time transcription. Options same as --model. This is used only if real-time transcription is enabled (enable_realtime_transcription). Default is tiny.en.')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
- parser.add_argument('-l', '--lang', '--language', type=str, default='en',
389
- help='Language code for the STT model to transcribe in a specific language. Leave this empty for auto-detection based on input audio. Default is en. List of supported language codes: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L11-L110')
390
-
391
- parser.add_argument('-i', '--input-device', '--input-device-index', type=int, default=1,
392
- help='Index of the audio input device to use. Use this option to specify a particular microphone or audio input device based on your system. Default is 1.')
393
-
394
- parser.add_argument('-c', '--control', '--control_port', type=int, default=8011,
395
- help='The port number used for the control WebSocket connection. Control connections are used to send and receive commands to the server. Default is port 8011.')
396
-
397
- parser.add_argument('-d', '--data', '--data_port', type=int, default=8012,
398
- help='The port number used for the data WebSocket connection. Data connections are used to send audio data and receive transcription updates in real time. Default is port 8012.')
399
-
400
- parser.add_argument('-w', '--wake_words', type=str, default="",
401
- help='Specify the wake word(s) that will trigger the server to start listening. For example, setting this to "Jarvis" will make the system start transcribing when it detects the wake word "Jarvis". Default is "Jarvis".')
402
-
403
- parser.add_argument('-D', '--debug', action='store_true', help='Enable debug logging for detailed server operations')
404
-
405
- parser.add_argument('--debug_websockets', action='store_true', help='Enable debug logging for detailed server websocket operations')
406
-
407
- parser.add_argument('-W', '--write', metavar='FILE', help='Save received audio to a WAV file')
408
-
409
- parser.add_argument('-b', '--batch', '--batch_size', type=int, default=16, help='Batch size for inference. This parameter controls the number of audio chunks processed in parallel during transcription. Default is 16.')
410
-
411
- parser.add_argument('--root', '--download_root', type=str,default=None, help='Specifies the root path where the Whisper models are downloaded to. Default is None.')
412
-
413
- parser.add_argument('-s', '--silence_timing', action='store_true', default=True,
414
- help='Enable dynamic adjustment of silence duration for sentence detection. Adjusts post-speech silence duration based on detected sentence structure and punctuation. Default is False.')
415
-
416
- parser.add_argument('--init_realtime_after_seconds', type=float, default=0.2,
417
- help='The initial waiting time in seconds before real-time transcription starts. This delay helps prevent false positives at the beginning of a session. Default is 0.2 seconds.')
418
-
419
- parser.add_argument('--realtime_batch_size', type=int, default=16,
420
- help='Batch size for the real-time transcription model. This parameter controls the number of audio chunks processed in parallel during real-time transcription. Default is 16.')
421
-
422
- parser.add_argument('--initial_prompt_realtime', type=str, default="", help='Initial prompt that guides the real-time transcription model to produce transcriptions in a particular style or format.')
423
-
424
- parser.add_argument('--silero_sensitivity', type=float, default=0.05,
425
- help='Sensitivity level for Silero Voice Activity Detection (VAD), with a range from 0 to 1. Lower values make the model less sensitive, useful for noisy environments. Default is 0.05.')
426
-
427
- parser.add_argument('--silero_use_onnx', action='store_true', default=False,
428
- help='Enable ONNX version of Silero model for faster performance with lower resource usage. Default is False.')
429
-
430
- parser.add_argument('--webrtc_sensitivity', type=int, default=3,
431
- help='Sensitivity level for WebRTC Voice Activity Detection (VAD), with a range from 0 to 3. Higher values make the model less sensitive, useful for cleaner environments. Default is 3.')
432
-
433
- parser.add_argument('--min_length_of_recording', type=float, default=1.1,
434
- help='Minimum duration of valid recordings in seconds. This prevents very short recordings from being processed, which could be caused by noise or accidental sounds. Default is 1.1 seconds.')
435
-
436
- parser.add_argument('--min_gap_between_recordings', type=float, default=0,
437
- help='Minimum time (in seconds) between consecutive recordings. Setting this helps avoid overlapping recordings when there’s a brief silence between them. Default is 0 seconds.')
438
-
439
- parser.add_argument('--enable_realtime_transcription', action='store_true', default=True,
440
- help='Enable continuous real-time transcription of audio as it is received. When enabled, transcriptions are sent in near real-time. Default is True.')
441
-
442
- parser.add_argument('--realtime_processing_pause', type=float, default=0.02,
443
- help='Time interval (in seconds) between processing audio chunks for real-time transcription. Lower values increase responsiveness but may put more load on the CPU. Default is 0.02 seconds.')
444
-
445
- parser.add_argument('--silero_deactivity_detection', action='store_true', default=True,
446
- help='Use the Silero model for end-of-speech detection. This option can provide more robust silence detection in noisy environments, though it consumes more GPU resources. Default is True.')
447
-
448
- parser.add_argument('--early_transcription_on_silence', type=float, default=0.2,
449
- help='Start transcription after the specified seconds of silence. This is useful when you want to trigger transcription mid-speech when there is a brief pause. Should be lower than post_speech_silence_duration. Set to 0 to disable. Default is 0.2 seconds.')
450
-
451
- parser.add_argument('--beam_size', type=int, default=5,
452
- help='Beam size for the main transcription model. Larger values may improve transcription accuracy but increase the processing time. Default is 5.')
453
-
454
- parser.add_argument('--beam_size_realtime', type=int, default=3,
455
- help='Beam size for the real-time transcription model. A smaller beam size allows for faster real-time processing but may reduce accuracy. Default is 3.')
456
-
457
- parser.add_argument('--initial_prompt', type=str,
458
- default="Incomplete thoughts should end with '...'. Examples of complete thoughts: 'The sky is blue.' 'She walked home.' Examples of incomplete thoughts: 'When the sky...' 'Because he...'",
459
- help='Initial prompt that guides the transcription model to produce transcriptions in a particular style or format. The default provides instructions for handling sentence completions and ellipsis usage.')
460
-
461
- parser.add_argument('--end_of_sentence_detection_pause', type=float, default=0.45,
462
- help='The duration of silence (in seconds) that the model should interpret as the end of a sentence. This helps the system detect when to finalize the transcription of a sentence. Default is 0.45 seconds.')
463
-
464
- parser.add_argument('--unknown_sentence_detection_pause', type=float, default=0.7,
465
- help='The duration of pause (in seconds) that the model should interpret as an incomplete or unknown sentence. This is useful for identifying when a sentence is trailing off or unfinished. Default is 0.7 seconds.')
466
-
467
- parser.add_argument('--mid_sentence_detection_pause', type=float, default=2.0,
468
- help='The duration of pause (in seconds) that the model should interpret as a mid-sentence break. Longer pauses can indicate a pause in speech but not necessarily the end of a sentence. Default is 2.0 seconds.')
469
-
470
- parser.add_argument('--wake_words_sensitivity', type=float, default=0.5,
471
- help='Sensitivity level for wake word detection, with a range from 0 (most sensitive) to 1 (least sensitive). Adjust this value based on your environment to ensure reliable wake word detection. Default is 0.5.')
472
-
473
- parser.add_argument('--wake_word_timeout', type=float, default=5.0,
474
- help='Maximum time in seconds that the system will wait for a wake word before timing out. After this timeout, the system stops listening for wake words until reactivated. Default is 5.0 seconds.')
475
-
476
- parser.add_argument('--wake_word_activation_delay', type=float, default=0,
477
- help='The delay in seconds before the wake word detection is activated after the system starts listening. This prevents false positives during the start of a session. Default is 0 seconds.')
478
 
479
- parser.add_argument('--wakeword_backend', type=str, default='none',
480
- help='The backend used for wake word detection. You can specify different backends such as "default" or any custom implementations depending on your setup. Default is "pvporcupine".')
481
 
482
- parser.add_argument('--openwakeword_model_paths', type=str, nargs='*',
483
- help='A list of file paths to OpenWakeWord models. This is useful if you are using OpenWakeWord for wake word detection and need to specify custom models.')
484
-
485
- parser.add_argument('--openwakeword_inference_framework', type=str, default='tensorflow',
486
- help='The inference framework to use for OpenWakeWord models. Supported frameworks could include "tensorflow", "pytorch", etc. Default is "tensorflow".')
487
-
488
- parser.add_argument('--wake_word_buffer_duration', type=float, default=1.0,
489
- help='Duration of the buffer in seconds for wake word detection. This sets how long the system will store the audio before and after detecting the wake word. Default is 1.0 seconds.')
490
-
491
- parser.add_argument('--use_main_model_for_realtime', action='store_true',
492
- help='Enable this option if you want to use the main model for real-time transcription, instead of the smaller, faster real-time model. Using the main model may provide better accuracy but at the cost of higher processing time.')
493
-
494
- parser.add_argument('--use_extended_logging', action='store_true',
495
- help='Writes extensive log messages for the recording worker, that processes the audio chunks.')
496
-
497
- parser.add_argument('--compute_type', type=str, default='default',
498
- help='Type of computation to use. See https://opennmt.net/CTranslate2/quantization.html')
499
-
500
- parser.add_argument('--gpu_device_index', type=int, default=0,
501
- help='Index of the GPU device to use. Default is None.')
502
-
503
- parser.add_argument('--device', type=str, default='cuda',
504
- help='Device for model to use. Can either be "cuda" or "cpu". Default is cuda.')
505
-
506
- parser.add_argument('--handle_buffer_overflow', action='store_true',
507
- help='Handle buffer overflow during transcription. Default is False.')
508
-
509
- parser.add_argument('--suppress_tokens', type=int, default=[-1], nargs='*', help='Suppress tokens during transcription. Default is [-1].')
510
-
511
- parser.add_argument('--allowed_latency_limit', type=int, default=100,
512
- help='Maximal amount of chunks that can be unprocessed in queue before discarding chunks.. Default is 100.')
513
-
514
- parser.add_argument('--faster_whisper_vad_filter', action='store_true',
515
- help='Enable VAD filter for Faster Whisper. Default is False.')
516
-
517
- parser.add_argument('--logchunks', action='store_true', help='Enable logging of incoming audio chunks (periods)')
518
-
519
- # Parse arguments
520
  args = parser.parse_args()
521
 
522
- debug_logging = args.debug
523
- extended_logging = args.use_extended_logging
524
- writechunks = args.write
525
- log_incoming_chunks = args.logchunks
526
- dynamic_silence_timing = args.silence_timing
527
-
528
 
529
- ws_logger = logging.getLogger('websockets')
 
530
  if args.debug_websockets:
531
- # If app debug is on, let websockets be verbose too
532
  ws_logger.setLevel(logging.DEBUG)
533
- # Ensure it uses the handler configured by basicConfig
534
- ws_logger.propagate = False # Prevent duplicate messages if it also propagates to root
 
535
  else:
536
- # If app debug is off, silence websockets below WARNING
537
  ws_logger.setLevel(logging.WARNING)
538
- ws_logger.propagate = True # Allow WARNING/ERROR messages to reach root logger's handler
539
 
540
- # Replace escaped newlines with actual newlines in initial_prompt
541
  if args.initial_prompt:
542
  args.initial_prompt = args.initial_prompt.replace("\\n", "\n")
543
-
544
  if args.initial_prompt_realtime:
545
  args.initial_prompt_realtime = args.initial_prompt_realtime.replace("\\n", "\n")
546
-
 
547
  return args
548
 
549
- def _recorder_thread(loop):
550
- global recorder, stop_recorder
551
- print(f"{bcolors.OKGREEN}Initializing RealtimeSTT server with parameters:{bcolors.ENDC}")
552
- for key, value in recorder_config.items():
553
- print(f" {bcolors.OKBLUE}{key}{bcolors.ENDC}: {value}")
554
- recorder = AudioToTextRecorder(**recorder_config)
555
- print(f"{bcolors.OKGREEN}{bcolors.BOLD}RealtimeSTT initialized{bcolors.ENDC}")
556
- recorder_ready.set()
557
-
558
- def process_text(full_sentence):
559
- global prev_text
560
- prev_text = ""
561
  full_sentence = preprocess_text(full_sentence)
562
- message = json.dumps({
563
- 'type': 'fullSentence',
564
- 'text': full_sentence
565
- })
566
- # Use the passed event loop here
567
- asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
568
 
569
  timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
570
-
571
- if extended_logging:
572
  print(f" [{timestamp}] Full text: {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n", flush=True, end="")
573
  else:
574
- print(f"\r[{timestamp}] {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n")
 
575
  try:
576
- while not stop_recorder:
577
- recorder.text(process_text)
578
- except KeyboardInterrupt:
579
- print(f"{bcolors.WARNING}Exiting application due to keyboard interrupt{bcolors.ENDC}")
 
 
 
 
 
580
 
581
- def decode_and_resample(
582
- audio_data,
583
- original_sample_rate,
584
- target_sample_rate):
585
 
586
- # Decode 16-bit PCM data to numpy array
587
  if original_sample_rate == target_sample_rate:
588
  return audio_data
589
-
590
  audio_np = np.frombuffer(audio_data, dtype=np.int16)
591
-
592
- # Calculate the number of samples after resampling
593
  num_original_samples = len(audio_np)
594
- num_target_samples = int(num_original_samples * target_sample_rate /
595
- original_sample_rate)
596
-
597
- # Resample the audio
598
  resampled_audio = resample(audio_np, num_target_samples)
599
-
600
  return resampled_audio.astype(np.int16).tobytes()
601
 
602
- async def control_handler(websocket):
603
- debug_print(f"New control connection from {websocket.remote_address}")
604
- print(f"{bcolors.OKGREEN}Control client connected{bcolors.ENDC}")
605
- global recorder
606
- control_connections.add(websocket)
607
- try:
608
- async for message in websocket:
609
- debug_print(f"Received control message: {message[:200]}...")
610
- if not recorder_ready.is_set():
611
- print(f"{bcolors.WARNING}Recorder not ready{bcolors.ENDC}")
612
- continue
613
- if isinstance(message, str):
614
- # Handle text message (command)
615
- try:
616
- command_data = json.loads(message)
617
- command = command_data.get("command")
618
- if command == "set_parameter":
619
- parameter = command_data.get("parameter")
620
- value = command_data.get("value")
621
- if parameter in allowed_parameters and hasattr(recorder, parameter):
622
- setattr(recorder, parameter, value)
623
- # Format the value for output
624
- if isinstance(value, float):
625
- value_formatted = f"{value:.2f}"
626
- else:
627
- value_formatted = value
628
- timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
629
- if extended_logging:
630
- print(f" [{timestamp}] {bcolors.OKGREEN}Set recorder.{parameter} to: {bcolors.OKBLUE}{value_formatted}{bcolors.ENDC}")
631
- # Optionally send a response back to the client
632
- await websocket.send(json.dumps({"status": "success", "message": f"Parameter {parameter} set to {value}"}))
633
- else:
634
- if not parameter in allowed_parameters:
635
- print(f"{bcolors.WARNING}Parameter {parameter} is not allowed (set_parameter){bcolors.ENDC}")
636
- await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} is not allowed (set_parameter)"}))
637
- else:
638
- print(f"{bcolors.WARNING}Parameter {parameter} does not exist (set_parameter){bcolors.ENDC}")
639
- await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} does not exist (set_parameter)"}))
640
-
641
- elif command == "get_parameter":
642
- parameter = command_data.get("parameter")
643
- request_id = command_data.get("request_id") # Get the request_id from the command data
644
- if parameter in allowed_parameters and hasattr(recorder, parameter):
645
- value = getattr(recorder, parameter)
646
- if isinstance(value, float):
647
- value_formatted = f"{value:.2f}"
648
- else:
649
- value_formatted = f"{value}"
650
-
651
- value_truncated = value_formatted[:39] + "…" if len(value_formatted) > 40 else value_formatted
652
-
653
- timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
654
- if extended_logging:
655
- print(f" [{timestamp}] {bcolors.OKGREEN}Get recorder.{parameter}: {bcolors.OKBLUE}{value_truncated}{bcolors.ENDC}")
656
- response = {"status": "success", "parameter": parameter, "value": value}
657
- if request_id is not None:
658
- response["request_id"] = request_id
659
- await websocket.send(json.dumps(response))
660
- else:
661
- if not parameter in allowed_parameters:
662
- print(f"{bcolors.WARNING}Parameter {parameter} is not allowed (get_parameter){bcolors.ENDC}")
663
- await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} is not allowed (get_parameter)"}))
664
- else:
665
- print(f"{bcolors.WARNING}Parameter {parameter} does not exist (get_parameter){bcolors.ENDC}")
666
- await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} does not exist (get_parameter)"}))
667
- elif command == "call_method":
668
- method_name = command_data.get("method")
669
- if method_name in allowed_methods:
670
- method = getattr(recorder, method_name, None)
671
- if method and callable(method):
672
- args = command_data.get("args", [])
673
- kwargs = command_data.get("kwargs", {})
674
- method(*args, **kwargs)
675
- timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
676
- print(f" [{timestamp}] {bcolors.OKGREEN}Called method recorder.{bcolors.OKBLUE}{method_name}{bcolors.ENDC}")
677
- await websocket.send(json.dumps({"status": "success", "message": f"Method {method_name} called"}))
678
- else:
679
- print(f"{bcolors.WARNING}Recorder does not have method {method_name}{bcolors.ENDC}")
680
- await websocket.send(json.dumps({"status": "error", "message": f"Recorder does not have method {method_name}"}))
681
- else:
682
- print(f"{bcolors.WARNING}Method {method_name} is not allowed{bcolors.ENDC}")
683
- await websocket.send(json.dumps({"status": "error", "message": f"Method {method_name} is not allowed"}))
684
- else:
685
- print(f"{bcolors.WARNING}Unknown command: {command}{bcolors.ENDC}")
686
- await websocket.send(json.dumps({"status": "error", "message": f"Unknown command {command}"}))
687
- except json.JSONDecodeError:
688
- print(f"{bcolors.WARNING}Received invalid JSON command{bcolors.ENDC}")
689
- await websocket.send(json.dumps({"status": "error", "message": "Invalid JSON command"}))
690
- else:
691
- print(f"{bcolors.WARNING}Received unknown message type on control connection{bcolors.ENDC}")
692
- except websockets.exceptions.ConnectionClosed as e:
693
- print(f"{bcolors.WARNING}Control client disconnected: {e}{bcolors.ENDC}")
694
- finally:
695
- control_connections.remove(websocket)
696
-
697
- async def data_handler(websocket):
698
- global writechunks, wav_file
699
- print(f"{bcolors.OKGREEN}Data client connected{bcolors.ENDC}")
700
- data_connections.add(websocket)
701
- try:
702
- while True:
703
- message = await websocket.recv()
704
- if isinstance(message, bytes):
705
- if extended_logging:
706
- debug_print(f"Received audio chunk (size: {len(message)} bytes)")
707
- elif log_incoming_chunks:
708
- print(".", end='', flush=True)
709
- # Handle binary message (audio data)
710
- metadata_length = int.from_bytes(message[:4], byteorder='little')
711
- metadata_json = message[4:4+metadata_length].decode('utf-8')
712
- metadata = json.loads(metadata_json)
713
- sample_rate = metadata['sampleRate']
714
-
715
- if 'server_sent_to_stt' in metadata:
716
- stt_received_ns = time.time_ns()
717
- metadata["stt_received"] = stt_received_ns
718
- metadata["stt_received_formatted"] = format_timestamp_ns(stt_received_ns)
719
- print(f"Server received audio chunk of length {len(message)} bytes, metadata: {metadata}")
720
-
721
- if extended_logging:
722
- debug_print(f"Processing audio chunk with sample rate {sample_rate}")
723
- chunk = message[4+metadata_length:]
724
-
725
- if writechunks:
726
- if not wav_file:
727
- wav_file = wave.open(writechunks, 'wb')
728
- wav_file.setnchannels(CHANNELS)
729
- wav_file.setsampwidth(pyaudio.get_sample_size(FORMAT))
730
- wav_file.setframerate(sample_rate)
731
 
732
- wav_file.writeframes(chunk)
 
 
 
733
 
734
- if sample_rate != 16000:
735
- resampled_chunk = decode_and_resample(chunk, sample_rate, 16000)
736
- if extended_logging:
737
- debug_print(f"Resampled chunk size: {len(resampled_chunk)} bytes")
738
- recorder.feed_audio(resampled_chunk)
739
- else:
740
- recorder.feed_audio(chunk)
741
- else:
742
- print(f"{bcolors.WARNING}Received non-binary message on data connection{bcolors.ENDC}")
743
- except websockets.exceptions.ConnectionClosed as e:
744
- print(f"{bcolors.WARNING}Data client disconnected: {e}{bcolors.ENDC}")
745
- finally:
746
- data_connections.remove(websocket)
747
- recorder.clear_audio_queue() # Ensure audio queue is cleared if client disconnects
748
-
749
- async def broadcast_audio_messages():
750
- while True:
751
- message = await audio_queue.get()
752
- for conn in list(data_connections):
753
- try:
754
- timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
755
-
756
- if extended_logging:
757
- print(f" [{timestamp}] Sending message: {bcolors.OKBLUE}{message}{bcolors.ENDC}\n", flush=True, end="")
758
- await conn.send(message)
759
- except websockets.exceptions.ConnectionClosed:
760
- data_connections.remove(conn)
761
-
762
- # Helper function to create event loop bound closures for callbacks
763
- def make_callback(loop, callback):
764
- def inner_callback(*args, **kwargs):
765
- callback(*args, **kwargs, loop=loop)
766
- return inner_callback
767
-
768
- async def main_async():
769
- global stop_recorder, recorder_config, global_args
770
- args = parse_arguments()
771
- global_args = args
772
-
773
- # Get the event loop here and pass it to the recorder thread
774
- loop = asyncio.get_event_loop()
775
-
776
- recorder_config = {
777
- 'model': args.model,
778
- 'download_root': args.root,
779
- 'realtime_model_type': args.rt_model,
780
- 'language': args.lang,
781
- 'batch_size': args.batch,
782
  'init_realtime_after_seconds': args.init_realtime_after_seconds,
783
  'realtime_batch_size': args.realtime_batch_size,
784
  'initial_prompt_realtime': args.initial_prompt_realtime,
785
- 'input_device_index': args.input_device,
786
- 'silero_sensitivity': args.silero_sensitivity,
787
- 'silero_use_onnx': args.silero_use_onnx,
788
- 'webrtc_sensitivity': args.webrtc_sensitivity,
789
  'post_speech_silence_duration': args.unknown_sentence_detection_pause,
790
  'min_length_of_recording': args.min_length_of_recording,
791
  'min_gap_between_recordings': args.min_gap_between_recordings,
@@ -793,10 +394,8 @@ async def main_async():
793
  'realtime_processing_pause': args.realtime_processing_pause,
794
  'silero_deactivity_detection': args.silero_deactivity_detection,
795
  'early_transcription_on_silence': args.early_transcription_on_silence,
796
- 'beam_size': args.beam_size,
797
- 'beam_size_realtime': args.beam_size_realtime,
798
- 'initial_prompt': args.initial_prompt,
799
- 'wake_words': args.wake_words,
800
  'wake_words_sensitivity': args.wake_words_sensitivity,
801
  'wake_word_timeout': args.wake_word_timeout,
802
  'wake_word_activation_delay': args.wake_word_activation_delay,
@@ -805,88 +404,341 @@ async def main_async():
805
  'openwakeword_inference_framework': args.openwakeword_inference_framework,
806
  'wake_word_buffer_duration': args.wake_word_buffer_duration,
807
  'use_main_model_for_realtime': args.use_main_model_for_realtime,
808
- 'spinner': False,
809
- 'use_microphone': False,
810
-
811
- 'on_realtime_transcription_update': make_callback(loop, text_detected),
812
- 'on_recording_start': make_callback(loop, on_recording_start),
813
- 'on_recording_stop': make_callback(loop, on_recording_stop),
814
- 'on_vad_detect_start': make_callback(loop, on_vad_detect_start),
815
- 'on_vad_detect_stop': make_callback(loop, on_vad_detect_stop),
816
- 'on_wakeword_detected': make_callback(loop, on_wakeword_detected),
817
- 'on_wakeword_detection_start': make_callback(loop, on_wakeword_detection_start),
818
- 'on_wakeword_detection_end': make_callback(loop, on_wakeword_detection_end),
819
- 'on_transcription_start': make_callback(loop, on_transcription_start),
820
- 'on_turn_detection_start': make_callback(loop, on_turn_detection_start),
821
- 'on_turn_detection_stop': make_callback(loop, on_turn_detection_stop),
822
-
823
- # 'on_recorded_chunk': make_callback(loop, on_recorded_chunk),
824
- 'no_log_file': True, # Disable logging to file
825
- 'use_extended_logging': args.use_extended_logging,
826
- 'level': loglevel,
827
- 'compute_type': args.compute_type,
828
- 'gpu_device_index': args.gpu_device_index,
829
- 'device': args.device,
830
- 'handle_buffer_overflow': args.handle_buffer_overflow,
831
  'suppress_tokens': args.suppress_tokens,
832
  'allowed_latency_limit': args.allowed_latency_limit,
833
  'faster_whisper_vad_filter': args.faster_whisper_vad_filter,
834
  }
 
 
 
 
835
 
836
- try:
837
- # Attempt to start control and data servers
838
- control_server = await websockets.serve(control_handler, "localhost", args.control)
839
- data_server = await websockets.serve(data_handler, "localhost", args.data)
840
- print(f"{bcolors.OKGREEN}Control server started on {bcolors.OKBLUE}ws://localhost:{args.control}{bcolors.ENDC}")
841
- print(f"{bcolors.OKGREEN}Data server started on {bcolors.OKBLUE}ws://localhost:{args.data}{bcolors.ENDC}")
842
-
843
- # Start the broadcast and recorder threads
844
- broadcast_task = asyncio.create_task(broadcast_audio_messages())
845
-
846
- recorder_thread = threading.Thread(target=_recorder_thread, args=(loop,))
847
- recorder_thread.start()
848
- recorder_ready.wait()
849
-
850
- print(f"{bcolors.OKGREEN}Server started. Press Ctrl+C to stop the server.{bcolors.ENDC}")
851
-
852
- # Run server tasks
853
- await asyncio.gather(control_server.wait_closed(), data_server.wait_closed(), broadcast_task)
854
- except OSError as e:
855
- print(f"{bcolors.FAIL}Error: Could not start server on specified ports. It’s possible another instance of the server is already running, or the ports are being used by another application.{bcolors.ENDC}")
856
- except KeyboardInterrupt:
857
- print(f"{bcolors.WARNING}Server interrupted by user, shutting down...{bcolors.ENDC}")
858
- finally:
859
- # Shutdown procedures for recorder and server threads
860
- await shutdown_procedure()
861
- print(f"{bcolors.OKGREEN}Server shutdown complete.{bcolors.ENDC}")
862
 
863
- async def shutdown_procedure():
864
- global stop_recorder, recorder_thread
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
865
  if recorder:
866
- stop_recorder = True
867
- recorder.abort()
868
- recorder.stop()
869
- recorder.shutdown()
870
- print(f"{bcolors.OKGREEN}Recorder shut down{bcolors.ENDC}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
871
 
872
- if recorder_thread:
873
- recorder_thread.join()
874
- print(f"{bcolors.OKGREEN}Recorder thread finished{bcolors.ENDC}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
875
 
876
- tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
877
- for task in tasks:
878
- task.cancel()
879
- await asyncio.gather(*tasks, return_exceptions=True)
 
 
880
 
881
- print(f"{bcolors.OKGREEN}All tasks cancelled, closing event loop now.{bcolors.ENDC}")
 
 
 
 
 
 
 
 
 
882
 
883
- def main():
884
  try:
885
- asyncio.run(main_async())
886
- except KeyboardInterrupt:
887
- # Capture any final KeyboardInterrupt to prevent it from showing up in logs
888
- print(f"{bcolors.WARNING}Server interrupted by user.{bcolors.ENDC}")
889
- exit(0)
890
-
891
- if __name__ == '__main__':
892
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ FastAPI Speech-to-Text (STT) Server with Real-Time Transcription and WebSocket Interface
3
+
4
+ This server provides real-time speech-to-text (STT) transcription using the RealtimeSTT library.
5
+ It allows clients to connect via WebSocket to send audio data and receive real-time transcription updates.
6
+ The server supports configurable audio recording parameters, voice activity detection (VAD), and wake word detection.
7
+ It is designed to handle continuous transcription as well as post-recording processing,
8
+ enabling real-time feedback with the option to improve final transcription quality after the complete sentence is recognized.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  """
10
 
11
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
12
+ from fastapi.responses import HTMLResponse # Optional: for a simple test page
13
+ import uvicorn
14
+
15
  from difflib import SequenceMatcher
16
  from collections import deque
17
  from datetime import datetime
 
20
  import pyaudio
21
  import base64
22
  import sys
23
+ import argparse # Keeping argparse for CLI options
24
+ import threading
25
+ import wave
26
+ import json
27
+ import time
28
+ import numpy as np
29
+ from scipy.signal import resample
30
+ from colorama import init, Fore, Style
31
+ init() # Initialize colorama
32
 
33
+ # Assuming RealtimeSTT is installed and accessible
34
+ from RealtimeSTT import AudioToTextRecorder
35
 
36
+ # --- Original Global Variables and Helper Functions (adapted or kept) ---
37
+ # (Many of these will remain similar, but their interaction with FastAPI needs care)
38
+
39
+ # Global state (consider encapsulating these in a class or app state for larger apps)
40
+ app_state = {
41
+ "recorder": None,
42
+ "recorder_config": {},
43
+ "recorder_ready": asyncio.Event(), # Use asyncio.Event
44
+ "stop_recorder_flag": False,
45
+ "prev_text": "",
46
+ "control_connections": set(),
47
+ "data_connections": set(),
48
+ "audio_queue": asyncio.Queue(),
49
+ "global_args": None, # To store parsed CLI arguments
50
+ "debug_logging": False,
51
+ "extended_logging": False,
52
+ "log_incoming_chunks": False,
53
+ "silence_timing": False, # Renamed from dynamic_silence_timing for consistency
54
+ "writechunks": None, # Store filename if writing
55
+ "wav_file": None,
56
+ "text_time_deque": deque(),
57
+ "hard_break_even_on_background_noise": 3.0,
58
+ "hard_break_even_on_background_noise_min_texts": 3,
59
+ "hard_break_even_on_background_noise_min_similarity": 0.99,
60
+ "hard_break_even_on_background_noise_min_chars": 15,
61
+ "allowed_methods": [
62
+ 'set_microphone', 'abort', 'stop', 'clear_audio_queue', 'wakeup', 'shutdown', 'text',
63
+ ],
64
+ "allowed_parameters": [
65
+ 'language', 'silero_sensitivity', 'wake_word_activation_delay',
66
+ 'post_speech_silence_duration', 'listen_start', 'recording_stop_time',
67
+ 'last_transcription_bytes', 'last_transcription_bytes_b64',
68
+ 'speech_end_silence_start', 'is_recording', 'use_wake_words',
69
+ ],
70
+ "CHANNELS": 1,
71
+ "FORMAT": pyaudio.paInt16,
72
+ }
73
+
74
+
75
+ # Define ANSI color codes for terminal output (same as original)
76
  class bcolors:
77
+ HEADER = '\033[95m'
78
+ OKBLUE = '\033[94m'
79
+ OKCYAN = '\033[96m'
80
+ OKGREEN = '\033[92m'
81
+ WARNING = '\033[93m'
82
+ FAIL = '\033[91m'
83
+ ENDC = '\033[0m'
84
  BOLD = '\033[1m'
85
  UNDERLINE = '\033[4m'
86
 
87
+ print(f"{bcolors.BOLD}{bcolors.OKCYAN}Starting FastAPI STT server, please wait...{bcolors.ENDC}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ # --- Helper Functions (preprocess_text, debug_print, format_timestamp_ns etc.) ---
90
+ # These functions can be kept largely as they are, ensuring they use app_state for global flags.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def preprocess_text(text):
 
93
  text = text.lstrip()
 
 
94
  if text.startswith("..."):
95
  text = text[3:]
 
96
  if text.endswith("...'."):
97
  text = text[:-1]
 
98
  if text.endswith("...'"):
99
  text = text[:-1]
 
 
100
  text = text.lstrip()
 
 
101
  if text:
102
  text = text[0].upper() + text[1:]
 
103
  return text
104
 
105
  def debug_print(message):
106
+ if app_state["debug_logging"]:
107
  timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
108
+ # FastAPI might not run recorder in a named thread in the same way, adjust if needed
109
+ thread_name = threading.current_thread().name
110
  print(f"{Fore.CYAN}[DEBUG][{timestamp}][{thread_name}] {message}{Style.RESET_ALL}", file=sys.stderr)
111
 
112
  def format_timestamp_ns(timestamp_ns: int) -> str:
 
113
  seconds = timestamp_ns // 1_000_000_000
114
  remainder_ns = timestamp_ns % 1_000_000_000
 
 
115
  dt = datetime.fromtimestamp(seconds)
 
 
116
  time_str = dt.strftime("%H:%M:%S")
 
 
117
  milliseconds = remainder_ns // 1_000_000
118
+ return f"{time_str}.{milliseconds:03d}"
119
 
 
120
 
121
+ # --- Callback Functions for RealtimeSTT (adapted for asyncio Queue) ---
122
+ # The loop argument is no longer needed as we'll use app_state["audio_queue"] which is an asyncio.Queue
123
 
124
+ def text_detected(text): # Removed loop argument
125
+ app_state["prev_text"] = app_state.get("prev_text", "") # Ensure prev_text exists
126
  text = preprocess_text(text)
127
 
128
+ if app_state["silence_timing"]:
129
+ def ends_with_ellipsis(s: str):
130
+ return s.endswith("...") or (len(s) > 1 and s[:-1].endswith("..."))
131
+ def sentence_end(s: str):
132
+ return s and s[-1] in ['.', '!', '?', '。']
 
 
 
 
 
 
 
 
 
133
 
134
+ recorder = app_state["recorder"]
135
  if ends_with_ellipsis(text):
136
+ recorder.post_speech_silence_duration = app_state["global_args"].mid_sentence_detection_pause
137
+ elif sentence_end(text) and sentence_end(app_state["prev_text"]) and not ends_with_ellipsis(app_state["prev_text"]):
138
+ recorder.post_speech_silence_duration = app_state["global_args"].end_of_sentence_detection_pause
139
  else:
140
+ recorder.post_speech_silence_duration = app_state["global_args"].unknown_sentence_detection_pause
141
 
 
 
142
  current_time = time.time()
143
+ app_state["text_time_deque"].append((current_time, text))
144
+ while app_state["text_time_deque"] and app_state["text_time_deque"][0][0] < current_time - app_state["hard_break_even_on_background_noise"]:
145
+ app_state["text_time_deque"].popleft()
 
 
 
 
 
 
 
 
146
 
147
+ if len(app_state["text_time_deque"]) >= app_state["hard_break_even_on_background_noise_min_texts"]:
148
+ texts = [t[1] for t in app_state["text_time_deque"]]
149
+ first_text, last_text = texts[0], texts[-1]
150
  similarity = SequenceMatcher(None, first_text, last_text).ratio()
151
+ if similarity > app_state["hard_break_even_on_background_noise_min_similarity"] and \
152
+ len(first_text) > app_state["hard_break_even_on_background_noise_min_chars"]:
153
  recorder.stop()
154
  recorder.clear_audio_queue()
155
+ app_state["prev_text"] = ""
156
+
157
+ app_state["prev_text"] = text
158
+ message = json.dumps({'type': 'realtime', 'text': text})
159
+ try:
160
+ app_state["audio_queue"].put_nowait(message)
161
+ except asyncio.QueueFull:
162
+ logging.warning("Audio queue full, dropping realtime message.")
163
 
 
164
 
 
 
 
 
 
 
 
 
165
  timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
166
+ log_msg = f"\r[{timestamp}] {bcolors.OKCYAN}{text}{bcolors.ENDC}"
167
+ if app_state["extended_logging"]:
168
  print(f" [{timestamp}] Realtime text: {bcolors.OKCYAN}{text}{bcolors.ENDC}\n", flush=True, end="")
169
  else:
170
+ print(log_msg, flush=True, end='')
171
 
 
 
 
 
 
172
 
173
+ def on_recording_start():
174
+ message = json.dumps({'type': 'recording_start'})
175
+ try:
176
+ app_state["audio_queue"].put_nowait(message)
177
+ except asyncio.QueueFull: logging.warning("Audio queue full, dropping recording_start message.")
178
 
179
+ def on_recording_stop():
180
+ message = json.dumps({'type': 'recording_stop'})
181
+ try:
182
+ app_state["audio_queue"].put_nowait(message)
183
+ except asyncio.QueueFull: logging.warning("Audio queue full, dropping recording_stop message.")
184
 
185
+ def on_vad_detect_start():
186
+ message = json.dumps({'type': 'vad_detect_start'})
187
+ try:
188
+ app_state["audio_queue"].put_nowait(message)
189
+ except asyncio.QueueFull: logging.warning("Audio queue full, dropping vad_detect_start message.")
190
+
191
+ def on_vad_detect_stop():
192
+ message = json.dumps({'type': 'vad_detect_stop'})
193
+ try:
194
+ app_state["audio_queue"].put_nowait(message)
195
+ except asyncio.QueueFull: logging.warning("Audio queue full, dropping vad_detect_stop message.")
196
 
197
+ def on_wakeword_detected():
198
+ message = json.dumps({'type': 'wakeword_detected'})
199
+ try:
200
+ app_state["audio_queue"].put_nowait(message)
201
+ except asyncio.QueueFull: logging.warning("Audio queue full, dropping wakeword_detected message.")
202
 
203
+ def on_wakeword_detection_start():
204
+ message = json.dumps({'type': 'wakeword_detection_start'})
205
+ try:
206
+ app_state["audio_queue"].put_nowait(message)
207
+ except asyncio.QueueFull: logging.warning("Audio queue full, dropping wakeword_detection_start message.")
208
 
209
+ def on_wakeword_detection_end():
210
+ message = json.dumps({'type': 'wakeword_detection_end'})
211
+ try:
212
+ app_state["audio_queue"].put_nowait(message)
213
+ except asyncio.QueueFull: logging.warning("Audio queue full, dropping wakeword_detection_end message.")
214
 
215
+ def on_transcription_start(_audio_bytes):
216
  bytes_b64 = base64.b64encode(_audio_bytes.tobytes()).decode('utf-8')
217
  message = json.dumps({
218
  'type': 'transcription_start',
219
  'audio_bytes_base64': bytes_b64
220
  })
221
+ try:
222
+ app_state["audio_queue"].put_nowait(message)
223
+ except asyncio.QueueFull: logging.warning("Audio queue full, dropping transcription_start message.")
224
 
225
+ def on_turn_detection_start():
226
  print("&&& stt_server on_turn_detection_start")
227
+ message = json.dumps({'type': 'start_turn_detection'})
228
+ try:
229
+ app_state["audio_queue"].put_nowait(message)
230
+ except asyncio.QueueFull: logging.warning("Audio queue full, dropping start_turn_detection message.")
231
 
232
+ def on_turn_detection_stop():
233
  print("&&& stt_server on_turn_detection_stop")
234
+ message = json.dumps({'type': 'stop_turn_detection'})
235
+ try:
236
+ app_state["audio_queue"].put_nowait(message)
237
+ except asyncio.QueueFull: logging.warning("Audio queue full, dropping stop_turn_detection message.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
+ # --- Argument Parser (same as original, but call it in startup) ---
240
+ def parse_arguments():
241
+ # ... (Keep the original parse_arguments function as is) ...
242
+ # This function will now set app_state["debug_logging"], etc.
243
  parser = argparse.ArgumentParser(description='Start the Speech-to-Text (STT) server with various configuration options.')
244
+ parser.add_argument('-m', '--model', type=str, default='large-v2', help='Path to the STT model or model size.')
245
+ parser.add_argument('-r', '--rt-model', '--realtime_model_type', type=str, default='tiny', help='Model size for real-time transcription.')
246
+ parser.add_argument('-l', '--lang', '--language', type=str, default='en', help='Language code for STT.')
247
+ parser.add_argument('-i', '--input-device', '--input-device-index', type=int, default=1, help='Audio input device index.')
248
+ # For FastAPI, host and port for HTTP server are set in uvicorn.run()
249
+ # WebSocket ports are part of the path now, e.g. /ws/control
250
+ # We can remove -c and -d from here if they are not used for other purposes
251
+ # parser.add_argument('-c', '--control', '--control_port', type=int, default=8011, help='Control WebSocket port (deprecated for FastAPI).')
252
+ # parser.add_argument('-d', '--data', '--data_port', type=int, default=8012, help='Data WebSocket port (deprecated for FastAPI).')
253
+ parser.add_argument('-w', '--wake_words', type=str, default="", help='Wake word(s).')
254
+ parser.add_argument('-D', '--debug', action='store_true', help='Enable debug logging.')
255
+ parser.add_argument('--debug_websockets', action='store_true', help='Enable debug logging for websockets.')
256
+ parser.add_argument('-W', '--write', metavar='FILE', help='Save received audio to a WAV file.')
257
+ parser.add_argument('-b', '--batch', '--batch_size', type=int, default=16, help='Batch size for inference.')
258
+ parser.add_argument('--root', '--download_root', type=str,default=None, help='Whisper models download root path.')
259
+ parser.add_argument('-s', '--silence_timing', action='store_true', default=True, help='Enable dynamic silence duration.')
260
+ parser.add_argument('--init_realtime_after_seconds', type=float, default=0.2, help='Initial wait for realtime transcription.')
261
+ parser.add_argument('--realtime_batch_size', type=int, default=16, help='Batch size for real-time model.')
262
+ parser.add_argument('--initial_prompt_realtime', type=str, default="", help='Initial prompt for real-time model.')
263
+ parser.add_argument('--silero_sensitivity', type=float, default=0.05, help='Silero VAD sensitivity (0-1).')
264
+ parser.add_argument('--silero_use_onnx', action='store_true', default=False, help='Use Silero ONNX model.')
265
+ parser.add_argument('--webrtc_sensitivity', type=int, default=3, help='WebRTC VAD sensitivity (0-3).')
266
+ parser.add_argument('--min_length_of_recording', type=float, default=1.1, help='Min recording duration (s).')
267
+ parser.add_argument('--min_gap_between_recordings', type=float, default=0, help='Min time between recordings (s).')
268
+ parser.add_argument('--enable_realtime_transcription', action='store_true', default=True, help='Enable real-time transcription.')
269
+ parser.add_argument('--realtime_processing_pause', type=float, default=0.02, help='Pause between audio chunk processing (s).')
270
+ parser.add_argument('--silero_deactivity_detection', action='store_true', default=True, help='Use Silero for end-of-speech detection.')
271
+ parser.add_argument('--early_transcription_on_silence', type=float, default=0.2, help='Start transcription after silence (s).')
272
+ parser.add_argument('--beam_size', type=int, default=5, help='Beam size for main model.')
273
+ parser.add_argument('--beam_size_realtime', type=int, default=3, help='Beam size for real-time model.')
274
+ parser.add_argument('--initial_prompt', type=str, default="Incomplete thoughts should end with '...'.", help='Initial main transcription prompt.')
275
+ parser.add_argument('--end_of_sentence_detection_pause', type=float, default=0.45, help='Silence for sentence end (s).')
276
+ parser.add_argument('--unknown_sentence_detection_pause', type=float, default=0.7, help='Pause for incomplete sentence (s).')
277
+ parser.add_argument('--mid_sentence_detection_pause', type=float, default=2.0, help='Pause for mid-sentence break (s).')
278
+ parser.add_argument('--wake_words_sensitivity', type=float, default=0.5, help='Wake word detection sensitivity (0-1).')
279
+ parser.add_argument('--wake_word_timeout', type=float, default=5.0, help='Wake word timeout (s).')
280
+ parser.add_argument('--wake_word_activation_delay', type=float, default=0, help='Delay before wake word activation (s).')
281
+ parser.add_argument('--wakeword_backend', type=str, default='none', help='Backend for wake word detection.')
282
+ parser.add_argument('--openwakeword_model_paths', type=str, nargs='*', help='Paths to OpenWakeWord models.')
283
+ parser.add_argument('--openwakeword_inference_framework', type=str, default='tensorflow', help='OpenWakeWord inference framework.')
284
+ parser.add_argument('--wake_word_buffer_duration', type=float, default=1.0, help='Wake word buffer duration (s).')
285
+ parser.add_argument('--use_main_model_for_realtime', action='store_true', help='Use main model for real-time transcription.')
286
+ parser.add_argument('--use_extended_logging', action='store_true', help='Enable extensive log messages.')
287
+ parser.add_argument('--compute_type', type=str, default='default', help='Computation type for CTranslate2.')
288
+ parser.add_argument('--gpu_device_index', type=int, default=0, help='GPU device index.')
289
+ parser.add_argument('--device', type=str, default='cuda', help='Device for model ("cuda" or "cpu").')
290
+ parser.add_argument('--handle_buffer_overflow', action='store_true', help='Handle buffer overflow.')
291
+ parser.add_argument('--suppress_tokens', type=int, default=[-1], nargs='*', help='Suppress tokens.')
292
+ parser.add_argument('--allowed_latency_limit', type=int, default=100, help='Allowed latency limit for real-time.')
293
+ parser.add_argument('--faster_whisper_vad_filter', action='store_true', help='Enable VAD filter for Faster Whisper.')
294
+ parser.add_argument('--logchunks', action='store_true', help='Log incoming audio chunks.')
295
 
296
+ # FastAPI specific args (uvicorn will handle host/port for the HTTP server)
297
+ parser.add_argument('--server_host', type=str, default='localhost', help='Host for the FastAPI server.')
298
+ parser.add_argument('--server_port', type=int, default=8000, help='Port for the FastAPI server.')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
 
 
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  args = parser.parse_args()
302
 
303
+ app_state["debug_logging"] = args.debug
304
+ app_state["extended_logging"] = args.use_extended_logging
305
+ app_state["writechunks"] = args.write # Filename or None
306
+ app_state["log_incoming_chunks"] = args.logchunks
307
+ app_state["silence_timing"] = args.silence_timing
 
308
 
309
+ ws_logger = logging.getLogger('websockets') # For uvicorn's websockets
310
+ uvicorn_logger = logging.getLogger('uvicorn')
311
  if args.debug_websockets:
 
312
  ws_logger.setLevel(logging.DEBUG)
313
+ uvicorn_logger.setLevel(logging.DEBUG)
314
+ ws_logger.propagate = True
315
+ uvicorn_logger.propagate = True
316
  else:
 
317
  ws_logger.setLevel(logging.WARNING)
318
+ uvicorn_logger.setLevel(logging.INFO) # Uvicorn access logs are INFO
319
 
 
320
  if args.initial_prompt:
321
  args.initial_prompt = args.initial_prompt.replace("\\n", "\n")
 
322
  if args.initial_prompt_realtime:
323
  args.initial_prompt_realtime = args.initial_prompt_realtime.replace("\\n", "\n")
324
+
325
+ app_state["global_args"] = args
326
  return args
327
 
328
+
329
+ # --- Recorder Thread Function (adapted for asyncio.to_thread) ---
330
+ def recorder_processing_loop():
331
+ """This function contains the blocking recorder.text() call."""
332
+ recorder = app_state["recorder"]
333
+ if not recorder:
334
+ logging.error("Recorder not initialized in recorder_processing_loop.")
335
+ return
336
+
337
+ def process_text_callback(full_sentence):
338
+ app_state["prev_text"] = "" # Reset for full sentence
 
339
  full_sentence = preprocess_text(full_sentence)
340
+ message = json.dumps({'type': 'fullSentence', 'text': full_sentence})
341
+ try:
342
+ app_state["audio_queue"].put_nowait(message)
343
+ except asyncio.QueueFull:
344
+ logging.warning("Audio queue full, dropping fullSentence message.")
 
345
 
346
  timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
347
+ log_msg = f"\r[{timestamp}] {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n"
348
+ if app_state["extended_logging"]:
349
  print(f" [{timestamp}] Full text: {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n", flush=True, end="")
350
  else:
351
+ print(log_msg, end='') # Ensure newline after sentence
352
+
353
  try:
354
+ while not app_state["stop_recorder_flag"]:
355
+ if recorder:
356
+ recorder.text(process_text_callback) # This is a blocking call
357
+ else: # Should not happen if startup logic is correct
358
+ time.sleep(0.1)
359
+ except Exception as e:
360
+ logging.error(f"Error in recorder_processing_loop: {e}")
361
+ finally:
362
+ logging.info("Recorder processing loop finished.")
363
 
 
 
 
 
364
 
365
+ def decode_and_resample(audio_data, original_sample_rate, target_sample_rate):
366
  if original_sample_rate == target_sample_rate:
367
  return audio_data
 
368
  audio_np = np.frombuffer(audio_data, dtype=np.int16)
 
 
369
  num_original_samples = len(audio_np)
370
+ num_target_samples = int(num_original_samples * target_sample_rate / original_sample_rate)
 
 
 
371
  resampled_audio = resample(audio_np, num_target_samples)
 
372
  return resampled_audio.astype(np.int16).tobytes()
373
 
374
+ # --- FastAPI App and Endpoints ---
375
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
+ @app.on_event("startup")
378
+ async def startup_event():
379
+ args = parse_arguments() # Parse CLI args and set app_state flags
380
+ # app_state["global_args"] = args # Already set in parse_arguments
381
 
382
+ app_state["recorder_config"] = {
383
+ 'model': args.model, 'download_root': args.root, 'realtime_model_type': args.rt_model,
384
+ 'language': args.lang, 'batch_size': args.batch,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  'init_realtime_after_seconds': args.init_realtime_after_seconds,
386
  'realtime_batch_size': args.realtime_batch_size,
387
  'initial_prompt_realtime': args.initial_prompt_realtime,
388
+ 'input_device_index': args.input_device, 'silero_sensitivity': args.silero_sensitivity,
389
+ 'silero_use_onnx': args.silero_use_onnx, 'webrtc_sensitivity': args.webrtc_sensitivity,
 
 
390
  'post_speech_silence_duration': args.unknown_sentence_detection_pause,
391
  'min_length_of_recording': args.min_length_of_recording,
392
  'min_gap_between_recordings': args.min_gap_between_recordings,
 
394
  'realtime_processing_pause': args.realtime_processing_pause,
395
  'silero_deactivity_detection': args.silero_deactivity_detection,
396
  'early_transcription_on_silence': args.early_transcription_on_silence,
397
+ 'beam_size': args.beam_size, 'beam_size_realtime': args.beam_size_realtime,
398
+ 'initial_prompt': args.initial_prompt, 'wake_words': args.wake_words,
 
 
399
  'wake_words_sensitivity': args.wake_words_sensitivity,
400
  'wake_word_timeout': args.wake_word_timeout,
401
  'wake_word_activation_delay': args.wake_word_activation_delay,
 
404
  'openwakeword_inference_framework': args.openwakeword_inference_framework,
405
  'wake_word_buffer_duration': args.wake_word_buffer_duration,
406
  'use_main_model_for_realtime': args.use_main_model_for_realtime,
407
+ 'spinner': False, 'use_microphone': False, # Server mode doesn't use mic directly
408
+ 'on_realtime_transcription_update': text_detected,
409
+ 'on_recording_start': on_recording_start, 'on_recording_stop': on_recording_stop,
410
+ 'on_vad_detect_start': on_vad_detect_start, 'on_vad_detect_stop': on_vad_detect_stop,
411
+ 'on_wakeword_detected': on_wakeword_detected,
412
+ 'on_wakeword_detection_start': on_wakeword_detection_start,
413
+ 'on_wakeword_detection_end': on_wakeword_detection_end,
414
+ 'on_transcription_start': on_transcription_start,
415
+ 'on_turn_detection_start': on_turn_detection_start,
416
+ 'on_turn_detection_stop': on_turn_detection_stop,
417
+ 'no_log_file': True, 'use_extended_logging': args.use_extended_logging,
418
+ # 'level': logging.WARNING, # Set based on args.debug
419
+ 'compute_type': args.compute_type, 'gpu_device_index': args.gpu_device_index,
420
+ 'device': args.device, 'handle_buffer_overflow': args.handle_buffer_overflow,
 
 
 
 
 
 
 
 
 
421
  'suppress_tokens': args.suppress_tokens,
422
  'allowed_latency_limit': args.allowed_latency_limit,
423
  'faster_whisper_vad_filter': args.faster_whisper_vad_filter,
424
  }
425
+
426
+ # Configure logging level based on debug flag
427
+ log_level = logging.DEBUG if args.debug else logging.INFO
428
+ logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')
429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
+ print(f"{bcolors.OKGREEN}Initializing RealtimeSTT server with parameters:{bcolors.ENDC}")
432
+ for key, value in app_state["recorder_config"].items():
433
+ # Truncate long values like initial_prompt for display
434
+ display_value = str(value)
435
+ if len(display_value) > 70:
436
+ display_value = display_value[:67] + "..."
437
+ print(f" {bcolors.OKBLUE}{key}{bcolors.ENDC}: {display_value}")
438
+
439
+ try:
440
+ app_state["recorder"] = AudioToTextRecorder(**app_state["recorder_config"])
441
+ print(f"{bcolors.OKGREEN}{bcolors.BOLD}RealtimeSTT initialized{bcolors.ENDC}")
442
+ app_state["recorder_ready"].set()
443
+
444
+ # Start the recorder processing loop in a separate thread
445
+ # This is crucial because recorder.text() is blocking.
446
+ app_state["recorder_thread"] = threading.Thread(target=recorder_processing_loop, daemon=True)
447
+ app_state["recorder_thread"].start()
448
+
449
+ # Start broadcasting messages from the audio_queue
450
+ asyncio.create_task(broadcast_audio_messages_task())
451
+ print(f"{bcolors.OKGREEN}FastAPI STT Server started. Waiting for connections...{bcolors.ENDC}")
452
+ print(f"Control WebSocket: ws://{args.server_host}:{args.server_port}/ws/control")
453
+ print(f"Data WebSocket: ws://{args.server_host}:{args.server_port}/ws/data")
454
+
455
+ except Exception as e:
456
+ print(f"{bcolors.FAIL}Error during RealtimeSTT initialization: {e}{bcolors.ENDC}")
457
+ # Potentially exit or prevent server from fully starting if recorder fails
458
+ # For now, it will proceed but recorder-dependent endpoints will fail.
459
+ # Consider raising an exception here to stop uvicorn if recorder is critical.
460
+ # sys.exit(1) # Or handle more gracefully
461
+
462
+ @app.on_event("shutdown")
463
+ async def shutdown_event():
464
+ print(f"{bcolors.WARNING}Server shutting down...{bcolors.ENDC}")
465
+ app_state["stop_recorder_flag"] = True
466
+ recorder = app_state["recorder"]
467
  if recorder:
468
+ print(f"{bcolors.OKBLUE}Aborting recorder...{bcolors.ENDC}")
469
+ recorder.abort() # Non-blocking
470
+ print(f"{bcolors.OKBLUE}Stopping recorder...{bcolors.ENDC}")
471
+ recorder.stop() # Non-blocking, signals the internal loop to stop
472
+ # recorder.shutdown() # This might be blocking or clean up resources
473
+
474
+ recorder_thread = app_state.get("recorder_thread")
475
+ if recorder_thread and recorder_thread.is_alive():
476
+ print(f"{bcolors.OKBLUE}Waiting for recorder thread to finish...{bcolors.ENDC}")
477
+ recorder_thread.join(timeout=5.0) # Wait for the thread
478
+ if recorder_thread.is_alive():
479
+ print(f"{bcolors.WARNING}Recorder thread did not finish in time.{bcolors.ENDC}")
480
+
481
+ if recorder: # Final shutdown call if it exists and is safe
482
+ try:
483
+ print(f"{bcolors.OKBLUE}Finalizing recorder shutdown...{bcolors.ENDC}")
484
+ # The original code had recorder.shutdown() here. Ensure it's safe.
485
+ # If `recorder.text()` in the thread needs to exit first, this is the right place.
486
+ if hasattr(recorder, 'shutdown') and callable(recorder.shutdown):
487
+ recorder.shutdown()
488
+ except Exception as e:
489
+ print(f"{bcolors.FAIL}Error during recorder.shutdown(): {e}{bcolors.ENDC}")
490
+
491
+
492
+ # Close WebSocket connections
493
+ for ws in list(app_state["control_connections"]):
494
+ try:
495
+ await ws.close(code=1000)
496
+ except Exception: pass # Ignore errors on close
497
+ for ws in list(app_state["data_connections"]):
498
+ try:
499
+ await ws.close(code=1000)
500
+ except Exception: pass
501
+
502
+ if app_state["wav_file"]:
503
+ app_state["wav_file"].close()
504
+ print(f"{bcolors.OKGREEN}Closed WAV file: {app_state['writechunks']}{bcolors.ENDC}")
505
+
506
+ print(f"{bcolors.OKGREEN}Server shutdown complete.{bcolors.ENDC}")
507
+
508
+
509
+ async def broadcast_audio_messages_task():
510
+ """ Task to broadcast messages from audio_queue to data clients """
511
+ while True:
512
+ try:
513
+ message = await app_state["audio_queue"].get()
514
+ if app_state["extended_logging"]:
515
+ timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
516
+ print(f" [{timestamp}] Broadcasting: {bcolors.OKBLUE}{message[:100]}...{bcolors.ENDC}\n", flush=True, end="")
517
+
518
+ # Create a list of connections to iterate over, to avoid issues if set changes
519
+ current_connections = list(app_state["data_connections"])
520
+ for conn in current_connections:
521
+ try:
522
+ await conn.send_text(message)
523
+ except WebSocketDisconnect:
524
+ app_state["data_connections"].discard(conn) # Remove if disconnected
525
+ except Exception as e:
526
+ logging.error(f"Error sending message to data client: {e}")
527
+ app_state["data_connections"].discard(conn) # Remove on other errors too
528
+ except asyncio.CancelledError:
529
+ logging.info("Broadcast task cancelled.")
530
+ break
531
+ except Exception as e:
532
+ logging.error(f"Error in broadcast_audio_messages_task: {e}")
533
+ await asyncio.sleep(1) # Avoid tight loop on unexpected errors
534
+
535
+ @app.websocket("/ws/control")
536
+ async def websocket_control_endpoint(websocket: WebSocket):
537
+ await websocket.accept()
538
+ app_state["control_connections"].add(websocket)
539
+ debug_print(f"New control connection from {websocket.client}")
540
+ print(f"{bcolors.OKGREEN}Control client connected: {websocket.client}{bcolors.ENDC}")
541
+
542
+ recorder = app_state["recorder"]
543
+ if not await app_state["recorder_ready"].wait(): # Wait for recorder to be ready
544
+ print(f"{bcolors.WARNING}Recorder not ready, control client may experience issues.{bcolors.ENDC}")
545
+ # Optionally send an error message and close
546
+
547
+ try:
548
+ while True:
549
+ message = await websocket.receive_text()
550
+ debug_print(f"Received control message: {message[:200]}...")
551
+ if not recorder: # Or not app_state["recorder_ready"].is_set()
552
+ print(f"{bcolors.WARNING}Recorder not ready, cannot process command.{bcolors.ENDC}")
553
+ await websocket.send_json({"status": "error", "message": "Recorder not ready"})
554
+ continue
555
 
556
+ try:
557
+ command_data = json.loads(message)
558
+ command = command_data.get("command")
559
+
560
+ if command == "set_parameter":
561
+ parameter = command_data.get("parameter")
562
+ value = command_data.get("value")
563
+ if parameter in app_state["allowed_parameters"] and hasattr(recorder, parameter):
564
+ setattr(recorder, parameter, value)
565
+ value_formatted = f"{value:.2f}" if isinstance(value, float) else str(value)
566
+ timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
567
+ if app_state["extended_logging"]:
568
+ print(f" [{timestamp}] {bcolors.OKGREEN}Set recorder.{parameter} to: {bcolors.OKBLUE}{value_formatted}{bcolors.ENDC}")
569
+ await websocket.send_json({"status": "success", "message": f"Parameter {parameter} set to {value}"})
570
+ else:
571
+ errmsg = f"Parameter {parameter} is not allowed or does not exist (set_parameter)"
572
+ print(f"{bcolors.WARNING}{errmsg}{bcolors.ENDC}")
573
+ await websocket.send_json({"status": "error", "message": errmsg})
574
+
575
+ elif command == "get_parameter":
576
+ parameter = command_data.get("parameter")
577
+ request_id = command_data.get("request_id")
578
+ if parameter in app_state["allowed_parameters"] and hasattr(recorder, parameter):
579
+ value = getattr(recorder, parameter)
580
+ value_formatted = f"{value:.2f}" if isinstance(value, float) else str(value)
581
+ value_truncated = value_formatted[:39] + "…" if len(value_formatted) > 40 else value_formatted
582
+ timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
583
+ if app_state["extended_logging"]:
584
+ print(f" [{timestamp}] {bcolors.OKGREEN}Get recorder.{parameter}: {bcolors.OKBLUE}{value_truncated}{bcolors.ENDC}")
585
+ response = {"status": "success", "parameter": parameter, "value": value}
586
+ if request_id is not None: response["request_id"] = request_id
587
+ await websocket.send_json(response)
588
+ else:
589
+ errmsg = f"Parameter {parameter} is not allowed or does not exist (get_parameter)"
590
+ print(f"{bcolors.WARNING}{errmsg}{bcolors.ENDC}")
591
+ await websocket.send_json({"status": "error", "message": errmsg})
592
+
593
+ elif command == "call_method":
594
+ method_name = command_data.get("method")
595
+ if method_name in app_state["allowed_methods"]:
596
+ method = getattr(recorder, method_name, None)
597
+ if method and callable(method):
598
+ args_list = command_data.get("args", [])
599
+ kwargs_dict = command_data.get("kwargs", {})
600
+ # Consider running blocking methods in a thread if any exist
601
+ method(*args_list, **kwargs_dict)
602
+ timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
603
+ print(f" [{timestamp}] {bcolors.OKGREEN}Called method recorder.{bcolors.OKBLUE}{method_name}{bcolors.ENDC}")
604
+ await websocket.send_json({"status": "success", "message": f"Method {method_name} called"})
605
+ else:
606
+ print(f"{bcolors.WARNING}Recorder does not have method {method_name}{bcolors.ENDC}")
607
+ await websocket.send_json({"status": "error", "message": f"Recorder does not have method {method_name}"})
608
+ else:
609
+ print(f"{bcolors.WARNING}Method {method_name} is not allowed{bcolors.ENDC}")
610
+ await websocket.send_json({"status": "error", "message": f"Method {method_name} is not allowed"})
611
+ else:
612
+ print(f"{bcolors.WARNING}Unknown command: {command}{bcolors.ENDC}")
613
+ await websocket.send_json({"status": "error", "message": f"Unknown command {command}"})
614
+ except json.JSONDecodeError:
615
+ print(f"{bcolors.WARNING}Received invalid JSON command from control client{bcolors.ENDC}")
616
+ await websocket.send_json({"status": "error", "message": "Invalid JSON command"})
617
+ except Exception as e:
618
+ print(f"{bcolors.FAIL}Error processing control message: {e}{bcolors.ENDC}")
619
+ try:
620
+ await websocket.send_json({"status": "error", "message": f"Server error: {type(e).__name__}"})
621
+ except Exception: pass # Ignore if send fails
622
 
623
+ except WebSocketDisconnect:
624
+ print(f"{bcolors.WARNING}Control client disconnected: {websocket.client}{bcolors.ENDC}")
625
+ except Exception as e:
626
+ print(f"{bcolors.FAIL}Control WebSocket error for {websocket.client}: {e}{bcolors.ENDC}")
627
+ finally:
628
+ app_state["control_connections"].discard(websocket)
629
 
630
+ @app.websocket("/ws/data")
631
+ async def websocket_data_endpoint(websocket: WebSocket):
632
+ await websocket.accept()
633
+ app_state["data_connections"].add(websocket)
634
+ print(f"{bcolors.OKGREEN}Data client connected: {websocket.client}{bcolors.ENDC}")
635
+
636
+ recorder = app_state["recorder"]
637
+ if not await app_state["recorder_ready"].wait():
638
+ print(f"{bcolors.WARNING}Recorder not ready, data client may experience issues.{bcolors.ENDC}")
639
+ # Optionally send an error message and close
640
 
 
641
  try:
642
+ while True:
643
+ message_bytes = await websocket.receive_bytes() # Expecting binary data
644
+
645
+ if app_state["extended_logging"]:
646
+ debug_print(f"Received audio chunk (size: {len(message_bytes)} bytes)")
647
+ elif app_state["log_incoming_chunks"]:
648
+ print(".", end='', flush=True)
649
+
650
+ if not recorder:
651
+ print(f"{bcolors.WARNING}Recorder not ready, cannot process audio data.{bcolors.ENDC}")
652
+ # Maybe send an error message back if the protocol supports it
653
+ continue
654
+
655
+ try:
656
+ # Assuming the same metadata structure as original
657
+ metadata_length = int.from_bytes(message_bytes[:4], byteorder='little')
658
+ metadata_json = message_bytes[4:4+metadata_length].decode('utf-8')
659
+ metadata = json.loads(metadata_json)
660
+ sample_rate = metadata['sampleRate']
661
+
662
+ if 'server_sent_to_stt' in metadata: # Example of adding more metadata
663
+ stt_received_ns = time.time_ns()
664
+ metadata["stt_received"] = stt_received_ns
665
+ metadata["stt_received_formatted"] = format_timestamp_ns(stt_received_ns)
666
+ # print(f"Server received audio chunk of length {len(message_bytes)} bytes, metadata: {metadata}")
667
+
668
+
669
+ if app_state["extended_logging"]:
670
+ debug_print(f"Processing audio chunk with sample rate {sample_rate}")
671
+
672
+ chunk = message_bytes[4+metadata_length:]
673
+
674
+ if app_state["writechunks"]:
675
+ if not app_state["wav_file"]:
676
+ try:
677
+ app_state["wav_file"] = wave.open(app_state["writechunks"], 'wb')
678
+ app_state["wav_file"].setnchannels(app_state["CHANNELS"])
679
+ app_state["wav_file"].setsampwidth(pyaudio.get_sample_size(app_state["FORMAT"]))
680
+ app_state["wav_file"].setframerate(sample_rate) # Use actual sample rate for initial write
681
+ except Exception as e:
682
+ print(f"{bcolors.FAIL}Error opening WAV file {app_state['writechunks']}: {e}{bcolors.ENDC}")
683
+ app_state["writechunks"] = None # Disable further writing attempts
684
+
685
+ if app_state["wav_file"]:
686
+ app_state["wav_file"].writeframes(chunk)
687
+
688
+
689
+ # Resample if necessary and feed to recorder
690
+ # Note: recorder.feed_audio might be blocking or have internal queuing.
691
+ # If it's significantly blocking, consider asyncio.to_thread for this call too.
692
+ # However, RealtimeSTT is designed for this, so it's likely fine.
693
+ if sample_rate != 16000:
694
+ resampled_chunk = decode_and_resample(chunk, sample_rate, 16000)
695
+ if app_state["extended_logging"]:
696
+ debug_print(f"Resampled chunk size: {len(resampled_chunk)} bytes")
697
+ recorder.feed_audio(resampled_chunk)
698
+ else:
699
+ recorder.feed_audio(chunk)
700
+
701
+ except json.JSONDecodeError:
702
+ print(f"{bcolors.WARNING}Invalid metadata JSON in audio message.{bcolors.ENDC}")
703
+ except Exception as e:
704
+ print(f"{bcolors.FAIL}Error processing audio data: {e}{bcolors.ENDC}")
705
+ # Potentially log the problematic message_bytes (or part of it) for debugging
706
+
707
+ except WebSocketDisconnect:
708
+ print(f"{bcolors.WARNING}Data client disconnected: {websocket.client}{bcolors.ENDC}")
709
+ except Exception as e:
710
+ print(f"{bcolors.FAIL}Data WebSocket error for {websocket.client}: {e}{bcolors.ENDC}")
711
+ finally:
712
+ app_state["data_connections"].discard(websocket)
713
+ if recorder:
714
+ recorder.clear_audio_queue() # Clear any pending audio for this session if client drops
715
+
716
+ # Optional: A simple HTML page for testing WebSocket connections
717
+ # (You'd typically have a proper frontend for this)
718
+ # @app.get("/", response_class=HTMLResponse)
719
+ # async def get_test_page():
720
+ # return """
721
+ # <html>
722
+ # <head><title>STT Server Test</title></head>
723
+ # <body>
724
+ # <h1>STT Server WebSocket Test</h1>
725
+ # <p>Open your browser's developer console to interact with WebSockets.</p>
726
+ # <p>Control: <code id="control_ws_url"></code></p>
727
+ # <p>Data: <code id="data_ws_url"></code></p>
728
+ # <script>
729
+ # document.getElementById('control_ws_url').textContent = `ws://${window.location.host}/ws/control`;
730
+ # document.getElementById('data_ws_url').textContent = `ws://${window.location.host}/ws/data`;
731
+ # </script>
732
+ # </body>
733
+ # </html>
734
+ # """
735
+
736
+ if __name__ == "__main__":
737
+ cli_args = parse_arguments() # Parse once to get server_host and server_port for uvicorn
738
+
739
+ # Check if sys.platform is win32 and apply policy
740
+ if sys.platform == 'win32':
741
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
742
+
743
+ print(f"Starting Uvicorn server on {cli_args.server_host}:{cli_args.server_port}")
744
+ uvicorn.run(app, host=cli_args.server_host, port=cli_args.server_port)
requirements.txt CHANGED
@@ -1,3 +1,5 @@
 
 
1
  RealtimeSTT==0.3.104
2
  websockets
3
  numpy
 
1
+ fastapi
2
+ uvicorn[standard]
3
  RealtimeSTT==0.3.104
4
  websockets
5
  numpy