Spaces:
Runtime error
Runtime error
Aakash jammula commited on
Commit ·
3d326e4
1
Parent(s): 06adce9
init
Browse files- Dockerfile +12 -14
- app.py +600 -748
- requirements.txt +2 -0
Dockerfile
CHANGED
|
@@ -6,29 +6,27 @@ RUN apt-get update && \
|
|
| 6 |
portaudio19-dev build-essential ffmpeg \
|
| 7 |
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
|
| 9 |
-
# 2) Create non-root user and workdir
|
| 10 |
WORKDIR /app
|
| 11 |
-
RUN useradd -m -u 1000 user
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
-
ENV HF_HOME=/app/.cache/huggingface
|
| 16 |
-
|
| 17 |
-
|
|
|
|
| 18 |
|
| 19 |
-
#
|
| 20 |
COPY requirements.txt .
|
| 21 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 22 |
|
| 23 |
-
#
|
| 24 |
COPY . .
|
| 25 |
RUN chown -R user:user /app
|
| 26 |
|
| 27 |
-
# 6) Switch to non-root
|
| 28 |
USER user
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
EXPOSE
|
| 32 |
|
| 33 |
-
#
|
| 34 |
-
CMD ["python", "app.py", "-m", "tiny.en", "-
|
|
|
|
| 6 |
portaudio19-dev build-essential ffmpeg \
|
| 7 |
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
|
|
|
|
| 9 |
WORKDIR /app
|
|
|
|
| 10 |
|
| 11 |
+
# 2) Create non-root user, set up cache/env
|
| 12 |
+
RUN useradd -m -u 1000 user
|
| 13 |
+
ENV HF_HOME=/app/.cache/huggingface \
|
| 14 |
+
HF_HUB_DISABLE_PROGRESS_BARS=1 \
|
| 15 |
+
TQDM_DISABLE=1 \
|
| 16 |
+
PATH="/home/user/.local/bin:$PATH"
|
| 17 |
|
| 18 |
+
# 3) Install Python deps
|
| 19 |
COPY requirements.txt .
|
| 20 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 21 |
|
| 22 |
+
# 4) Copy code and fix ownership
|
| 23 |
COPY . .
|
| 24 |
RUN chown -R user:user /app
|
| 25 |
|
|
|
|
| 26 |
USER user
|
| 27 |
|
| 28 |
+
# 5) Expose HTTP + WS port
|
| 29 |
+
EXPOSE 7860
|
| 30 |
|
| 31 |
+
# 6) Launch via the Python entrypoint so our __main__ block picks up "-m tiny.en"
|
| 32 |
+
CMD ["python", "app.py", "-m", "tiny.en", "--server_host", "0.0.0.0", "--server_port", "7860"]
|
app.py
CHANGED
|
@@ -1,80 +1,17 @@
|
|
| 1 |
"""
|
| 2 |
-
Speech-to-Text (STT) Server with Real-Time Transcription and WebSocket Interface
|
| 3 |
-
|
| 4 |
-
This server provides real-time speech-to-text (STT) transcription using the RealtimeSTT library.
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
- Flexible recording and transcription options, including configurable pauses for sentence detection.
|
| 10 |
-
- Supports Silero and WebRTC VAD for robust voice activity detection.
|
| 11 |
-
|
| 12 |
-
### Starting the Server:
|
| 13 |
-
You can start the server using the command-line interface (CLI) command `stt-server`, passing the desired configuration options.
|
| 14 |
-
|
| 15 |
-
```bash
|
| 16 |
-
stt-server [OPTIONS]
|
| 17 |
-
```
|
| 18 |
-
|
| 19 |
-
### Available Parameters:
|
| 20 |
-
- `-m, --model`: Model path or size; default 'large-v2'.
|
| 21 |
-
- `-r, --rt-model, --realtime_model_type`: Real-time model size; default 'tiny.en'.
|
| 22 |
-
- `-l, --lang, --language`: Language code for transcription; default 'en'.
|
| 23 |
-
- `-i, --input-device, --input_device_index`: Audio input device index; default 1.
|
| 24 |
-
- `-c, --control, --control_port`: WebSocket control port; default 8011.
|
| 25 |
-
- `-d, --data, --data_port`: WebSocket data port; default 8012.
|
| 26 |
-
- `-w, --wake_words`: Wake word(s) to trigger listening; default "".
|
| 27 |
-
- `-D, --debug`: Enable debug logging.
|
| 28 |
-
- `-W, --write`: Save audio to WAV file.
|
| 29 |
-
- `-s, --silence_timing`: Enable dynamic silence duration for sentence detection; default True.
|
| 30 |
-
- `-b, --batch, --batch_size`: Batch size for inference; default 16.
|
| 31 |
-
- `--root, --download_root`: Specifies the root path were the Whisper models are downloaded to.
|
| 32 |
-
- `--silero_sensitivity`: Silero VAD sensitivity (0-1); default 0.05.
|
| 33 |
-
- `--silero_use_onnx`: Use Silero ONNX model; default False.
|
| 34 |
-
- `--webrtc_sensitivity`: WebRTC VAD sensitivity (0-3); default 3.
|
| 35 |
-
- `--min_length_of_recording`: Minimum recording duration in seconds; default 1.1.
|
| 36 |
-
- `--min_gap_between_recordings`: Min time between recordings in seconds; default 0.
|
| 37 |
-
- `--enable_realtime_transcription`: Enable real-time transcription; default True.
|
| 38 |
-
- `--realtime_processing_pause`: Pause between audio chunk processing; default 0.02.
|
| 39 |
-
- `--silero_deactivity_detection`: Use Silero for end-of-speech detection; default True.
|
| 40 |
-
- `--early_transcription_on_silence`: Start transcription after silence in seconds; default 0.2.
|
| 41 |
-
- `--beam_size`: Beam size for main model; default 5.
|
| 42 |
-
- `--beam_size_realtime`: Beam size for real-time model; default 3.
|
| 43 |
-
- `--init_realtime_after_seconds`: Initial waiting time for realtime transcription; default 0.2.
|
| 44 |
-
- `--realtime_batch_size`: Batch size for the real-time transcription model; default 16.
|
| 45 |
-
- `--initial_prompt`: Initial main transcription guidance prompt.
|
| 46 |
-
- `--initial_prompt_realtime`: Initial realtime transcription guidance prompt.
|
| 47 |
-
- `--end_of_sentence_detection_pause`: Silence duration for sentence end detection; default 0.45.
|
| 48 |
-
- `--unknown_sentence_detection_pause`: Pause duration for incomplete sentence detection; default 0.7.
|
| 49 |
-
- `--mid_sentence_detection_pause`: Pause for mid-sentence break; default 2.0.
|
| 50 |
-
- `--wake_words_sensitivity`: Wake word detection sensitivity (0-1); default 0.5.
|
| 51 |
-
- `--wake_word_timeout`: Wake word timeout in seconds; default 5.0.
|
| 52 |
-
- `--wake_word_activation_delay`: Delay before wake word activation; default 20.
|
| 53 |
-
- `--wakeword_backend`: Backend for wake word detection; default 'none'.
|
| 54 |
-
- `--openwakeword_model_paths`: Paths to OpenWakeWord models.
|
| 55 |
-
- `--openwakeword_inference_framework`: OpenWakeWord inference framework; default 'tensorflow'.
|
| 56 |
-
- `--wake_word_buffer_duration`: Wake word buffer duration in seconds; default 1.0.
|
| 57 |
-
- `--use_main_model_for_realtime`: Use main model for real-time transcription.
|
| 58 |
-
- `--use_extended_logging`: Enable extensive log messages.
|
| 59 |
-
- `--logchunks`: Log incoming audio chunks.
|
| 60 |
-
- `--compute_type`: Type of computation to use.
|
| 61 |
-
- `--input_device_index`: Index of the audio input device.
|
| 62 |
-
- `--gpu_device_index`: Index of the GPU device.
|
| 63 |
-
- `--device`: Device to use for computation.
|
| 64 |
-
- `--handle_buffer_overflow`: Handle buffer overflow during transcription.
|
| 65 |
-
- `--suppress_tokens`: Suppress tokens during transcription.
|
| 66 |
-
- `--allowed_latency_limit`: Allowed latency limit for real-time transcription.
|
| 67 |
-
- `--faster_whisper_vad_filter`: Enable VAD filter for Faster Whisper; default False.
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
### WebSocket Interface:
|
| 71 |
-
The server supports two WebSocket connections:
|
| 72 |
-
1. **Control WebSocket**: Used to send and receive commands, such as setting parameters or calling recorder methods.
|
| 73 |
-
2. **Data WebSocket**: Used to send audio data for transcription and receive real-time transcription updates.
|
| 74 |
-
|
| 75 |
-
The server will broadcast real-time transcription updates to all connected clients on the data WebSocket.
|
| 76 |
"""
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
from difflib import SequenceMatcher
|
| 79 |
from collections import deque
|
| 80 |
from datetime import datetime
|
|
@@ -83,709 +20,373 @@ import asyncio
|
|
| 83 |
import pyaudio
|
| 84 |
import base64
|
| 85 |
import sys
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
class bcolors:
|
| 115 |
-
HEADER = '\033[95m'
|
| 116 |
-
OKBLUE = '\033[94m'
|
| 117 |
-
OKCYAN = '\033[96m'
|
| 118 |
-
OKGREEN = '\033[92m'
|
| 119 |
-
WARNING = '\033[93m'
|
| 120 |
-
FAIL = '\033[91m'
|
| 121 |
-
ENDC = '\033[0m'
|
| 122 |
BOLD = '\033[1m'
|
| 123 |
UNDERLINE = '\033[4m'
|
| 124 |
|
| 125 |
-
print(f"{bcolors.BOLD}{bcolors.OKCYAN}Starting server, please wait...{bcolors.ENDC}")
|
| 126 |
-
|
| 127 |
-
# Initialize colorama
|
| 128 |
-
from colorama import init, Fore, Style
|
| 129 |
-
init()
|
| 130 |
-
|
| 131 |
-
from RealtimeSTT import AudioToTextRecorder
|
| 132 |
-
from scipy.signal import resample
|
| 133 |
-
import numpy as np
|
| 134 |
-
import websockets
|
| 135 |
-
import threading
|
| 136 |
-
import logging
|
| 137 |
-
import wave
|
| 138 |
-
import json
|
| 139 |
-
import time
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
recorder_config = {}
|
| 144 |
-
recorder_ready = threading.Event()
|
| 145 |
-
recorder_thread = None
|
| 146 |
-
stop_recorder = False
|
| 147 |
-
prev_text = ""
|
| 148 |
-
|
| 149 |
-
# Define allowed methods and parameters for security
|
| 150 |
-
allowed_methods = [
|
| 151 |
-
'set_microphone',
|
| 152 |
-
'abort',
|
| 153 |
-
'stop',
|
| 154 |
-
'clear_audio_queue',
|
| 155 |
-
'wakeup',
|
| 156 |
-
'shutdown',
|
| 157 |
-
'text',
|
| 158 |
-
]
|
| 159 |
-
allowed_parameters = [
|
| 160 |
-
'language',
|
| 161 |
-
'silero_sensitivity',
|
| 162 |
-
'wake_word_activation_delay',
|
| 163 |
-
'post_speech_silence_duration',
|
| 164 |
-
'listen_start',
|
| 165 |
-
'recording_stop_time',
|
| 166 |
-
'last_transcription_bytes',
|
| 167 |
-
'last_transcription_bytes_b64',
|
| 168 |
-
'speech_end_silence_start',
|
| 169 |
-
'is_recording',
|
| 170 |
-
'use_wake_words',
|
| 171 |
-
]
|
| 172 |
-
|
| 173 |
-
# Queues and connections for control and data
|
| 174 |
-
control_connections = set()
|
| 175 |
-
data_connections = set()
|
| 176 |
-
control_queue = asyncio.Queue()
|
| 177 |
-
audio_queue = asyncio.Queue()
|
| 178 |
|
| 179 |
def preprocess_text(text):
|
| 180 |
-
# Remove leading whitespaces
|
| 181 |
text = text.lstrip()
|
| 182 |
-
|
| 183 |
-
# Remove starting ellipses if present
|
| 184 |
if text.startswith("..."):
|
| 185 |
text = text[3:]
|
| 186 |
-
|
| 187 |
if text.endswith("...'."):
|
| 188 |
text = text[:-1]
|
| 189 |
-
|
| 190 |
if text.endswith("...'"):
|
| 191 |
text = text[:-1]
|
| 192 |
-
|
| 193 |
-
# Remove any leading whitespaces again after ellipses removal
|
| 194 |
text = text.lstrip()
|
| 195 |
-
|
| 196 |
-
# Uppercase the first letter
|
| 197 |
if text:
|
| 198 |
text = text[0].upper() + text[1:]
|
| 199 |
-
|
| 200 |
return text
|
| 201 |
|
| 202 |
def debug_print(message):
|
| 203 |
-
if debug_logging:
|
| 204 |
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
| 205 |
-
|
|
|
|
| 206 |
print(f"{Fore.CYAN}[DEBUG][{timestamp}][{thread_name}] {message}{Style.RESET_ALL}", file=sys.stderr)
|
| 207 |
|
| 208 |
def format_timestamp_ns(timestamp_ns: int) -> str:
|
| 209 |
-
# Split into whole seconds and the nanosecond remainder
|
| 210 |
seconds = timestamp_ns // 1_000_000_000
|
| 211 |
remainder_ns = timestamp_ns % 1_000_000_000
|
| 212 |
-
|
| 213 |
-
# Convert seconds part into a datetime object (local time)
|
| 214 |
dt = datetime.fromtimestamp(seconds)
|
| 215 |
-
|
| 216 |
-
# Format the main time as HH:MM:SS
|
| 217 |
time_str = dt.strftime("%H:%M:%S")
|
| 218 |
-
|
| 219 |
-
# For instance, if you want milliseconds, divide the remainder by 1e6 and format as 3-digit
|
| 220 |
milliseconds = remainder_ns // 1_000_000
|
| 221 |
-
|
| 222 |
|
| 223 |
-
return formatted_timestamp
|
| 224 |
|
| 225 |
-
|
| 226 |
-
|
| 227 |
|
|
|
|
|
|
|
| 228 |
text = preprocess_text(text)
|
| 229 |
|
| 230 |
-
if silence_timing:
|
| 231 |
-
def ends_with_ellipsis(
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
return True
|
| 236 |
-
return False
|
| 237 |
-
|
| 238 |
-
def sentence_end(text: str):
|
| 239 |
-
sentence_end_marks = ['.', '!', '?', '。']
|
| 240 |
-
if text and text[-1] in sentence_end_marks:
|
| 241 |
-
return True
|
| 242 |
-
return False
|
| 243 |
-
|
| 244 |
|
|
|
|
| 245 |
if ends_with_ellipsis(text):
|
| 246 |
-
recorder.post_speech_silence_duration = global_args.mid_sentence_detection_pause
|
| 247 |
-
elif sentence_end(text) and sentence_end(prev_text) and not ends_with_ellipsis(prev_text):
|
| 248 |
-
recorder.post_speech_silence_duration = global_args.end_of_sentence_detection_pause
|
| 249 |
else:
|
| 250 |
-
recorder.post_speech_silence_duration = global_args.unknown_sentence_detection_pause
|
| 251 |
|
| 252 |
-
|
| 253 |
-
# Append the new text with its timestamp
|
| 254 |
current_time = time.time()
|
| 255 |
-
text_time_deque.append((current_time, text))
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
while text_time_deque and text_time_deque[0][0] < current_time - hard_break_even_on_background_noise:
|
| 259 |
-
text_time_deque.popleft()
|
| 260 |
-
|
| 261 |
-
# Check if at least hard_break_even_on_background_noise_min_texts texts have arrived within the last hard_break_even_on_background_noise seconds
|
| 262 |
-
if len(text_time_deque) >= hard_break_even_on_background_noise_min_texts:
|
| 263 |
-
texts = [t[1] for t in text_time_deque]
|
| 264 |
-
first_text = texts[0]
|
| 265 |
-
last_text = texts[-1]
|
| 266 |
|
| 267 |
-
|
|
|
|
|
|
|
| 268 |
similarity = SequenceMatcher(None, first_text, last_text).ratio()
|
| 269 |
-
|
| 270 |
-
|
| 271 |
recorder.stop()
|
| 272 |
recorder.clear_audio_queue()
|
| 273 |
-
prev_text = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
-
prev_text = text
|
| 276 |
|
| 277 |
-
# Put the message in the audio queue to be sent to clients
|
| 278 |
-
message = json.dumps({
|
| 279 |
-
'type': 'realtime',
|
| 280 |
-
'text': text
|
| 281 |
-
})
|
| 282 |
-
asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
|
| 283 |
-
|
| 284 |
-
# Get current timestamp in HH:MM:SS.nnn format
|
| 285 |
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 286 |
-
|
| 287 |
-
if extended_logging:
|
| 288 |
print(f" [{timestamp}] Realtime text: {bcolors.OKCYAN}{text}{bcolors.ENDC}\n", flush=True, end="")
|
| 289 |
else:
|
| 290 |
-
print(
|
| 291 |
|
| 292 |
-
def on_recording_start(loop):
|
| 293 |
-
message = json.dumps({
|
| 294 |
-
'type': 'recording_start'
|
| 295 |
-
})
|
| 296 |
-
asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
|
| 297 |
|
| 298 |
-
def
|
| 299 |
-
message = json.dumps({
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
asyncio.
|
| 303 |
|
| 304 |
-
def
|
| 305 |
-
message = json.dumps({
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
asyncio.
|
| 309 |
|
| 310 |
-
def
|
| 311 |
-
message = json.dumps({
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
asyncio.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
-
def on_wakeword_detected(
|
| 317 |
-
message = json.dumps({
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
asyncio.
|
| 321 |
|
| 322 |
-
def on_wakeword_detection_start(
|
| 323 |
-
message = json.dumps({
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
asyncio.
|
| 327 |
|
| 328 |
-
def on_wakeword_detection_end(
|
| 329 |
-
message = json.dumps({
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
asyncio.
|
| 333 |
|
| 334 |
-
def on_transcription_start(_audio_bytes
|
| 335 |
bytes_b64 = base64.b64encode(_audio_bytes.tobytes()).decode('utf-8')
|
| 336 |
message = json.dumps({
|
| 337 |
'type': 'transcription_start',
|
| 338 |
'audio_bytes_base64': bytes_b64
|
| 339 |
})
|
| 340 |
-
|
|
|
|
|
|
|
| 341 |
|
| 342 |
-
def on_turn_detection_start(
|
| 343 |
print("&&& stt_server on_turn_detection_start")
|
| 344 |
-
message = json.dumps({
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
asyncio.
|
| 348 |
|
| 349 |
-
def on_turn_detection_stop(
|
| 350 |
print("&&& stt_server on_turn_detection_stop")
|
| 351 |
-
message = json.dumps({
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
asyncio.
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
# def on_realtime_transcription_update(text, loop):
|
| 358 |
-
# # Send real-time transcription updates to the client
|
| 359 |
-
# text = preprocess_text(text)
|
| 360 |
-
# message = json.dumps({
|
| 361 |
-
# 'type': 'realtime_update',
|
| 362 |
-
# 'text': text
|
| 363 |
-
# })
|
| 364 |
-
# asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
|
| 365 |
-
|
| 366 |
-
# def on_recorded_chunk(chunk, loop):
|
| 367 |
-
# if send_recorded_chunk:
|
| 368 |
-
# bytes_b64 = base64.b64encode(chunk.tobytes()).decode('utf-8')
|
| 369 |
-
# message = json.dumps({
|
| 370 |
-
# 'type': 'recorded_chunk',
|
| 371 |
-
# 'bytes': bytes_b64
|
| 372 |
-
# })
|
| 373 |
-
# asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
|
| 374 |
-
|
| 375 |
-
# Define the server's arguments
|
| 376 |
-
def parse_arguments():
|
| 377 |
-
global debug_logging, extended_logging, loglevel, writechunks, log_incoming_chunks, dynamic_silence_timing
|
| 378 |
|
| 379 |
-
|
|
|
|
|
|
|
|
|
|
| 380 |
parser = argparse.ArgumentParser(description='Start the Speech-to-Text (STT) server with various configuration options.')
|
| 381 |
-
|
| 382 |
-
parser.add_argument('-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
parser.add_argument('-i', '--input-device', '--input-device-index', type=int, default=1,
|
| 392 |
-
help='Index of the audio input device to use. Use this option to specify a particular microphone or audio input device based on your system. Default is 1.')
|
| 393 |
-
|
| 394 |
-
parser.add_argument('-c', '--control', '--control_port', type=int, default=8011,
|
| 395 |
-
help='The port number used for the control WebSocket connection. Control connections are used to send and receive commands to the server. Default is port 8011.')
|
| 396 |
-
|
| 397 |
-
parser.add_argument('-d', '--data', '--data_port', type=int, default=8012,
|
| 398 |
-
help='The port number used for the data WebSocket connection. Data connections are used to send audio data and receive transcription updates in real time. Default is port 8012.')
|
| 399 |
-
|
| 400 |
-
parser.add_argument('-w', '--wake_words', type=str, default="",
|
| 401 |
-
help='Specify the wake word(s) that will trigger the server to start listening. For example, setting this to "Jarvis" will make the system start transcribing when it detects the wake word "Jarvis". Default is "Jarvis".')
|
| 402 |
-
|
| 403 |
-
parser.add_argument('-D', '--debug', action='store_true', help='Enable debug logging for detailed server operations')
|
| 404 |
-
|
| 405 |
-
parser.add_argument('--debug_websockets', action='store_true', help='Enable debug logging for detailed server websocket operations')
|
| 406 |
-
|
| 407 |
-
parser.add_argument('-W', '--write', metavar='FILE', help='Save received audio to a WAV file')
|
| 408 |
-
|
| 409 |
-
parser.add_argument('-b', '--batch', '--batch_size', type=int, default=16, help='Batch size for inference. This parameter controls the number of audio chunks processed in parallel during transcription. Default is 16.')
|
| 410 |
-
|
| 411 |
-
parser.add_argument('--root', '--download_root', type=str,default=None, help='Specifies the root path where the Whisper models are downloaded to. Default is None.')
|
| 412 |
-
|
| 413 |
-
parser.add_argument('-s', '--silence_timing', action='store_true', default=True,
|
| 414 |
-
help='Enable dynamic adjustment of silence duration for sentence detection. Adjusts post-speech silence duration based on detected sentence structure and punctuation. Default is False.')
|
| 415 |
-
|
| 416 |
-
parser.add_argument('--init_realtime_after_seconds', type=float, default=0.2,
|
| 417 |
-
help='The initial waiting time in seconds before real-time transcription starts. This delay helps prevent false positives at the beginning of a session. Default is 0.2 seconds.')
|
| 418 |
-
|
| 419 |
-
parser.add_argument('--realtime_batch_size', type=int, default=16,
|
| 420 |
-
help='Batch size for the real-time transcription model. This parameter controls the number of audio chunks processed in parallel during real-time transcription. Default is 16.')
|
| 421 |
-
|
| 422 |
-
parser.add_argument('--initial_prompt_realtime', type=str, default="", help='Initial prompt that guides the real-time transcription model to produce transcriptions in a particular style or format.')
|
| 423 |
-
|
| 424 |
-
parser.add_argument('--silero_sensitivity', type=float, default=0.05,
|
| 425 |
-
help='Sensitivity level for Silero Voice Activity Detection (VAD), with a range from 0 to 1. Lower values make the model less sensitive, useful for noisy environments. Default is 0.05.')
|
| 426 |
-
|
| 427 |
-
parser.add_argument('--silero_use_onnx', action='store_true', default=False,
|
| 428 |
-
help='Enable ONNX version of Silero model for faster performance with lower resource usage. Default is False.')
|
| 429 |
-
|
| 430 |
-
parser.add_argument('--webrtc_sensitivity', type=int, default=3,
|
| 431 |
-
help='Sensitivity level for WebRTC Voice Activity Detection (VAD), with a range from 0 to 3. Higher values make the model less sensitive, useful for cleaner environments. Default is 3.')
|
| 432 |
-
|
| 433 |
-
parser.add_argument('--min_length_of_recording', type=float, default=1.1,
|
| 434 |
-
help='Minimum duration of valid recordings in seconds. This prevents very short recordings from being processed, which could be caused by noise or accidental sounds. Default is 1.1 seconds.')
|
| 435 |
-
|
| 436 |
-
parser.add_argument('--min_gap_between_recordings', type=float, default=0,
|
| 437 |
-
help='Minimum time (in seconds) between consecutive recordings. Setting this helps avoid overlapping recordings when there’s a brief silence between them. Default is 0 seconds.')
|
| 438 |
-
|
| 439 |
-
parser.add_argument('--enable_realtime_transcription', action='store_true', default=True,
|
| 440 |
-
help='Enable continuous real-time transcription of audio as it is received. When enabled, transcriptions are sent in near real-time. Default is True.')
|
| 441 |
-
|
| 442 |
-
parser.add_argument('--realtime_processing_pause', type=float, default=0.02,
|
| 443 |
-
help='Time interval (in seconds) between processing audio chunks for real-time transcription. Lower values increase responsiveness but may put more load on the CPU. Default is 0.02 seconds.')
|
| 444 |
-
|
| 445 |
-
parser.add_argument('--silero_deactivity_detection', action='store_true', default=True,
|
| 446 |
-
help='Use the Silero model for end-of-speech detection. This option can provide more robust silence detection in noisy environments, though it consumes more GPU resources. Default is True.')
|
| 447 |
-
|
| 448 |
-
parser.add_argument('--early_transcription_on_silence', type=float, default=0.2,
|
| 449 |
-
help='Start transcription after the specified seconds of silence. This is useful when you want to trigger transcription mid-speech when there is a brief pause. Should be lower than post_speech_silence_duration. Set to 0 to disable. Default is 0.2 seconds.')
|
| 450 |
-
|
| 451 |
-
parser.add_argument('--beam_size', type=int, default=5,
|
| 452 |
-
help='Beam size for the main transcription model. Larger values may improve transcription accuracy but increase the processing time. Default is 5.')
|
| 453 |
-
|
| 454 |
-
parser.add_argument('--beam_size_realtime', type=int, default=3,
|
| 455 |
-
help='Beam size for the real-time transcription model. A smaller beam size allows for faster real-time processing but may reduce accuracy. Default is 3.')
|
| 456 |
-
|
| 457 |
-
parser.add_argument('--initial_prompt', type=str,
|
| 458 |
-
default="Incomplete thoughts should end with '...'. Examples of complete thoughts: 'The sky is blue.' 'She walked home.' Examples of incomplete thoughts: 'When the sky...' 'Because he...'",
|
| 459 |
-
help='Initial prompt that guides the transcription model to produce transcriptions in a particular style or format. The default provides instructions for handling sentence completions and ellipsis usage.')
|
| 460 |
-
|
| 461 |
-
parser.add_argument('--end_of_sentence_detection_pause', type=float, default=0.45,
|
| 462 |
-
help='The duration of silence (in seconds) that the model should interpret as the end of a sentence. This helps the system detect when to finalize the transcription of a sentence. Default is 0.45 seconds.')
|
| 463 |
-
|
| 464 |
-
parser.add_argument('--unknown_sentence_detection_pause', type=float, default=0.7,
|
| 465 |
-
help='The duration of pause (in seconds) that the model should interpret as an incomplete or unknown sentence. This is useful for identifying when a sentence is trailing off or unfinished. Default is 0.7 seconds.')
|
| 466 |
-
|
| 467 |
-
parser.add_argument('--mid_sentence_detection_pause', type=float, default=2.0,
|
| 468 |
-
help='The duration of pause (in seconds) that the model should interpret as a mid-sentence break. Longer pauses can indicate a pause in speech but not necessarily the end of a sentence. Default is 2.0 seconds.')
|
| 469 |
-
|
| 470 |
-
parser.add_argument('--wake_words_sensitivity', type=float, default=0.5,
|
| 471 |
-
help='Sensitivity level for wake word detection, with a range from 0 (most sensitive) to 1 (least sensitive). Adjust this value based on your environment to ensure reliable wake word detection. Default is 0.5.')
|
| 472 |
-
|
| 473 |
-
parser.add_argument('--wake_word_timeout', type=float, default=5.0,
|
| 474 |
-
help='Maximum time in seconds that the system will wait for a wake word before timing out. After this timeout, the system stops listening for wake words until reactivated. Default is 5.0 seconds.')
|
| 475 |
-
|
| 476 |
-
parser.add_argument('--wake_word_activation_delay', type=float, default=0,
|
| 477 |
-
help='The delay in seconds before the wake word detection is activated after the system starts listening. This prevents false positives during the start of a session. Default is 0 seconds.')
|
| 478 |
|
| 479 |
-
parser.add_argument('--wakeword_backend', type=str, default='none',
|
| 480 |
-
help='The backend used for wake word detection. You can specify different backends such as "default" or any custom implementations depending on your setup. Default is "pvporcupine".')
|
| 481 |
|
| 482 |
-
parser.add_argument('--openwakeword_model_paths', type=str, nargs='*',
|
| 483 |
-
help='A list of file paths to OpenWakeWord models. This is useful if you are using OpenWakeWord for wake word detection and need to specify custom models.')
|
| 484 |
-
|
| 485 |
-
parser.add_argument('--openwakeword_inference_framework', type=str, default='tensorflow',
|
| 486 |
-
help='The inference framework to use for OpenWakeWord models. Supported frameworks could include "tensorflow", "pytorch", etc. Default is "tensorflow".')
|
| 487 |
-
|
| 488 |
-
parser.add_argument('--wake_word_buffer_duration', type=float, default=1.0,
|
| 489 |
-
help='Duration of the buffer in seconds for wake word detection. This sets how long the system will store the audio before and after detecting the wake word. Default is 1.0 seconds.')
|
| 490 |
-
|
| 491 |
-
parser.add_argument('--use_main_model_for_realtime', action='store_true',
|
| 492 |
-
help='Enable this option if you want to use the main model for real-time transcription, instead of the smaller, faster real-time model. Using the main model may provide better accuracy but at the cost of higher processing time.')
|
| 493 |
-
|
| 494 |
-
parser.add_argument('--use_extended_logging', action='store_true',
|
| 495 |
-
help='Writes extensive log messages for the recording worker, that processes the audio chunks.')
|
| 496 |
-
|
| 497 |
-
parser.add_argument('--compute_type', type=str, default='default',
|
| 498 |
-
help='Type of computation to use. See https://opennmt.net/CTranslate2/quantization.html')
|
| 499 |
-
|
| 500 |
-
parser.add_argument('--gpu_device_index', type=int, default=0,
|
| 501 |
-
help='Index of the GPU device to use. Default is None.')
|
| 502 |
-
|
| 503 |
-
parser.add_argument('--device', type=str, default='cuda',
|
| 504 |
-
help='Device for model to use. Can either be "cuda" or "cpu". Default is cuda.')
|
| 505 |
-
|
| 506 |
-
parser.add_argument('--handle_buffer_overflow', action='store_true',
|
| 507 |
-
help='Handle buffer overflow during transcription. Default is False.')
|
| 508 |
-
|
| 509 |
-
parser.add_argument('--suppress_tokens', type=int, default=[-1], nargs='*', help='Suppress tokens during transcription. Default is [-1].')
|
| 510 |
-
|
| 511 |
-
parser.add_argument('--allowed_latency_limit', type=int, default=100,
|
| 512 |
-
help='Maximal amount of chunks that can be unprocessed in queue before discarding chunks.. Default is 100.')
|
| 513 |
-
|
| 514 |
-
parser.add_argument('--faster_whisper_vad_filter', action='store_true',
|
| 515 |
-
help='Enable VAD filter for Faster Whisper. Default is False.')
|
| 516 |
-
|
| 517 |
-
parser.add_argument('--logchunks', action='store_true', help='Enable logging of incoming audio chunks (periods)')
|
| 518 |
-
|
| 519 |
-
# Parse arguments
|
| 520 |
args = parser.parse_args()
|
| 521 |
|
| 522 |
-
debug_logging = args.debug
|
| 523 |
-
extended_logging = args.use_extended_logging
|
| 524 |
-
writechunks = args.write
|
| 525 |
-
log_incoming_chunks = args.logchunks
|
| 526 |
-
|
| 527 |
-
|
| 528 |
|
| 529 |
-
ws_logger = logging.getLogger('websockets')
|
|
|
|
| 530 |
if args.debug_websockets:
|
| 531 |
-
# If app debug is on, let websockets be verbose too
|
| 532 |
ws_logger.setLevel(logging.DEBUG)
|
| 533 |
-
|
| 534 |
-
ws_logger.propagate =
|
|
|
|
| 535 |
else:
|
| 536 |
-
# If app debug is off, silence websockets below WARNING
|
| 537 |
ws_logger.setLevel(logging.WARNING)
|
| 538 |
-
|
| 539 |
|
| 540 |
-
# Replace escaped newlines with actual newlines in initial_prompt
|
| 541 |
if args.initial_prompt:
|
| 542 |
args.initial_prompt = args.initial_prompt.replace("\\n", "\n")
|
| 543 |
-
|
| 544 |
if args.initial_prompt_realtime:
|
| 545 |
args.initial_prompt_realtime = args.initial_prompt_realtime.replace("\\n", "\n")
|
| 546 |
-
|
|
|
|
| 547 |
return args
|
| 548 |
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
def
|
| 559 |
-
|
| 560 |
-
prev_text = ""
|
| 561 |
full_sentence = preprocess_text(full_sentence)
|
| 562 |
-
message = json.dumps({
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
|
| 568 |
|
| 569 |
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 570 |
-
|
| 571 |
-
if extended_logging:
|
| 572 |
print(f" [{timestamp}] Full text: {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n", flush=True, end="")
|
| 573 |
else:
|
| 574 |
-
print(
|
|
|
|
| 575 |
try:
|
| 576 |
-
while not
|
| 577 |
-
recorder
|
| 578 |
-
|
| 579 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
|
| 581 |
-
def decode_and_resample(
|
| 582 |
-
audio_data,
|
| 583 |
-
original_sample_rate,
|
| 584 |
-
target_sample_rate):
|
| 585 |
|
| 586 |
-
|
| 587 |
if original_sample_rate == target_sample_rate:
|
| 588 |
return audio_data
|
| 589 |
-
|
| 590 |
audio_np = np.frombuffer(audio_data, dtype=np.int16)
|
| 591 |
-
|
| 592 |
-
# Calculate the number of samples after resampling
|
| 593 |
num_original_samples = len(audio_np)
|
| 594 |
-
num_target_samples = int(num_original_samples * target_sample_rate /
|
| 595 |
-
original_sample_rate)
|
| 596 |
-
|
| 597 |
-
# Resample the audio
|
| 598 |
resampled_audio = resample(audio_np, num_target_samples)
|
| 599 |
-
|
| 600 |
return resampled_audio.astype(np.int16).tobytes()
|
| 601 |
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
print(f"{bcolors.OKGREEN}Control client connected{bcolors.ENDC}")
|
| 605 |
-
global recorder
|
| 606 |
-
control_connections.add(websocket)
|
| 607 |
-
try:
|
| 608 |
-
async for message in websocket:
|
| 609 |
-
debug_print(f"Received control message: {message[:200]}...")
|
| 610 |
-
if not recorder_ready.is_set():
|
| 611 |
-
print(f"{bcolors.WARNING}Recorder not ready{bcolors.ENDC}")
|
| 612 |
-
continue
|
| 613 |
-
if isinstance(message, str):
|
| 614 |
-
# Handle text message (command)
|
| 615 |
-
try:
|
| 616 |
-
command_data = json.loads(message)
|
| 617 |
-
command = command_data.get("command")
|
| 618 |
-
if command == "set_parameter":
|
| 619 |
-
parameter = command_data.get("parameter")
|
| 620 |
-
value = command_data.get("value")
|
| 621 |
-
if parameter in allowed_parameters and hasattr(recorder, parameter):
|
| 622 |
-
setattr(recorder, parameter, value)
|
| 623 |
-
# Format the value for output
|
| 624 |
-
if isinstance(value, float):
|
| 625 |
-
value_formatted = f"{value:.2f}"
|
| 626 |
-
else:
|
| 627 |
-
value_formatted = value
|
| 628 |
-
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 629 |
-
if extended_logging:
|
| 630 |
-
print(f" [{timestamp}] {bcolors.OKGREEN}Set recorder.{parameter} to: {bcolors.OKBLUE}{value_formatted}{bcolors.ENDC}")
|
| 631 |
-
# Optionally send a response back to the client
|
| 632 |
-
await websocket.send(json.dumps({"status": "success", "message": f"Parameter {parameter} set to {value}"}))
|
| 633 |
-
else:
|
| 634 |
-
if not parameter in allowed_parameters:
|
| 635 |
-
print(f"{bcolors.WARNING}Parameter {parameter} is not allowed (set_parameter){bcolors.ENDC}")
|
| 636 |
-
await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} is not allowed (set_parameter)"}))
|
| 637 |
-
else:
|
| 638 |
-
print(f"{bcolors.WARNING}Parameter {parameter} does not exist (set_parameter){bcolors.ENDC}")
|
| 639 |
-
await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} does not exist (set_parameter)"}))
|
| 640 |
-
|
| 641 |
-
elif command == "get_parameter":
|
| 642 |
-
parameter = command_data.get("parameter")
|
| 643 |
-
request_id = command_data.get("request_id") # Get the request_id from the command data
|
| 644 |
-
if parameter in allowed_parameters and hasattr(recorder, parameter):
|
| 645 |
-
value = getattr(recorder, parameter)
|
| 646 |
-
if isinstance(value, float):
|
| 647 |
-
value_formatted = f"{value:.2f}"
|
| 648 |
-
else:
|
| 649 |
-
value_formatted = f"{value}"
|
| 650 |
-
|
| 651 |
-
value_truncated = value_formatted[:39] + "…" if len(value_formatted) > 40 else value_formatted
|
| 652 |
-
|
| 653 |
-
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 654 |
-
if extended_logging:
|
| 655 |
-
print(f" [{timestamp}] {bcolors.OKGREEN}Get recorder.{parameter}: {bcolors.OKBLUE}{value_truncated}{bcolors.ENDC}")
|
| 656 |
-
response = {"status": "success", "parameter": parameter, "value": value}
|
| 657 |
-
if request_id is not None:
|
| 658 |
-
response["request_id"] = request_id
|
| 659 |
-
await websocket.send(json.dumps(response))
|
| 660 |
-
else:
|
| 661 |
-
if not parameter in allowed_parameters:
|
| 662 |
-
print(f"{bcolors.WARNING}Parameter {parameter} is not allowed (get_parameter){bcolors.ENDC}")
|
| 663 |
-
await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} is not allowed (get_parameter)"}))
|
| 664 |
-
else:
|
| 665 |
-
print(f"{bcolors.WARNING}Parameter {parameter} does not exist (get_parameter){bcolors.ENDC}")
|
| 666 |
-
await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} does not exist (get_parameter)"}))
|
| 667 |
-
elif command == "call_method":
|
| 668 |
-
method_name = command_data.get("method")
|
| 669 |
-
if method_name in allowed_methods:
|
| 670 |
-
method = getattr(recorder, method_name, None)
|
| 671 |
-
if method and callable(method):
|
| 672 |
-
args = command_data.get("args", [])
|
| 673 |
-
kwargs = command_data.get("kwargs", {})
|
| 674 |
-
method(*args, **kwargs)
|
| 675 |
-
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 676 |
-
print(f" [{timestamp}] {bcolors.OKGREEN}Called method recorder.{bcolors.OKBLUE}{method_name}{bcolors.ENDC}")
|
| 677 |
-
await websocket.send(json.dumps({"status": "success", "message": f"Method {method_name} called"}))
|
| 678 |
-
else:
|
| 679 |
-
print(f"{bcolors.WARNING}Recorder does not have method {method_name}{bcolors.ENDC}")
|
| 680 |
-
await websocket.send(json.dumps({"status": "error", "message": f"Recorder does not have method {method_name}"}))
|
| 681 |
-
else:
|
| 682 |
-
print(f"{bcolors.WARNING}Method {method_name} is not allowed{bcolors.ENDC}")
|
| 683 |
-
await websocket.send(json.dumps({"status": "error", "message": f"Method {method_name} is not allowed"}))
|
| 684 |
-
else:
|
| 685 |
-
print(f"{bcolors.WARNING}Unknown command: {command}{bcolors.ENDC}")
|
| 686 |
-
await websocket.send(json.dumps({"status": "error", "message": f"Unknown command {command}"}))
|
| 687 |
-
except json.JSONDecodeError:
|
| 688 |
-
print(f"{bcolors.WARNING}Received invalid JSON command{bcolors.ENDC}")
|
| 689 |
-
await websocket.send(json.dumps({"status": "error", "message": "Invalid JSON command"}))
|
| 690 |
-
else:
|
| 691 |
-
print(f"{bcolors.WARNING}Received unknown message type on control connection{bcolors.ENDC}")
|
| 692 |
-
except websockets.exceptions.ConnectionClosed as e:
|
| 693 |
-
print(f"{bcolors.WARNING}Control client disconnected: {e}{bcolors.ENDC}")
|
| 694 |
-
finally:
|
| 695 |
-
control_connections.remove(websocket)
|
| 696 |
-
|
| 697 |
-
async def data_handler(websocket):
|
| 698 |
-
global writechunks, wav_file
|
| 699 |
-
print(f"{bcolors.OKGREEN}Data client connected{bcolors.ENDC}")
|
| 700 |
-
data_connections.add(websocket)
|
| 701 |
-
try:
|
| 702 |
-
while True:
|
| 703 |
-
message = await websocket.recv()
|
| 704 |
-
if isinstance(message, bytes):
|
| 705 |
-
if extended_logging:
|
| 706 |
-
debug_print(f"Received audio chunk (size: {len(message)} bytes)")
|
| 707 |
-
elif log_incoming_chunks:
|
| 708 |
-
print(".", end='', flush=True)
|
| 709 |
-
# Handle binary message (audio data)
|
| 710 |
-
metadata_length = int.from_bytes(message[:4], byteorder='little')
|
| 711 |
-
metadata_json = message[4:4+metadata_length].decode('utf-8')
|
| 712 |
-
metadata = json.loads(metadata_json)
|
| 713 |
-
sample_rate = metadata['sampleRate']
|
| 714 |
-
|
| 715 |
-
if 'server_sent_to_stt' in metadata:
|
| 716 |
-
stt_received_ns = time.time_ns()
|
| 717 |
-
metadata["stt_received"] = stt_received_ns
|
| 718 |
-
metadata["stt_received_formatted"] = format_timestamp_ns(stt_received_ns)
|
| 719 |
-
print(f"Server received audio chunk of length {len(message)} bytes, metadata: {metadata}")
|
| 720 |
-
|
| 721 |
-
if extended_logging:
|
| 722 |
-
debug_print(f"Processing audio chunk with sample rate {sample_rate}")
|
| 723 |
-
chunk = message[4+metadata_length:]
|
| 724 |
-
|
| 725 |
-
if writechunks:
|
| 726 |
-
if not wav_file:
|
| 727 |
-
wav_file = wave.open(writechunks, 'wb')
|
| 728 |
-
wav_file.setnchannels(CHANNELS)
|
| 729 |
-
wav_file.setsampwidth(pyaudio.get_sample_size(FORMAT))
|
| 730 |
-
wav_file.setframerate(sample_rate)
|
| 731 |
|
| 732 |
-
|
|
|
|
|
|
|
|
|
|
| 733 |
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
debug_print(f"Resampled chunk size: {len(resampled_chunk)} bytes")
|
| 738 |
-
recorder.feed_audio(resampled_chunk)
|
| 739 |
-
else:
|
| 740 |
-
recorder.feed_audio(chunk)
|
| 741 |
-
else:
|
| 742 |
-
print(f"{bcolors.WARNING}Received non-binary message on data connection{bcolors.ENDC}")
|
| 743 |
-
except websockets.exceptions.ConnectionClosed as e:
|
| 744 |
-
print(f"{bcolors.WARNING}Data client disconnected: {e}{bcolors.ENDC}")
|
| 745 |
-
finally:
|
| 746 |
-
data_connections.remove(websocket)
|
| 747 |
-
recorder.clear_audio_queue() # Ensure audio queue is cleared if client disconnects
|
| 748 |
-
|
| 749 |
-
async def broadcast_audio_messages():
|
| 750 |
-
while True:
|
| 751 |
-
message = await audio_queue.get()
|
| 752 |
-
for conn in list(data_connections):
|
| 753 |
-
try:
|
| 754 |
-
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 755 |
-
|
| 756 |
-
if extended_logging:
|
| 757 |
-
print(f" [{timestamp}] Sending message: {bcolors.OKBLUE}{message}{bcolors.ENDC}\n", flush=True, end="")
|
| 758 |
-
await conn.send(message)
|
| 759 |
-
except websockets.exceptions.ConnectionClosed:
|
| 760 |
-
data_connections.remove(conn)
|
| 761 |
-
|
| 762 |
-
# Helper function to create event loop bound closures for callbacks
|
| 763 |
-
def make_callback(loop, callback):
|
| 764 |
-
def inner_callback(*args, **kwargs):
|
| 765 |
-
callback(*args, **kwargs, loop=loop)
|
| 766 |
-
return inner_callback
|
| 767 |
-
|
| 768 |
-
async def main_async():
|
| 769 |
-
global stop_recorder, recorder_config, global_args
|
| 770 |
-
args = parse_arguments()
|
| 771 |
-
global_args = args
|
| 772 |
-
|
| 773 |
-
# Get the event loop here and pass it to the recorder thread
|
| 774 |
-
loop = asyncio.get_event_loop()
|
| 775 |
-
|
| 776 |
-
recorder_config = {
|
| 777 |
-
'model': args.model,
|
| 778 |
-
'download_root': args.root,
|
| 779 |
-
'realtime_model_type': args.rt_model,
|
| 780 |
-
'language': args.lang,
|
| 781 |
-
'batch_size': args.batch,
|
| 782 |
'init_realtime_after_seconds': args.init_realtime_after_seconds,
|
| 783 |
'realtime_batch_size': args.realtime_batch_size,
|
| 784 |
'initial_prompt_realtime': args.initial_prompt_realtime,
|
| 785 |
-
'input_device_index': args.input_device,
|
| 786 |
-
'
|
| 787 |
-
'silero_use_onnx': args.silero_use_onnx,
|
| 788 |
-
'webrtc_sensitivity': args.webrtc_sensitivity,
|
| 789 |
'post_speech_silence_duration': args.unknown_sentence_detection_pause,
|
| 790 |
'min_length_of_recording': args.min_length_of_recording,
|
| 791 |
'min_gap_between_recordings': args.min_gap_between_recordings,
|
|
@@ -793,10 +394,8 @@ async def main_async():
|
|
| 793 |
'realtime_processing_pause': args.realtime_processing_pause,
|
| 794 |
'silero_deactivity_detection': args.silero_deactivity_detection,
|
| 795 |
'early_transcription_on_silence': args.early_transcription_on_silence,
|
| 796 |
-
'beam_size': args.beam_size,
|
| 797 |
-
'
|
| 798 |
-
'initial_prompt': args.initial_prompt,
|
| 799 |
-
'wake_words': args.wake_words,
|
| 800 |
'wake_words_sensitivity': args.wake_words_sensitivity,
|
| 801 |
'wake_word_timeout': args.wake_word_timeout,
|
| 802 |
'wake_word_activation_delay': args.wake_word_activation_delay,
|
|
@@ -805,88 +404,341 @@ async def main_async():
|
|
| 805 |
'openwakeword_inference_framework': args.openwakeword_inference_framework,
|
| 806 |
'wake_word_buffer_duration': args.wake_word_buffer_duration,
|
| 807 |
'use_main_model_for_realtime': args.use_main_model_for_realtime,
|
| 808 |
-
'spinner': False,
|
| 809 |
-
'
|
| 810 |
-
|
| 811 |
-
'
|
| 812 |
-
'
|
| 813 |
-
'
|
| 814 |
-
'
|
| 815 |
-
'
|
| 816 |
-
'
|
| 817 |
-
'
|
| 818 |
-
'
|
| 819 |
-
'
|
| 820 |
-
'
|
| 821 |
-
'
|
| 822 |
-
|
| 823 |
-
# 'on_recorded_chunk': make_callback(loop, on_recorded_chunk),
|
| 824 |
-
'no_log_file': True, # Disable logging to file
|
| 825 |
-
'use_extended_logging': args.use_extended_logging,
|
| 826 |
-
'level': loglevel,
|
| 827 |
-
'compute_type': args.compute_type,
|
| 828 |
-
'gpu_device_index': args.gpu_device_index,
|
| 829 |
-
'device': args.device,
|
| 830 |
-
'handle_buffer_overflow': args.handle_buffer_overflow,
|
| 831 |
'suppress_tokens': args.suppress_tokens,
|
| 832 |
'allowed_latency_limit': args.allowed_latency_limit,
|
| 833 |
'faster_whisper_vad_filter': args.faster_whisper_vad_filter,
|
| 834 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 835 |
|
| 836 |
-
try:
|
| 837 |
-
# Attempt to start control and data servers
|
| 838 |
-
control_server = await websockets.serve(control_handler, "localhost", args.control)
|
| 839 |
-
data_server = await websockets.serve(data_handler, "localhost", args.data)
|
| 840 |
-
print(f"{bcolors.OKGREEN}Control server started on {bcolors.OKBLUE}ws://localhost:{args.control}{bcolors.ENDC}")
|
| 841 |
-
print(f"{bcolors.OKGREEN}Data server started on {bcolors.OKBLUE}ws://localhost:{args.data}{bcolors.ENDC}")
|
| 842 |
-
|
| 843 |
-
# Start the broadcast and recorder threads
|
| 844 |
-
broadcast_task = asyncio.create_task(broadcast_audio_messages())
|
| 845 |
-
|
| 846 |
-
recorder_thread = threading.Thread(target=_recorder_thread, args=(loop,))
|
| 847 |
-
recorder_thread.start()
|
| 848 |
-
recorder_ready.wait()
|
| 849 |
-
|
| 850 |
-
print(f"{bcolors.OKGREEN}Server started. Press Ctrl+C to stop the server.{bcolors.ENDC}")
|
| 851 |
-
|
| 852 |
-
# Run server tasks
|
| 853 |
-
await asyncio.gather(control_server.wait_closed(), data_server.wait_closed(), broadcast_task)
|
| 854 |
-
except OSError as e:
|
| 855 |
-
print(f"{bcolors.FAIL}Error: Could not start server on specified ports. It’s possible another instance of the server is already running, or the ports are being used by another application.{bcolors.ENDC}")
|
| 856 |
-
except KeyboardInterrupt:
|
| 857 |
-
print(f"{bcolors.WARNING}Server interrupted by user, shutting down...{bcolors.ENDC}")
|
| 858 |
-
finally:
|
| 859 |
-
# Shutdown procedures for recorder and server threads
|
| 860 |
-
await shutdown_procedure()
|
| 861 |
-
print(f"{bcolors.OKGREEN}Server shutdown complete.{bcolors.ENDC}")
|
| 862 |
|
| 863 |
-
|
| 864 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 865 |
if recorder:
|
| 866 |
-
|
| 867 |
-
recorder.abort()
|
| 868 |
-
recorder.
|
| 869 |
-
recorder.
|
| 870 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 871 |
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
|
|
|
|
|
|
| 880 |
|
| 881 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 882 |
|
| 883 |
-
def main():
|
| 884 |
try:
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
FastAPI Speech-to-Text (STT) Server with Real-Time Transcription and WebSocket Interface
|
| 3 |
+
|
| 4 |
+
This server provides real-time speech-to-text (STT) transcription using the RealtimeSTT library.
|
| 5 |
+
It allows clients to connect via WebSocket to send audio data and receive real-time transcription updates.
|
| 6 |
+
The server supports configurable audio recording parameters, voice activity detection (VAD), and wake word detection.
|
| 7 |
+
It is designed to handle continuous transcription as well as post-recording processing,
|
| 8 |
+
enabling real-time feedback with the option to improve final transcription quality after the complete sentence is recognized.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
| 12 |
+
from fastapi.responses import HTMLResponse # Optional: for a simple test page
|
| 13 |
+
import uvicorn
|
| 14 |
+
|
| 15 |
from difflib import SequenceMatcher
|
| 16 |
from collections import deque
|
| 17 |
from datetime import datetime
|
|
|
|
| 20 |
import pyaudio
|
| 21 |
import base64
|
| 22 |
import sys
|
| 23 |
+
import argparse # Keeping argparse for CLI options
|
| 24 |
+
import threading
|
| 25 |
+
import wave
|
| 26 |
+
import json
|
| 27 |
+
import time
|
| 28 |
+
import numpy as np
|
| 29 |
+
from scipy.signal import resample
|
| 30 |
+
from colorama import init, Fore, Style
|
| 31 |
+
init() # Initialize colorama
|
| 32 |
|
| 33 |
+
# Assuming RealtimeSTT is installed and accessible
|
| 34 |
+
from RealtimeSTT import AudioToTextRecorder
|
| 35 |
|
| 36 |
+
# --- Original Global Variables and Helper Functions (adapted or kept) ---
|
| 37 |
+
# (Many of these will remain similar, but their interaction with FastAPI needs care)
|
| 38 |
+
|
| 39 |
+
# Global state (consider encapsulating these in a class or app state for larger apps)
|
| 40 |
+
app_state = {
|
| 41 |
+
"recorder": None,
|
| 42 |
+
"recorder_config": {},
|
| 43 |
+
"recorder_ready": asyncio.Event(), # Use asyncio.Event
|
| 44 |
+
"stop_recorder_flag": False,
|
| 45 |
+
"prev_text": "",
|
| 46 |
+
"control_connections": set(),
|
| 47 |
+
"data_connections": set(),
|
| 48 |
+
"audio_queue": asyncio.Queue(),
|
| 49 |
+
"global_args": None, # To store parsed CLI arguments
|
| 50 |
+
"debug_logging": False,
|
| 51 |
+
"extended_logging": False,
|
| 52 |
+
"log_incoming_chunks": False,
|
| 53 |
+
"silence_timing": False, # Renamed from dynamic_silence_timing for consistency
|
| 54 |
+
"writechunks": None, # Store filename if writing
|
| 55 |
+
"wav_file": None,
|
| 56 |
+
"text_time_deque": deque(),
|
| 57 |
+
"hard_break_even_on_background_noise": 3.0,
|
| 58 |
+
"hard_break_even_on_background_noise_min_texts": 3,
|
| 59 |
+
"hard_break_even_on_background_noise_min_similarity": 0.99,
|
| 60 |
+
"hard_break_even_on_background_noise_min_chars": 15,
|
| 61 |
+
"allowed_methods": [
|
| 62 |
+
'set_microphone', 'abort', 'stop', 'clear_audio_queue', 'wakeup', 'shutdown', 'text',
|
| 63 |
+
],
|
| 64 |
+
"allowed_parameters": [
|
| 65 |
+
'language', 'silero_sensitivity', 'wake_word_activation_delay',
|
| 66 |
+
'post_speech_silence_duration', 'listen_start', 'recording_stop_time',
|
| 67 |
+
'last_transcription_bytes', 'last_transcription_bytes_b64',
|
| 68 |
+
'speech_end_silence_start', 'is_recording', 'use_wake_words',
|
| 69 |
+
],
|
| 70 |
+
"CHANNELS": 1,
|
| 71 |
+
"FORMAT": pyaudio.paInt16,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# Define ANSI color codes for terminal output (same as original)
|
| 76 |
class bcolors:
|
| 77 |
+
HEADER = '\033[95m'
|
| 78 |
+
OKBLUE = '\033[94m'
|
| 79 |
+
OKCYAN = '\033[96m'
|
| 80 |
+
OKGREEN = '\033[92m'
|
| 81 |
+
WARNING = '\033[93m'
|
| 82 |
+
FAIL = '\033[91m'
|
| 83 |
+
ENDC = '\033[0m'
|
| 84 |
BOLD = '\033[1m'
|
| 85 |
UNDERLINE = '\033[4m'
|
| 86 |
|
| 87 |
+
print(f"{bcolors.BOLD}{bcolors.OKCYAN}Starting FastAPI STT server, please wait...{bcolors.ENDC}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
# --- Helper Functions (preprocess_text, debug_print, format_timestamp_ns etc.) ---
|
| 90 |
+
# These functions can be kept largely as they are, ensuring they use app_state for global flags.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
def preprocess_text(text):
|
|
|
|
| 93 |
text = text.lstrip()
|
|
|
|
|
|
|
| 94 |
if text.startswith("..."):
|
| 95 |
text = text[3:]
|
|
|
|
| 96 |
if text.endswith("...'."):
|
| 97 |
text = text[:-1]
|
|
|
|
| 98 |
if text.endswith("...'"):
|
| 99 |
text = text[:-1]
|
|
|
|
|
|
|
| 100 |
text = text.lstrip()
|
|
|
|
|
|
|
| 101 |
if text:
|
| 102 |
text = text[0].upper() + text[1:]
|
|
|
|
| 103 |
return text
|
| 104 |
|
| 105 |
def debug_print(message):
|
| 106 |
+
if app_state["debug_logging"]:
|
| 107 |
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
| 108 |
+
# FastAPI might not run recorder in a named thread in the same way, adjust if needed
|
| 109 |
+
thread_name = threading.current_thread().name
|
| 110 |
print(f"{Fore.CYAN}[DEBUG][{timestamp}][{thread_name}] {message}{Style.RESET_ALL}", file=sys.stderr)
|
| 111 |
|
| 112 |
def format_timestamp_ns(timestamp_ns: int) -> str:
|
|
|
|
| 113 |
seconds = timestamp_ns // 1_000_000_000
|
| 114 |
remainder_ns = timestamp_ns % 1_000_000_000
|
|
|
|
|
|
|
| 115 |
dt = datetime.fromtimestamp(seconds)
|
|
|
|
|
|
|
| 116 |
time_str = dt.strftime("%H:%M:%S")
|
|
|
|
|
|
|
| 117 |
milliseconds = remainder_ns // 1_000_000
|
| 118 |
+
return f"{time_str}.{milliseconds:03d}"
|
| 119 |
|
|
|
|
| 120 |
|
| 121 |
+
# --- Callback Functions for RealtimeSTT (adapted for asyncio Queue) ---
|
| 122 |
+
# The loop argument is no longer needed as we'll use app_state["audio_queue"] which is an asyncio.Queue
|
| 123 |
|
| 124 |
+
def text_detected(text): # Removed loop argument
|
| 125 |
+
app_state["prev_text"] = app_state.get("prev_text", "") # Ensure prev_text exists
|
| 126 |
text = preprocess_text(text)
|
| 127 |
|
| 128 |
+
if app_state["silence_timing"]:
|
| 129 |
+
def ends_with_ellipsis(s: str):
|
| 130 |
+
return s.endswith("...") or (len(s) > 1 and s[:-1].endswith("..."))
|
| 131 |
+
def sentence_end(s: str):
|
| 132 |
+
return s and s[-1] in ['.', '!', '?', '。']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
+
recorder = app_state["recorder"]
|
| 135 |
if ends_with_ellipsis(text):
|
| 136 |
+
recorder.post_speech_silence_duration = app_state["global_args"].mid_sentence_detection_pause
|
| 137 |
+
elif sentence_end(text) and sentence_end(app_state["prev_text"]) and not ends_with_ellipsis(app_state["prev_text"]):
|
| 138 |
+
recorder.post_speech_silence_duration = app_state["global_args"].end_of_sentence_detection_pause
|
| 139 |
else:
|
| 140 |
+
recorder.post_speech_silence_duration = app_state["global_args"].unknown_sentence_detection_pause
|
| 141 |
|
|
|
|
|
|
|
| 142 |
current_time = time.time()
|
| 143 |
+
app_state["text_time_deque"].append((current_time, text))
|
| 144 |
+
while app_state["text_time_deque"] and app_state["text_time_deque"][0][0] < current_time - app_state["hard_break_even_on_background_noise"]:
|
| 145 |
+
app_state["text_time_deque"].popleft()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
if len(app_state["text_time_deque"]) >= app_state["hard_break_even_on_background_noise_min_texts"]:
|
| 148 |
+
texts = [t[1] for t in app_state["text_time_deque"]]
|
| 149 |
+
first_text, last_text = texts[0], texts[-1]
|
| 150 |
similarity = SequenceMatcher(None, first_text, last_text).ratio()
|
| 151 |
+
if similarity > app_state["hard_break_even_on_background_noise_min_similarity"] and \
|
| 152 |
+
len(first_text) > app_state["hard_break_even_on_background_noise_min_chars"]:
|
| 153 |
recorder.stop()
|
| 154 |
recorder.clear_audio_queue()
|
| 155 |
+
app_state["prev_text"] = ""
|
| 156 |
+
|
| 157 |
+
app_state["prev_text"] = text
|
| 158 |
+
message = json.dumps({'type': 'realtime', 'text': text})
|
| 159 |
+
try:
|
| 160 |
+
app_state["audio_queue"].put_nowait(message)
|
| 161 |
+
except asyncio.QueueFull:
|
| 162 |
+
logging.warning("Audio queue full, dropping realtime message.")
|
| 163 |
|
|
|
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 166 |
+
log_msg = f"\r[{timestamp}] {bcolors.OKCYAN}{text}{bcolors.ENDC}"
|
| 167 |
+
if app_state["extended_logging"]:
|
| 168 |
print(f" [{timestamp}] Realtime text: {bcolors.OKCYAN}{text}{bcolors.ENDC}\n", flush=True, end="")
|
| 169 |
else:
|
| 170 |
+
print(log_msg, flush=True, end='')
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
+
def on_recording_start():
|
| 174 |
+
message = json.dumps({'type': 'recording_start'})
|
| 175 |
+
try:
|
| 176 |
+
app_state["audio_queue"].put_nowait(message)
|
| 177 |
+
except asyncio.QueueFull: logging.warning("Audio queue full, dropping recording_start message.")
|
| 178 |
|
| 179 |
+
def on_recording_stop():
|
| 180 |
+
message = json.dumps({'type': 'recording_stop'})
|
| 181 |
+
try:
|
| 182 |
+
app_state["audio_queue"].put_nowait(message)
|
| 183 |
+
except asyncio.QueueFull: logging.warning("Audio queue full, dropping recording_stop message.")
|
| 184 |
|
| 185 |
+
def on_vad_detect_start():
|
| 186 |
+
message = json.dumps({'type': 'vad_detect_start'})
|
| 187 |
+
try:
|
| 188 |
+
app_state["audio_queue"].put_nowait(message)
|
| 189 |
+
except asyncio.QueueFull: logging.warning("Audio queue full, dropping vad_detect_start message.")
|
| 190 |
+
|
| 191 |
+
def on_vad_detect_stop():
|
| 192 |
+
message = json.dumps({'type': 'vad_detect_stop'})
|
| 193 |
+
try:
|
| 194 |
+
app_state["audio_queue"].put_nowait(message)
|
| 195 |
+
except asyncio.QueueFull: logging.warning("Audio queue full, dropping vad_detect_stop message.")
|
| 196 |
|
| 197 |
+
def on_wakeword_detected():
|
| 198 |
+
message = json.dumps({'type': 'wakeword_detected'})
|
| 199 |
+
try:
|
| 200 |
+
app_state["audio_queue"].put_nowait(message)
|
| 201 |
+
except asyncio.QueueFull: logging.warning("Audio queue full, dropping wakeword_detected message.")
|
| 202 |
|
| 203 |
+
def on_wakeword_detection_start():
|
| 204 |
+
message = json.dumps({'type': 'wakeword_detection_start'})
|
| 205 |
+
try:
|
| 206 |
+
app_state["audio_queue"].put_nowait(message)
|
| 207 |
+
except asyncio.QueueFull: logging.warning("Audio queue full, dropping wakeword_detection_start message.")
|
| 208 |
|
| 209 |
+
def on_wakeword_detection_end():
|
| 210 |
+
message = json.dumps({'type': 'wakeword_detection_end'})
|
| 211 |
+
try:
|
| 212 |
+
app_state["audio_queue"].put_nowait(message)
|
| 213 |
+
except asyncio.QueueFull: logging.warning("Audio queue full, dropping wakeword_detection_end message.")
|
| 214 |
|
| 215 |
+
def on_transcription_start(_audio_bytes):
|
| 216 |
bytes_b64 = base64.b64encode(_audio_bytes.tobytes()).decode('utf-8')
|
| 217 |
message = json.dumps({
|
| 218 |
'type': 'transcription_start',
|
| 219 |
'audio_bytes_base64': bytes_b64
|
| 220 |
})
|
| 221 |
+
try:
|
| 222 |
+
app_state["audio_queue"].put_nowait(message)
|
| 223 |
+
except asyncio.QueueFull: logging.warning("Audio queue full, dropping transcription_start message.")
|
| 224 |
|
| 225 |
+
def on_turn_detection_start():
|
| 226 |
print("&&& stt_server on_turn_detection_start")
|
| 227 |
+
message = json.dumps({'type': 'start_turn_detection'})
|
| 228 |
+
try:
|
| 229 |
+
app_state["audio_queue"].put_nowait(message)
|
| 230 |
+
except asyncio.QueueFull: logging.warning("Audio queue full, dropping start_turn_detection message.")
|
| 231 |
|
| 232 |
+
def on_turn_detection_stop():
|
| 233 |
print("&&& stt_server on_turn_detection_stop")
|
| 234 |
+
message = json.dumps({'type': 'stop_turn_detection'})
|
| 235 |
+
try:
|
| 236 |
+
app_state["audio_queue"].put_nowait(message)
|
| 237 |
+
except asyncio.QueueFull: logging.warning("Audio queue full, dropping stop_turn_detection message.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
+
# --- Argument Parser (same as original, but call it in startup) ---
|
| 240 |
+
def parse_arguments():
|
| 241 |
+
# ... (Keep the original parse_arguments function as is) ...
|
| 242 |
+
# This function will now set app_state["debug_logging"], etc.
|
| 243 |
parser = argparse.ArgumentParser(description='Start the Speech-to-Text (STT) server with various configuration options.')
|
| 244 |
+
parser.add_argument('-m', '--model', type=str, default='large-v2', help='Path to the STT model or model size.')
|
| 245 |
+
parser.add_argument('-r', '--rt-model', '--realtime_model_type', type=str, default='tiny', help='Model size for real-time transcription.')
|
| 246 |
+
parser.add_argument('-l', '--lang', '--language', type=str, default='en', help='Language code for STT.')
|
| 247 |
+
parser.add_argument('-i', '--input-device', '--input-device-index', type=int, default=1, help='Audio input device index.')
|
| 248 |
+
# For FastAPI, host and port for HTTP server are set in uvicorn.run()
|
| 249 |
+
# WebSocket ports are part of the path now, e.g. /ws/control
|
| 250 |
+
# We can remove -c and -d from here if they are not used for other purposes
|
| 251 |
+
# parser.add_argument('-c', '--control', '--control_port', type=int, default=8011, help='Control WebSocket port (deprecated for FastAPI).')
|
| 252 |
+
# parser.add_argument('-d', '--data', '--data_port', type=int, default=8012, help='Data WebSocket port (deprecated for FastAPI).')
|
| 253 |
+
parser.add_argument('-w', '--wake_words', type=str, default="", help='Wake word(s).')
|
| 254 |
+
parser.add_argument('-D', '--debug', action='store_true', help='Enable debug logging.')
|
| 255 |
+
parser.add_argument('--debug_websockets', action='store_true', help='Enable debug logging for websockets.')
|
| 256 |
+
parser.add_argument('-W', '--write', metavar='FILE', help='Save received audio to a WAV file.')
|
| 257 |
+
parser.add_argument('-b', '--batch', '--batch_size', type=int, default=16, help='Batch size for inference.')
|
| 258 |
+
parser.add_argument('--root', '--download_root', type=str,default=None, help='Whisper models download root path.')
|
| 259 |
+
parser.add_argument('-s', '--silence_timing', action='store_true', default=True, help='Enable dynamic silence duration.')
|
| 260 |
+
parser.add_argument('--init_realtime_after_seconds', type=float, default=0.2, help='Initial wait for realtime transcription.')
|
| 261 |
+
parser.add_argument('--realtime_batch_size', type=int, default=16, help='Batch size for real-time model.')
|
| 262 |
+
parser.add_argument('--initial_prompt_realtime', type=str, default="", help='Initial prompt for real-time model.')
|
| 263 |
+
parser.add_argument('--silero_sensitivity', type=float, default=0.05, help='Silero VAD sensitivity (0-1).')
|
| 264 |
+
parser.add_argument('--silero_use_onnx', action='store_true', default=False, help='Use Silero ONNX model.')
|
| 265 |
+
parser.add_argument('--webrtc_sensitivity', type=int, default=3, help='WebRTC VAD sensitivity (0-3).')
|
| 266 |
+
parser.add_argument('--min_length_of_recording', type=float, default=1.1, help='Min recording duration (s).')
|
| 267 |
+
parser.add_argument('--min_gap_between_recordings', type=float, default=0, help='Min time between recordings (s).')
|
| 268 |
+
parser.add_argument('--enable_realtime_transcription', action='store_true', default=True, help='Enable real-time transcription.')
|
| 269 |
+
parser.add_argument('--realtime_processing_pause', type=float, default=0.02, help='Pause between audio chunk processing (s).')
|
| 270 |
+
parser.add_argument('--silero_deactivity_detection', action='store_true', default=True, help='Use Silero for end-of-speech detection.')
|
| 271 |
+
parser.add_argument('--early_transcription_on_silence', type=float, default=0.2, help='Start transcription after silence (s).')
|
| 272 |
+
parser.add_argument('--beam_size', type=int, default=5, help='Beam size for main model.')
|
| 273 |
+
parser.add_argument('--beam_size_realtime', type=int, default=3, help='Beam size for real-time model.')
|
| 274 |
+
parser.add_argument('--initial_prompt', type=str, default="Incomplete thoughts should end with '...'.", help='Initial main transcription prompt.')
|
| 275 |
+
parser.add_argument('--end_of_sentence_detection_pause', type=float, default=0.45, help='Silence for sentence end (s).')
|
| 276 |
+
parser.add_argument('--unknown_sentence_detection_pause', type=float, default=0.7, help='Pause for incomplete sentence (s).')
|
| 277 |
+
parser.add_argument('--mid_sentence_detection_pause', type=float, default=2.0, help='Pause for mid-sentence break (s).')
|
| 278 |
+
parser.add_argument('--wake_words_sensitivity', type=float, default=0.5, help='Wake word detection sensitivity (0-1).')
|
| 279 |
+
parser.add_argument('--wake_word_timeout', type=float, default=5.0, help='Wake word timeout (s).')
|
| 280 |
+
parser.add_argument('--wake_word_activation_delay', type=float, default=0, help='Delay before wake word activation (s).')
|
| 281 |
+
parser.add_argument('--wakeword_backend', type=str, default='none', help='Backend for wake word detection.')
|
| 282 |
+
parser.add_argument('--openwakeword_model_paths', type=str, nargs='*', help='Paths to OpenWakeWord models.')
|
| 283 |
+
parser.add_argument('--openwakeword_inference_framework', type=str, default='tensorflow', help='OpenWakeWord inference framework.')
|
| 284 |
+
parser.add_argument('--wake_word_buffer_duration', type=float, default=1.0, help='Wake word buffer duration (s).')
|
| 285 |
+
parser.add_argument('--use_main_model_for_realtime', action='store_true', help='Use main model for real-time transcription.')
|
| 286 |
+
parser.add_argument('--use_extended_logging', action='store_true', help='Enable extensive log messages.')
|
| 287 |
+
parser.add_argument('--compute_type', type=str, default='default', help='Computation type for CTranslate2.')
|
| 288 |
+
parser.add_argument('--gpu_device_index', type=int, default=0, help='GPU device index.')
|
| 289 |
+
parser.add_argument('--device', type=str, default='cuda', help='Device for model ("cuda" or "cpu").')
|
| 290 |
+
parser.add_argument('--handle_buffer_overflow', action='store_true', help='Handle buffer overflow.')
|
| 291 |
+
parser.add_argument('--suppress_tokens', type=int, default=[-1], nargs='*', help='Suppress tokens.')
|
| 292 |
+
parser.add_argument('--allowed_latency_limit', type=int, default=100, help='Allowed latency limit for real-time.')
|
| 293 |
+
parser.add_argument('--faster_whisper_vad_filter', action='store_true', help='Enable VAD filter for Faster Whisper.')
|
| 294 |
+
parser.add_argument('--logchunks', action='store_true', help='Log incoming audio chunks.')
|
| 295 |
|
| 296 |
+
# FastAPI specific args (uvicorn will handle host/port for the HTTP server)
|
| 297 |
+
parser.add_argument('--server_host', type=str, default='localhost', help='Host for the FastAPI server.')
|
| 298 |
+
parser.add_argument('--server_port', type=int, default=8000, help='Port for the FastAPI server.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
|
|
|
|
|
|
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
args = parser.parse_args()
|
| 302 |
|
| 303 |
+
app_state["debug_logging"] = args.debug
|
| 304 |
+
app_state["extended_logging"] = args.use_extended_logging
|
| 305 |
+
app_state["writechunks"] = args.write # Filename or None
|
| 306 |
+
app_state["log_incoming_chunks"] = args.logchunks
|
| 307 |
+
app_state["silence_timing"] = args.silence_timing
|
|
|
|
| 308 |
|
| 309 |
+
ws_logger = logging.getLogger('websockets') # For uvicorn's websockets
|
| 310 |
+
uvicorn_logger = logging.getLogger('uvicorn')
|
| 311 |
if args.debug_websockets:
|
|
|
|
| 312 |
ws_logger.setLevel(logging.DEBUG)
|
| 313 |
+
uvicorn_logger.setLevel(logging.DEBUG)
|
| 314 |
+
ws_logger.propagate = True
|
| 315 |
+
uvicorn_logger.propagate = True
|
| 316 |
else:
|
|
|
|
| 317 |
ws_logger.setLevel(logging.WARNING)
|
| 318 |
+
uvicorn_logger.setLevel(logging.INFO) # Uvicorn access logs are INFO
|
| 319 |
|
|
|
|
| 320 |
if args.initial_prompt:
|
| 321 |
args.initial_prompt = args.initial_prompt.replace("\\n", "\n")
|
|
|
|
| 322 |
if args.initial_prompt_realtime:
|
| 323 |
args.initial_prompt_realtime = args.initial_prompt_realtime.replace("\\n", "\n")
|
| 324 |
+
|
| 325 |
+
app_state["global_args"] = args
|
| 326 |
return args
|
| 327 |
|
| 328 |
+
|
| 329 |
+
# --- Recorder Thread Function (adapted for asyncio.to_thread) ---
|
| 330 |
+
def recorder_processing_loop():
|
| 331 |
+
"""This function contains the blocking recorder.text() call."""
|
| 332 |
+
recorder = app_state["recorder"]
|
| 333 |
+
if not recorder:
|
| 334 |
+
logging.error("Recorder not initialized in recorder_processing_loop.")
|
| 335 |
+
return
|
| 336 |
+
|
| 337 |
+
def process_text_callback(full_sentence):
|
| 338 |
+
app_state["prev_text"] = "" # Reset for full sentence
|
|
|
|
| 339 |
full_sentence = preprocess_text(full_sentence)
|
| 340 |
+
message = json.dumps({'type': 'fullSentence', 'text': full_sentence})
|
| 341 |
+
try:
|
| 342 |
+
app_state["audio_queue"].put_nowait(message)
|
| 343 |
+
except asyncio.QueueFull:
|
| 344 |
+
logging.warning("Audio queue full, dropping fullSentence message.")
|
|
|
|
| 345 |
|
| 346 |
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 347 |
+
log_msg = f"\r[{timestamp}] {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n"
|
| 348 |
+
if app_state["extended_logging"]:
|
| 349 |
print(f" [{timestamp}] Full text: {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n", flush=True, end="")
|
| 350 |
else:
|
| 351 |
+
print(log_msg, end='') # Ensure newline after sentence
|
| 352 |
+
|
| 353 |
try:
|
| 354 |
+
while not app_state["stop_recorder_flag"]:
|
| 355 |
+
if recorder:
|
| 356 |
+
recorder.text(process_text_callback) # This is a blocking call
|
| 357 |
+
else: # Should not happen if startup logic is correct
|
| 358 |
+
time.sleep(0.1)
|
| 359 |
+
except Exception as e:
|
| 360 |
+
logging.error(f"Error in recorder_processing_loop: {e}")
|
| 361 |
+
finally:
|
| 362 |
+
logging.info("Recorder processing loop finished.")
|
| 363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
+
def decode_and_resample(audio_data, original_sample_rate, target_sample_rate):
|
| 366 |
if original_sample_rate == target_sample_rate:
|
| 367 |
return audio_data
|
|
|
|
| 368 |
audio_np = np.frombuffer(audio_data, dtype=np.int16)
|
|
|
|
|
|
|
| 369 |
num_original_samples = len(audio_np)
|
| 370 |
+
num_target_samples = int(num_original_samples * target_sample_rate / original_sample_rate)
|
|
|
|
|
|
|
|
|
|
| 371 |
resampled_audio = resample(audio_np, num_target_samples)
|
|
|
|
| 372 |
return resampled_audio.astype(np.int16).tobytes()
|
| 373 |
|
| 374 |
+
# --- FastAPI App and Endpoints ---
|
| 375 |
+
app = FastAPI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
+
@app.on_event("startup")
|
| 378 |
+
async def startup_event():
|
| 379 |
+
args = parse_arguments() # Parse CLI args and set app_state flags
|
| 380 |
+
# app_state["global_args"] = args # Already set in parse_arguments
|
| 381 |
|
| 382 |
+
app_state["recorder_config"] = {
|
| 383 |
+
'model': args.model, 'download_root': args.root, 'realtime_model_type': args.rt_model,
|
| 384 |
+
'language': args.lang, 'batch_size': args.batch,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
'init_realtime_after_seconds': args.init_realtime_after_seconds,
|
| 386 |
'realtime_batch_size': args.realtime_batch_size,
|
| 387 |
'initial_prompt_realtime': args.initial_prompt_realtime,
|
| 388 |
+
'input_device_index': args.input_device, 'silero_sensitivity': args.silero_sensitivity,
|
| 389 |
+
'silero_use_onnx': args.silero_use_onnx, 'webrtc_sensitivity': args.webrtc_sensitivity,
|
|
|
|
|
|
|
| 390 |
'post_speech_silence_duration': args.unknown_sentence_detection_pause,
|
| 391 |
'min_length_of_recording': args.min_length_of_recording,
|
| 392 |
'min_gap_between_recordings': args.min_gap_between_recordings,
|
|
|
|
| 394 |
'realtime_processing_pause': args.realtime_processing_pause,
|
| 395 |
'silero_deactivity_detection': args.silero_deactivity_detection,
|
| 396 |
'early_transcription_on_silence': args.early_transcription_on_silence,
|
| 397 |
+
'beam_size': args.beam_size, 'beam_size_realtime': args.beam_size_realtime,
|
| 398 |
+
'initial_prompt': args.initial_prompt, 'wake_words': args.wake_words,
|
|
|
|
|
|
|
| 399 |
'wake_words_sensitivity': args.wake_words_sensitivity,
|
| 400 |
'wake_word_timeout': args.wake_word_timeout,
|
| 401 |
'wake_word_activation_delay': args.wake_word_activation_delay,
|
|
|
|
| 404 |
'openwakeword_inference_framework': args.openwakeword_inference_framework,
|
| 405 |
'wake_word_buffer_duration': args.wake_word_buffer_duration,
|
| 406 |
'use_main_model_for_realtime': args.use_main_model_for_realtime,
|
| 407 |
+
'spinner': False, 'use_microphone': False, # Server mode doesn't use mic directly
|
| 408 |
+
'on_realtime_transcription_update': text_detected,
|
| 409 |
+
'on_recording_start': on_recording_start, 'on_recording_stop': on_recording_stop,
|
| 410 |
+
'on_vad_detect_start': on_vad_detect_start, 'on_vad_detect_stop': on_vad_detect_stop,
|
| 411 |
+
'on_wakeword_detected': on_wakeword_detected,
|
| 412 |
+
'on_wakeword_detection_start': on_wakeword_detection_start,
|
| 413 |
+
'on_wakeword_detection_end': on_wakeword_detection_end,
|
| 414 |
+
'on_transcription_start': on_transcription_start,
|
| 415 |
+
'on_turn_detection_start': on_turn_detection_start,
|
| 416 |
+
'on_turn_detection_stop': on_turn_detection_stop,
|
| 417 |
+
'no_log_file': True, 'use_extended_logging': args.use_extended_logging,
|
| 418 |
+
# 'level': logging.WARNING, # Set based on args.debug
|
| 419 |
+
'compute_type': args.compute_type, 'gpu_device_index': args.gpu_device_index,
|
| 420 |
+
'device': args.device, 'handle_buffer_overflow': args.handle_buffer_overflow,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
'suppress_tokens': args.suppress_tokens,
|
| 422 |
'allowed_latency_limit': args.allowed_latency_limit,
|
| 423 |
'faster_whisper_vad_filter': args.faster_whisper_vad_filter,
|
| 424 |
}
|
| 425 |
+
|
| 426 |
+
# Configure logging level based on debug flag
|
| 427 |
+
log_level = logging.DEBUG if args.debug else logging.INFO
|
| 428 |
+
logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 429 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
|
| 431 |
+
print(f"{bcolors.OKGREEN}Initializing RealtimeSTT server with parameters:{bcolors.ENDC}")
|
| 432 |
+
for key, value in app_state["recorder_config"].items():
|
| 433 |
+
# Truncate long values like initial_prompt for display
|
| 434 |
+
display_value = str(value)
|
| 435 |
+
if len(display_value) > 70:
|
| 436 |
+
display_value = display_value[:67] + "..."
|
| 437 |
+
print(f" {bcolors.OKBLUE}{key}{bcolors.ENDC}: {display_value}")
|
| 438 |
+
|
| 439 |
+
try:
|
| 440 |
+
app_state["recorder"] = AudioToTextRecorder(**app_state["recorder_config"])
|
| 441 |
+
print(f"{bcolors.OKGREEN}{bcolors.BOLD}RealtimeSTT initialized{bcolors.ENDC}")
|
| 442 |
+
app_state["recorder_ready"].set()
|
| 443 |
+
|
| 444 |
+
# Start the recorder processing loop in a separate thread
|
| 445 |
+
# This is crucial because recorder.text() is blocking.
|
| 446 |
+
app_state["recorder_thread"] = threading.Thread(target=recorder_processing_loop, daemon=True)
|
| 447 |
+
app_state["recorder_thread"].start()
|
| 448 |
+
|
| 449 |
+
# Start broadcasting messages from the audio_queue
|
| 450 |
+
asyncio.create_task(broadcast_audio_messages_task())
|
| 451 |
+
print(f"{bcolors.OKGREEN}FastAPI STT Server started. Waiting for connections...{bcolors.ENDC}")
|
| 452 |
+
print(f"Control WebSocket: ws://{args.server_host}:{args.server_port}/ws/control")
|
| 453 |
+
print(f"Data WebSocket: ws://{args.server_host}:{args.server_port}/ws/data")
|
| 454 |
+
|
| 455 |
+
except Exception as e:
|
| 456 |
+
print(f"{bcolors.FAIL}Error during RealtimeSTT initialization: {e}{bcolors.ENDC}")
|
| 457 |
+
# Potentially exit or prevent server from fully starting if recorder fails
|
| 458 |
+
# For now, it will proceed but recorder-dependent endpoints will fail.
|
| 459 |
+
# Consider raising an exception here to stop uvicorn if recorder is critical.
|
| 460 |
+
# sys.exit(1) # Or handle more gracefully
|
| 461 |
+
|
| 462 |
+
@app.on_event("shutdown")
|
| 463 |
+
async def shutdown_event():
|
| 464 |
+
print(f"{bcolors.WARNING}Server shutting down...{bcolors.ENDC}")
|
| 465 |
+
app_state["stop_recorder_flag"] = True
|
| 466 |
+
recorder = app_state["recorder"]
|
| 467 |
if recorder:
|
| 468 |
+
print(f"{bcolors.OKBLUE}Aborting recorder...{bcolors.ENDC}")
|
| 469 |
+
recorder.abort() # Non-blocking
|
| 470 |
+
print(f"{bcolors.OKBLUE}Stopping recorder...{bcolors.ENDC}")
|
| 471 |
+
recorder.stop() # Non-blocking, signals the internal loop to stop
|
| 472 |
+
# recorder.shutdown() # This might be blocking or clean up resources
|
| 473 |
+
|
| 474 |
+
recorder_thread = app_state.get("recorder_thread")
|
| 475 |
+
if recorder_thread and recorder_thread.is_alive():
|
| 476 |
+
print(f"{bcolors.OKBLUE}Waiting for recorder thread to finish...{bcolors.ENDC}")
|
| 477 |
+
recorder_thread.join(timeout=5.0) # Wait for the thread
|
| 478 |
+
if recorder_thread.is_alive():
|
| 479 |
+
print(f"{bcolors.WARNING}Recorder thread did not finish in time.{bcolors.ENDC}")
|
| 480 |
+
|
| 481 |
+
if recorder: # Final shutdown call if it exists and is safe
|
| 482 |
+
try:
|
| 483 |
+
print(f"{bcolors.OKBLUE}Finalizing recorder shutdown...{bcolors.ENDC}")
|
| 484 |
+
# The original code had recorder.shutdown() here. Ensure it's safe.
|
| 485 |
+
# If `recorder.text()` in the thread needs to exit first, this is the right place.
|
| 486 |
+
if hasattr(recorder, 'shutdown') and callable(recorder.shutdown):
|
| 487 |
+
recorder.shutdown()
|
| 488 |
+
except Exception as e:
|
| 489 |
+
print(f"{bcolors.FAIL}Error during recorder.shutdown(): {e}{bcolors.ENDC}")
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
# Close WebSocket connections
|
| 493 |
+
for ws in list(app_state["control_connections"]):
|
| 494 |
+
try:
|
| 495 |
+
await ws.close(code=1000)
|
| 496 |
+
except Exception: pass # Ignore errors on close
|
| 497 |
+
for ws in list(app_state["data_connections"]):
|
| 498 |
+
try:
|
| 499 |
+
await ws.close(code=1000)
|
| 500 |
+
except Exception: pass
|
| 501 |
+
|
| 502 |
+
if app_state["wav_file"]:
|
| 503 |
+
app_state["wav_file"].close()
|
| 504 |
+
print(f"{bcolors.OKGREEN}Closed WAV file: {app_state['writechunks']}{bcolors.ENDC}")
|
| 505 |
+
|
| 506 |
+
print(f"{bcolors.OKGREEN}Server shutdown complete.{bcolors.ENDC}")
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
async def broadcast_audio_messages_task():
|
| 510 |
+
""" Task to broadcast messages from audio_queue to data clients """
|
| 511 |
+
while True:
|
| 512 |
+
try:
|
| 513 |
+
message = await app_state["audio_queue"].get()
|
| 514 |
+
if app_state["extended_logging"]:
|
| 515 |
+
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 516 |
+
print(f" [{timestamp}] Broadcasting: {bcolors.OKBLUE}{message[:100]}...{bcolors.ENDC}\n", flush=True, end="")
|
| 517 |
+
|
| 518 |
+
# Create a list of connections to iterate over, to avoid issues if set changes
|
| 519 |
+
current_connections = list(app_state["data_connections"])
|
| 520 |
+
for conn in current_connections:
|
| 521 |
+
try:
|
| 522 |
+
await conn.send_text(message)
|
| 523 |
+
except WebSocketDisconnect:
|
| 524 |
+
app_state["data_connections"].discard(conn) # Remove if disconnected
|
| 525 |
+
except Exception as e:
|
| 526 |
+
logging.error(f"Error sending message to data client: {e}")
|
| 527 |
+
app_state["data_connections"].discard(conn) # Remove on other errors too
|
| 528 |
+
except asyncio.CancelledError:
|
| 529 |
+
logging.info("Broadcast task cancelled.")
|
| 530 |
+
break
|
| 531 |
+
except Exception as e:
|
| 532 |
+
logging.error(f"Error in broadcast_audio_messages_task: {e}")
|
| 533 |
+
await asyncio.sleep(1) # Avoid tight loop on unexpected errors
|
| 534 |
+
|
| 535 |
+
@app.websocket("/ws/control")
|
| 536 |
+
async def websocket_control_endpoint(websocket: WebSocket):
|
| 537 |
+
await websocket.accept()
|
| 538 |
+
app_state["control_connections"].add(websocket)
|
| 539 |
+
debug_print(f"New control connection from {websocket.client}")
|
| 540 |
+
print(f"{bcolors.OKGREEN}Control client connected: {websocket.client}{bcolors.ENDC}")
|
| 541 |
+
|
| 542 |
+
recorder = app_state["recorder"]
|
| 543 |
+
if not await app_state["recorder_ready"].wait(): # Wait for recorder to be ready
|
| 544 |
+
print(f"{bcolors.WARNING}Recorder not ready, control client may experience issues.{bcolors.ENDC}")
|
| 545 |
+
# Optionally send an error message and close
|
| 546 |
+
|
| 547 |
+
try:
|
| 548 |
+
while True:
|
| 549 |
+
message = await websocket.receive_text()
|
| 550 |
+
debug_print(f"Received control message: {message[:200]}...")
|
| 551 |
+
if not recorder: # Or not app_state["recorder_ready"].is_set()
|
| 552 |
+
print(f"{bcolors.WARNING}Recorder not ready, cannot process command.{bcolors.ENDC}")
|
| 553 |
+
await websocket.send_json({"status": "error", "message": "Recorder not ready"})
|
| 554 |
+
continue
|
| 555 |
|
| 556 |
+
try:
|
| 557 |
+
command_data = json.loads(message)
|
| 558 |
+
command = command_data.get("command")
|
| 559 |
+
|
| 560 |
+
if command == "set_parameter":
|
| 561 |
+
parameter = command_data.get("parameter")
|
| 562 |
+
value = command_data.get("value")
|
| 563 |
+
if parameter in app_state["allowed_parameters"] and hasattr(recorder, parameter):
|
| 564 |
+
setattr(recorder, parameter, value)
|
| 565 |
+
value_formatted = f"{value:.2f}" if isinstance(value, float) else str(value)
|
| 566 |
+
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 567 |
+
if app_state["extended_logging"]:
|
| 568 |
+
print(f" [{timestamp}] {bcolors.OKGREEN}Set recorder.{parameter} to: {bcolors.OKBLUE}{value_formatted}{bcolors.ENDC}")
|
| 569 |
+
await websocket.send_json({"status": "success", "message": f"Parameter {parameter} set to {value}"})
|
| 570 |
+
else:
|
| 571 |
+
errmsg = f"Parameter {parameter} is not allowed or does not exist (set_parameter)"
|
| 572 |
+
print(f"{bcolors.WARNING}{errmsg}{bcolors.ENDC}")
|
| 573 |
+
await websocket.send_json({"status": "error", "message": errmsg})
|
| 574 |
+
|
| 575 |
+
elif command == "get_parameter":
|
| 576 |
+
parameter = command_data.get("parameter")
|
| 577 |
+
request_id = command_data.get("request_id")
|
| 578 |
+
if parameter in app_state["allowed_parameters"] and hasattr(recorder, parameter):
|
| 579 |
+
value = getattr(recorder, parameter)
|
| 580 |
+
value_formatted = f"{value:.2f}" if isinstance(value, float) else str(value)
|
| 581 |
+
value_truncated = value_formatted[:39] + "…" if len(value_formatted) > 40 else value_formatted
|
| 582 |
+
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 583 |
+
if app_state["extended_logging"]:
|
| 584 |
+
print(f" [{timestamp}] {bcolors.OKGREEN}Get recorder.{parameter}: {bcolors.OKBLUE}{value_truncated}{bcolors.ENDC}")
|
| 585 |
+
response = {"status": "success", "parameter": parameter, "value": value}
|
| 586 |
+
if request_id is not None: response["request_id"] = request_id
|
| 587 |
+
await websocket.send_json(response)
|
| 588 |
+
else:
|
| 589 |
+
errmsg = f"Parameter {parameter} is not allowed or does not exist (get_parameter)"
|
| 590 |
+
print(f"{bcolors.WARNING}{errmsg}{bcolors.ENDC}")
|
| 591 |
+
await websocket.send_json({"status": "error", "message": errmsg})
|
| 592 |
+
|
| 593 |
+
elif command == "call_method":
|
| 594 |
+
method_name = command_data.get("method")
|
| 595 |
+
if method_name in app_state["allowed_methods"]:
|
| 596 |
+
method = getattr(recorder, method_name, None)
|
| 597 |
+
if method and callable(method):
|
| 598 |
+
args_list = command_data.get("args", [])
|
| 599 |
+
kwargs_dict = command_data.get("kwargs", {})
|
| 600 |
+
# Consider running blocking methods in a thread if any exist
|
| 601 |
+
method(*args_list, **kwargs_dict)
|
| 602 |
+
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
| 603 |
+
print(f" [{timestamp}] {bcolors.OKGREEN}Called method recorder.{bcolors.OKBLUE}{method_name}{bcolors.ENDC}")
|
| 604 |
+
await websocket.send_json({"status": "success", "message": f"Method {method_name} called"})
|
| 605 |
+
else:
|
| 606 |
+
print(f"{bcolors.WARNING}Recorder does not have method {method_name}{bcolors.ENDC}")
|
| 607 |
+
await websocket.send_json({"status": "error", "message": f"Recorder does not have method {method_name}"})
|
| 608 |
+
else:
|
| 609 |
+
print(f"{bcolors.WARNING}Method {method_name} is not allowed{bcolors.ENDC}")
|
| 610 |
+
await websocket.send_json({"status": "error", "message": f"Method {method_name} is not allowed"})
|
| 611 |
+
else:
|
| 612 |
+
print(f"{bcolors.WARNING}Unknown command: {command}{bcolors.ENDC}")
|
| 613 |
+
await websocket.send_json({"status": "error", "message": f"Unknown command {command}"})
|
| 614 |
+
except json.JSONDecodeError:
|
| 615 |
+
print(f"{bcolors.WARNING}Received invalid JSON command from control client{bcolors.ENDC}")
|
| 616 |
+
await websocket.send_json({"status": "error", "message": "Invalid JSON command"})
|
| 617 |
+
except Exception as e:
|
| 618 |
+
print(f"{bcolors.FAIL}Error processing control message: {e}{bcolors.ENDC}")
|
| 619 |
+
try:
|
| 620 |
+
await websocket.send_json({"status": "error", "message": f"Server error: {type(e).__name__}"})
|
| 621 |
+
except Exception: pass # Ignore if send fails
|
| 622 |
|
| 623 |
+
except WebSocketDisconnect:
|
| 624 |
+
print(f"{bcolors.WARNING}Control client disconnected: {websocket.client}{bcolors.ENDC}")
|
| 625 |
+
except Exception as e:
|
| 626 |
+
print(f"{bcolors.FAIL}Control WebSocket error for {websocket.client}: {e}{bcolors.ENDC}")
|
| 627 |
+
finally:
|
| 628 |
+
app_state["control_connections"].discard(websocket)
|
| 629 |
|
| 630 |
+
@app.websocket("/ws/data")
|
| 631 |
+
async def websocket_data_endpoint(websocket: WebSocket):
|
| 632 |
+
await websocket.accept()
|
| 633 |
+
app_state["data_connections"].add(websocket)
|
| 634 |
+
print(f"{bcolors.OKGREEN}Data client connected: {websocket.client}{bcolors.ENDC}")
|
| 635 |
+
|
| 636 |
+
recorder = app_state["recorder"]
|
| 637 |
+
if not await app_state["recorder_ready"].wait():
|
| 638 |
+
print(f"{bcolors.WARNING}Recorder not ready, data client may experience issues.{bcolors.ENDC}")
|
| 639 |
+
# Optionally send an error message and close
|
| 640 |
|
|
|
|
| 641 |
try:
|
| 642 |
+
while True:
|
| 643 |
+
message_bytes = await websocket.receive_bytes() # Expecting binary data
|
| 644 |
+
|
| 645 |
+
if app_state["extended_logging"]:
|
| 646 |
+
debug_print(f"Received audio chunk (size: {len(message_bytes)} bytes)")
|
| 647 |
+
elif app_state["log_incoming_chunks"]:
|
| 648 |
+
print(".", end='', flush=True)
|
| 649 |
+
|
| 650 |
+
if not recorder:
|
| 651 |
+
print(f"{bcolors.WARNING}Recorder not ready, cannot process audio data.{bcolors.ENDC}")
|
| 652 |
+
# Maybe send an error message back if the protocol supports it
|
| 653 |
+
continue
|
| 654 |
+
|
| 655 |
+
try:
|
| 656 |
+
# Assuming the same metadata structure as original
|
| 657 |
+
metadata_length = int.from_bytes(message_bytes[:4], byteorder='little')
|
| 658 |
+
metadata_json = message_bytes[4:4+metadata_length].decode('utf-8')
|
| 659 |
+
metadata = json.loads(metadata_json)
|
| 660 |
+
sample_rate = metadata['sampleRate']
|
| 661 |
+
|
| 662 |
+
if 'server_sent_to_stt' in metadata: # Example of adding more metadata
|
| 663 |
+
stt_received_ns = time.time_ns()
|
| 664 |
+
metadata["stt_received"] = stt_received_ns
|
| 665 |
+
metadata["stt_received_formatted"] = format_timestamp_ns(stt_received_ns)
|
| 666 |
+
# print(f"Server received audio chunk of length {len(message_bytes)} bytes, metadata: {metadata}")
|
| 667 |
+
|
| 668 |
+
|
| 669 |
+
if app_state["extended_logging"]:
|
| 670 |
+
debug_print(f"Processing audio chunk with sample rate {sample_rate}")
|
| 671 |
+
|
| 672 |
+
chunk = message_bytes[4+metadata_length:]
|
| 673 |
+
|
| 674 |
+
if app_state["writechunks"]:
|
| 675 |
+
if not app_state["wav_file"]:
|
| 676 |
+
try:
|
| 677 |
+
app_state["wav_file"] = wave.open(app_state["writechunks"], 'wb')
|
| 678 |
+
app_state["wav_file"].setnchannels(app_state["CHANNELS"])
|
| 679 |
+
app_state["wav_file"].setsampwidth(pyaudio.get_sample_size(app_state["FORMAT"]))
|
| 680 |
+
app_state["wav_file"].setframerate(sample_rate) # Use actual sample rate for initial write
|
| 681 |
+
except Exception as e:
|
| 682 |
+
print(f"{bcolors.FAIL}Error opening WAV file {app_state['writechunks']}: {e}{bcolors.ENDC}")
|
| 683 |
+
app_state["writechunks"] = None # Disable further writing attempts
|
| 684 |
+
|
| 685 |
+
if app_state["wav_file"]:
|
| 686 |
+
app_state["wav_file"].writeframes(chunk)
|
| 687 |
+
|
| 688 |
+
|
| 689 |
+
# Resample if necessary and feed to recorder
|
| 690 |
+
# Note: recorder.feed_audio might be blocking or have internal queuing.
|
| 691 |
+
# If it's significantly blocking, consider asyncio.to_thread for this call too.
|
| 692 |
+
# However, RealtimeSTT is designed for this, so it's likely fine.
|
| 693 |
+
if sample_rate != 16000:
|
| 694 |
+
resampled_chunk = decode_and_resample(chunk, sample_rate, 16000)
|
| 695 |
+
if app_state["extended_logging"]:
|
| 696 |
+
debug_print(f"Resampled chunk size: {len(resampled_chunk)} bytes")
|
| 697 |
+
recorder.feed_audio(resampled_chunk)
|
| 698 |
+
else:
|
| 699 |
+
recorder.feed_audio(chunk)
|
| 700 |
+
|
| 701 |
+
except json.JSONDecodeError:
|
| 702 |
+
print(f"{bcolors.WARNING}Invalid metadata JSON in audio message.{bcolors.ENDC}")
|
| 703 |
+
except Exception as e:
|
| 704 |
+
print(f"{bcolors.FAIL}Error processing audio data: {e}{bcolors.ENDC}")
|
| 705 |
+
# Potentially log the problematic message_bytes (or part of it) for debugging
|
| 706 |
+
|
| 707 |
+
except WebSocketDisconnect:
|
| 708 |
+
print(f"{bcolors.WARNING}Data client disconnected: {websocket.client}{bcolors.ENDC}")
|
| 709 |
+
except Exception as e:
|
| 710 |
+
print(f"{bcolors.FAIL}Data WebSocket error for {websocket.client}: {e}{bcolors.ENDC}")
|
| 711 |
+
finally:
|
| 712 |
+
app_state["data_connections"].discard(websocket)
|
| 713 |
+
if recorder:
|
| 714 |
+
recorder.clear_audio_queue() # Clear any pending audio for this session if client drops
|
| 715 |
+
|
| 716 |
+
# Optional: A simple HTML page for testing WebSocket connections
|
| 717 |
+
# (You'd typically have a proper frontend for this)
|
| 718 |
+
# @app.get("/", response_class=HTMLResponse)
|
| 719 |
+
# async def get_test_page():
|
| 720 |
+
# return """
|
| 721 |
+
# <html>
|
| 722 |
+
# <head><title>STT Server Test</title></head>
|
| 723 |
+
# <body>
|
| 724 |
+
# <h1>STT Server WebSocket Test</h1>
|
| 725 |
+
# <p>Open your browser's developer console to interact with WebSockets.</p>
|
| 726 |
+
# <p>Control: <code id="control_ws_url"></code></p>
|
| 727 |
+
# <p>Data: <code id="data_ws_url"></code></p>
|
| 728 |
+
# <script>
|
| 729 |
+
# document.getElementById('control_ws_url').textContent = `ws://${window.location.host}/ws/control`;
|
| 730 |
+
# document.getElementById('data_ws_url').textContent = `ws://${window.location.host}/ws/data`;
|
| 731 |
+
# </script>
|
| 732 |
+
# </body>
|
| 733 |
+
# </html>
|
| 734 |
+
# """
|
| 735 |
+
|
| 736 |
+
if __name__ == "__main__":
|
| 737 |
+
cli_args = parse_arguments() # Parse once to get server_host and server_port for uvicorn
|
| 738 |
+
|
| 739 |
+
# Check if sys.platform is win32 and apply policy
|
| 740 |
+
if sys.platform == 'win32':
|
| 741 |
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
| 742 |
+
|
| 743 |
+
print(f"Starting Uvicorn server on {cli_args.server_host}:{cli_args.server_port}")
|
| 744 |
+
uvicorn.run(app, host=cli_args.server_host, port=cli_args.server_port)
|
requirements.txt
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
RealtimeSTT==0.3.104
|
| 2 |
websockets
|
| 3 |
numpy
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
RealtimeSTT==0.3.104
|
| 4 |
websockets
|
| 5 |
numpy
|