SkyNetWalker commited on
Commit
f433461
·
verified ·
1 Parent(s): 9a7f6e8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +837 -0
app.py ADDED
@@ -0,0 +1,837 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ import os
4
+ import queue
5
+ import threading
6
+ import time
7
+ import uuid
8
+ from typing import AsyncIterator
9
+
10
+ import gradio as gr
11
+ import numpy as np
12
+
13
+ from mistralai import Mistral
14
+ from mistralai.extra.realtime import UnknownRealtimeEvent
15
+ from mistralai.models import (
16
+ AudioFormat,
17
+ RealtimeTranscriptionError,
18
+ RealtimeTranscriptionSessionCreated,
19
+ TranscriptionStreamDone,
20
+ TranscriptionStreamTextDelta,
21
+ )
22
+
23
+ # --- SECURITY ENHANCEMENT ---
24
+ # kkk = Your Mistral API Key
25
+ # ppp = The password you want users to enter
26
+ MISTRAL_PRIVATE_KEY = os.environ.get("kkk", "")
27
+ APP_PASSWORD = os.environ.get("ppp", "")
28
+ # ----------------------------
29
+
30
+ # Load Voxtral icon as base64
31
+ VOXTRAL_ICON_B64 = ""
32
+ icon_path = os.path.join(os.path.dirname(__file__), "assets", "voxtral.png")
33
+ if os.path.exists(icon_path):
34
+ with open(icon_path, "rb") as f:
35
+ VOXTRAL_ICON_B64 = base64.b64encode(f.read()).decode("utf-8")
36
+
37
+ SAMPLE_RATE = 16_000
38
+ WARMUP_DURATION = 2.0 # seconds of silence for warmup
39
+ WPM_WINDOW = 10 # seconds for running mean calculation
40
+ CALIBRATION_PERIOD = 5 # seconds before showing WPM
41
+ SESSION_TIMEOUT = int(os.environ.get("SESSION_TIMEOUT", "35")) # Max 30s per session
42
+ INACTIVITY_TIMEOUT = int(os.environ.get("INACTIVITY_TIMEOUT", "10")) # Close after 10s silence
43
+ MAX_CONCURRENT_SESSIONS = int(os.environ.get("MAX_SESSIONS", "50"))
44
+
45
+ # Global config (shared across users)
46
+ MISTRAL_BASE_URL = "wss://api.mistral.ai"
47
+ MODEL = "voxtral-mini-transcribe-realtime-2602"
48
+
49
+ # Global event loop for all websocket connections (runs in single background thread)
50
+ _event_loop = None
51
+ _loop_thread = None
52
+ _loop_lock = threading.Lock()
53
+
54
+ # Track active sessions for resource management
55
+ _active_sessions = {}
56
+ _sessions_lock = threading.Lock()
57
+
58
+ # Global session registry - sessions are stored here and looked up by ID
59
+ _session_registry = {}
60
+ _registry_lock = threading.Lock()
61
+ _last_cleanup = time.time()
62
+ SESSION_REGISTRY_CLEANUP_INTERVAL = 30 # seconds
63
+ SESSION_MAX_AGE = 30 # 30 seconds - remove sessions older than this
64
+
65
+
66
+ def get_or_create_session(session_id: str = None) -> "UserSession":
67
+ """Get existing session by ID or create a new one."""
68
+ global _last_cleanup
69
+
70
+ # Periodic cleanup of stale sessions
71
+ now = time.time()
72
+ if now - _last_cleanup > SESSION_REGISTRY_CLEANUP_INTERVAL:
73
+ _cleanup_stale_sessions()
74
+ _last_cleanup = now
75
+
76
+ with _registry_lock:
77
+ if session_id and session_id in _session_registry:
78
+ session = _session_registry[session_id]
79
+ # Validate the session is actually a UserSession instance
80
+ if isinstance(session, UserSession):
81
+ session._last_accessed = now
82
+ return session
83
+ else:
84
+ # Corrupted registry entry - remove and create new
85
+ print(f"WARNING: Corrupted session registry entry for {session_id}: {type(session)}")
86
+ del _session_registry[session_id]
87
+
88
+ # Create new session
89
+ session = UserSession()
90
+ session._last_accessed = now
91
+ _session_registry[session.session_id] = session
92
+ return session
93
+
94
+
95
+ def _cleanup_stale_sessions():
96
+ """Remove sessions that haven't been accessed recently."""
97
+ now = time.time()
98
+ to_remove_from_registry = []
99
+ to_remove_from_active = []
100
+
101
+ # Need both locks to safely check both dictionaries
102
+ with _registry_lock:
103
+ with _sessions_lock:
104
+ # Find stale sessions in registry
105
+ for session_id, session in _session_registry.items():
106
+ # NEVER remove if still in active_sessions (websocket still running)
107
+ if session_id in _active_sessions:
108
+ continue
109
+
110
+ last_accessed = getattr(session, '_last_accessed', 0)
111
+ # Remove if: not running AND not active AND old
112
+ if not session.is_running and (now - last_accessed > SESSION_MAX_AGE):
113
+ to_remove_from_registry.append(session_id)
114
+
115
+ # Find orphaned sessions in active_sessions (not in registry anymore)
116
+ for session_id, session in list(_active_sessions.items()):
117
+ if session_id not in _session_registry:
118
+ # Orphaned - mark for removal
119
+ if not session.is_running:
120
+ to_remove_from_active.append(session_id)
121
+
122
+ # Clean up registry
123
+ for session_id in to_remove_from_registry:
124
+ _session_registry.pop(session_id, None)
125
+
126
+ # Clean up orphaned active sessions
127
+ for session_id in to_remove_from_active:
128
+ _active_sessions.pop(session_id, None)
129
+
130
+ active_count = len(_active_sessions)
131
+ registry_count = len(_session_registry)
132
+
133
+ total_cleaned = len(to_remove_from_registry) + len(to_remove_from_active)
134
+ if total_cleaned > 0:
135
+ print(f"Cleaned up {len(to_remove_from_registry)} stale + {len(to_remove_from_active)} orphaned sessions. Registry: {registry_count}, Active: {active_count}")
136
+
137
+
138
+ def cleanup_session(session_id: str):
139
+ """Remove session from registry."""
140
+ with _registry_lock:
141
+ _session_registry.pop(session_id, None)
142
+
143
+
144
+ def kill_all_sessions():
145
+ """Emergency cleanup - kill ALL active sessions to free capacity."""
146
+ killed_count = 0
147
+
148
+ with _sessions_lock:
149
+ sessions_to_kill = list(_active_sessions.values())
150
+
151
+ for session in sessions_to_kill:
152
+ try:
153
+ session.is_running = False
154
+ session._stopped_by_user = True
155
+
156
+ # Signal stop event
157
+ if session._stop_event is not None:
158
+ loop = get_event_loop()
159
+ try:
160
+ asyncio.run_coroutine_threadsafe(
161
+ _set_stop_event_sync(session._stop_event), loop
162
+ )
163
+ except Exception:
164
+ pass
165
+ session._stop_event = None
166
+
167
+ # Cancel the task
168
+ if session._task is not None:
169
+ session._task.cancel()
170
+ session._task = None
171
+
172
+ killed_count += 1
173
+ except Exception as e:
174
+ print(f"Error killing session {session.session_id[:8]}: {e}")
175
+
176
+ # Clear both dictionaries
177
+ with _registry_lock:
178
+ with _sessions_lock:
179
+ _active_sessions.clear()
180
+ _session_registry.clear()
181
+
182
+ print(f"CAPACITY RESET: Killed {killed_count} sessions. All sessions cleared.")
183
+
184
+
185
+ async def _set_stop_event_sync(event):
186
+ """Helper to set asyncio event."""
187
+ event.set()
188
+
189
+
190
+ def get_event_loop():
191
+ """Get or create the shared event loop."""
192
+ global _event_loop, _loop_thread
193
+ with _loop_lock:
194
+ if _event_loop is None or not _event_loop.is_running():
195
+ _event_loop = asyncio.new_event_loop()
196
+ _loop_thread = threading.Thread(target=_run_event_loop, daemon=True)
197
+ _loop_thread.start()
198
+ # Wait for loop to start
199
+ time.sleep(0.1)
200
+ return _event_loop
201
+
202
+
203
+ def _run_event_loop():
204
+ """Run the event loop in background thread."""
205
+ asyncio.set_event_loop(_event_loop)
206
+ _event_loop.run_forever()
207
+
208
+
209
+ class UserSession:
210
+ """Per-user session state."""
211
+ def __init__(self, api_key: str = None):
212
+ self.session_id = str(uuid.uuid4())
213
+ self.api_key = api_key
214
+ # Use a thread-safe queue for cross-thread communication
215
+ self._audio_queue = queue.Queue(maxsize=200)
216
+ self.transcription_text = ""
217
+ self.is_running = False
218
+ self.status_message = "ready"
219
+ self.word_timestamps = []
220
+ self.current_wpm = "Calibrating..."
221
+ self.session_start_time = None
222
+ self.last_audio_time = None
223
+ self._start_lock = threading.Lock()
224
+ self._task = None # Track the async task
225
+ self._stop_event = None # Event to signal stop
226
+ self._stopped_by_user = False # Track if user explicitly stopped
227
+
228
+ @property
229
+ def audio_queue(self):
230
+ """Return the thread-safe queue."""
231
+ return self._audio_queue
232
+
233
+ def reset_queue(self):
234
+ """Reset the audio queue."""
235
+ self._audio_queue = queue.Queue(maxsize=200)
236
+
237
+
238
+ # Load CSS from external file
239
+ css_path = os.path.join(os.path.dirname(__file__), "style.css")
240
+ with open(css_path, "r") as f:
241
+ CUSTOM_CSS = f.read()
242
+
243
+
244
+ def get_header_html() -> str:
245
+ """Generate the header HTML with Voxtral logo."""
246
+ if VOXTRAL_ICON_B64:
247
+ logo_html = f'<img src="data:image/png;base64,{VOXTRAL_ICON_B64}" alt="Voxtral" class="header-logo" />'
248
+ else:
249
+ logo_html = ''
250
+
251
+ return f"""
252
+ <div class="header-card">
253
+ <h1 class="header-title">{logo_html}Real-time Speech Transcription</h1>
254
+ <p class="header-subtitle">Enter the <b>App Password</b> below, then click the microphone to start streaming transcriptions.</p>
255
+ <p class="header-subtitle">Talk naturally. Talk fast. Talk ridiculously fast. I can handle it.</p>
256
+ </div>
257
+ """
258
+
259
+
260
+ def get_status_html(status: str) -> str:
261
+ """Generate status badge HTML based on current status."""
262
+ status_configs = {
263
+ "ready": ("STANDBY", "status-ready", ""),
264
+ "connecting": ("CONNECTING", "status-connecting", "fast"),
265
+ "warming": ("WARMING UP", "status-warming", "fast"),
266
+ "listening": ("LISTENING", "status-listening", "animate"),
267
+ "timeout": ("TIMEOUT", "status-timeout", ""),
268
+ "error": ("ERROR", "status-error", ""),
269
+ }
270
+ label, css_class, dot_class = status_configs.get(status, status_configs["ready"])
271
+ dot_anim = f" {dot_class}" if dot_class else ""
272
+
273
+ return f"""<div class="status-badge {css_class}"><span class="status-dot{dot_anim}"></span><span style="color: inherit !important;">{label}</span></div>"""
274
+
275
+
276
+ def get_transcription_html(transcript: str, status: str, wpm: str = "Calibrating...") -> str:
277
+ """Generate the full transcription card HTML."""
278
+ status_badge = get_status_html(status)
279
+ wpm_badge = f'<div class="wpm-badge"><span style="color: #1E1E1E !important;">{wpm}</span></div>'
280
+
281
+ if transcript:
282
+ cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
283
+ content_html = f"""
284
+ <div class="transcript-text" style="color: #000000 !important;">
285
+ {transcript}{cursor_html}
286
+ </div>
287
+ """
288
+ elif status in ["listening", "warming", "connecting"]:
289
+ content_html = """
290
+ <div class="empty-state">
291
+ <div class="empty-dots">
292
+ <div class="empty-dot"></div>
293
+ <div class="empty-dot"></div>
294
+ <div class="empty-dot"></div>
295
+ </div>
296
+ <p class="empty-text" style="color: #555555 !important;">Listening for audio...</p>
297
+ </div>
298
+ """
299
+ elif status == "timeout":
300
+ content_html = """
301
+ <div class="empty-state">
302
+ <p class="empty-text" style="color: #B30400 !important;">Session timeout (5 minutes)</p>
303
+ <p class="empty-text" style="color: #555555 !important;">Click 'Clear History' and refresh to restart.</p>
304
+ </div>
305
+ """
306
+ else:
307
+ content_html = """
308
+ <div class="empty-state">
309
+ <p class="empty-text" style="color: #555555 !important;">// Awaiting audio input...</p>
310
+ <p class="empty-text" style="color: #555555 !important;">// Click the microphone to start.</p>
311
+ </div>
312
+ """
313
+
314
+ # Use base64 image if available
315
+ if VOXTRAL_ICON_B64:
316
+ icon_html = f'<img src="data:image/png;base64,{VOXTRAL_ICON_B64}" alt="Voxtral" class="voxtral-icon" />'
317
+ else:
318
+ icon_html = '<span style="font-size:20px;">🎙️</span>'
319
+
320
+ return f"""
321
+ <div class="transcription-card">
322
+ <div class="card-header">
323
+ <div class="card-header-left">
324
+ {icon_html}
325
+ <span class="card-title" style="color: #1E1E1E !important;">Transcription Output</span>
326
+ </div>
327
+ <div class="card-header-right">
328
+ {wpm_badge}
329
+ {status_badge}
330
+ </div>
331
+ </div>
332
+ <div class="card-content">
333
+ {content_html}
334
+ </div>
335
+ <div class="card-footer">
336
+ <span style="color: #555555 !important;">Voxtral Mini</span>
337
+ <span style="color: #555555 !important;">Real-time Audio Transcription</span>
338
+ </div>
339
+ </div>
340
+ """
341
+
342
+
343
+ def calculate_wpm(session):
344
+ """Calculate words per minute based on running mean of last WPM_WINDOW seconds."""
345
+ if session.session_start_time is not None:
346
+ elapsed = time.time() - session.session_start_time
347
+ if elapsed < CALIBRATION_PERIOD:
348
+ return "Calibrating..."
349
+
350
+ if len(session.word_timestamps) < 2:
351
+ return "0.0 WPM"
352
+
353
+ current_time = time.time()
354
+ cutoff_time = current_time - WPM_WINDOW
355
+ session.word_timestamps = [ts for ts in session.word_timestamps if ts >= cutoff_time]
356
+
357
+ if len(session.word_timestamps) < 2:
358
+ return "0.0 WPM"
359
+
360
+ time_span = current_time - session.word_timestamps[0]
361
+ if time_span == 0:
362
+ return "0.0 WPM"
363
+
364
+ word_count = len(session.word_timestamps)
365
+ wpm = (word_count / time_span) * 60
366
+ return f"{round(wpm, 1)} WPM"
367
+
368
+
369
+ async def audio_stream_from_queue(session) -> AsyncIterator[bytes]:
370
+ """Async generator that yields audio bytes from the session queue."""
371
+ # First, send silence for warmup
372
+ session.status_message = "warming"
373
+ num_samples = int(SAMPLE_RATE * WARMUP_DURATION)
374
+ silence = np.zeros(num_samples, dtype=np.int16)
375
+ chunk_size = int(SAMPLE_RATE * 0.1) # 100ms chunks
376
+
377
+ for i in range(0, num_samples, chunk_size):
378
+ if not session.is_running:
379
+ return
380
+ chunk = silence[i:i + chunk_size]
381
+ yield chunk.tobytes()
382
+ await asyncio.sleep(0.05)
383
+
384
+ session.status_message = "listening"
385
+
386
+ # Then stream real audio from the queue
387
+ while session.is_running:
388
+ # Check for inactivity timeout
389
+ if session.last_audio_time is not None:
390
+ idle = time.time() - session.last_audio_time
391
+ if idle >= INACTIVITY_TIMEOUT:
392
+ session.is_running = False
393
+ session.status_message = "ready"
394
+ return
395
+
396
+ # Check for session timeout
397
+ if session.session_start_time is not None:
398
+ elapsed = time.time() - session.session_start_time
399
+ if elapsed >= SESSION_TIMEOUT:
400
+ session.is_running = False
401
+ session.status_message = "timeout"
402
+ return
403
+
404
+ # Check if stop was requested
405
+ if session._stop_event and session._stop_event.is_set():
406
+ return
407
+
408
+ # Get audio from queue
409
+ try:
410
+ # The queue contains base64-encoded PCM16 audio
411
+ b64_chunk = session.audio_queue.get_nowait()
412
+ # Decode base64 to raw bytes
413
+ audio_bytes = base64.b64decode(b64_chunk)
414
+ yield audio_bytes
415
+ except queue.Empty:
416
+ # No audio available, yield control briefly
417
+ await asyncio.sleep(0.05)
418
+ continue
419
+
420
+
421
+ async def mistral_transcription_handler(session):
422
+ """Connect to Mistral realtime API and handle transcription."""
423
+ try:
424
+ if not session.api_key:
425
+ session.status_message = "error"
426
+ print(f"Session {session.session_id[:8]}: No API key provided")
427
+ return
428
+
429
+ # Create Mistral client
430
+ client = Mistral(api_key=session.api_key, server_url=MISTRAL_BASE_URL)
431
+ audio_format = AudioFormat(encoding="pcm_s16le", sample_rate=SAMPLE_RATE)
432
+
433
+ session.status_message = "connecting"
434
+
435
+ # Create the audio stream generator
436
+ audio_stream = audio_stream_from_queue(session)
437
+
438
+ print(f"Session {session.session_id[:8]}: Connecting to Mistral realtime API...")
439
+
440
+ async for event in client.audio.realtime.transcribe_stream(
441
+ audio_stream=audio_stream,
442
+ model=MODEL,
443
+ audio_format=audio_format,
444
+ ):
445
+ if not session.is_running:
446
+ break
447
+
448
+ if isinstance(event, RealtimeTranscriptionSessionCreated):
449
+ print(f"Session {session.session_id[:8]}: Connected to Mistral")
450
+ # Status is already set by audio_stream_from_queue
451
+
452
+ elif isinstance(event, TranscriptionStreamTextDelta):
453
+ delta = event.text
454
+ session.transcription_text += delta
455
+
456
+ # Track words for WPM calculation
457
+ words = delta.split()
458
+ for _ in words:
459
+ session.word_timestamps.append(time.time())
460
+
461
+ session.current_wpm = calculate_wpm(session)
462
+
463
+ elif isinstance(event, TranscriptionStreamDone):
464
+ print(f"Session {session.session_id[:8]}: Transcription done")
465
+ break
466
+
467
+ elif isinstance(event, RealtimeTranscriptionError):
468
+ print(f"Session {session.session_id[:8]}: Error - {event.error}")
469
+ session.status_message = "error"
470
+ break
471
+
472
+ elif isinstance(event, UnknownRealtimeEvent):
473
+ continue # Ignore unknown events
474
+
475
+ except asyncio.CancelledError:
476
+ pass # Normal cancellation
477
+ except Exception as e:
478
+ error_msg = str(e) if str(e) else type(e).__name__
479
+ if "ConnectionReset" not in error_msg and "CancelledError" not in error_msg:
480
+ print(f"Session {session.session_id[:8]}: Mistral API error - {error_msg}")
481
+ session.status_message = "error"
482
+ finally:
483
+ session.is_running = False
484
+
485
+ # Only remove and log if not already handled by stop_session
486
+ if not session._stopped_by_user:
487
+ with _sessions_lock:
488
+ removed = _active_sessions.pop(session.session_id, None)
489
+ active_count = len(_active_sessions)
490
+ if removed:
491
+ print(f"Session {session.session_id[:8]} ended. Active sessions: {active_count}")
492
+
493
+
494
+ def start_transcription(session):
495
+ """Start Mistral transcription using the shared event loop."""
496
+ session.is_running = True
497
+ session._stop_event = asyncio.Event()
498
+
499
+ # Register this session
500
+ with _sessions_lock:
501
+ _active_sessions[session.session_id] = session
502
+ active_count = len(_active_sessions)
503
+
504
+ print(f"Starting session {session.session_id[:8]}. Active sessions: {active_count}")
505
+
506
+ # Submit to the shared event loop
507
+ loop = get_event_loop()
508
+ future = asyncio.run_coroutine_threadsafe(mistral_transcription_handler(session), loop)
509
+ session._task = future
510
+
511
+ # Don't block - the coroutine runs in the background
512
+ # Cleanup happens in mistral_transcription_handler's finally block
513
+
514
+
515
+ def ensure_session(session_id):
516
+ """Get or create a valid UserSession from a session_id."""
517
+ # Handle various invalid inputs
518
+ if session_id is None or callable(session_id):
519
+ session = get_or_create_session()
520
+ return session
521
+
522
+ # If it's already a UserSession object (legacy), return it
523
+ if isinstance(session_id, UserSession):
524
+ return session_id
525
+
526
+ # Otherwise treat it as a session_id string
527
+ session = get_or_create_session(str(session_id))
528
+
529
+ # Defensive check - this should never happen but helps debug
530
+ if not isinstance(session, UserSession):
531
+ print(f"WARNING: ensure_session returned non-UserSession: {type(session)}")
532
+ return get_or_create_session()
533
+
534
+ return session
535
+
536
+
537
+ def auto_start_recording(session):
538
+ """Automatically start the transcription service when audio begins."""
539
+ # Protect against startup races: Gradio can call `process_audio` concurrently.
540
+ with session._start_lock:
541
+ if session.is_running:
542
+ return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
543
+
544
+ # Check if API key is set
545
+ if not session.api_key:
546
+ session.status_message = "error"
547
+ return get_transcription_html("Please enter the correct App Password above to start.", "error", "")
548
+
549
+ # Check if we've hit max concurrent sessions - kill all if so
550
+ with _sessions_lock:
551
+ active_at_capacity = len(_active_sessions) >= MAX_CONCURRENT_SESSIONS
552
+ with _registry_lock:
553
+ registry_over = len(_session_registry) > MAX_CONCURRENT_SESSIONS
554
+
555
+ if active_at_capacity or registry_over:
556
+ kill_all_sessions()
557
+ session.status_message = "error"
558
+ return get_transcription_html("Server reset due to capacity. Please click the microphone to restart.", "error", "")
559
+
560
+ session.transcription_text = ""
561
+ session.word_timestamps = []
562
+ session.current_wpm = "Calibrating..."
563
+ session.session_start_time = time.time()
564
+ session.last_audio_time = time.time()
565
+ session.status_message = "connecting"
566
+
567
+ # Start Mistral transcription (now non-blocking, uses shared event loop)
568
+ start_transcription(session)
569
+
570
+ return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
571
+
572
+
573
+ def stop_session(session_id, api_key=None):
574
+ """Stop the transcription and invalidate the session.
575
+
576
+ Returns None for session_id so a fresh session is created on next recording.
577
+ This prevents duplicate session issues when users stop and restart quickly.
578
+ """
579
+ session = ensure_session(session_id)
580
+ old_transcript = session.transcription_text
581
+ old_wpm = session.current_wpm
582
+
583
+ if session.is_running:
584
+ session.is_running = False
585
+ session.last_audio_time = None
586
+ session._stopped_by_user = True # Mark as user-stopped to avoid duplicate logging
587
+
588
+ # Signal the stop event to terminate the audio stream
589
+ if session._stop_event is not None:
590
+ loop = get_event_loop()
591
+ try:
592
+ asyncio.run_coroutine_threadsafe(
593
+ _set_stop_event(session._stop_event), loop
594
+ )
595
+ except Exception:
596
+ pass
597
+ session._stop_event = None
598
+
599
+ # Cancel the running task if any
600
+ if session._task is not None:
601
+ session._task.cancel()
602
+ session._task = None
603
+
604
+ # Remove from active sessions
605
+ with _sessions_lock:
606
+ _active_sessions.pop(session.session_id, None)
607
+ active_count = len(_active_sessions)
608
+
609
+ print(f"Mic stopped - session {session.session_id[:8]} ended. Active sessions: {active_count}")
610
+
611
+ # Remove from registry - the session is done
612
+ cleanup_session(session.session_id)
613
+
614
+ # Return None for session_id - a fresh session will be created on next recording
615
+ # This ensures no duplicate sessions when users stop/start quickly
616
+ return get_transcription_html(old_transcript, "ready", old_wpm), None
617
+
618
+
619
+ async def _set_stop_event(event):
620
+ """Helper to set asyncio event from sync context."""
621
+ event.set()
622
+
623
+
624
+ def clear_history(session_id, api_key=None):
625
+ """Stop the transcription and clear all history."""
626
+ session = ensure_session(session_id)
627
+ session.is_running = False
628
+ session.last_audio_time = None
629
+ session._stopped_by_user = True # Mark as user-stopped
630
+
631
+ # Signal the stop event
632
+ if session._stop_event is not None:
633
+ loop = get_event_loop()
634
+ try:
635
+ asyncio.run_coroutine_threadsafe(
636
+ _set_stop_event(session._stop_event), loop
637
+ )
638
+ except Exception:
639
+ pass
640
+ session._stop_event = None
641
+
642
+ # Cancel the running task if any
643
+ if session._task is not None:
644
+ session._task.cancel()
645
+ session._task = None
646
+
647
+ # Remove from active sessions
648
+ with _sessions_lock:
649
+ _active_sessions.pop(session.session_id, None)
650
+
651
+ # Reset the queue
652
+ session.reset_queue()
653
+
654
+ session.transcription_text = ""
655
+ session.word_timestamps = []
656
+ session.current_wpm = "Calibrating..."
657
+ session.session_start_time = None
658
+ session.status_message = "ready"
659
+
660
+ # Return the session_id to maintain state
661
+ return get_transcription_html("", "ready", "Calibrating..."), None, session.session_id
662
+
663
+
664
+ def process_audio(audio, session_id, api_key_input):
665
+ """Process incoming audio and queue for streaming."""
666
+
667
+ # --- PASSWORD VALIDATION ---
668
+ provided_password = api_key_input.strip() if api_key_input else ""
669
+ if provided_password != APP_PASSWORD:
670
+ return get_transcription_html(
671
+ "Invalid App Password. Access Denied.",
672
+ "error",
673
+ ""
674
+ ), None
675
+
676
+ # If password is correct, use the private key from environment
677
+ actual_api_key = MISTRAL_PRIVATE_KEY
678
+ # ---------------------------
679
+
680
+ # Check capacity - if at or above max, kill ALL sessions to reset
681
+ with _sessions_lock:
682
+ active_count = len(_active_sessions)
683
+ is_active_user = session_id and any(s.session_id == session_id for s in _active_sessions.values())
684
+
685
+ with _registry_lock:
686
+ registry_count = len(_session_registry)
687
+
688
+ # Kill all if:
689
+ # 1. Registry exceeds limit (memory safety)
690
+ # 2. Active sessions exceed limit
691
+ # 3. At active capacity AND new user trying to join
692
+ if registry_count > MAX_CONCURRENT_SESSIONS or active_count > MAX_CONCURRENT_SESSIONS or (active_count >= MAX_CONCURRENT_SESSIONS and not is_active_user):
693
+ kill_all_sessions()
694
+ return get_transcription_html(
695
+ "Server reset due to capacity. Please click the microphone to restart.",
696
+ "error",
697
+ ""
698
+ ), None
699
+
700
+ # Always ensure we have a valid session first
701
+ try:
702
+ session = ensure_session(session_id)
703
+ # Update API key on the session with the PRIVATE KEY
704
+ session.api_key = actual_api_key
705
+ except Exception as e:
706
+ print(f"Error creating session: {e}")
707
+ # Create a fresh session if ensure_session fails
708
+ session = UserSession(api_key=actual_api_key)
709
+ _session_registry[session.session_id] = session
710
+
711
+ # Cache session_id early in case of later errors
712
+ current_session_id = session.session_id
713
+
714
+ try:
715
+ # Quick return if audio is None
716
+ if audio is None:
717
+ wpm = session.current_wpm if session.is_running else "Calibrating..."
718
+ return get_transcription_html(session.transcription_text, session.status_message, wpm), current_session_id
719
+
720
+ # Update last audio time for inactivity tracking
721
+ session.last_audio_time = time.time()
722
+
723
+ # Auto-start if not running
724
+ if not session.is_running and session.status_message not in ["timeout", "error"]:
725
+ auto_start_recording(session)
726
+
727
+ # Skip processing if session stopped
728
+ if not session.is_running:
729
+ return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm), current_session_id
730
+
731
+ sample_rate, audio_data = audio
732
+
733
+ # Convert to mono if stereo
734
+ if len(audio_data.shape) > 1:
735
+ audio_data = audio_data.mean(axis=1)
736
+
737
+ # Normalize to float
738
+ if audio_data.dtype == np.int16:
739
+ audio_float = audio_data.astype(np.float32) / 32767.0
740
+ else:
741
+ audio_float = audio_data.astype(np.float32)
742
+
743
+ # Resample to 16kHz if needed
744
+ if sample_rate != SAMPLE_RATE:
745
+ num_samples = int(len(audio_float) * SAMPLE_RATE / sample_rate)
746
+ audio_float = np.interp(
747
+ np.linspace(0, len(audio_float) - 1, num_samples),
748
+ np.arange(len(audio_float)),
749
+ audio_float,
750
+ )
751
+
752
+ # Convert to PCM16 and base64 encode
753
+ pcm16 = (audio_float * 32767).astype(np.int16)
754
+ b64_chunk = base64.b64encode(pcm16.tobytes()).decode("utf-8")
755
+
756
+ # Put directly into thread-safe queue (no event loop needed)
757
+ try:
758
+ session.audio_queue.put_nowait(b64_chunk)
759
+ except Exception:
760
+ pass # Skip if queue is full
761
+
762
+ return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm), current_session_id
763
+ except Exception as e:
764
+ print(f"Error processing audio: {e}")
765
+ # Return safe defaults - always include session_id to maintain state
766
+ return get_transcription_html("", "error", ""), current_session_id
767
+
768
+
769
+ # Gradio interface
770
+ with gr.Blocks(title="Voxtral Real-time Transcription") as demo:
771
+ # Store just the session_id string - much more reliable than complex objects
772
+ session_state = gr.State(value=None)
773
+
774
+ # Header
775
+ gr.HTML(get_header_html())
776
+
777
+ # Password input (Replaces API Key input)
778
+ with gr.Row():
779
+ api_key_input = gr.Textbox(
780
+ label="App Password",
781
+ placeholder="Enter the password to access this app...",
782
+ type="password",
783
+ elem_id="api-key-input",
784
+ info="This app is private. Please enter the authorized password."
785
+ )
786
+
787
+ # Transcription output
788
+ transcription_display = gr.HTML(
789
+ value=get_transcription_html("", "ready", "Calibrating..."),
790
+ elem_id="transcription-output"
791
+ )
792
+
793
+ # Audio input
794
+ audio_input = gr.Audio(
795
+ sources=["microphone"],
796
+ streaming=True,
797
+ type="numpy",
798
+ format="wav",
799
+ elem_id="audio-input",
800
+ label="Microphone Input"
801
+ )
802
+
803
+ # Clear button
804
+ clear_btn = gr.Button(
805
+ "Clear History",
806
+ elem_classes=["clear-btn"]
807
+ )
808
+
809
+ # Info text
810
+ gr.HTML('<p class="info-text">To start again - click on Clear History AND refresh your website.</p>')
811
+
812
+ # Event handlers
813
+ clear_btn.click(
814
+ clear_history,
815
+ inputs=[session_state, api_key_input],
816
+ outputs=[transcription_display, audio_input, session_state]
817
+ )
818
+
819
+
820
+ audio_input.stop_recording(
821
+ stop_session,
822
+ inputs=[session_state, api_key_input],
823
+ outputs=[transcription_display, session_state]
824
+ )
825
+
826
+ audio_input.stream(
827
+ process_audio,
828
+ inputs=[audio_input, session_state, api_key_input],
829
+ outputs=[transcription_display, session_state],
830
+ show_progress="hidden",
831
+ concurrency_limit=500,
832
+ )
833
+
834
+ get_event_loop()
835
+
836
+ demo.queue(default_concurrency_limit=200)
837
+ demo.launch(css=CUSTOM_CSS, theme=gr.themes.Base(), ssr_mode=False, max_threads=200)