Voxtral-Mini-Realtime

Running

App Files Files Community

Joffrey Thomas commited on 12 days ago

Commit

9930cc6

1 Parent(s): f67b151

change app.py

Browse files

Files changed (1) hide show

app.py +97 -25

app.py CHANGED Viewed

@@ -39,6 +39,61 @@ _loop_lock = threading.Lock()
 _active_sessions = {}
 _sessions_lock = threading.Lock()
 def get_event_loop():
     """Get or create the shared event loop."""
@@ -63,7 +118,7 @@ class UserSession:
     """Per-user session state."""
     def __init__(self):
         self.session_id = str(uuid.uuid4())
-        self.audio_queue = asyncio.Queue(maxsize=100)  # Use async queue
         self.transcription_text = ""
         self.is_running = False
         self.status_message = "ready"
@@ -73,6 +128,17 @@ class UserSession:
         self.last_audio_time = None
         self._start_lock = threading.Lock()
         self._task = None  # Track the async task
 # Load CSS from external file
@@ -314,6 +380,7 @@ async def websocket_handler(session):
         # Remove from active sessions
         with _sessions_lock:
             _active_sessions.pop(session.session_id, None)
 def start_websocket(session):
@@ -333,11 +400,18 @@ def start_websocket(session):
     # Cleanup happens in websocket_handler's finally block
-def ensure_session(session):
-    """Ensure we have a valid UserSession instance (not the lambda factory)."""
-    if session is None or callable(session):
-        return UserSession()
-    return session
 def auto_start_recording(session):
@@ -366,9 +440,9 @@ def auto_start_recording(session):
     return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
-def clear_history(session):
     """Stop the websocket connection and clear all history."""
-    session = ensure_session(session)
     session.is_running = False
     session.last_audio_time = None
@@ -381,8 +455,8 @@ def clear_history(session):
     with _sessions_lock:
         _active_sessions.pop(session.session_id, None)
-    # Create a fresh async queue (old one may have items)
-    session.audio_queue = asyncio.Queue(maxsize=100)
     session.transcription_text = ""
     session.word_timestamps = []
@@ -390,17 +464,18 @@ def clear_history(session):
     session.session_start_time = None
     session.status_message = "ready"
-    return get_transcription_html("", "ready", "Calibrating..."), None
-def process_audio(audio, session):
     """Process incoming audio and queue for streaming."""
-    session = ensure_session(session)
     try:
         # Quick return if audio is None
         if audio is None:
             wpm = session.current_wpm if session.is_running else "Calibrating..."
-            return get_transcription_html(session.transcription_text, session.status_message, wpm)
         # Update last audio time for inactivity tracking
         session.last_audio_time = time.time()
@@ -411,7 +486,7 @@ def process_audio(audio, session):
         # Skip processing if session stopped
         if not session.is_running:
-            return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
         sample_rate, audio_data = audio
@@ -445,14 +520,11 @@ def process_audio(audio, session):
         except Exception:
             pass  # Skip if queue is full or loop issues
-        return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
     except Exception as e:
         print(f"Error processing audio: {e}")
-        # Safely get session attributes with fallbacks
-        text = getattr(session, 'transcription_text', '') if not callable(session) else ''
-        status = getattr(session, 'status_message', 'error') if not callable(session) else 'error'
-        wpm = getattr(session, 'current_wpm', '') if not callable(session) else ''
-        return get_transcription_html(text, status, wpm)
 def _safe_queue_put(q, item):
@@ -465,7 +537,8 @@ def _safe_queue_put(q, item):
 # Gradio interface
 with gr.Blocks(title="Voxtral Real-time Transcription") as demo:
-    session_state = gr.State(value=lambda: UserSession())
     # Header
     gr.HTML(get_header_html())
@@ -499,13 +572,13 @@ with gr.Blocks(title="Voxtral Real-time Transcription") as demo:
     clear_btn.click(
         clear_history,
         inputs=[session_state],
-        outputs=[transcription_display, audio_input]
     )
     audio_input.stream(
         process_audio,
         inputs=[audio_input, session_state],
-        outputs=[transcription_display],
         show_progress="hidden",
         concurrency_limit=100,  # Allow many concurrent audio streams
     )
@@ -515,7 +588,6 @@ host = os.environ.get("HOST", "")
 ws_url = f"wss://{host}/v1/realtime"
-# Initialize the shared event loop at startup
 get_event_loop()
 demo.queue(default_concurrency_limit=50)

 _active_sessions = {}
 _sessions_lock = threading.Lock()
+# Global session registry - sessions are stored here and looked up by ID
+_session_registry = {}
+_registry_lock = threading.Lock()
+_last_cleanup = time.time()
+SESSION_REGISTRY_CLEANUP_INTERVAL = 60  # seconds
+SESSION_MAX_AGE = 300  # 5 minutes - remove sessions older than this
+def get_or_create_session(session_id: str = None) -> "UserSession":
+    """Get existing session by ID or create a new one."""
+    global _last_cleanup
+    # Periodic cleanup of stale sessions
+    now = time.time()
+    if now - _last_cleanup > SESSION_REGISTRY_CLEANUP_INTERVAL:
+        _cleanup_stale_sessions()
+        _last_cleanup = now
+    with _registry_lock:
+        if session_id and session_id in _session_registry:
+            session = _session_registry[session_id]
+            session._last_accessed = now
+            return session
+        # Create new session
+        session = UserSession()
+        session._last_accessed = now
+        _session_registry[session.session_id] = session
+        return session
+def _cleanup_stale_sessions():
+    """Remove sessions that haven't been accessed recently."""
+    now = time.time()
+    to_remove = []
+    with _registry_lock:
+        for session_id, session in _session_registry.items():
+            last_accessed = getattr(session, '_last_accessed', 0)
+            # Remove if: not running AND (no activity for SESSION_MAX_AGE)
+            if not session.is_running and (now - last_accessed > SESSION_MAX_AGE):
+                to_remove.append(session_id)
+        for session_id in to_remove:
+            _session_registry.pop(session_id, None)
+    if to_remove:
+        print(f"Cleaned up {len(to_remove)} stale sessions. Active: {len(_session_registry)}")
+def cleanup_session(session_id: str):
+    """Remove session from registry."""
+    with _registry_lock:
+        _session_registry.pop(session_id, None)
 def get_event_loop():
     """Get or create the shared event loop."""
     """Per-user session state."""
     def __init__(self):
         self.session_id = str(uuid.uuid4())
+        self._audio_queue = None  # Created lazily in the correct event loop
         self.transcription_text = ""
         self.is_running = False
         self.status_message = "ready"
         self.last_audio_time = None
         self._start_lock = threading.Lock()
         self._task = None  # Track the async task
+    @property
+    def audio_queue(self):
+        """Lazily create audio queue to ensure it's in the right event loop."""
+        if self._audio_queue is None:
+            self._audio_queue = asyncio.Queue(maxsize=100)
+        return self._audio_queue
+    def reset_queue(self):
+        """Reset the audio queue."""
+        self._audio_queue = asyncio.Queue(maxsize=100)
 # Load CSS from external file
         # Remove from active sessions
         with _sessions_lock:
             _active_sessions.pop(session.session_id, None)
+        # Note: Don't remove from registry here - session might be reused
 def start_websocket(session):
     # Cleanup happens in websocket_handler's finally block
+def ensure_session(session_id):
+    """Get or create a valid UserSession from a session_id."""
+    # Handle various invalid inputs
+    if session_id is None or callable(session_id):
+        return get_or_create_session()
+    # If it's already a UserSession object (legacy), return it
+    if isinstance(session_id, UserSession):
+        return session_id
+    # Otherwise treat it as a session_id string
+    return get_or_create_session(str(session_id))
 def auto_start_recording(session):
     return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
+def clear_history(session_id):
     """Stop the websocket connection and clear all history."""
+    session = ensure_session(session_id)
     session.is_running = False
     session.last_audio_time = None
     with _sessions_lock:
         _active_sessions.pop(session.session_id, None)
+    # Reset the queue
+    session.reset_queue()
     session.transcription_text = ""
     session.word_timestamps = []
     session.session_start_time = None
     session.status_message = "ready"
+    # Return the session_id to maintain state
+    return get_transcription_html("", "ready", "Calibrating..."), None, session.session_id
+def process_audio(audio, session_id):
     """Process incoming audio and queue for streaming."""
+    session = ensure_session(session_id)
     try:
         # Quick return if audio is None
         if audio is None:
             wpm = session.current_wpm if session.is_running else "Calibrating..."
+            return get_transcription_html(session.transcription_text, session.status_message, wpm), session.session_id
         # Update last audio time for inactivity tracking
         session.last_audio_time = time.time()
         # Skip processing if session stopped
         if not session.is_running:
+            return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm), session.session_id
         sample_rate, audio_data = audio
         except Exception:
             pass  # Skip if queue is full or loop issues
+        return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm), session.session_id
     except Exception as e:
         print(f"Error processing audio: {e}")
+        # Return safe defaults
+        return get_transcription_html("", "error", ""), session.session_id if hasattr(session, 'session_id') else None
 def _safe_queue_put(q, item):
 # Gradio interface
 with gr.Blocks(title="Voxtral Real-time Transcription") as demo:
+    # Store just the session_id string - much more reliable than complex objects
+    session_state = gr.State(value=None)
     # Header
     gr.HTML(get_header_html())
     clear_btn.click(
         clear_history,
         inputs=[session_state],
+        outputs=[transcription_display, audio_input, session_state]
     )
     audio_input.stream(
         process_audio,
         inputs=[audio_input, session_state],
+        outputs=[transcription_display, session_state],
         show_progress="hidden",
         concurrency_limit=100,  # Allow many concurrent audio streams
     )
 ws_url = f"wss://{host}/v1/realtime"
 get_event_loop()
 demo.queue(default_concurrency_limit=50)