Spaces:

mistralai
/

Voxtral-Mini-Realtime

Running

App Files Files Community

Jofthomas commited on 28 days ago

Commit

a6fa7a0

verified ·

1 Parent(s): d54e7c0

Upload 2 files

Browse files

Files changed (2) hide show

app.py +187 -146
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,16 +1,24 @@
 import asyncio
 import base64
-import json
 import os
 import queue
 import threading
 import time
 import uuid
 import gradio as gr
 import numpy as np
-import websockets
 # Load Voxtral icon as base64
 VOXTRAL_ICON_B64 = ""
@@ -28,8 +36,8 @@ INACTIVITY_TIMEOUT = int(os.environ.get("INACTIVITY_TIMEOUT", "10"))  # Close af
 MAX_CONCURRENT_SESSIONS = int(os.environ.get("MAX_SESSIONS", "50"))
 # Global config (shared across users)
-ws_url = ""
-model = ""
 # Global event loop for all websocket connections (runs in single background thread)
 _event_loop = None
@@ -138,14 +146,16 @@ def kill_all_sessions():
             session.is_running = False
             session._stopped_by_user = True
-            # Close websocket immediately
-            if session._websocket is not None:
                 loop = get_event_loop()
                 try:
-                    asyncio.run_coroutine_threadsafe(session._websocket.close(), loop)
                 except Exception:
                     pass
-                session._websocket = None
             # Cancel the task
             if session._task is not None:
@@ -165,6 +175,11 @@ def kill_all_sessions():
     print(f"CAPACITY RESET: Killed {killed_count} sessions. All sessions cleared.")
 def get_event_loop():
     """Get or create the shared event loop."""
     global _event_loop, _loop_thread
@@ -186,8 +201,9 @@ def _run_event_loop():
 class UserSession:
     """Per-user session state."""
-    def __init__(self):
         self.session_id = str(uuid.uuid4())
         # Use a thread-safe queue for cross-thread communication
         self._audio_queue = queue.Queue(maxsize=200)
         self.transcription_text = ""
@@ -199,7 +215,7 @@ class UserSession:
         self.last_audio_time = None
         self._start_lock = threading.Lock()
         self._task = None  # Track the async task
-        self._websocket = None  # Store websocket for forced closure
         self._stopped_by_user = False  # Track if user explicitly stopped
     @property
@@ -228,7 +244,7 @@ def get_header_html() -> str:
     return f"""
     <div class="header-card">
         <h1 class="header-title">{logo_html}Real-time Speech Transcription</h1>
-        <p class="header-subtitle">Click the microphone to start streaming transcriptions. The system will warm up automatically - so there will be a small delay</p>
         <p class="header-subtitle">Talk naturally. Talk fast. Talk ridiculously fast. I can handle it.</p>
     </div>
     """
@@ -343,126 +359,121 @@ def calculate_wpm(session):
     return f"{round(wpm, 1)} WPM"
-async def send_silence(ws, duration=2.0):
-    """Send silence to warm up the model."""
-    num_samples = int(SAMPLE_RATE * duration)
     silence = np.zeros(num_samples, dtype=np.int16)
-    chunk_size = int(SAMPLE_RATE * 0.1)
     for i in range(0, num_samples, chunk_size):
         chunk = silence[i:i + chunk_size]
-        b64_chunk = base64.b64encode(chunk.tobytes()).decode("utf-8")
-        await ws.send(
-            json.dumps(
-                {"type": "input_audio_buffer.append", "audio": b64_chunk}
-            )
-        )
         await asyncio.sleep(0.05)
-async def websocket_handler(session):
-    """Connect to WebSocket and handle audio streaming + transcription."""
-    ws = None
     try:
-        # Add connection timeout to prevent hanging
-        async with asyncio.timeout(10):  # 10 second connection timeout
-            ws = await websockets.connect(ws_url)
-        # Store websocket reference so it can be closed externally
-        session._websocket = ws
-        async with ws:
-            await asyncio.wait_for(ws.recv(), timeout=5)
-            await ws.send(json.dumps({"type": "session.update", "model": model}))
-            session.status_message = "warming"
-            await send_silence(ws, WARMUP_DURATION)
-            await ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
-            session.status_message = "listening"
-            async def send_audio():
-                while session.is_running:
-                    try:
-                        # Check for inactivity timeout
-                        if session.last_audio_time is not None:
-                            idle = time.time() - session.last_audio_time
-                            if idle >= INACTIVITY_TIMEOUT:
-                                session.is_running = False
-                                session.status_message = "ready"
-                                break
-                        if session.session_start_time is not None:
-                            elapsed = time.time() - session.session_start_time
-                            if elapsed >= SESSION_TIMEOUT:
-                                session.is_running = False
-                                session.status_message = "timeout"
-                                break
-                        # Use thread-safe queue with non-blocking get + async sleep
-                        try:
-                            chunk = session.audio_queue.get_nowait()
-                            if session.is_running:
-                                await ws.send(
-                                    json.dumps(
-                                        {"type": "input_audio_buffer.append", "audio": chunk}
-                                    )
-                                )
-                        except queue.Empty:
-                            # No audio available, yield control briefly
-                            await asyncio.sleep(0.05)
-                            continue
-                    except Exception as e:
-                        if session.is_running:  # Only log if unexpected
-                            print(f"Error sending audio: {e}")
-                        session.is_running = False
-                        break
-            async def receive_transcription():
-                try:
-                    async for message in ws:
-                        if not session.is_running:
-                            break
-                        if session.session_start_time is not None:
-                            elapsed = time.time() - session.session_start_time
-                            if elapsed >= SESSION_TIMEOUT:
-                                session.status_message = "timeout"
-                                session.is_running = False
-                                break
-                        data = json.loads(message)
-                        if data.get("type") == "transcription.delta":
-                            delta = data["delta"]
-                            session.transcription_text += delta
-                            words = delta.split()
-                            for _ in words:
-                                session.word_timestamps.append(time.time())
-                            session.current_wpm = calculate_wpm(session)
-                except asyncio.CancelledError:
-                    pass  # Normal cancellation
-                except Exception as e:
-                    if session.is_running:
-                        print(f"Error receiving transcription: {e}")
-                    session.is_running = False
-            await asyncio.gather(send_audio(), receive_transcription(), return_exceptions=True)
     except asyncio.CancelledError:
         pass  # Normal cancellation
-    except websockets.exceptions.ConnectionClosed:
-        pass  # Normal closure
-    except asyncio.TimeoutError:
-        print(f"WebSocket connection timeout for session {session.session_id[:8]}")
-        session.status_message = "error"
     except Exception as e:
         error_msg = str(e) if str(e) else type(e).__name__
-        if "ConnectionReset" not in error_msg:  # Suppress common disconnect errors
-            print(f"WebSocket error: {error_msg}")
         session.status_message = "error"
     finally:
         session.is_running = False
-        session._websocket = None
         # Only remove and log if not already handled by stop_session
         if not session._stopped_by_user:
@@ -473,9 +484,10 @@ async def websocket_handler(session):
                 print(f"Session {session.session_id[:8]} ended. Active sessions: {active_count}")
-def start_websocket(session):
-    """Start WebSocket connection using the shared event loop."""
     session.is_running = True
     # Register this session
     with _sessions_lock:
@@ -486,11 +498,11 @@ def start_websocket(session):
     # Submit to the shared event loop
     loop = get_event_loop()
-    future = asyncio.run_coroutine_threadsafe(websocket_handler(session), loop)
     session._task = future
     # Don't block - the coroutine runs in the background
-    # Cleanup happens in websocket_handler's finally block
 def ensure_session(session_id):
@@ -522,6 +534,11 @@ def auto_start_recording(session):
         if session.is_running:
             return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
         # Check if we've hit max concurrent sessions - kill all if so
         with _sessions_lock:
             active_at_capacity = len(_active_sessions) >= MAX_CONCURRENT_SESSIONS
@@ -540,14 +557,14 @@ def auto_start_recording(session):
         session.last_audio_time = time.time()
         session.status_message = "connecting"
-        # Start websocket (now non-blocking, uses shared event loop)
-        start_websocket(session)
     return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
-def stop_session(session_id):
-    """Stop the websocket connection and invalidate the session.
     Returns None for session_id so a fresh session is created on next recording.
     This prevents duplicate session issues when users stop and restart quickly.
@@ -561,14 +578,16 @@ def stop_session(session_id):
         session.last_audio_time = None
         session._stopped_by_user = True  # Mark as user-stopped to avoid duplicate logging
-        # Close the websocket immediately to force cleanup
-        if session._websocket is not None:
             loop = get_event_loop()
             try:
-                asyncio.run_coroutine_threadsafe(session._websocket.close(), loop)
             except Exception:
-                pass  # Ignore errors during close
-            session._websocket = None
         # Cancel the running task if any
         if session._task is not None:
@@ -590,21 +609,28 @@ def stop_session(session_id):
     return get_transcription_html(old_transcript, "ready", old_wpm), None
-def clear_history(session_id):
-    """Stop the websocket connection and clear all history."""
     session = ensure_session(session_id)
     session.is_running = False
     session.last_audio_time = None
     session._stopped_by_user = True  # Mark as user-stopped
-    # Close the websocket immediately
-    if session._websocket is not None:
         loop = get_event_loop()
         try:
-            asyncio.run_coroutine_threadsafe(session._websocket.close(), loop)
         except Exception:
             pass
-        session._websocket = None
     # Cancel the running task if any
     if session._task is not None:
@@ -628,7 +654,7 @@ def clear_history(session_id):
     return get_transcription_html("", "ready", "Calibrating..."), None, session.session_id
-def process_audio(audio, session_id):
     """Process incoming audio and queue for streaming."""
     # Check capacity - if at or above max, kill ALL sessions to reset
     with _sessions_lock:
@@ -650,13 +676,23 @@ def process_audio(audio, session_id):
             ""
         ), None
     # Always ensure we have a valid session first
     try:
         session = ensure_session(session_id)
     except Exception as e:
         print(f"Error creating session: {e}")
         # Create a fresh session if ensure_session fails
-        session = UserSession()
         _session_registry[session.session_id] = session
     # Cache session_id early in case of later errors
@@ -727,6 +763,16 @@ with gr.Blocks(title="Voxtral Real-time Transcription") as demo:
     # Header
     gr.HTML(get_header_html())
     # Transcription output
     transcription_display = gr.HTML(
         value=get_transcription_html("", "ready", "Calibrating..."),
@@ -755,30 +801,25 @@ with gr.Blocks(title="Voxtral Real-time Transcription") as demo:
     # Event handlers
     clear_btn.click(
         clear_history,
-        inputs=[session_state],
         outputs=[transcription_display, audio_input, session_state]
     )
     audio_input.stop_recording(
         stop_session,
-        inputs=[session_state],
         outputs=[transcription_display, session_state]
     )
     audio_input.stream(
         process_audio,
-        inputs=[audio_input, session_state],
         outputs=[transcription_display, session_state],
         show_progress="hidden",
         concurrency_limit=500,
     )
-model = os.environ.get("MODEL", "mistralai/Voxtral-Mini-4B-Realtime-2602")
-host = os.environ.get("HOST", "")
-ws_url = f"wss://{host}/v1/realtime"
 get_event_loop()
 demo.queue(default_concurrency_limit=200)

 import asyncio
 import base64
 import os
 import queue
 import threading
 import time
 import uuid
+from typing import AsyncIterator
 import gradio as gr
 import numpy as np
+from mistralai import Mistral
+from mistralai.extra.realtime import UnknownRealtimeEvent
+from mistralai.models import (
+    AudioFormat,
+    RealtimeTranscriptionError,
+    RealtimeTranscriptionSessionCreated,
+    TranscriptionStreamDone,
+    TranscriptionStreamTextDelta,
+)
 # Load Voxtral icon as base64
 VOXTRAL_ICON_B64 = ""
 MAX_CONCURRENT_SESSIONS = int(os.environ.get("MAX_SESSIONS", "50"))
 # Global config (shared across users)
+MISTRAL_BASE_URL = "wss://api.mistral.ai"
+MODEL = "voxtral-mini-transcribe-realtime-2602"
 # Global event loop for all websocket connections (runs in single background thread)
 _event_loop = None
             session.is_running = False
             session._stopped_by_user = True
+            # Signal stop event
+            if session._stop_event is not None:
                 loop = get_event_loop()
                 try:
+                    asyncio.run_coroutine_threadsafe(
+                        _set_stop_event_sync(session._stop_event), loop
+                    )
                 except Exception:
                     pass
+                session._stop_event = None
             # Cancel the task
             if session._task is not None:
     print(f"CAPACITY RESET: Killed {killed_count} sessions. All sessions cleared.")
+async def _set_stop_event_sync(event):
+    """Helper to set asyncio event."""
+    event.set()
 def get_event_loop():
     """Get or create the shared event loop."""
     global _event_loop, _loop_thread
 class UserSession:
     """Per-user session state."""
+    def __init__(self, api_key: str = None):
         self.session_id = str(uuid.uuid4())
+        self.api_key = api_key
         # Use a thread-safe queue for cross-thread communication
         self._audio_queue = queue.Queue(maxsize=200)
         self.transcription_text = ""
         self.last_audio_time = None
         self._start_lock = threading.Lock()
         self._task = None  # Track the async task
+        self._stop_event = None  # Event to signal stop
         self._stopped_by_user = False  # Track if user explicitly stopped
     @property
     return f"""
     <div class="header-card">
         <h1 class="header-title">{logo_html}Real-time Speech Transcription</h1>
+        <p class="header-subtitle">Enter your Mistral API key below, then click the microphone to start streaming transcriptions.</p>
         <p class="header-subtitle">Talk naturally. Talk fast. Talk ridiculously fast. I can handle it.</p>
     </div>
     """
     return f"{round(wpm, 1)} WPM"
+async def audio_stream_from_queue(session) -> AsyncIterator[bytes]:
+    """Async generator that yields audio bytes from the session queue."""
+    # First, send silence for warmup
+    session.status_message = "warming"
+    num_samples = int(SAMPLE_RATE * WARMUP_DURATION)
     silence = np.zeros(num_samples, dtype=np.int16)
+    chunk_size = int(SAMPLE_RATE * 0.1)  # 100ms chunks
     for i in range(0, num_samples, chunk_size):
+        if not session.is_running:
+            return
         chunk = silence[i:i + chunk_size]
+        yield chunk.tobytes()
         await asyncio.sleep(0.05)
+    session.status_message = "listening"
+    # Then stream real audio from the queue
+    while session.is_running:
+        # Check for inactivity timeout
+        if session.last_audio_time is not None:
+            idle = time.time() - session.last_audio_time
+            if idle >= INACTIVITY_TIMEOUT:
+                session.is_running = False
+                session.status_message = "ready"
+                return
+        # Check for session timeout
+        if session.session_start_time is not None:
+            elapsed = time.time() - session.session_start_time
+            if elapsed >= SESSION_TIMEOUT:
+                session.is_running = False
+                session.status_message = "timeout"
+                return
+        # Check if stop was requested
+        if session._stop_event and session._stop_event.is_set():
+            return
+        # Get audio from queue
+        try:
+            # The queue contains base64-encoded PCM16 audio
+            b64_chunk = session.audio_queue.get_nowait()
+            # Decode base64 to raw bytes
+            audio_bytes = base64.b64decode(b64_chunk)
+            yield audio_bytes
+        except queue.Empty:
+            # No audio available, yield control briefly
+            await asyncio.sleep(0.05)
+            continue
+async def mistral_transcription_handler(session):
+    """Connect to Mistral realtime API and handle transcription."""
     try:
+        if not session.api_key:
+            session.status_message = "error"
+            print(f"Session {session.session_id[:8]}: No API key provided")
+            return
+        # Create Mistral client
+        client = Mistral(api_key=session.api_key, server_url=MISTRAL_BASE_URL)
+        audio_format = AudioFormat(encoding="pcm_s16le", sample_rate=SAMPLE_RATE)
+        session.status_message = "connecting"
+        # Create the audio stream generator
+        audio_stream = audio_stream_from_queue(session)
+        print(f"Session {session.session_id[:8]}: Connecting to Mistral realtime API...")
+        async for event in client.audio.realtime.transcribe_stream(
+            audio_stream=audio_stream,
+            model=MODEL,
+            audio_format=audio_format,
+        ):
+            if not session.is_running:
+                break
+            if isinstance(event, RealtimeTranscriptionSessionCreated):
+                print(f"Session {session.session_id[:8]}: Connected to Mistral")
+                # Status is already set by audio_stream_from_queue
+            elif isinstance(event, TranscriptionStreamTextDelta):
+                delta = event.text
+                session.transcription_text += delta
+                # Track words for WPM calculation
+                words = delta.split()
+                for _ in words:
+                    session.word_timestamps.append(time.time())
+                session.current_wpm = calculate_wpm(session)
+            elif isinstance(event, TranscriptionStreamDone):
+                print(f"Session {session.session_id[:8]}: Transcription done")
+                break
+            elif isinstance(event, RealtimeTranscriptionError):
+                print(f"Session {session.session_id[:8]}: Error - {event.error}")
+                session.status_message = "error"
+                break
+            elif isinstance(event, UnknownRealtimeEvent):
+                continue  # Ignore unknown events
     except asyncio.CancelledError:
         pass  # Normal cancellation
     except Exception as e:
         error_msg = str(e) if str(e) else type(e).__name__
+        if "ConnectionReset" not in error_msg and "CancelledError" not in error_msg:
+            print(f"Session {session.session_id[:8]}: Mistral API error - {error_msg}")
         session.status_message = "error"
     finally:
         session.is_running = False
         # Only remove and log if not already handled by stop_session
         if not session._stopped_by_user:
                 print(f"Session {session.session_id[:8]} ended. Active sessions: {active_count}")
+def start_transcription(session):
+    """Start Mistral transcription using the shared event loop."""
     session.is_running = True
+    session._stop_event = asyncio.Event()
     # Register this session
     with _sessions_lock:
     # Submit to the shared event loop
     loop = get_event_loop()
+    future = asyncio.run_coroutine_threadsafe(mistral_transcription_handler(session), loop)
     session._task = future
     # Don't block - the coroutine runs in the background
+    # Cleanup happens in mistral_transcription_handler's finally block
 def ensure_session(session_id):
         if session.is_running:
             return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
+        # Check if API key is set
+        if not session.api_key:
+            session.status_message = "error"
+            return get_transcription_html("Please enter your Mistral API key above to start transcription.", "error", "")
         # Check if we've hit max concurrent sessions - kill all if so
         with _sessions_lock:
             active_at_capacity = len(_active_sessions) >= MAX_CONCURRENT_SESSIONS
         session.last_audio_time = time.time()
         session.status_message = "connecting"
+        # Start Mistral transcription (now non-blocking, uses shared event loop)
+        start_transcription(session)
     return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
+def stop_session(session_id, api_key=None):
+    """Stop the transcription and invalidate the session.
     Returns None for session_id so a fresh session is created on next recording.
     This prevents duplicate session issues when users stop and restart quickly.
         session.last_audio_time = None
         session._stopped_by_user = True  # Mark as user-stopped to avoid duplicate logging
+        # Signal the stop event to terminate the audio stream
+        if session._stop_event is not None:
             loop = get_event_loop()
             try:
+                asyncio.run_coroutine_threadsafe(
+                    _set_stop_event(session._stop_event), loop
+                )
             except Exception:
+                pass
+            session._stop_event = None
         # Cancel the running task if any
         if session._task is not None:
     return get_transcription_html(old_transcript, "ready", old_wpm), None
+async def _set_stop_event(event):
+    """Helper to set asyncio event from sync context."""
+    event.set()
+def clear_history(session_id, api_key=None):
+    """Stop the transcription and clear all history."""
     session = ensure_session(session_id)
     session.is_running = False
     session.last_audio_time = None
     session._stopped_by_user = True  # Mark as user-stopped
+    # Signal the stop event
+    if session._stop_event is not None:
         loop = get_event_loop()
         try:
+            asyncio.run_coroutine_threadsafe(
+                _set_stop_event(session._stop_event), loop
+            )
         except Exception:
             pass
+        session._stop_event = None
     # Cancel the running task if any
     if session._task is not None:
     return get_transcription_html("", "ready", "Calibrating..."), None, session.session_id
+def process_audio(audio, session_id, api_key):
     """Process incoming audio and queue for streaming."""
     # Check capacity - if at or above max, kill ALL sessions to reset
     with _sessions_lock:
             ""
         ), None
+    # Check if API key is provided
+    if not api_key or not api_key.strip():
+        return get_transcription_html(
+            "Please enter your Mistral API key above to start transcription.",
+            "error",
+            ""
+        ), None
     # Always ensure we have a valid session first
     try:
         session = ensure_session(session_id)
+        # Update API key on the session
+        session.api_key = api_key.strip()
     except Exception as e:
         print(f"Error creating session: {e}")
         # Create a fresh session if ensure_session fails
+        session = UserSession(api_key=api_key.strip())
         _session_registry[session.session_id] = session
     # Cache session_id early in case of later errors
     # Header
     gr.HTML(get_header_html())
+    # API Key input
+    with gr.Row():
+        api_key_input = gr.Textbox(
+            label="Mistral API Key",
+            placeholder="Enter your Mistral API key...",
+            type="password",
+            elem_id="api-key-input",
+            info="Get your API key from console.mistral.ai"
+        )
     # Transcription output
     transcription_display = gr.HTML(
         value=get_transcription_html("", "ready", "Calibrating..."),
     # Event handlers
     clear_btn.click(
         clear_history,
+        inputs=[session_state, api_key_input],
         outputs=[transcription_display, audio_input, session_state]
     )
     audio_input.stop_recording(
         stop_session,
+        inputs=[session_state, api_key_input],
         outputs=[transcription_display, session_state]
     )
     audio_input.stream(
         process_audio,
+        inputs=[audio_input, session_state, api_key_input],
         outputs=[transcription_display, session_state],
         show_progress="hidden",
         concurrency_limit=500,
     )
 get_event_loop()
 demo.queue(default_concurrency_limit=200)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio>=4.0.0
 websockets
 numpy

 gradio>=4.0.0
 websockets
 numpy
+mistralai[realtime]