Spaces:

mistralai
/

Voxtral-Mini-Realtime

Paused

App Files Files Community

Delay support with partial transcript

by pandora-s - opened Feb 17

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+572

-82

Files changed (3) hide show

app.py +498 -80
requirements.txt +1 -1
style.css +73 -1

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import threading
 import time
 import uuid
 from typing import AsyncIterator
 import gradio as gr
 import numpy as np
@@ -56,6 +57,7 @@ _last_cleanup = time.time()
 SESSION_REGISTRY_CLEANUP_INTERVAL = 90  # seconds
 SESSION_MAX_AGE = 90  # 90 seconds - remove sessions older than this
 def get_or_create_session(session_id: str = None) -> "UserSession":
     """Get existing session by ID or create a new one."""
@@ -202,12 +204,13 @@ def _run_event_loop():
 class UserSession:
     """Per-user session state."""
-    def __init__(self, api_key: str = None):
         self.session_id = str(uuid.uuid4())
         self.api_key = api_key
         # Use a thread-safe queue for cross-thread communication
         self._audio_queue = queue.Queue(maxsize=200)
-        self.transcription_text = ""
         self.is_running = False
         self.status_message = "ready"
         self.word_timestamps = []
@@ -218,6 +221,15 @@ class UserSession:
         self._task = None  # Track the async task
         self._stop_event = None  # Event to signal stop
         self._stopped_by_user = False  # Track if user explicitly stopped
     @property
     def audio_queue(self):
@@ -227,6 +239,96 @@ class UserSession:
     def reset_queue(self):
         """Reset the audio queue."""
         self._audio_queue = queue.Queue(maxsize=200)
 # Load CSS from external file
@@ -267,18 +369,81 @@ def get_status_html(status: str) -> str:
     return f"""<div class="status-badge {css_class}"><span class="status-dot{dot_anim}"></span><span style="color: inherit !important;">{label}</span></div>"""
-def get_transcription_html(transcript: str, status: str, wpm: str = "Calibrating...") -> str:
     """Generate the full transcription card HTML."""
     status_badge = get_status_html(status)
     wpm_badge = f'<div class="wpm-badge"><span style="color: #1E1E1E !important;">{wpm}</span></div>'
-    if transcript:
-        cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
-        content_html = f"""
-        <div class="transcript-text" style="color: #000000 !important;">
-            {transcript}{cursor_html}
-        </div>
-        """
     elif status in ["listening", "warming", "connecting"]:
         content_html = """
         <div class="empty-state">
@@ -412,8 +577,101 @@ async def audio_stream_from_queue(session) -> AsyncIterator[bytes]:
             continue
 async def mistral_transcription_handler(session):
-    """Connect to Mistral realtime API and handle transcription."""
     try:
         if not session.api_key:
             session.status_message = "error"
@@ -426,46 +684,187 @@ async def mistral_transcription_handler(session):
         session.status_message = "connecting"
-        # Create the audio stream generator
-        audio_stream = audio_stream_from_queue(session)
         print(f"Session {session.session_id[:8]}: Connecting to Mistral realtime API...")
-        async for event in client.audio.realtime.transcribe_stream(
-            audio_stream=audio_stream,
-            model=MODEL,
-            audio_format=audio_format,
-        ):
-            if not session.is_running:
-                break
-            if isinstance(event, RealtimeTranscriptionSessionCreated):
-                print(f"Session {session.session_id[:8]}: Connected to Mistral")
-                # Status is already set by audio_stream_from_queue
-            elif isinstance(event, TranscriptionStreamTextDelta):
-                delta = event.text
-                session.transcription_text += delta
-                # Track words for WPM calculation
-                words = delta.split()
-                for _ in words:
-                    session.word_timestamps.append(time.time())
-                session.current_wpm = calculate_wpm(session)
-            elif isinstance(event, TranscriptionStreamDone):
-                print(f"Session {session.session_id[:8]}: Transcription done")
-                break
-            elif isinstance(event, RealtimeTranscriptionError):
-                print(f"Session {session.session_id[:8]}: Error - {event.error}")
-                session.status_message = "error"
-                break
-            elif isinstance(event, UnknownRealtimeEvent):
-                continue  # Ignore unknown events
     except asyncio.CancelledError:
         pass  # Normal cancellation
     except Exception as e:
@@ -533,12 +932,12 @@ def auto_start_recording(session):
     # Protect against startup races: Gradio can call `process_audio` concurrently.
     with session._start_lock:
         if session.is_running:
-            return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
         # Check if API key is set
         if not session.api_key:
             session.status_message = "error"
-            return get_transcription_html("Please enter your Mistral API key above to start transcription.", "error", "")
         # Check if we've hit max concurrent sessions - kill all if so
         with _sessions_lock:
@@ -549,29 +948,32 @@ def auto_start_recording(session):
         if active_at_capacity or registry_over:
             kill_all_sessions()
             session.status_message = "error"
-            return get_transcription_html("Server reset due to capacity. Please click the microphone to restart.", "error", "")
-        session.transcription_text = ""
         session.word_timestamps = []
         session.current_wpm = "Calibrating..."
         session.session_start_time = time.time()
         session.last_audio_time = time.time()
         session.status_message = "connecting"
         # Start Mistral transcription (now non-blocking, uses shared event loop)
         start_transcription(session)
-    return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm)
-def stop_session(session_id, api_key=None):
     """Stop the transcription and invalidate the session.
     Returns None for session_id so a fresh session is created on next recording.
     This prevents duplicate session issues when users stop and restart quickly.
     """
     session = ensure_session(session_id)
-    old_transcript = session.transcription_text
     old_wpm = session.current_wpm
     if session.is_running:
@@ -607,7 +1009,7 @@ def stop_session(session_id, api_key=None):
     # Return None for session_id - a fresh session will be created on next recording
     # This ensures no duplicate sessions when users stop/start quickly
-    return get_transcription_html(old_transcript, "ready", old_wpm), None
 async def _set_stop_event(event):
@@ -615,7 +1017,7 @@ async def _set_stop_event(event):
     event.set()
-def clear_history(session_id, api_key=None):
     """Stop the transcription and clear all history."""
     session = ensure_session(session_id)
     session.is_running = False
@@ -645,17 +1047,23 @@ def clear_history(session_id, api_key=None):
     # Reset the queue
     session.reset_queue()
-    session.transcription_text = ""
     session.word_timestamps = []
     session.current_wpm = "Calibrating..."
     session.session_start_time = None
     session.status_message = "ready"
     # Return the session_id to maintain state
-    return get_transcription_html("", "ready", "Calibrating..."), None, session.session_id
-def process_audio(audio, session_id, api_key):
     """Process incoming audio and queue for streaming."""
     # Check capacity - if at or above max, kill ALL sessions to reset
     with _sessions_lock:
@@ -672,28 +1080,33 @@ def process_audio(audio, session_id, api_key):
     if registry_count > MAX_CONCURRENT_SESSIONS or active_count > MAX_CONCURRENT_SESSIONS or (active_count >= MAX_CONCURRENT_SESSIONS and not is_active_user):
         kill_all_sessions()
         return get_transcription_html(
-            "Server reset due to capacity. Please click the microphone to restart.",
             "error",
-            ""
         ), None
     # Check if API key is provided
     if not api_key or not api_key.strip():
-        return get_transcription_html(
-            "Please enter your Mistral API key above to start transcription.",
-            "error",
-            ""
-        ), None
     # Always ensure we have a valid session first
     try:
         session = ensure_session(session_id)
         # Update API key on the session
         session.api_key = api_key.strip()
     except Exception as e:
         print(f"Error creating session: {e}")
         # Create a fresh session if ensure_session fails
         session = UserSession(api_key=api_key.strip())
         _session_registry[session.session_id] = session
     # Cache session_id early in case of later errors
@@ -703,7 +1116,7 @@ def process_audio(audio, session_id, api_key):
         # Quick return if audio is None
         if audio is None:
             wpm = session.current_wpm if session.is_running else "Calibrating..."
-            return get_transcription_html(session.transcription_text, session.status_message, wpm), current_session_id
         # Update last audio time for inactivity tracking
         session.last_audio_time = time.time()
@@ -714,7 +1127,7 @@ def process_audio(audio, session_id, api_key):
         # Skip processing if session stopped
         if not session.is_running:
-            return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm), current_session_id
         sample_rate, audio_data = audio
@@ -747,14 +1160,11 @@ def process_audio(audio, session_id, api_key):
         except Exception:
             pass  # Skip if queue is full
-        return get_transcription_html(session.transcription_text, session.status_message, session.current_wpm), current_session_id
     except Exception as e:
         print(f"Error processing audio: {e}")
         # Return safe defaults - always include session_id to maintain state
-        return get_transcription_html("", "error", ""), current_session_id
 # Gradio interface
 with gr.Blocks(title="Voxtral Real-time Transcription") as demo:
@@ -764,19 +1174,27 @@ with gr.Blocks(title="Voxtral Real-time Transcription") as demo:
     # Header
     gr.HTML(get_header_html())
-    # API Key input
     with gr.Row():
         api_key_input = gr.Textbox(
-            label="Mistral API Key",
-            placeholder="Enter your Mistral API key...",
             type="password",
             elem_id="api-key-input",
-            info="Get your API key from console.mistral.ai"
         )
     # Transcription output
     transcription_display = gr.HTML(
-        value=get_transcription_html("", "ready", "Calibrating..."),
         elem_id="transcription-output"
     )
@@ -802,20 +1220,20 @@ with gr.Blocks(title="Voxtral Real-time Transcription") as demo:
     # Event handlers
     clear_btn.click(
         clear_history,
-        inputs=[session_state, api_key_input],
         outputs=[transcription_display, audio_input, session_state]
     )
     audio_input.stop_recording(
         stop_session,
-        inputs=[session_state, api_key_input],
         outputs=[transcription_display, session_state]
     )
     audio_input.stream(
         process_audio,
-        inputs=[audio_input, session_state, api_key_input],
         outputs=[transcription_display, session_state],
         show_progress="hidden",
         concurrency_limit=500,

 import time
 import uuid
 from typing import AsyncIterator
+import difflib
 import gradio as gr
 import numpy as np
 SESSION_REGISTRY_CLEANUP_INTERVAL = 90  # seconds
 SESSION_MAX_AGE = 90  # 90 seconds - remove sessions older than this
+DEFAULT_API_KEY = os.environ.get("DEFAULT_API_KEY", "")
 def get_or_create_session(session_id: str = None) -> "UserSession":
     """Get existing session by ID or create a new one."""
 class UserSession:
     """Per-user session state."""
+    def __init__(self, api_key: str = ""):
         self.session_id = str(uuid.uuid4())
         self.api_key = api_key
+        self.partial_transcript_enabled = False  # Default to disabled
         # Use a thread-safe queue for cross-thread communication
         self._audio_queue = queue.Queue(maxsize=200)
+        self.transcription_tuple = ("", "", "")  # For 3 streams
         self.is_running = False
         self.status_message = "ready"
         self.word_timestamps = []
         self._task = None  # Track the async task
         self._stop_event = None  # Event to signal stop
         self._stopped_by_user = False  # Track if user explicitly stopped
+        self.new_color_open = '<span style="color: #FFA500";>'
+        self.new_color_close = "</span>"
+        # Enhanced event tracking
+        self.stream_events = {
+            'stream_1': [],  # List of (timestamp, event_type, event_data) tuples
+            'stream_2': []   # List of (timestamp, event_type, event_data) tuples
+        }
+        self.last_event_timestamp = None
     @property
     def audio_queue(self):
     def reset_queue(self):
         """Reset the audio queue."""
         self._audio_queue = queue.Queue(maxsize=200)
+    def get_event_summary(self):
+        """Get a summary of all stream events with timestamps."""
+        summary = {
+            'stream_1': [],
+            'stream_2': [],
+            'stats': {
+                'stream_1_count': len(self.stream_events['stream_1']),
+                'stream_2_count': len(self.stream_events['stream_2']),
+                'last_event_time': self.last_event_timestamp,
+                'total_events': len(self.stream_events['stream_1']) + len(self.stream_events['stream_2'])
+            }
+        }
+        for stream_name in ['stream_1', 'stream_2']:
+            for event in self.stream_events[stream_name]:
+                summary[stream_name].append({
+                    'timestamp': event.get('timestamp', 0),
+                    'type': event.get('type', 'unknown'),
+                    'data': {k: v for k, v in event.items() if k not in ['timestamp', 'type']}
+                })
+        return summary
+    def clear_events(self):
+        """Clear all event history."""
+        self.stream_events = {
+            'stream_1': [],
+            'stream_2': []
+        }
+        self.last_event_timestamp = None
+        self.transcription_tuple = ("", "", "")
+    @staticmethod
+    def _normalize_word(word: str) -> str:
+        return word.strip(".,!?;:\"'()[]{}").lower()
+    def _compute_display_texts(self, slow_text, fast_text) -> tuple[str, str]:
+        slow_words = slow_text.split()
+        fast_words = fast_text.split()
+        if not slow_words:
+            partial_text = f" {fast_text}".rstrip()
+            return "", partial_text
+        slow_norm = [self._normalize_word(word) for word in slow_words]
+        fast_norm = [self._normalize_word(word) for word in fast_words]
+        matcher = difflib.SequenceMatcher(None, slow_norm, fast_norm)
+        last_fast_index = 0
+        slow_progress = 0
+        for block in matcher.get_matching_blocks():
+            if block.size == 0:
+                continue
+            slow_end = block.a + block.size
+            if slow_end > slow_progress:
+                slow_progress = slow_end
+                last_fast_index = block.b + block.size
+        if last_fast_index < len(fast_words):
+            ahead_words = fast_words[last_fast_index:]
+            partial_text = " " + " ".join(ahead_words) if ahead_words else ""
+        else:
+            partial_text = ""
+        return slow_text, partial_text
+    def reconstruct_transcription(self):
+        """Reconstruct transcription text from stream events."""
+        stream1_text = ""
+        stream2_text = ""
+        # Reconstruct from text_delta events
+        for event in self.stream_events['stream_1']:
+            if event.get('type') == 'text_delta':
+                stream1_text += event.get('text', '')
+        for event in self.stream_events['stream_2']:
+            if event.get('type') == 'text_delta':
+                stream2_text += event.get('text', '')
+        # Stream 3
+        stream3_final = stream2_text
+        stream3_preview = stream1_text
+        stream3_final, stream3_preview = self._compute_display_texts(stream3_final, stream3_preview)
+        stream3_text = stream3_final + self.new_color_open + stream3_preview + self.new_color_close
+        # Return as tuple for compatibility with HTML function
+        return (stream1_text, stream2_text, stream3_text)
 # Load CSS from external file
     return f"""<div class="status-badge {css_class}"><span class="status-dot{dot_anim}"></span><span style="color: inherit !important;">{label}</span></div>"""
+def get_transcription_html(transcripts: tuple, status: str, wpm: str = "Calibrating...", partial_transcript_enabled: bool = False) -> str:
     """Generate the full transcription card HTML."""
     status_badge = get_status_html(status)
     wpm_badge = f'<div class="wpm-badge"><span style="color: #1E1E1E !important;">{wpm}</span></div>'
+    if transcripts:
+        # Check if partial transcript is enabled and we have 3 streams
+        if partial_transcript_enabled and len(transcripts) >= 3 and transcripts[0] and transcripts[1] and transcripts[2]:
+            # Split into three streams
+            stream1_content, stream2_content, stream3_content = transcripts
+            cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
+            content_html = f"""
+            <div class="triple-stream-container">
+                <div class="stream-box">
+                    <div class="stream-label">Stream 1 (Preview - 240ms)</div>
+                    <div class="transcript-text" style="color: #000000 !important;">
+{stream1_content}{cursor_html}
+                    </div>
+                </div>
+                <div class="stream-box">
+                    <div class="stream-label">Stream 2 (Final - 2.4s)</div>
+                    <div class="transcript-text" style="color: #000000 !important;">
+{stream2_content}{cursor_html}
+                    </div>
+                </div>
+                <div class="stream-box">
+                    <div class="stream-label">Stream 3 (Merged)</div>
+                    <div class="transcript-text" style="color: #000000 !important;">
+{stream3_content}{cursor_html}
+                    </div>
+                </div>
+            </div>
+            """
+        # Check if we have 3 streams (backward compatibility for when partial transcript is disabled)
+        elif len(transcripts) >= 3 and transcripts[0] and transcripts[1] and transcripts[2]:
+            # Show only the merged stream when partial transcript is disabled
+            stream3_content = transcripts[2]
+            cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
+            content_html = f"""
+            <div class="transcript-text" style="color: #000000 !important;">
+{stream3_content}{cursor_html}
+            </div>
+            """
+        # Check if transcript contains both streams (backward compatibility)
+        elif transcripts[0] and transcripts[1]:
+            # Split the transcript into two streams
+            stream1_content, stream2_content = transcripts
+            cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
+            content_html = f"""
+            <div class="dual-stream-container">
+                <div class="stream-box">
+                    <div class="stream-label">Stream 1</div>
+                    <div class="transcript-text" style="color: #000000 !important;">
+{stream1_content}{cursor_html}
+                    </div>
+                </div>
+                <div class="stream-box">
+                    <div class="stream-label">Stream 2</div>
+                    <div class="transcript-text" style="color: #000000 !important;">
+{stream2_content}{cursor_html}
+                    </div>
+                </div>
+            </div>
+            """
+        else:
+            # Single stream (backward compatibility)
+            cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
+            content_html = f"""
+            <div class="transcript-text" style="color: #000000 !important;">
+{transcripts[0]}{cursor_html}
+            </div>
+            """
     elif status in ["listening", "warming", "connecting"]:
         content_html = """
         <div class="empty-state">
             continue
+class AudioStreamDuplicator:
+    """Duplicates an audio stream so it can be consumed by multiple consumers."""
+    def __init__(self, session):
+        self.session = session
+        self.consumers = []
+        self.buffer = []
+        self.consumer_positions = {}  # Track position for each consumer
+        self.lock = asyncio.Lock()
+    async def add_consumer(self):
+        """Add a new consumer to the duplicator."""
+        consumer_id = len(self.consumers)
+        self.consumers.append(consumer_id)
+        self.consumer_positions[consumer_id] = 0  # Start at beginning
+        return self._create_consumer_stream(consumer_id)
+    async def _create_consumer_stream(self, consumer_id):
+        """Create a stream for a specific consumer."""
+        # First yield warmup silence for this consumer
+        num_samples = int(SAMPLE_RATE * WARMUP_DURATION)
+        silence = np.zeros(num_samples, dtype=np.int16)
+        chunk_size = int(SAMPLE_RATE * 0.1)  # 100ms chunks
+        for i in range(0, num_samples, chunk_size):
+            if not self.session.is_running:
+                return
+            chunk = silence[i:i + chunk_size]
+            yield chunk.tobytes()
+            await asyncio.sleep(0.05)
+        # Then stream from the shared buffer
+        while self.session.is_running:
+            # Check for inactivity timeout
+            if self.session.last_audio_time is not None:
+                idle = time.time() - self.session.last_audio_time
+                if idle >= INACTIVITY_TIMEOUT:
+                    self.session.is_running = False
+                    self.session.status_message = "ready"
+                    return
+            # Check for session timeout
+            if self.session.session_start_time is not None:
+                elapsed = time.time() - self.session.session_start_time
+                if elapsed >= SESSION_TIMEOUT:
+                    self.session.is_running = False
+                    self.session.status_message = "timeout"
+                    return
+            # Check if stop was requested
+            if self.session._stop_event and self.session._stop_event.is_set():
+                return
+            # Get audio from the shared buffer - each consumer gets all chunks
+            async with self.lock:
+                position = self.consumer_positions[consumer_id]
+                if position < len(self.buffer):
+                    audio_bytes = self.buffer[position]
+                    self.consumer_positions[consumer_id] += 1
+                    yield audio_bytes
+                else:
+                    # No audio available, yield control briefly
+                    await asyncio.sleep(0.05)
+                    continue
+async def audio_stream_duplicator_from_queue(session):
+    """Create a duplicator that can serve multiple audio streams."""
+    duplicator = AudioStreamDuplicator(session)
+    # Start a background task to fill the buffer from the queue
+    async def fill_buffer():
+        while session.is_running:
+            try:
+                # The queue contains base64-encoded PCM16 audio
+                b64_chunk = session.audio_queue.get_nowait()
+                # Decode base64 to raw bytes
+                audio_bytes = base64.b64decode(b64_chunk)
+                async with duplicator.lock:
+                    # Add to buffer - all consumers will get this chunk
+                    duplicator.buffer.append(audio_bytes)
+            except queue.Empty:
+                # No audio available, yield control briefly
+                await asyncio.sleep(0.05)
+                continue
+    # Start the buffer filler task
+    asyncio.create_task(fill_buffer())
+    return duplicator
 async def mistral_transcription_handler(session):
+    """Connect to Mistral realtime API and handle transcription with 2 parallel streams."""
     try:
         if not session.api_key:
             session.status_message = "error"
         session.status_message = "connecting"
         print(f"Session {session.session_id[:8]}: Connecting to Mistral realtime API...")
+        # Create a duplicator that can serve multiple audio streams
+        duplicator = await audio_stream_duplicator_from_queue(session)
+        print(f"Session {session.session_id[:8]}: Created audio stream duplicator for parallel processing")
+        # Create separate audio streams from the duplicator
+        audio_stream_1 = await duplicator.add_consumer()
+        audio_stream_2 = await duplicator.add_consumer()
+        print(f"Session {session.session_id[:8]}: Created 2 separate audio streams from duplicator")
+        # Create tasks for both transcription streams
+        async def process_stream_1():
+            async for event_1 in client.audio.realtime.transcribe_stream(
+                audio_stream=audio_stream_1,
+                model=MODEL,
+                audio_format=audio_format,
+                target_streaming_delay_ms=240
+            ):
+                if not session.is_running:
+                    break
+                current_time = time.time()
+                if isinstance(event_1, RealtimeTranscriptionSessionCreated):
+                    event_data = {
+                        'type': 'session_created',
+                        'timestamp': current_time,
+                        'session_id': event_1.session_id if hasattr(event_1, 'session_id') else None
+                    }
+                    session.stream_events['stream_1'].append(event_data)
+                    session.last_event_timestamp = current_time
+                    print(f"Session {session.session_id[:8]}: Stream 1 connected to Mistral - {current_time:.3f}")
+                elif isinstance(event_1, TranscriptionStreamTextDelta):
+                    delta = event_1.text
+                    # Get current full text by reconstructing from events
+                    current_full_text = ""
+                    for e in session.stream_events['stream_1']:
+                        if e.get('type') == 'text_delta':
+                            current_full_text += e.get('text', '')
+                    current_full_text += delta
+                    event_data = {
+                        'type': 'text_delta',
+                        'timestamp': current_time,
+                        'text': delta,
+                        'full_text': current_full_text
+                    }
+                    session.stream_events['stream_1'].append(event_data)
+                    session.last_event_timestamp = current_time
+                    print(f'1 [{current_time:.3f}]', delta, end="", flush=True)
+                    words = delta.split()
+                    for _ in words:
+                        session.word_timestamps.append(time.time())
+                    session.current_wpm = calculate_wpm(session)
+                elif isinstance(event_1, TranscriptionStreamDone):
+                    event_data = {
+                        'type': 'stream_done',
+                        'timestamp': current_time
+                    }
+                    session.stream_events['stream_1'].append(event_data)
+                    session.last_event_timestamp = current_time
+                    print(f"Session {session.session_id[:8]}: Stream 1 transcription done - {current_time:.3f}")
+                    break
+                elif isinstance(event_1, RealtimeTranscriptionError):
+                    event_data = {
+                        'type': 'error',
+                        'timestamp': current_time,
+                        'error': str(event_1.error)
+                    }
+                    session.stream_events['stream_1'].append(event_data)
+                    session.last_event_timestamp = current_time
+                    print(f"Session {session.session_id[:8]}: Stream 1 error - {event_1.error} - {current_time:.3f}")
+                    break
+                elif isinstance(event_1, UnknownRealtimeEvent):
+                    event_data = {
+                        'type': 'unknown_event',
+                        'timestamp': current_time,
+                        'event': str(event_1)
+                    }
+                    session.stream_events['stream_1'].append(event_data)
+                    session.last_event_timestamp = current_time
+                    continue  # Ignore unknown events
+        async def process_stream_2():
+            async for event_2 in client.audio.realtime.transcribe_stream(
+                audio_stream=audio_stream_2,
+                model=MODEL,
+                audio_format=audio_format,
+                target_streaming_delay_ms=2400
+            ):
+                if not session.is_running:
+                    break
+                current_time = time.time()
+                if isinstance(event_2, RealtimeTranscriptionSessionCreated):
+                    event_data = {
+                        'type': 'session_created',
+                        'timestamp': current_time,
+                        'session_id': event_2.session_id if hasattr(event_2, 'session_id') else None
+                    }
+                    session.stream_events['stream_2'].append(event_data)
+                    session.last_event_timestamp = current_time
+                    print(f"Session {session.session_id[:8]}: Stream 2 connected to Mistral - {current_time:.3f}")
+                elif isinstance(event_2, TranscriptionStreamTextDelta):
+                    delta = event_2.text
+                    # Get current full text by reconstructing from events
+                    current_full_text = ""
+                    for e in session.stream_events['stream_2']:
+                        if e.get('type') == 'text_delta':
+                            current_full_text += e.get('text', '')
+                    current_full_text += delta
+                    event_data = {
+                        'type': 'text_delta',
+                        'timestamp': current_time,
+                        'text': delta,
+                        'full_text': current_full_text
+                    }
+                    session.stream_events['stream_2'].append(event_data)
+                    session.last_event_timestamp = current_time
+                    print(f'2 [{current_time:.3f}]', delta, end="", flush=True)
+                    session.current_wpm = calculate_wpm(session)
+                elif isinstance(event_2, TranscriptionStreamDone):
+                    event_data = {
+                        'type': 'stream_done',
+                        'timestamp': current_time
+                    }
+                    session.stream_events['stream_2'].append(event_data)
+                    session.last_event_timestamp = current_time
+                    print(f"Session {session.session_id[:8]}: Stream 2 transcription done - {current_time:.3f}")
+                    break
+                elif isinstance(event_2, RealtimeTranscriptionError):
+                    event_data = {
+                        'type': 'error',
+                        'timestamp': current_time,
+                        'error': str(event_2.error)
+                    }
+                    session.stream_events['stream_2'].append(event_data)
+                    session.last_event_timestamp = current_time
+                    print(f"Session {session.session_id[:8]}: Stream 2 error - {event_2.error} - {current_time:.3f}")
+                    break
+                elif isinstance(event_2, UnknownRealtimeEvent):
+                    event_data = {
+                        'type': 'unknown_event',
+                        'timestamp': current_time,
+                        'event': str(event_2)
+                    }
+                    session.stream_events['stream_2'].append(event_data)
+                    session.last_event_timestamp = current_time
+                    continue  # Ignore unknown events
+        # Run both streams in parallel
+        stream1_task = asyncio.create_task(process_stream_1())
+        stream2_task = asyncio.create_task(process_stream_2())
+        # Wait for both streams to complete
+        await asyncio.gather(stream1_task, stream2_task)
+        # Final transcription is already reconstructed from events
+        # Just add stats to the display
+        event_summary = session.get_event_summary()
+        stats_text = f"Events: {event_summary['stats']['total_events']} (S1: {event_summary['stats']['stream_1_count']}, S2: {event_summary['stats']['stream_2_count']})"
+        # Store the reconstructed transcription as tuple
+        session.transcription_tuple = session.reconstruct_transcription()
     except asyncio.CancelledError:
         pass  # Normal cancellation
     except Exception as e:
     # Protect against startup races: Gradio can call `process_audio` concurrently.
     with session._start_lock:
         if session.is_running:
+            return get_transcription_html(session.reconstruct_transcription(), session.status_message, session.current_wpm, session.partial_transcript_enabled)
         # Check if API key is set
         if not session.api_key:
             session.status_message = "error"
+            return get_transcription_html(("Please enter your Mistral API key above to start transcription.","",""), "error", "", False)
         # Check if we've hit max concurrent sessions - kill all if so
         with _sessions_lock:
         if active_at_capacity or registry_over:
             kill_all_sessions()
             session.status_message = "error"
+            return get_transcription_html(("Server reset due to capacity. Please click the microphone to restart.","",""), "error", "", False)
         session.word_timestamps = []
         session.current_wpm = "Calibrating..."
         session.session_start_time = time.time()
         session.last_audio_time = time.time()
         session.status_message = "connecting"
+        session.stream_events = {
+            'stream_1': [],
+            'stream_2': []
+        }
         # Start Mistral transcription (now non-blocking, uses shared event loop)
         start_transcription(session)
+    return get_transcription_html(session.reconstruct_transcription(), session.status_message, session.current_wpm, session.partial_transcript_enabled)
+def stop_session(session_id, api_key=None, partial_transcript=False):
     """Stop the transcription and invalidate the session.
     Returns None for session_id so a fresh session is created on next recording.
     This prevents duplicate session issues when users stop and restart quickly.
     """
     session = ensure_session(session_id)
+    old_transcripts = session.reconstruct_transcription()
     old_wpm = session.current_wpm
     if session.is_running:
     # Return None for session_id - a fresh session will be created on next recording
     # This ensures no duplicate sessions when users stop/start quickly
+    return get_transcription_html(old_transcripts, "ready", old_wpm, partial_transcript), None
 async def _set_stop_event(event):
     event.set()
+def clear_history(session_id, api_key=None, partial_transcript=False):
     """Stop the transcription and clear all history."""
     session = ensure_session(session_id)
     session.is_running = False
     # Reset the queue
     session.reset_queue()
+    # Clear event history
+    session.clear_events()
     session.word_timestamps = []
     session.current_wpm = "Calibrating..."
     session.session_start_time = None
     session.status_message = "ready"
+    session.stream_events = {
+        'stream_1': [],
+        'stream_2': []
+    }
     # Return the session_id to maintain state
+    return get_transcription_html(("",), "ready", "Calibrating...", False), None, session.session_id
+def process_audio(audio, session_id, api_key, partial_transcript=False):
     """Process incoming audio and queue for streaming."""
     # Check capacity - if at or above max, kill ALL sessions to reset
     with _sessions_lock:
     if registry_count > MAX_CONCURRENT_SESSIONS or active_count > MAX_CONCURRENT_SESSIONS or (active_count >= MAX_CONCURRENT_SESSIONS and not is_active_user):
         kill_all_sessions()
         return get_transcription_html(
+            ("Server reset due to capacity. Please click the microphone to restart.","",""),
             "error",
+            "",
+            False
         ), None
     # Check if API key is provided
     if not api_key or not api_key.strip():
+        # return get_transcription_html(
+        #     ("Please enter your Mistral API key above to start transcription.","",""),
+        #     "error",
+        #     ""
+        # ), None
+        api_key = DEFAULT_API_KEY
     # Always ensure we have a valid session first
     try:
         session = ensure_session(session_id)
         # Update API key on the session
         session.api_key = api_key.strip()
+        # Store partial transcript preference on the session
+        session.partial_transcript_enabled = partial_transcript
     except Exception as e:
         print(f"Error creating session: {e}")
         # Create a fresh session if ensure_session fails
         session = UserSession(api_key=api_key.strip())
+        session.partial_transcript_enabled = partial_transcript
         _session_registry[session.session_id] = session
     # Cache session_id early in case of later errors
         # Quick return if audio is None
         if audio is None:
             wpm = session.current_wpm if session.is_running else "Calibrating..."
+            return get_transcription_html(session.reconstruct_transcription(), session.status_message, wpm, session.partial_transcript_enabled), current_session_id
         # Update last audio time for inactivity tracking
         session.last_audio_time = time.time()
         # Skip processing if session stopped
         if not session.is_running:
+            return get_transcription_html(session.reconstruct_transcription(), session.status_message, session.current_wpm, session.partial_transcript_enabled), current_session_id
         sample_rate, audio_data = audio
         except Exception:
             pass  # Skip if queue is full
+        return get_transcription_html(session.reconstruct_transcription(), session.status_message, session.current_wpm, session.partial_transcript_enabled), current_session_id
     except Exception as e:
         print(f"Error processing audio: {e}")
         # Return safe defaults - always include session_id to maintain state
+        return get_transcription_html(("",), "error", "", False), current_session_id
 # Gradio interface
 with gr.Blocks(title="Voxtral Real-time Transcription") as demo:
     # Header
     gr.HTML(get_header_html())
+    # API Key input with partial transcript checkbox
     with gr.Row():
         api_key_input = gr.Textbox(
+            label="Mistral API Key (optional)",
+            placeholder="Enter your own Mistral API key if you encounter issues.",
             type="password",
             elem_id="api-key-input",
+            info="Get your API key from console.mistral.ai",
+            scale=4
+        )
+        partial_transcript_checkbox = gr.Checkbox(
+            label="Partial Transcript",
+            info="Enable to show 2 streams + merged output",
+            value=False,
+            elem_id="partial-transcript-checkbox",
+            scale=1
         )
     # Transcription output
     transcription_display = gr.HTML(
+        value=get_transcription_html(("","",""), "ready", "Calibrating...", False),
         elem_id="transcription-output"
     )
     # Event handlers
     clear_btn.click(
         clear_history,
+        inputs=[session_state, api_key_input, partial_transcript_checkbox],
         outputs=[transcription_display, audio_input, session_state]
     )
     audio_input.stop_recording(
         stop_session,
+        inputs=[session_state, api_key_input, partial_transcript_checkbox],
         outputs=[transcription_display, session_state]
     )
     audio_input.stream(
         process_audio,
+        inputs=[audio_input, session_state, api_key_input, partial_transcript_checkbox],
         outputs=[transcription_display, session_state],
         show_progress="hidden",
         concurrency_limit=500,

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 gradio>=4.0.0
 websockets
 numpy
-mistralai[realtime]

 gradio>=4.0.0
 websockets
 numpy
+mistralai[realtime]>=1.12.3

style.css CHANGED Viewed

@@ -191,6 +191,57 @@ body, .gradio-container {
     color: #000000 !important;
     white-space: pre-wrap;
     word-break: break-word;
 }
 .transcript-cursor {
@@ -283,6 +334,27 @@ footer {
     display: none !important;
 }
 .gradio-container .prose {
     max-width: none !important;
 }
@@ -294,4 +366,4 @@ footer {
     font-style: italic;
     text-align: center;
     margin-top: 1rem;
-}

     color: #000000 !important;
     white-space: pre-wrap;
     word-break: break-word;
+    text-align: left !important;
+    margin: 0 !important;
+    padding: 0 !important;
+}
+/* Fix for first text chunk alignment */
+.transcript-text:first-letter {
+    margin-left: 0 !important;
+}
+.transcript-text::first-line {
+    text-indent: 0 !important;
+}
+.dual-stream-container {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    gap: 1rem;
+    height: 100%;
+    text-align: left !important;
+}
+.triple-stream-container {
+    display: grid;
+    grid-template-columns: 1fr 1fr 1fr;
+    gap: 1rem;
+    height: 100%;
+    text-align: left !important;
+}
+.stream-box {
+    background: rgba(255, 255, 255, 0.6) !important;
+    border: 1px solid #E9E2CB;
+    border-radius: 4px;
+    padding: 0.75rem;
+    height: 100%;
+    overflow-y: auto;
+    text-align: left !important;
+}
+.stream-label {
+    font-family: 'JetBrains Mono', monospace !important;
+    font-size: 0.75rem !important;
+    font-weight: 700 !important;
+    color: #FF8205 !important;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid #FF8205;
+    text-align: left !important;
 }
 .transcript-cursor {
     display: none !important;
 }
+/* Partial transcript checkbox styling */
+#partial-transcript-checkbox {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    margin-left: 1rem;
+}
+#partial-transcript-checkbox .gradio-checkbox {
+    transform: scale(1.2);
+}
+#partial-transcript-checkbox label {
+    font-family: 'JetBrains Mono', monospace !important;
+    font-size: 0.85rem !important;
+    font-weight: 600 !important;
+    color: #1E1E1E !important;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
 .gradio-container .prose {
     max-width: none !important;
 }
     font-style: italic;
     text-align: center;
     margin-top: 1rem;
+}