Spaces:

mistralai
/

Voxtral-Mini-Realtime

Paused

App Files Files Community

Handle no partial

by pandora-s - opened Feb 17

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+136

-109

Files changed (1) hide show

app.py +136 -109

app.py CHANGED Viewed

@@ -310,27 +310,32 @@ class UserSession:
         """Reconstruct transcription text from stream events."""
         stream1_text = ""
         stream2_text = ""
         # Reconstruct from text_delta events
         for event in self.stream_events['stream_1']:
             if event.get('type') == 'text_delta':
                 stream1_text += event.get('text', '')
-        for event in self.stream_events['stream_2']:
-            if event.get('type') == 'text_delta':
-                stream2_text += event.get('text', '')
-        # Stream 3
         stream3_final = stream2_text
         stream3_preview = stream1_text
         stream3_final, stream3_preview = self._compute_display_texts(stream3_final, stream3_preview)
         stream3_text = stream3_final + self.new_color_open + stream3_preview + self.new_color_close
         # Return as tuple for compatibility with HTML function
         return (stream1_text, stream2_text, stream3_text)
 # Load CSS from external file
 css_path = os.path.join(os.path.dirname(__file__), "style.css")
 with open(css_path, "r") as f:
@@ -373,77 +378,85 @@ def get_transcription_html(transcripts: tuple, status: str, wpm: str = "Calibrat
     """Generate the full transcription card HTML."""
     status_badge = get_status_html(status)
     wpm_badge = f'<div class="wpm-badge"><span style="color: #1E1E1E !important;">{wpm}</span></div>'
     if transcripts:
-        # Check if partial transcript is enabled and we have 3 streams
-        if partial_transcript_enabled and len(transcripts) >= 3 and transcripts[0] and transcripts[1] and transcripts[2]:
-            # Split into three streams
-            stream1_content, stream2_content, stream3_content = transcripts
             cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
             content_html = f"""
-            <div class="triple-stream-container">
-                <div class="stream-box">
-                    <div class="stream-label">Stream 1 (Preview - 240ms)</div>
-                    <div class="transcript-text" style="color: #000000 !important;">
 {stream1_content}{cursor_html}
                     </div>
-                </div>
-                <div class="stream-box">
-                    <div class="stream-label">Stream 2 (Final - 2.4s)</div>
-                    <div class="transcript-text" style="color: #000000 !important;">
 {stream2_content}{cursor_html}
                     </div>
-                </div>
-                <div class="stream-box">
-                    <div class="stream-label">Stream 3 (Merged)</div>
-                    <div class="transcript-text" style="color: #000000 !important;">
 {stream3_content}{cursor_html}
                     </div>
                 </div>
-            </div>
-            """
-        # Check if we have 3 streams (backward compatibility for when partial transcript is disabled)
-        elif len(transcripts) >= 3 and transcripts[0] and transcripts[1] and transcripts[2]:
-            # Show only the merged stream when partial transcript is disabled
-            stream3_content = transcripts[2]
-            cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
-            content_html = f"""
-            <div class="transcript-text" style="color: #000000 !important;">
 {stream3_content}{cursor_html}
-            </div>
-            """
-        # Check if transcript contains both streams (backward compatibility)
-        elif transcripts[0] and transcripts[1]:
-            # Split the transcript into two streams
-            stream1_content, stream2_content = transcripts
-            cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
-            content_html = f"""
-            <div class="dual-stream-container">
-                <div class="stream-box">
-                    <div class="stream-label">Stream 1</div>
-                    <div class="transcript-text" style="color: #000000 !important;">
 {stream1_content}{cursor_html}
                     </div>
-                </div>
-                <div class="stream-box">
-                    <div class="stream-label">Stream 2</div>
-                    <div class="transcript-text" style="color: #000000 !important;">
 {stream2_content}{cursor_html}
                     </div>
                 </div>
-            </div>
-            """
-        else:
-            # Single stream (backward compatibility)
-            cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
-            content_html = f"""
-            <div class="transcript-text" style="color: #000000 !important;">
 {transcripts[0]}{cursor_html}
-            </div>
-            """
     elif status in ["listening", "warming", "connecting"]:
         content_html = """
         <div class="empty-state">
@@ -469,13 +482,13 @@ def get_transcription_html(transcripts: tuple, status: str, wpm: str = "Calibrat
             <p class="empty-text" style="color: #555555 !important;">// Click the microphone to start.</p>
         </div>
         """
     # Use base64 image if available
     if VOXTRAL_ICON_B64:
         icon_html = f'<img src="data:image/png;base64,{VOXTRAL_ICON_B64}" alt="Voxtral" class="voxtral-icon" />'
     else:
         icon_html = '<span style="font-size:20px;">🎙️</span>'
     return f"""
     <div class="transcription-card">
         <div class="card-header">
@@ -669,45 +682,49 @@ async def audio_stream_duplicator_from_queue(session):
     return duplicator
 async def mistral_transcription_handler(session):
-    """Connect to Mistral realtime API and handle transcription with 2 parallel streams."""
     try:
         if not session.api_key:
             session.status_message = "error"
             print(f"Session {session.session_id[:8]}: No API key provided")
             return
         # Create Mistral client
         client = Mistral(api_key=session.api_key, server_url=MISTRAL_BASE_URL)
         audio_format = AudioFormat(encoding="pcm_s16le", sample_rate=SAMPLE_RATE)
         session.status_message = "connecting"
         print(f"Session {session.session_id[:8]}: Connecting to Mistral realtime API...")
         # Create a duplicator that can serve multiple audio streams
         duplicator = await audio_stream_duplicator_from_queue(session)
         print(f"Session {session.session_id[:8]}: Created audio stream duplicator for parallel processing")
-        # Create separate audio streams from the duplicator
         audio_stream_1 = await duplicator.add_consumer()
-        audio_stream_2 = await duplicator.add_consumer()
-        print(f"Session {session.session_id[:8]}: Created 2 separate audio streams from duplicator")
-        # Create tasks for both transcription streams
         async def process_stream_1():
             async for event_1 in client.audio.realtime.transcribe_stream(
                 audio_stream=audio_stream_1,
                 model=MODEL,
                 audio_format=audio_format,
-                target_streaming_delay_ms=240
             ):
                 if not session.is_running:
                     break
                 current_time = time.time()
                 if isinstance(event_1, RealtimeTranscriptionSessionCreated):
                     event_data = {
                         'type': 'session_created',
@@ -717,17 +734,17 @@ async def mistral_transcription_handler(session):
                     session.stream_events['stream_1'].append(event_data)
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 1 connected to Mistral - {current_time:.3f}")
                 elif isinstance(event_1, TranscriptionStreamTextDelta):
                     delta = event_1.text
                     # Get current full text by reconstructing from events
                     current_full_text = ""
                     for e in session.stream_events['stream_1']:
                         if e.get('type') == 'text_delta':
                             current_full_text += e.get('text', '')
                     current_full_text += delta
                     event_data = {
                         'type': 'text_delta',
                         'timestamp': current_time,
@@ -737,13 +754,13 @@ async def mistral_transcription_handler(session):
                     session.stream_events['stream_1'].append(event_data)
                     session.last_event_timestamp = current_time
                     print(f'1 [{current_time:.3f}]', delta, end="", flush=True)
                     words = delta.split()
                     for _ in words:
                         session.word_timestamps.append(time.time())
                     session.current_wpm = calculate_wpm(session)
                 elif isinstance(event_1, TranscriptionStreamDone):
                     event_data = {
                         'type': 'stream_done',
@@ -753,7 +770,7 @@ async def mistral_transcription_handler(session):
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 1 transcription done - {current_time:.3f}")
                     break
                 elif isinstance(event_1, RealtimeTranscriptionError):
                     event_data = {
                         'type': 'error',
@@ -764,7 +781,7 @@ async def mistral_transcription_handler(session):
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 1 error - {event_1.error} - {current_time:.3f}")
                     break
                 elif isinstance(event_1, UnknownRealtimeEvent):
                     event_data = {
                         'type': 'unknown_event',
@@ -774,8 +791,12 @@ async def mistral_transcription_handler(session):
                     session.stream_events['stream_1'].append(event_data)
                     session.last_event_timestamp = current_time
                     continue  # Ignore unknown events
         async def process_stream_2():
             async for event_2 in client.audio.realtime.transcribe_stream(
                 audio_stream=audio_stream_2,
                 model=MODEL,
@@ -784,9 +805,9 @@ async def mistral_transcription_handler(session):
             ):
                 if not session.is_running:
                     break
                 current_time = time.time()
                 if isinstance(event_2, RealtimeTranscriptionSessionCreated):
                     event_data = {
                         'type': 'session_created',
@@ -796,17 +817,17 @@ async def mistral_transcription_handler(session):
                     session.stream_events['stream_2'].append(event_data)
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 2 connected to Mistral - {current_time:.3f}")
                 elif isinstance(event_2, TranscriptionStreamTextDelta):
                     delta = event_2.text
                     # Get current full text by reconstructing from events
                     current_full_text = ""
                     for e in session.stream_events['stream_2']:
                         if e.get('type') == 'text_delta':
                             current_full_text += e.get('text', '')
                     current_full_text += delta
                     event_data = {
                         'type': 'text_delta',
                         'timestamp': current_time,
@@ -816,9 +837,9 @@ async def mistral_transcription_handler(session):
                     session.stream_events['stream_2'].append(event_data)
                     session.last_event_timestamp = current_time
                     print(f'2 [{current_time:.3f}]', delta, end="", flush=True)
                     session.current_wpm = calculate_wpm(session)
                 elif isinstance(event_2, TranscriptionStreamDone):
                     event_data = {
                         'type': 'stream_done',
@@ -828,7 +849,7 @@ async def mistral_transcription_handler(session):
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 2 transcription done - {current_time:.3f}")
                     break
                 elif isinstance(event_2, RealtimeTranscriptionError):
                     event_data = {
                         'type': 'error',
@@ -839,7 +860,7 @@ async def mistral_transcription_handler(session):
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 2 error - {event_2.error} - {current_time:.3f}")
                     break
                 elif isinstance(event_2, UnknownRealtimeEvent):
                     event_data = {
                         'type': 'unknown_event',
@@ -849,19 +870,26 @@ async def mistral_transcription_handler(session):
                     session.stream_events['stream_2'].append(event_data)
                     session.last_event_timestamp = current_time
                     continue  # Ignore unknown events
-        # Run both streams in parallel
         stream1_task = asyncio.create_task(process_stream_1())
-        stream2_task = asyncio.create_task(process_stream_2())
-        # Wait for both streams to complete
-        await asyncio.gather(stream1_task, stream2_task)
         # Final transcription is already reconstructed from events
         # Just add stats to the display
         event_summary = session.get_event_summary()
         stats_text = f"Events: {event_summary['stats']['total_events']} (S1: {event_summary['stats']['stream_1_count']}, S2: {event_summary['stats']['stream_2_count']})"
         # Store the reconstructed transcription as tuple
         session.transcription_tuple = session.reconstruct_transcription()
@@ -874,7 +902,7 @@ async def mistral_transcription_handler(session):
         session.status_message = "error"
     finally:
         session.is_running = False
         # Only remove and log if not already handled by stop_session
         if not session._stopped_by_user:
             with _sessions_lock:
@@ -883,7 +911,6 @@ async def mistral_transcription_handler(session):
             if removed:
                 print(f"Session {session.session_id[:8]} ended. Active sessions: {active_count}")
 def start_transcription(session):
     """Start Mistral transcription using the shared event loop."""
     session.is_running = True

         """Reconstruct transcription text from stream events."""
         stream1_text = ""
         stream2_text = ""
         # Reconstruct from text_delta events
         for event in self.stream_events['stream_1']:
             if event.get('type') == 'text_delta':
                 stream1_text += event.get('text', '')
+        # Only reconstruct Stream 2 if partial_transcript_enabled is True
+        if self.partial_transcript_enabled:
+            for event in self.stream_events['stream_2']:
+                if event.get('type') == 'text_delta':
+                    stream2_text += event.get('text', '')
+        # If partial_transcript_enabled is False, just return Stream 1 for all streams
+        if not self.partial_transcript_enabled:
+            return (stream1_text, "", stream1_text)
+        # Stream 3 (merged)
         stream3_final = stream2_text
         stream3_preview = stream1_text
         stream3_final, stream3_preview = self._compute_display_texts(stream3_final, stream3_preview)
         stream3_text = stream3_final + self.new_color_open + stream3_preview + self.new_color_close
         # Return as tuple for compatibility with HTML function
         return (stream1_text, stream2_text, stream3_text)
 # Load CSS from external file
 css_path = os.path.join(os.path.dirname(__file__), "style.css")
 with open(css_path, "r") as f:
     """Generate the full transcription card HTML."""
     status_badge = get_status_html(status)
     wpm_badge = f'<div class="wpm-badge"><span style="color: #1E1E1E !important;">{wpm}</span></div>'
     if transcripts:
+        # If partial_transcript_enabled is False, only show Stream 1
+        if not partial_transcript_enabled:
+            stream1_content = transcripts[0]
             cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
             content_html = f"""
+            <div class="transcript-text" style="color: #000000 !important;">
+{stream1_content}{cursor_html}
+            </div>
+            """
+        else:
+            # Show all streams if partial_transcript_enabled is True
+            if len(transcripts) >= 3 and transcripts[0] and transcripts[1] and transcripts[2]:
+                # Split into three streams
+                stream1_content, stream2_content, stream3_content = transcripts
+                cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
+                content_html = f"""
+                <div class="triple-stream-container">
+                    <div class="stream-box">
+                        <div class="stream-label">Stream 1 (Preview - 240ms)</div>
+                        <div class="transcript-text" style="color: #000000 !important;">
 {stream1_content}{cursor_html}
+                        </div>
                     </div>
+                    <div class="stream-box">
+                        <div class="stream-label">Stream 2 (Final - 2.4s)</div>
+                        <div class="transcript-text" style="color: #000000 !important;">
 {stream2_content}{cursor_html}
+                        </div>
                     </div>
+                    <div class="stream-box">
+                        <div class="stream-label">Stream 3 (Merged)</div>
+                        <div class="transcript-text" style="color: #000000 !important;">
 {stream3_content}{cursor_html}
+                        </div>
                     </div>
                 </div>
+                """
+            elif len(transcripts) >= 3 and transcripts[0] and transcripts[1] and transcripts[2]:
+                # Show only the merged stream when partial transcript is disabled
+                stream3_content = transcripts[2]
+                cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
+                content_html = f"""
+                <div class="transcript-text" style="color: #000000 !important;">
 {stream3_content}{cursor_html}
+                </div>
+                """
+            elif transcripts[0] and transcripts[1]:
+                # Split the transcript into two streams
+                stream1_content, stream2_content = transcripts
+                cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
+                content_html = f"""
+                <div class="dual-stream-container">
+                    <div class="stream-box">
+                        <div class="stream-label">Stream 1</div>
+                        <div class="transcript-text" style="color: #000000 !important;">
 {stream1_content}{cursor_html}
+                        </div>
                     </div>
+                    <div class="stream-box">
+                        <div class="stream-label">Stream 2</div>
+                        <div class="transcript-text" style="color: #000000 !important;">
 {stream2_content}{cursor_html}
+                        </div>
                     </div>
                 </div>
+                """
+            else:
+                # Single stream (backward compatibility)
+                cursor_html = '<span class="transcript-cursor"></span>' if status == "listening" else ""
+                content_html = f"""
+                <div class="transcript-text" style="color: #000000 !important;">
 {transcripts[0]}{cursor_html}
+                </div>
+                """
     elif status in ["listening", "warming", "connecting"]:
         content_html = """
         <div class="empty-state">
             <p class="empty-text" style="color: #555555 !important;">// Click the microphone to start.</p>
         </div>
         """
     # Use base64 image if available
     if VOXTRAL_ICON_B64:
         icon_html = f'<img src="data:image/png;base64,{VOXTRAL_ICON_B64}" alt="Voxtral" class="voxtral-icon" />'
     else:
         icon_html = '<span style="font-size:20px;">🎙️</span>'
     return f"""
     <div class="transcription-card">
         <div class="card-header">
     return duplicator
 async def mistral_transcription_handler(session):
+    """Connect to Mistral realtime API and handle transcription with 1 or 2 parallel streams."""
     try:
         if not session.api_key:
             session.status_message = "error"
             print(f"Session {session.session_id[:8]}: No API key provided")
             return
         # Create Mistral client
         client = Mistral(api_key=session.api_key, server_url=MISTRAL_BASE_URL)
         audio_format = AudioFormat(encoding="pcm_s16le", sample_rate=SAMPLE_RATE)
         session.status_message = "connecting"
         print(f"Session {session.session_id[:8]}: Connecting to Mistral realtime API...")
         # Create a duplicator that can serve multiple audio streams
         duplicator = await audio_stream_duplicator_from_queue(session)
         print(f"Session {session.session_id[:8]}: Created audio stream duplicator for parallel processing")
+        # Always create Stream 1 (fast, 240ms delay)
         audio_stream_1 = await duplicator.add_consumer()
+        print(f"Session {session.session_id[:8]}: Created Stream 1 (240ms delay)")
+        # Only create Stream 2 if partial_transcript_enabled is True
+        audio_stream_2 = None
+        if session.partial_transcript_enabled:
+            audio_stream_2 = await duplicator.add_consumer()
+            print(f"Session {session.session_id[:8]}: Created Stream 2 (2400ms delay)")
+        # Create tasks for transcription streams
         async def process_stream_1():
             async for event_1 in client.audio.realtime.transcribe_stream(
                 audio_stream=audio_stream_1,
                 model=MODEL,
                 audio_format=audio_format,
+                target_streaming_delay_ms=240 if session.partial_transcript_enabled else 480
             ):
                 if not session.is_running:
                     break
                 current_time = time.time()
                 if isinstance(event_1, RealtimeTranscriptionSessionCreated):
                     event_data = {
                         'type': 'session_created',
                     session.stream_events['stream_1'].append(event_data)
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 1 connected to Mistral - {current_time:.3f}")
                 elif isinstance(event_1, TranscriptionStreamTextDelta):
                     delta = event_1.text
                     # Get current full text by reconstructing from events
                     current_full_text = ""
                     for e in session.stream_events['stream_1']:
                         if e.get('type') == 'text_delta':
                             current_full_text += e.get('text', '')
                     current_full_text += delta
                     event_data = {
                         'type': 'text_delta',
                         'timestamp': current_time,
                     session.stream_events['stream_1'].append(event_data)
                     session.last_event_timestamp = current_time
                     print(f'1 [{current_time:.3f}]', delta, end="", flush=True)
                     words = delta.split()
                     for _ in words:
                         session.word_timestamps.append(time.time())
                     session.current_wpm = calculate_wpm(session)
                 elif isinstance(event_1, TranscriptionStreamDone):
                     event_data = {
                         'type': 'stream_done',
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 1 transcription done - {current_time:.3f}")
                     break
                 elif isinstance(event_1, RealtimeTranscriptionError):
                     event_data = {
                         'type': 'error',
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 1 error - {event_1.error} - {current_time:.3f}")
                     break
                 elif isinstance(event_1, UnknownRealtimeEvent):
                     event_data = {
                         'type': 'unknown_event',
                     session.stream_events['stream_1'].append(event_data)
                     session.last_event_timestamp = current_time
                     continue  # Ignore unknown events
         async def process_stream_2():
+            # Only process Stream 2 if it exists and partial_transcript_enabled is True
+            if not session.partial_transcript_enabled or audio_stream_2 is None:
+                return
             async for event_2 in client.audio.realtime.transcribe_stream(
                 audio_stream=audio_stream_2,
                 model=MODEL,
             ):
                 if not session.is_running:
                     break
                 current_time = time.time()
                 if isinstance(event_2, RealtimeTranscriptionSessionCreated):
                     event_data = {
                         'type': 'session_created',
                     session.stream_events['stream_2'].append(event_data)
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 2 connected to Mistral - {current_time:.3f}")
                 elif isinstance(event_2, TranscriptionStreamTextDelta):
                     delta = event_2.text
                     # Get current full text by reconstructing from events
                     current_full_text = ""
                     for e in session.stream_events['stream_2']:
                         if e.get('type') == 'text_delta':
                             current_full_text += e.get('text', '')
                     current_full_text += delta
                     event_data = {
                         'type': 'text_delta',
                         'timestamp': current_time,
                     session.stream_events['stream_2'].append(event_data)
                     session.last_event_timestamp = current_time
                     print(f'2 [{current_time:.3f}]', delta, end="", flush=True)
                     session.current_wpm = calculate_wpm(session)
                 elif isinstance(event_2, TranscriptionStreamDone):
                     event_data = {
                         'type': 'stream_done',
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 2 transcription done - {current_time:.3f}")
                     break
                 elif isinstance(event_2, RealtimeTranscriptionError):
                     event_data = {
                         'type': 'error',
                     session.last_event_timestamp = current_time
                     print(f"Session {session.session_id[:8]}: Stream 2 error - {event_2.error} - {current_time:.3f}")
                     break
                 elif isinstance(event_2, UnknownRealtimeEvent):
                     event_data = {
                         'type': 'unknown_event',
                     session.stream_events['stream_2'].append(event_data)
                     session.last_event_timestamp = current_time
                     continue  # Ignore unknown events
+        # Run Stream 1 always
         stream1_task = asyncio.create_task(process_stream_1())
+        # Run Stream 2 only if partial_transcript_enabled is True
+        stream2_task = None
+        if session.partial_transcript_enabled:
+            stream2_task = asyncio.create_task(process_stream_2())
+        # Wait for streams to complete
+        if stream2_task:
+            await asyncio.gather(stream1_task, stream2_task)
+        else:
+            await stream1_task
         # Final transcription is already reconstructed from events
         # Just add stats to the display
         event_summary = session.get_event_summary()
         stats_text = f"Events: {event_summary['stats']['total_events']} (S1: {event_summary['stats']['stream_1_count']}, S2: {event_summary['stats']['stream_2_count']})"
         # Store the reconstructed transcription as tuple
         session.transcription_tuple = session.reconstruct_transcription()
         session.status_message = "error"
     finally:
         session.is_running = False
         # Only remove and log if not already handled by stop_session
         if not session._stopped_by_user:
             with _sessions_lock:
             if removed:
                 print(f"Session {session.session_id[:8]} ended. Active sessions: {active_count}")
 def start_transcription(session):
     """Start Mistral transcription using the shared event loop."""
     session.is_running = True