VoxSum

Sleeping

App Files Files Community

Luigi commited on Sep 10, 2025

Commit

06b55be

1 Parent(s): d3dce49

re-implement tab3 (ui for transcription and summarization)

Browse files

Files changed (1) hide show

src/streamlit_app.py +234 -135

src/streamlit_app.py CHANGED Viewed

@@ -79,38 +79,64 @@ with tab2:
         st.session_state.audio_path = temp_audio_path
 with tab3:
-    st.subheader("Transcription & Summary")
     st.markdown("---")
-    # Debug console
-    debug_expander = st.expander("Debug Console", expanded=False)
     with debug_expander:
         debug_placeholder = st.empty()
-    def debug_log(message):
-        """Helper function for debug messages"""
         with debug_expander:
-            debug_placeholder.markdown(f"`{datetime.now().strftime('%H:%M:%S')}`: {message}")
-        print(f"[DEBUG] {message}")
-    # Initialize audio player
     if st.session_state.audio_path and not st.session_state.audio_base64:
-        debug_log("Initializing audio player...")
-        with open(st.session_state.audio_path, "rb") as f:
-            audio_bytes = f.read()
-        st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
-        debug_log(f"Audio loaded: {len(audio_bytes)} bytes")
     if st.session_state.audio_base64:
-        # Unified audio player with enhanced messaging
-        debug_log("Rendering audio player...")
         audio_html = f"""
         <audio id="audioPlayer" controls ontimeupdate="updateTime(this)">
             <source src="data:audio/mp3;base64,{st.session_state.audio_base64}" type="audio/mp3">
         </audio>
         <script>
             const player = document.getElementById('audioPlayer');
-            console.log('[AUDIO] Player initialized');
             function seekAudio(time) {{
                 console.log('[AUDIO] Seeking to: ' + time);
@@ -119,9 +145,13 @@ with tab3:
             }}
             function updateTime(audio) {{
-                const time = audio.currentTime;
-                console.log('[AUDIO] Time update: ' + time.toFixed(2));
-                window.parent.postMessage({{type: "currentTimeUpdate", time: time}}, "*");
             }}
             // Unified message handling
@@ -134,181 +164,250 @@ with tab3:
         </script>
         """
         st.markdown(audio_html, unsafe_allow_html=True)
-    # Create persistent display containers
-    status_placeholder = st.empty()
-    transcript_container = st.container()
-    summary_container = st.container()
-    # Display existing summary if available
-    if st.session_state.get('summary'):
-        debug_log(f"Displaying existing summary ({len(st.session_state.summary)} chars)")
-        with summary_container:
-            st.markdown("### Summary")
-            st.markdown(st.session_state.summary)
-    # Streamlit Bridge for time updates
-    def streamlit_bridge():
-        debug_log("Initializing time bridge...")
         time_js = """
         <script>
-        console.log('[TIME BRIDGE] Initialized');
-        // Listen for time updates from anywhere
         window.addEventListener('message', (event) => {
             if (event.data.type === 'currentTimeUpdate') {
-                console.log('[TIME BRIDGE] Received time update: ' + event.data.time);
                 Streamlit.setComponentValue(event.data.time);
             }
         });
         </script>
         """
-        return st.components.v1.html(time_js, height=0)
-    # Get current time via bridge
-    current_time = streamlit_bridge()
-    if current_time is not None:
-        debug_log(f"Time bridge component initialized")
-        st.session_state.current_time = current_time
-    # Updated transcript generation and rendering code
-    def generate_transcript_html(utterances):
         if not utterances:
-            return ""
-        current_time = st.session_state.get("current_time", 0.0)
         html = """
         <div id="transcript-container" style="max-height:500px;overflow-y:auto;">
         """
-        for utterance in utterances:
             if len(utterance) != 3:
                 continue
             start, end, text = utterance
-            if start < 0 or end < 0:
-                html += f'<div class="utterance">{text}</div>'
-                continue
             try:
-                # Convert to floats to ensure numerical comparison
                 start_f = float(start)
                 end_f = float(end)
-                current_f = float(current_time)
-                is_current = start_f <= current_f < end_f
             except Exception as e:
                 is_current = False
-            # Format time display
             start_time = time.strftime('%M:%S', time.gmtime(start_f))
             end_time = time.strftime('%M:%S', time.gmtime(end_f))
-            # Create safe text display (escape quotes)
-            safe_text = text.replace('"', '&quot;').replace("'", "&apos;")
             html += f"""
             <div class="utterance {'current-utterance' if is_current else ''}"
-                onclick="window.parent.postMessage({{type: 'seekToTime', time: {start_f}}}, '*')">
                 <b>[{start_time}-{end_time}]</b> {safe_text}
             </div>
             """
         html += "</div>"
         return html
-    # Simplified transcript rendering function
-    def render_transcript():
-        if "utterances" in st.session_state and st.session_state.utterances:
-            with transcript_container:
-                html = generate_transcript_html(st.session_state.utterances)
-                st.components.v1.html(html, height=500)
-    # Add CSS for styling
     st.markdown("""
     <style>
     .utterance {
-        padding: 8px;
-        margin: 4px 0;
-        border-radius: 4px;
         cursor: pointer;
-        transition: background 0.2s;
-        line-height: 1.5;
     }
-    .utterance:hover { background-color: #f0f0f0; }
     .current-utterance {
-        background-color: #ffebee;
-        border-left: 3px solid #f44336;
-        font-weight: 500;
     }
     </style>
     """, unsafe_allow_html=True)
-    # Transcribe button
-    # Update the transcription button logic
-    if st.button("🎙️ Transcribe", key="transcribe_button_tab3"):
         if st.session_state.audio_path:
-            debug_log("Transcription started...")
-            status_placeholder.text("Transcribing...")
             st.session_state.utterances = []
             st.session_state.transcript = ""
-            # Process ASR output
-            for current_utt, all_utts in transcribe_file(
-                st.session_state.audio_path, vad_threshold, model_names[model_name]
-            ):
-                st.session_state.utterances = all_utts
-                st.session_state.transcript = "\n".join(
-                    f"[{start:.1f}-{end:.1f}] {text}"
-                    for start, end, text in all_utts
                 )
-                # Update transcript display
-                render_transcript()
-            # Final update after transcription
-            debug_log(f"Transcription completed: {len(st.session_state.utterances)} utterances")
-            render_transcript()
-            status_placeholder.empty()
-    # Always render transcript when available
-    render_transcript()
-    # Real-time transcript highlighting
     if "utterances" in st.session_state and st.session_state.utterances:
-        debug_log(f"Rendering transcript with {len(st.session_state.utterances)} utterances")
-        with transcript_container:
-            st.markdown(
-                generate_transcript_html(
-                    st.session_state.utterances,
-                    st.session_state.get('current_time', 0.0)
-                ),
-                unsafe_allow_html=True
             )
-    # Summarize button
-    if st.button("📝 Summarize", key="summarize_button_tab3"):
         if st.session_state.transcript:
-            debug_log("Summarization started...")
-            status_placeholder.text("Summarizing...")
             st.session_state.summary = ""
-            # Create temporary progress display
-            progress_placeholder = st.empty()
-            for accumulated_summary in summarize_transcript(
-                st.session_state.transcript, llm_model, prompt_input
-            ):
-                st.session_state.summary = accumulated_summary
-                progress_placeholder.markdown(accumulated_summary)
-                debug_log(f"Summary update: {len(accumulated_summary)} chars")
-            # Update persistent summary display
-            debug_log(f"Final summary: {len(st.session_state.summary)} chars")
-            with summary_container:
-                st.markdown("### Summary")
-                st.markdown(st.session_state.summary)
-            # Cleanup progress display
-            progress_placeholder.empty()
-            status_placeholder.empty()
-            debug_log("Summarization completed")

         st.session_state.audio_path = temp_audio_path
 with tab3:
+    st.subheader("🎤 Transcription & Summary")
     st.markdown("---")
+    # ===== Enhanced Debug Console =====
+    debug_expander = st.expander("🛠️ Debug Console", expanded=False)
     with debug_expander:
         debug_placeholder = st.empty()
+    # Initialize debug logs with startup message
+    if 'debug_logs' not in st.session_state:
+        st.session_state.debug_logs = ["`System`: Tab3 initialized"]
+    def debug_log(message, category="INFO"):
+        """Enhanced debug logger with categories and colors"""
+        color_map = {
+            "INFO": "blue",
+            "WARN": "orange",
+            "ERROR": "red",
+            "AUDIO": "purple",
+            "STATE": "green"
+        }
+        timestamp = datetime.now().strftime('%H:%M:%S')
+        log_entry = f"<span style='color:{color_map.get(category, 'gray')}'>`{timestamp}` [{category}]: {message}</span>"
+        st.session_state.debug_logs.append(log_entry)
+        # Display last 15 logs with scroll
         with debug_expander:
+            debug_html = "<div style='max-height:300px; overflow-y:auto; font-family: monospace;'>" + \
+                         "<br>".join(st.session_state.debug_logs[-15:]) + "</div>"
+            debug_placeholder.markdown(debug_html, unsafe_allow_html=True)
+        print(f"[DEBUG] {category}: {message}")
+    debug_log("Tab3 rendering started", "STATE")
+    # ===== Audio Player Initialization =====
     if st.session_state.audio_path and not st.session_state.audio_base64:
+        debug_log("Initializing audio player...", "AUDIO")
+        try:
+            with open(st.session_state.audio_path, "rb") as f:
+                audio_bytes = f.read()
+            st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+            debug_log(f"Audio loaded: {len(audio_bytes)} bytes | MIME: audio/mp3", "AUDIO")
+        except Exception as e:
+            debug_log(f"Audio loading failed: {str(e)}", "ERROR")
+            st.error(f"Audio loading error: {str(e)}")
+    # ===== Unified Audio Player =====
     if st.session_state.audio_base64:
+        debug_log("Rendering audio player component", "AUDIO")
         audio_html = f"""
         <audio id="audioPlayer" controls ontimeupdate="updateTime(this)">
             <source src="data:audio/mp3;base64,{st.session_state.audio_base64}" type="audio/mp3">
         </audio>
         <script>
             const player = document.getElementById('audioPlayer');
+            let lastUpdate = 0;
+            const THROTTLE_MS = 200;
             function seekAudio(time) {{
                 console.log('[AUDIO] Seeking to: ' + time);
             }}
             function updateTime(audio) {{
+                const now = Date.now();
+                if (now - lastUpdate > THROTTLE_MS) {{
+                    const time = audio.currentTime;
+                    console.log('[AUDIO] Time update: ' + time.toFixed(2));
+                    window.parent.postMessage({{type: "currentTimeUpdate", time: time}}, "*");
+                    lastUpdate = now;
+                }}
             }}
             // Unified message handling
         </script>
         """
         st.markdown(audio_html, unsafe_allow_html=True)
+        debug_log("Audio player rendered successfully", "AUDIO")
+    # ===== Time Synchronization Bridge =====
+    if 'current_time' not in st.session_state:
+        st.session_state.current_time = 0.0
+        debug_log("Current_time state initialized to 0.0", "STATE")
+    # Initialize bridge only once
+    if 'time_bridge_initialized' not in st.session_state:
+        debug_log("Initializing time synchronization bridge", "STATE")
         time_js = """
         <script>
+        // Listen for time updates from audio player
         window.addEventListener('message', (event) => {
             if (event.data.type === 'currentTimeUpdate') {
+                // Update Streamlit component value
                 Streamlit.setComponentValue(event.data.time);
+                console.log('[TIME BRIDGE] Sent time to Python: ' + event.data.time);
             }
         });
         </script>
         """
+        st.components.v1.html(time_js, height=0)
+        st.session_state.time_bridge_initialized = True
+        debug_log("Time bridge initialized successfully", "STATE")
+    # Create dummy component to receive time updates
+    current_time = st.session_state.current_time
+    debug_log(f"Current playback time: {current_time:.2f} seconds", "STATE")
+    # ===== Enhanced Transcript Generation =====
+    def generate_transcript_html(utterances, current_time=0.0):
+        """Generate interactive transcript HTML with current time highlighting"""
+        debug_log(f"Generating transcript HTML for {len(utterances)} utterances", "STATE")
         if not utterances:
+            return "<div>No transcript available</div>"
+        try:
+            current_time = float(current_time)
+        except:
+            current_time = 0.0
+            debug_log("Invalid current_time, defaulting to 0.0", "WARN")
         html = """
         <div id="transcript-container" style="max-height:500px;overflow-y:auto;">
         """
+        current_highlighted = -1
+        for idx, utterance in enumerate(utterances):
             if len(utterance) != 3:
+                debug_log(f"Skipping invalid utterance: {utterance}", "WARN")
                 continue
             start, end, text = utterance
             try:
                 start_f = float(start)
                 end_f = float(end)
+                is_current = start_f <= current_time < end_f
+                if is_current:
+                    current_highlighted = idx
             except Exception as e:
+                debug_log(f"Utterance time conversion error: {e}", "ERROR")
                 is_current = False
+                start_f = 0
+                end_f = 0
             start_time = time.strftime('%M:%S', time.gmtime(start_f))
             end_time = time.strftime('%M:%S', time.gmtime(end_f))
+            # Escape HTML special characters for security
+            safe_text = html.escape(text)
             html += f"""
             <div class="utterance {'current-utterance' if is_current else ''}"
+                onclick="window.parent.postMessage({{type: 'seekToTime', time: {start_f}}}, '*')"
+                data-start="{start_f}" data-end="{end_f}" data-idx="{idx}">
                 <b>[{start_time}-{end_time}]</b> {safe_text}
             </div>
             """
         html += "</div>"
+        if current_highlighted >= 0:
+            html += f"""
+            <script>
+            // Auto-scroll to current utterance
+            try {{
+                const currentElement = document.querySelector('[data-idx="{current_highlighted}"]');
+                if (currentElement) {{
+                    currentElement.scrollIntoView({{behavior: 'smooth', block: 'center'}});
+                }}
+            }} catch(e) {{ console.error('Scroll error:', e); }}
+            </script>
+            """
+        debug_log(f"Transcript HTML generated ({len(html)} chars)", "STATE")
         return html
+    # ===== UI Elements Setup =====
+    status_placeholder = st.empty()
+    transcript_placeholder = st.empty()
+    summary_container = st.container()
+    debug_log("UI placeholders initialized", "STATE")
+    # Display existing summary if available
+    if st.session_state.get('summary'):
+        summary_len = len(st.session_state.summary)
+        debug_log(f"Displaying existing summary ({summary_len} chars)", "STATE")
+        with summary_container:
+            st.markdown("### 📝 Summary")
+            st.markdown(st.session_state.summary)
+    # ===== Enhanced CSS Styling =====
     st.markdown("""
     <style>
     .utterance {
+        padding: 10px;
+        margin: 5px 0;
+        border-radius: 6px;
         cursor: pointer;
+        transition: all 0.3s;
+        line-height: 1.6;
+        border: 1px solid #e0e0e0;
+    }
+    .utterance:hover {
+        background-color: #f5f5f5;
+        transform: translateX(3px);
+        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
     }
     .current-utterance {
+        background-color: #fff3e0;
+        border-left: 4px solid #ff9800;
+        font-weight: 600;
+        box-shadow: 0 3px 8px rgba(255,152,0,0.2);
     }
     </style>
     """, unsafe_allow_html=True)
+    debug_log("CSS styles applied", "STATE")
+    # ===== Transcription Process =====
+    if st.button("🎙️ Transcribe Audio", key="transcribe_button_tab3", help="Convert audio to timestamped text"):
         if st.session_state.audio_path:
+            debug_log("=== TRANSCRIPTION STARTED ===", "INFO")
+            status_placeholder.info("🔊 Transcribing audio...")
             st.session_state.utterances = []
             st.session_state.transcript = ""
+            try:
+                transcription_gen = transcribe_file(
+                    st.session_state.audio_path,
+                    vad_threshold,
+                    model_names[model_name]
                 )
+                last_update = 0
+                update_frequency = 3  # Update every 3 utterances
+                for idx, (current_utt, all_utts) in enumerate(transcription_gen):
+                    st.session_state.utterances = all_utts
+                    st.session_state.transcript = "\n".join(
+                        f"[{start:.1f}-{end:.1f}] {text}"
+                        for start, end, text in all_utts
+                    )
+                    # Update UI at defined frequency or last item
+                    if idx % update_frequency == 0 or not current_utt:
+                        with transcript_placeholder.container():
+                            st.empty()  # Clear previous content
+                            html = generate_transcript_html(
+                                st.session_state.utterances,
+                                st.session_state.current_time
+                            )
+                            st.components.v1.html(html, height=500)
+                        debug_log(f"Transcript updated: {len(all_utts)} utterances", "STATE")
+                    if current_utt:
+                        start, end, text = current_utt
+                        debug_log(f"New utterance: [{start:.1f}-{end:.1f}] {text[:30]}...", "INFO")
+                debug_log(f"Transcription completed: {len(st.session_state.utterances)} utterances", "INFO")
+                status_placeholder.success("✅ Transcription completed!")
+            except Exception as e:
+                debug_log(f"Transcription failed: {str(e)}", "ERROR")
+                status_placeholder.error(f"Transcription error: {str(e)}")
+        else:
+            debug_log("Transcription attempted without audio file", "ERROR")
+            status_placeholder.warning("⚠️ No audio file available")
+    # ===== Render Existing Transcript =====
     if "utterances" in st.session_state and st.session_state.utterances:
+        debug_log("Rendering existing transcript", "STATE")
+        with transcript_placeholder.container():
+            html = generate_transcript_html(
+                st.session_state.utterances,
+                st.session_state.current_time
             )
+            st.components.v1.html(html, height=500)
+    else:
+        with transcript_placeholder.container():
+            st.info("No transcript available. Click 'Transcribe Audio' to generate one.")
+    # ===== Summarization Process =====
+    if st.button("📝 Generate Summary", key="summarize_button_tab3", help="Create summary from transcript"):
         if st.session_state.transcript:
+            debug_log("=== SUMMARIZATION STARTED ===", "INFO")
+            status_placeholder.info("🧠 Generating summary...")
             st.session_state.summary = ""
+            try:
+                progress_placeholder = st.empty()
+                summary_gen = summarize_transcript(
+                    st.session_state.transcript,
+                    llm_model,
+                    prompt_input
+                )
+                start_time = time.time()
+                char_count = 0
+                for accumulated_summary in summary_gen:
+                    st.session_state.summary = accumulated_summary
+                    progress_placeholder.markdown(accumulated_summary)
+                    new_chars = len(accumulated_summary) - char_count
+                    char_count = len(accumulated_summary)
+                    debug_log(f"Summary update: +{new_chars} chars", "INFO")
+                duration = time.time() - start_time
+                debug_log(f"Summary completed: {char_count} chars in {duration:.1f}s", "INFO")
+                with summary_container:
+                    st.markdown("### 📝 Final Summary")
+                    st.markdown(st.session_state.summary)
+                progress_placeholder.empty()
+                status_placeholder.success(f"✅ Summary generated ({char_count} chars)")
+            except Exception as e:
+                debug_log(f"Summarization failed: {str(e)}", "ERROR")
+                status_placeholder.error(f"Summary error: {str(e)}")
+        else:
+            debug_log("Summarization attempted without transcript", "ERROR")
+            status_placeholder.warning("⚠️ No transcript available")
+    debug_log("Tab3 rendering completed", "STATE")