Spaces:

Luigi
/

VoxSum

Sleeping

Luigi commited on Sep 26, 2025

Commit

228a065

1 Parent(s): ca590d4

Clean up legacy Streamlit implementation

- Remove src/streamlit_app.py (legacy Streamlit application)
- Remove src/editing_sync.py (Streamlit editing helper)
- Update README.md to remove references to deleted files
- Update improved_diarization.py comment to remove Streamlit reference

Files changed (4) hide show

README.md +0 -2
src/editing_sync.py +0 -65
src/improved_diarization.py +1 -1
src/streamlit_app.py +0 -1444

README.md CHANGED Viewed

@@ -59,11 +59,9 @@ voxsum-studio/
 │   ├── __init__.py           # Makes src a Python package
 │   ├── asr.py                # Logic for Automatic Speech Recognition (ASR) transcription
 │   ├── diarization.py        # Speaker diarization functionality
-│   ├── editing_sync.py       # Audio editing and synchronization
 │   ├── export_utils.py       # Utilities for exporting transcripts and summaries
 │   ├── improved_diarization.py # Enhanced diarization features
 │   ├── podcast.py            # Functions for podcast search, episode fetching, and audio downloading
-│   ├── streamlit_app.py      # Legacy Streamlit application (for reference)
 │   ├── summarization.py      # Logic for generating summaries using LLMs
 │   ├── utils.py              # Utility functions and model configurations
 │   ├── server/               # FastAPI backend

 │   ├── __init__.py           # Makes src a Python package
 │   ├── asr.py                # Logic for Automatic Speech Recognition (ASR) transcription
 │   ├── diarization.py        # Speaker diarization functionality
 │   ├── export_utils.py       # Utilities for exporting transcripts and summaries
 │   ├── improved_diarization.py # Enhanced diarization features
 │   ├── podcast.py            # Functions for podcast search, episode fetching, and audio downloading
 │   ├── summarization.py      # Logic for generating summaries using LLMs
 │   ├── utils.py              # Utility functions and model configurations
 │   ├── server/               # FastAPI backend

src/editing_sync.py DELETED Viewed

@@ -1,65 +0,0 @@
-"""
-Helper script to handle inline editing communication with Streamlit
-"""
-import streamlit as st
-import json
-def init_editing_communication():
-    """Initialize communication channel for inline editing"""
-    # Check for updates from JavaScript
-    if 'editing_updates' not in st.session_state:
-        st.session_state.editing_updates = {}
-    # Add JavaScript to handle communication
-    js_code = """
-    <script>
-    // Listen for utterance updates
-    window.addEventListener('utteranceUpdate', function(event) {
-        const detail = event.detail;
-        console.log('📝 Utterance update received:', detail);
-        // Send update to Streamlit via session state
-        // Note: This is a demonstration - in production, you'd use st.components for two-way communication
-        // For now, we rely on localStorage and manual sync
-    });
-    // Function to get all edits for sync with Streamlit
-    window.getEditedUtterances = function(playerId) {
-        const editKey = 'voxsum_edits_' + playerId;
-        return JSON.parse(localStorage.getItem(editKey) || '{}');
-    };
-    // Function to clear edits after sync
-    window.clearEditedUtterances = function(playerId) {
-        const editKey = 'voxsum_edits_' + playerId;
-        localStorage.removeItem(editKey);
-    };
-    </script>
-    """
-    st.components.v1.html(js_code, height=0)
-def check_for_editing_updates():
-    """Check if there are any editing updates and apply them"""
-    # This is a placeholder - in a real implementation, you'd need
-    # a proper communication channel between JavaScript and Streamlit
-    # For now, we show how the system would work
-    if st.button("🔄 Sync edits from transcript", help="Click to apply any edits made in the interactive transcript"):
-        # In a real implementation, this would:
-        # 1. Get edits from JavaScript via st.components
-        # 2. Apply them to session state
-        # 3. Update the utterances
-        st.info("Edits would be synchronized here. For demonstration purposes, the localStorage-based editing is working in the transcript viewer.")
-        # For now, show current state
-        if st.session_state.utterances:
-            st.write(f"Current utterances: {len(st.session_state.utterances)}")
-        return True
-    return False

src/improved_diarization.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Diarisation Améliorée avec Clustering Adaptatif et Validation de Qualité
-Vendored copy so the module is importable when running Streamlit from `src/`.
 """
 import numpy as np

 """
 Diarisation Améliorée avec Clustering Adaptatif et Validation de Qualité
+Vendored copy for importability from src/.
 """
 import numpy as np

src/streamlit_app.py DELETED Viewed

@@ -1,1444 +0,0 @@
-# frontend.py
-import streamlit as st
-from asr import transcribe_file
-from summarization import summarize_transcript
-from podcast import search_podcast_series, fetch_episodes, download_podcast_audio, fetch_audio
-from utils import model_names, sensevoice_models, available_gguf_llms
-from diarization import (
-    init_speaker_embedding_extractor, perform_speaker_diarization_on_utterances,
-    merge_transcription_with_diarization, merge_consecutive_utterances, format_speaker_transcript,
-    get_diarization_stats, get_speaker_color
-)
-from export_utils import (
-    SUBTITLE_FORMATS, TRANSCRIPT_FORMATS, SUMMARY_FORMATS,
-    export_to_srt, export_to_vtt, export_to_ass, export_to_transcript_json,
-    export_to_elan_eaf, export_plain_text, export_summary_markdown, export_summary_plain_text
-)
-import base64
-import json
-import hashlib
-import os
-import shutil
-import uuid
-import math
-from pathlib import Path
-from datetime import datetime
-# === 1. Session State Initialization ===
-def init_session_state():
-    defaults = {
-        "transcript": "",
-        "summary": "",
-        "status": "Ready",
-        "audio_path": None,
-        "utterances": [],
-        "utterances_with_speakers": [],  # New: for diarization results
-        "audio_base64": None,
-        "prev_audio_path": None,
-        "transcribing": False,
-        "series_list": [],
-        "episodes": [],
-        "backend": "sensevoice",  # New: default backend
-        "sensevoice_model": list(sensevoice_models.keys())[0],  # New: default SenseVoice model
-        "language": "auto",  # New: language setting for SenseVoice
-        "textnorm": "withitn",  # New: text normalization for SenseVoice
-        "current_page": 1,  # New: for pagination
-        "utterances_per_page": 100,  # New: pagination size
-        "static_audio_url": None,  # New: for static audio serving
-        # Speaker Diarization Settings
-        "enable_diarization": False,  # New: diarization toggle
-        "num_speakers": -1,  # New: number of speakers (-1 = auto)
-        "cluster_threshold": 0.5,  # New: clustering threshold
-        "diarization_stats": {},  # New: speaker statistics
-        "utterances_with_speakers": [],  # New: diarized utterances
-    }
-    for key, value in defaults.items():
-        if key not in st.session_state:
-            st.session_state[key] = value
-# === 1.1. Static Audio File Management ===
-def cleanup_old_static_files():
-    """Clean up old static audio files to prevent disk space issues on HF Spaces"""
-    try:
-        static_dir = Path("static")
-        if not static_dir.exists():
-            return
-        # Get all audio files with their modification times
-        audio_files = []
-        for pattern in ["*.mp3", "*.wav", "*.m4a"]:
-            audio_files.extend(static_dir.glob(pattern))
-        # If more than 10 files, remove oldest ones
-        if len(audio_files) > 10:
-            audio_files.sort(key=lambda f: f.stat().st_mtime)
-            for old_file in audio_files[:-10]:  # Keep only 10 newest
-                try:
-                    old_file.unlink()
-                    print(f"🧹 Cleaned up old audio file: {old_file.name}")
-                except:
-                    pass
-    except Exception as e:
-        print(f"⚠️ Cleanup warning: {e}")
-def setup_static_audio(audio_path):
-    """
-    Copy audio file to static directory and return URL for serving.
-    This eliminates the need for base64 encoding.
-    """
-    try:
-        # Clean up old files first (important for HF Spaces)
-        cleanup_old_static_files()
-        # Use Streamlit's static directory structure
-        static_dir = Path("static")
-        static_dir.mkdir(exist_ok=True)
-        # Generate unique filename
-        audio_id = str(uuid.uuid4())[:8]
-        file_extension = Path(audio_path).suffix or '.mp3'
-        static_filename = f"audio_{audio_id}{file_extension}"
-        static_path = static_dir / static_filename
-        # Copy audio file
-        shutil.copy2(audio_path, static_path)
-        # Return relative URL that Streamlit can serve
-        return f"./static/{static_filename}"
-    except PermissionError:
-        st.warning("⚠️ Cannot access static directory. Using fallback method.")
-        return None
-    except Exception as e:
-        st.warning(f"Static file setup failed: {e}. Using fallback method.")
-        return None
-# === 2. UI Components ===
-# In render_settings_sidebar function
-def render_settings_sidebar():
-    with st.sidebar:
-        st.header("⚙️ Settings")
-        # Backend selection
-        st.session_state.backend = st.radio(
-            "ASR Backend",
-            ["moonshine", "sensevoice"],
-            index=0 if st.session_state.backend == "moonshine" else 1
-        )
-        # Model selection based on backend
-        if st.session_state.backend == "moonshine":
-            model_name = st.selectbox("Moonshine Model", model_names.keys())
-        else:
-            st.session_state.sensevoice_model = st.selectbox(
-                "SenseVoice Model",
-                sensevoice_models.keys(),
-                index=list(sensevoice_models.keys()).index(st.session_state.sensevoice_model) if st.session_state.sensevoice_model in sensevoice_models else 0
-            )
-            model_name = st.session_state.sensevoice_model
-            # SenseVoice specific settings
-            st.session_state.language = st.selectbox(
-                "Language",
-                ["auto", "zh", "en", "ja", "ko", "yue"],
-                index=["auto", "zh", "en", "ja", "ko", "yue"].index(st.session_state.language) if st.session_state.language in ["auto", "zh", "en", "ja", "ko", "yue"] else 0
-            )
-            st.session_state.textnorm = st.radio(
-                "Text Normalization",
-                ["withitn", "noitn"],
-                index=0 if st.session_state.textnorm == "withitn" else 1
-            )
-        # Speaker Diarization Settings
-        st.divider()
-        st.subheader("🎭 Speaker Diarization")
-        st.session_state.enable_diarization = st.checkbox(
-            "Enable Speaker Diarization",
-            value=st.session_state.enable_diarization,
-            help="⚠️ This feature is time-consuming and will significantly increase processing time"
-        )
-        if st.session_state.enable_diarization:
-            col1, col2 = st.columns(2)
-            with col1:
-                st.session_state.num_speakers = st.number_input(
-                    "Number of Speakers",
-                    min_value=-1,
-                    max_value=10,
-                    value=st.session_state.num_speakers,
-                    help="-1 for auto-detection"
-                )
-            with col2:
-                st.session_state.cluster_threshold = st.slider(
-                    "Clustering Threshold",
-                    min_value=0.1,
-                    max_value=1.0,
-                    value=st.session_state.cluster_threshold,
-                    step=0.05,
-                    help="Lower = more speakers detected"
-                )
-            st.info("📝 **Note:** Speaker diarization requires downloading ~200MB of models on first use")
-        return {
-            "vad_threshold": st.slider("VAD Threshold", 0.1, 0.9, 0.5),
-            "model_name": model_name,
-            "llm_model": st.selectbox("LLM for Summarization", list(available_gguf_llms.keys())),
-            "prompt_input": st.text_area("Custom Prompt", value="Summarize the transcript below."),
-            "utterances_per_page": st.number_input("Utterances per page", min_value=20, max_value=500, value=st.session_state.utterances_per_page, step=20, help="For large transcripts, adjust pagination size")
-        }
-def render_podcast_tab():
-    st.subheader("Search Podcast")
-    query = st.text_input("Enter podcast name")
-    col1, col2 = st.columns(2)
-    with col1:
-        if st.button("Search Series") and query:
-            st.session_state.series_list = search_podcast_series(query)
-    if st.session_state.series_list:
-        series_titles = [f"{s['title']} by {s['artist']}" for s in st.session_state.series_list]
-        selected_title = st.selectbox("Select Series", series_titles)
-        series = next((s for s in st.session_state.series_list if f"{s['title']} by {s['artist']}" == selected_title), None)
-        if series:
-            col1, col2 = st.columns([1, 3])
-            with col1:
-                st.image(series["thumbnail"], width=150)
-            with col2:
-                st.text_area("Series Info", value=f"Title: {series['title']}\nArtist: {series['artist']}\nEpisodes: {series['episode_count']}", disabled=True)
-            if st.button("Load Episodes"):
-                st.session_state.episodes = fetch_episodes(series["feed_url"])
-            if st.session_state.episodes:
-                episode_titles = [e["title"] for e in st.session_state.episodes]
-                selected_episode = st.selectbox("Select Episode", episode_titles)
-                episode = next((e for e in st.session_state.episodes if e["title"] == selected_episode), None)
-                if episode:
-                    st.text_area("Episode Info", value=f"Title: {episode['title']}\nPublished: {episode['published']}\nDuration: {episode['duration']}", disabled=True)
-                    if st.button("Download Episode"):
-                        audio_path, status = download_podcast_audio(episode["audio_url"], episode["title"], st.session_state.status)
-                        st.session_state.audio_path = audio_path
-                        st.session_state.status = status
-def render_audio_tab():
-    st.subheader("Upload or Fetch Audio")
-    # YouTube Section
-    youtube_url = st.text_input("YouTube URL")
-    if st.button("Fetch from YouTube") and youtube_url:
-        audio_path, status = fetch_audio(youtube_url, st.session_state.status)
-        st.session_state.audio_path = audio_path
-        st.session_state.audio_base64 = None
-        st.session_state.status = status
-    # File Upload Section
-    uploaded_file = st.file_uploader("Upload Audio", type=["mp3", "wav"])
-    if uploaded_file:
-        import tempfile
-        try:
-            tmp = tempfile.NamedTemporaryFile(prefix="voxsum_", suffix=".mp3", delete=False)
-            tmp.write(uploaded_file.getbuffer())
-            tmp.flush()
-            tmp.close()
-            st.session_state.audio_path = tmp.name
-            st.session_state.audio_base64 = None
-        except Exception as e:
-            st.error(f"Failed to save uploaded file: {e}")
-def create_efficient_sync_player(audio_path, utterances, utterances_with_speakers=None):
-    """
-    Ultra-optimized player with inline editing for large audio files and long transcripts:
-    1. Base64 encoding with intelligent size limits
-    2. Virtual scrolling for 1000+ utterances
-    3. Binary search for O(log n) synchronization
-    4. Efficient DOM management
-    5. Debounced updates
-    6. Speaker color coding for diarization
-    7. Inline editing with auto-save to session state
-    """
-    # Use speaker-aware utterances if available
-    display_utterances = utterances_with_speakers if utterances_with_speakers else utterances
-    has_speakers = utterances_with_speakers is not None
-    print(f"🎭 DEBUG Player: has_speakers={has_speakers}, display_utterances count={len(display_utterances)}")
-    if has_speakers and len(display_utterances) > 0:
-        sample = display_utterances[0]
-        print(f"🎭 DEBUG Player: Sample utterance format: {len(sample)} elements = {sample}")
-    file_size = os.path.getsize(audio_path)
-    # For now, use base64 for all files with intelligent limits
-    # TODO: Implement proper static file serving for production
-    if file_size > 100 * 1024 * 1024:  # 100MB absolute limit
-        return f"""
-        <div style="padding: 20px; text-align: center; color: #d32f2f; background: #ffebee; border-radius: 8px;">
-            ⚠️ Audio file too large ({file_size / 1024 / 1024:.1f}MB) for browser playback.
-            <br>Please use a smaller file (< 100MB) for optimal performance.
-            <br><small>Large file support requires production deployment.</small>
-        </div>
-        """
-    # Read and encode file as base64 - most reliable method
-    try:
-        with open(audio_path, "rb") as f:
-            audio_bytes = f.read()
-        # Check if base64 will be too large for DOM
-        base64_size = len(audio_bytes) * 4 // 3  # Approximate base64 size
-        if base64_size > 100 * 1024 * 1024:  # 100MB base64 limit
-            return f"""
-            <div style="padding: 20px; text-align: center; color: #d32f2f; background: #ffebee; border-radius: 8px;">
-                ⚠️ Audio file creates {base64_size / 1024 / 1024:.1f}MB base64 string - too large for DOM.
-                <br>Please use a smaller file (< 75MB original size).
-            </div>
-            """
-        audio_url = f"data:audio/mp3;base64,{base64.b64encode(audio_bytes).decode('utf-8')}"
-        # Warning for larger files
-        audio_warning = ""
-        if file_size > 10 * 1024 * 1024:  # > 10MB
-            audio_warning = f"""
-            <div style="padding: 8px; background: #fff3e0; border-left: 4px solid #ff9800; margin-bottom: 10px; border-radius: 4px;">
-                📡 Loading {file_size / 1024 / 1024:.1f}MB file ({base64_size / 1024 / 1024:.1f}MB encoded)... This may take a moment.
-            </div>
-            """
-    except Exception as e:
-        return f"""
-        <div style="padding: 20px; text-align: center; color: #d32f2f;">
-            ❌ Failed to load audio file: {str(e)}
-        </div>
-        """
-    # Generate unique ID for this player instance
-    player_id = hashlib.md5((audio_path + str(len(display_utterances))).encode()).hexdigest()[:8]
-    # Determine if we need virtualization
-    use_virtualization = len(display_utterances) > 200
-    max_visible_items = 50 if use_virtualization else len(display_utterances)
-    # Prepare utterances data and speaker colors
-    utterances_json = json.dumps(display_utterances)
-    # Generate speaker color mapping for JavaScript
-    speaker_colors = {}
-    if has_speakers:
-        unique_speakers = set()
-        for utt in display_utterances:
-            if len(utt) >= 4:  # (start, end, text, speaker_id)
-                unique_speakers.add(utt[3])
-        for speaker_id in unique_speakers:
-            speaker_colors[speaker_id] = get_speaker_color(speaker_id)
-    speaker_colors_json = json.dumps(speaker_colors)
-    html_content = f"""
-    <!DOCTYPE html>
-    <html>
-    <head>
-        <meta charset="UTF-8">
-        <style>
-            body {{
-                font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
-                margin: 0; padding: 10px; background: #fafafa;
-            }}
-            #audio-container-{player_id} {{
-                margin-bottom: 15px;
-                background: white;
-                border-radius: 8px;
-                padding: 10px;
-                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-            }}
-            #transcript-container-{player_id} {{
-                max-height: 600px;
-                overflow-y: auto;
-                border: 1px solid #e0e0e0;
-                border-radius: 8px;
-                background: white;
-                position: relative;
-            }}
-            #virtual-content-{player_id} {{
-                padding: 8px;
-                position: relative;
-            }}
-            .utterance-{player_id} {{
-                padding: 8px 12px;
-                margin: 2px 0;
-                border-radius: 6px;
-                cursor: pointer;
-                transition: all 0.15s ease;
-                border-left: 3px solid transparent;
-                font-size: 0.95em;
-                line-height: 1.5;
-                background: #fdfdfd;
-            }}
-            .utterance-{player_id}:hover {{
-                background-color: #f0f8ff;
-                transform: translateX(3px);
-                box-shadow: 0 2px 8px rgba(33, 150, 243, 0.2);
-            }}
-            .current-{player_id} {{
-                background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%) !important;
-                border-left: 3px solid #2196f3 !important;
-                font-weight: 500;
-                box-shadow: 0 3px 12px rgba(33, 150, 243, 0.3);
-                transform: translateX(3px);
-            }}
-            .timestamp-{player_id} {{
-                font-size: 0.8em;
-                color: #666;
-                margin-right: 8px;
-                font-weight: 600;
-                background: #f5f5f5;
-                padding: 2px 6px;
-                border-radius: 3px;
-            }}
-            .pagination-{player_id} {{
-                display: flex;
-                justify-content: center;
-                align-items: center;
-                padding: 10px;
-                background: #f8f9fa;
-                border-top: 1px solid #e0e0e0;
-                gap: 10px;
-            }}
-            .pagination-{player_id} button {{
-                padding: 6px 12px;
-                border: 1px solid #ddd;
-                background: white;
-                border-radius: 4px;
-                cursor: pointer;
-                transition: all 0.2s;
-            }}
-            .pagination-{player_id} button:hover {{
-                background: #e3f2fd;
-                border-color: #2196f3;
-            }}
-            .pagination-{player_id} button:disabled {{
-                opacity: 0.5;
-                cursor: not-allowed;
-            }}
-            .stats-{player_id} {{
-                font-size: 0.85em;
-                color: #666;
-                text-align: center;
-                padding: 5px;
-                background: #f8f9fa;
-            }}
-            /* Inline editing styles */
-            .edit-mode-{player_id} {{
-                background: #fff8e1 !important;
-                border: 2px solid #ff9800 !important;
-                border-radius: 8px;
-            }}
-            .edit-controls-{player_id} {{
-                display: flex;
-                align-items: center;
-                gap: 8px;
-                margin-top: 8px;
-                padding-top: 8px;
-                border-top: 1px solid #e0e0e0;
-            }}
-            .edit-textarea-{player_id} {{
-                width: 100%;
-                border: 1px solid #ddd;
-                border-radius: 4px;
-                padding: 8px;
-                font-size: 0.95em;
-                line-height: 1.5;
-                resize: vertical;
-                min-height: 60px;
-                font-family: inherit;
-            }}
-            .edit-btn-{player_id} {{
-                padding: 4px 8px;
-                border: 1px solid #ddd;
-                border-radius: 4px;
-                background: white;
-                cursor: pointer;
-                font-size: 0.8em;
-                transition: all 0.2s;
-            }}
-            .edit-btn-{player_id}.save {{
-                background: #4caf50;
-                color: white;
-                border-color: #4caf50;
-            }}
-            .edit-btn-{player_id}.cancel {{
-                background: #f44336;
-                color: white;
-                border-color: #f44336;
-            }}
-            .edit-btn-{player_id}:hover {{
-                opacity: 0.8;
-            }}
-            .edit-icon-{player_id} {{
-                position: absolute;
-                top: 8px;
-                right: 8px;
-                background: rgba(255, 152, 0, 0.1);
-                border: 1px solid #ff9800;
-                border-radius: 50%;
-                width: 24px;
-                height: 24px;
-                display: flex;
-                align-items: center;
-                justify-content: center;
-                cursor: pointer;
-                font-size: 12px;
-                opacity: 0;
-                transition: opacity 0.2s;
-            }}
-            .utterance-{player_id}:hover .edit-icon-{player_id} {{
-                opacity: 1;
-            }}
-            .utterance-text-{player_id} {{
-                position: relative;
-                padding-right: 30px;
-            }}
-        </style>
-    </head>
-    <body>
-        {audio_warning}
-        <div id="audio-container-{player_id}">
-            <audio id="audio-{player_id}" controls preload="auto" style="width: 100%;">
-                <source src="{audio_url}" type="audio/mp3">
-                <source src="{audio_url}" type="audio/mpeg">
-                <source src="{audio_url}" type="audio/wav">
-                Your browser does not support the audio element.
-            </audio>
-        </div>
-        <div class="stats-{player_id}">
-            📊 {len(display_utterances)} utterances • ⏱️ {display_utterances[-1][1]:.1f}s duration
-            {' • 🔄 Virtual scrolling enabled' if use_virtualization else ''}
-            {' • 🎭 Speaker diarization active' if has_speakers else ''}
-        </div>
-        <div id="transcript-container-{player_id}">
-            <div id="virtual-content-{player_id}"></div>
-        </div>
-        {"<div class='pagination-" + player_id + "' id='pagination-" + player_id + "'></div>" if use_virtualization else ""}
-        <script>
-            (function() {{
-                const playerId = '{player_id}';
-                const player = document.getElementById('audio-' + playerId);
-                const container = document.getElementById('transcript-container-' + playerId);
-                const virtualContent = document.getElementById('virtual-content-' + playerId);
-                const utterances = {utterances_json};
-                const useVirtualization = {str(use_virtualization).lower()};
-                const maxVisibleItems = {max_visible_items};
-                const hasSpeakers = {str(has_speakers).lower()};
-                const speakerColors = {speaker_colors_json};
-                let currentHighlight = null;
-                let isSeeking = false;
-                let lastUpdateTime = 0;
-                let currentPage = 1;
-                let itemsPerPage = maxVisibleItems;
-                let totalPages = Math.ceil(utterances.length / itemsPerPage);
-                // Binary search for efficient utterance finding - O(log n)
-                function findActiveUtterance(currentTime) {{
-                    let left = 0, right = utterances.length - 1;
-                    let result = -1;
-                    while (left <= right) {{
-                        const mid = Math.floor((left + right) / 2);
-                        const [start, end] = utterances[mid];
-                        if (currentTime >= start && currentTime < end) {{
-                            return mid;
-                        }} else if (currentTime < start) {{
-                            right = mid - 1;
-                        }} else {{
-                            left = mid + 1;
-                            if (currentTime >= start) result = mid; // Keep track of closest
-                        }}
-                    }}
-                    return result;
-                }}
-                // Efficient DOM builder with virtual scrolling
-                function buildTranscript(page = 1) {{
-                    virtualContent.innerHTML = '';
-                    let startIdx, endIdx;
-                    if (useVirtualization) {{
-                        startIdx = (page - 1) * itemsPerPage;
-                        endIdx = Math.min(startIdx + itemsPerPage, utterances.length);
-                    }} else {{
-                        startIdx = 0;
-                        endIdx = utterances.length;
-                    }}
-                    // Create document fragment for efficient DOM insertion
-                    const fragment = document.createDocumentFragment();
-                    for (let i = startIdx; i < endIdx; i++) {{
-                        const utt = utterances[i];
-                        if (utt.length < 3) continue;
-                        const [start, end, text] = utt;
-                        const speakerId = hasSpeakers && utt.length >= 4 ? utt[3] : null;
-                        const div = document.createElement('div');
-                        div.className = 'utterance-' + playerId;
-                        div.dataset.start = start;
-                        div.dataset.end = end;
-                        div.dataset.index = i;
-                        // Apply speaker color if available
-                        if (speakerId !== null && speakerColors[speakerId]) {{
-                            div.style.borderLeftColor = speakerColors[speakerId];
-                            div.style.backgroundColor = speakerColors[speakerId] + '15'; // 15% opacity
-                        }}
-                        const minutes = Math.floor(start / 60);
-                        const seconds = Math.floor(start % 60).toString().padStart(2, '0');
-                        // Build content with optional speaker label and edit controls
-                        let content = `<span class="timestamp-${{playerId}}">[${{minutes}}:${{seconds}}]</span>`;
-                        if (speakerId !== null) {{
-                            content += ` <span class="speaker-label-${{playerId}}" style="background: ${{speakerColors[speakerId] || '#ccc'}}; color: white; padding: 2px 6px; border-radius: 3px; font-size: 0.8em; margin-right: 6px;">S${{speakerId + 1}}</span>`;
-                        }}
-                        // Wrap text in a container for editing
-                        content += `<div class="utterance-text-${{playerId}}">
-                            <span class="text-display-${{playerId}}">${{text}}</span>
-                            <div class="edit-icon-${{playerId}}" onclick="startEdit(${{i}})" title="Edit this utterance">✏️</div>
-                            <div class="edit-mode-container-${{playerId}}" style="display: none;">
-                                <textarea class="edit-textarea-${{playerId}}">${{text}}</textarea>
-                                <div class="edit-controls-${{playerId}}">
-                                    <button class="edit-btn-${{playerId}} save" onclick="saveEdit(${{i}})">💾 Save</button>
-                                    <button class="edit-btn-${{playerId}} cancel" onclick="cancelEdit(${{i}})">❌ Cancel</button>
-                                </div>
-                            </div>
-                        </div>`;
-                        div.innerHTML = content;
-                        // Optimized click handler
-                        div.addEventListener('click', (e) => {{
-                            e.stopPropagation();
-                            isSeeking = true;
-                            player.currentTime = start;
-                            player.play().catch(() => {{}});
-                            setTimeout(() => isSeeking = false, 150);
-                        }});
-                        fragment.appendChild(div);
-                    }}
-                    virtualContent.appendChild(fragment);
-                    updatePagination();
-                }}
-                // Pagination controls
-                function updatePagination() {{
-                    if (!useVirtualization) return;
-                    const pagination = document.getElementById('pagination-' + playerId);
-                    if (!pagination) return;
-                    pagination.innerHTML = `
-                        <button onclick="window.transcriptPlayers_${{playerId}}.goToPage(1)"
-                                ${{currentPage === 1 ? 'disabled' : ''}}>⏮️</button>
-                        <button onclick="window.transcriptPlayers_${{playerId}}.goToPage(${{Math.max(1, currentPage - 1)}})"
-                                ${{currentPage === 1 ? 'disabled' : ''}}>⏪</button>
-                        <span>Page ${{currentPage}} of ${{totalPages}}</span>
-                        <button onclick="window.transcriptPlayers_${{playerId}}.goToPage(${{Math.min(totalPages, currentPage + 1)}})"
-                                ${{currentPage === totalPages ? 'disabled' : ''}}>⏩</button>
-                        <button onclick="window.transcriptPlayers_${{playerId}}.goToPage(${{totalPages}})"
-                                ${{currentPage === totalPages ? 'disabled' : ''}}>⏭️</button>
-                    `;
-                }}
-                // Page navigation
-                function goToPage(page) {{
-                    if (page < 1 || page > totalPages) return;
-                    currentPage = page;
-                    buildTranscript(currentPage);
-                }}
-                // Auto-navigate to page containing active utterance
-                function navigateToActiveUtterance(utteranceIndex) {{
-                    if (!useVirtualization || utteranceIndex === -1) return;
-                    const targetPage = Math.ceil((utteranceIndex + 1) / itemsPerPage);
-                    if (targetPage !== currentPage) {{
-                        currentPage = targetPage;
-                        buildTranscript(currentPage);
-                    }}
-                }}
-                // Optimized highlighting with debouncing - max 20fps for better performance
-                function updateHighlight() {{
-                    const now = Date.now();
-                    if (now - lastUpdateTime < 50) return; // 20fps max
-                    lastUpdateTime = now;
-                    if (isSeeking) return;
-                    const time = player.currentTime;
-                    const activeUtteranceIndex = findActiveUtterance(time);
-                    // Auto-navigate to correct page if needed
-                    navigateToActiveUtterance(activeUtteranceIndex);
-                    // Find active div in current page
-                    const divs = virtualContent.querySelectorAll('.utterance-' + playerId);
-                    let activeDiv = null;
-                    for (const div of divs) {{
-                        const index = parseInt(div.dataset.index);
-                        if (index === activeUtteranceIndex) {{
-                            activeDiv = div;
-                            break;
-                        }}
-                    }}
-                    // Update highlight with smooth transition
-                    if (activeDiv !== currentHighlight) {{
-                        if (currentHighlight) {{
-                            currentHighlight.classList.remove('current-' + playerId);
-                        }}
-                        if (activeDiv) {{
-                            activeDiv.classList.add('current-' + playerId);
-                            // Smooth scroll with animation
-                            activeDiv.scrollIntoView({{
-                                behavior: 'smooth',
-                                block: 'center',
-                                inline: 'nearest'
-                            }});
-                        }}
-                        currentHighlight = activeDiv;
-                    }}
-                }}
-                // Global API for pagination
-                window.transcriptPlayers_{player_id} = {{ goToPage }};
-                // Initialize
-                buildTranscript(1);
-                player.addEventListener('timeupdate', updateHighlight);
-                // Enhanced audio loading diagnostics with UI feedback
-                player.addEventListener('loadstart', () => {{
-                    console.log('🔄 Audio loading started');
-                    const container = document.getElementById('audio-container-' + playerId);
-                    const statusDiv = document.createElement('div');
-                    statusDiv.id = 'loading-status-' + playerId;
-                    statusDiv.style.cssText = 'padding: 5px; background: #e3f2fd; color: #1976d2; border-radius: 4px; margin-top: 5px; font-size: 0.9em;';
-                    statusDiv.innerHTML = '🔄 Loading audio...';
-                    container.appendChild(statusDiv);
-                }});
-                player.addEventListener('loadedmetadata', () => {{
-                    console.log('✅ Audio metadata loaded');
-                    const statusDiv = document.getElementById('loading-status-' + playerId);
-                    if (statusDiv) statusDiv.innerHTML = '✅ Metadata loaded';
-                }});
-                player.addEventListener('loadeddata', () => {{
-                    console.log('✅ Audio data loaded');
-                    const statusDiv = document.getElementById('loading-status-' + playerId);
-                    if (statusDiv) statusDiv.innerHTML = '✅ Audio data ready';
-                }});
-                player.addEventListener('canplay', () => {{
-                    console.log('▶️ Audio can start playing');
-                    const statusDiv = document.getElementById('loading-status-' + playerId);
-                    if (statusDiv) {{
-                        statusDiv.innerHTML = '🎵 Ready to play';
-                        setTimeout(() => statusDiv.remove(), 2000);
-                    }}
-                }});
-                player.addEventListener('canplaythrough', () => {{
-                    console.log('🚀 Audio can play through');
-                }});
-                player.addEventListener('error', (e) => {{
-                    console.error('❌ Audio error:', e, player.error);
-                    const statusDiv = document.getElementById('loading-status-' + playerId);
-                    if (statusDiv) statusDiv.remove();
-                    const errorDiv = document.createElement('div');
-                    errorDiv.style.cssText = 'padding: 10px; background: #ffebee; color: #c62828; border-radius: 4px; margin-top: 10px; border-left: 4px solid #f44336;';
-                    let errorMessage = '❌ Audio loading failed. ';
-                    if (player.error) {{
-                        switch(player.error.code) {{
-                            case 1: errorMessage += 'Network error - check your connection.'; break;
-                            case 2: errorMessage += 'File format not supported.'; break;
-                            case 3: errorMessage += 'Audio decoding failed.'; break;
-                            case 4: errorMessage += 'Audio source not usable.'; break;
-                            default: errorMessage += 'Unknown error occurred.';
-                        }}
-                    }} else {{
-                        errorMessage += 'Please check the file format and try again.';
-                    }}
-                    errorDiv.innerHTML = errorMessage;
-                    document.getElementById('audio-container-' + playerId).appendChild(errorDiv);
-                }});
-                // Timeout fallback - if no canplay event after 30 seconds
-                setTimeout(() => {{
-                    if (player.readyState === 0) {{
-                        console.warn('⚠️ Audio loading timeout');
-                        const container = document.getElementById('audio-container-' + playerId);
-                        const timeoutDiv = document.createElement('div');
-                        timeoutDiv.style.cssText = 'padding: 8px; background: #fff3e0; color: #f57c00; border-radius: 4px; margin-top: 5px;';
-                        timeoutDiv.innerHTML = '⚠️ Audio loading is taking longer than expected. Large file or slow connection?';
-                        container.appendChild(timeoutDiv);
-                    }}
-                }}, 30000);
-                // Handle seek events
-                player.addEventListener('seeking', () => isSeeking = true);
-                player.addEventListener('seeked', () => {{
-                    setTimeout(() => isSeeking = false, 100);
-                }});
-                // Keyboard navigation
-                document.addEventListener('keydown', (e) => {{
-                    if (!useVirtualization) return;
-                    if (e.target.tagName === 'INPUT' || e.target.tagName === 'TEXTAREA') return;
-                    if (e.key === 'ArrowLeft' && currentPage > 1) {{
-                        e.preventDefault();
-                        goToPage(currentPage - 1);
-                    }} else if (e.key === 'ArrowRight' && currentPage < totalPages) {{
-                        e.preventDefault();
-                        goToPage(currentPage + 1);
-                    }}
-                }});
-                // Inline editing functions
-                window.startEdit = function(index) {{
-                    const div = document.querySelector(`[data-index="${{index}}"]`);
-                    if (!div) return;
-                    const textDisplay = div.querySelector('.text-display-' + playerId);
-                    const editContainer = div.querySelector('.edit-mode-container-' + playerId);
-                    const textarea = div.querySelector('.edit-textarea-' + playerId);
-                    if (!textDisplay || !editContainer || !textarea) return;
-                    // Store original text for cancel
-                    textarea.dataset.originalText = textDisplay.textContent;
-                    // Switch to edit mode
-                    textDisplay.style.display = 'none';
-                    editContainer.style.display = 'block';
-                    div.classList.add('edit-mode-' + playerId);
-                    // Focus and select all text
-                    textarea.focus();
-                    textarea.select();
-                }};
-                window.saveEdit = function(index) {{
-                    const div = document.querySelector(`[data-index="${{index}}"]`);
-                    if (!div) return;
-                    const textDisplay = div.querySelector('.text-display-' + playerId);
-                    const editContainer = div.querySelector('.edit-mode-container-' + playerId);
-                    const textarea = div.querySelector('.edit-textarea-' + playerId);
-                    if (!textDisplay || !editContainer || !textarea) return;
-                    const newText = textarea.value.trim();
-                    if (!newText) {{
-                        alert('Text cannot be empty');
-                        return;
-                    }}
-                    // Update display text
-                    textDisplay.textContent = newText;
-                    // Update utterances data
-                    utterances[index][2] = newText;
-                    // Send update to Streamlit (via session state simulation)
-                    try {{
-                        // Create a custom event to notify Streamlit about the change
-                        const updateEvent = new CustomEvent('utteranceUpdate', {{
-                            detail: {{
-                                index: index,
-                                text: newText,
-                                playerId: playerId
-                            }}
-                        }});
-                        window.dispatchEvent(updateEvent);
-                        // Store in localStorage as backup
-                        const editKey = 'voxsum_edits_' + playerId;
-                        let edits = JSON.parse(localStorage.getItem(editKey) || '{{}}');
-                        edits[index] = newText;
-                        localStorage.setItem(editKey, JSON.stringify(edits));
-                        console.log('💾 Utterance updated:', index, newText);
-                    }} catch (e) {{
-                        console.warn('⚠️ Could not save to session state:', e);
-                    }}
-                    // Exit edit mode
-                    cancelEdit(index, false);
-                    // Show success feedback
-                    showSuccessMessage(div, 'Saved!');
-                }};
-                window.cancelEdit = function(index, restoreText = true) {{
-                    const div = document.querySelector(`[data-index="${{index}}"]`);
-                    if (!div) return;
-                    const textDisplay = div.querySelector('.text-display-' + playerId);
-                    const editContainer = div.querySelector('.edit-mode-container-' + playerId);
-                    const textarea = div.querySelector('.edit-textarea-' + playerId);
-                    if (!textDisplay || !editContainer || !textarea) return;
-                    // Restore original text if cancelling
-                    if (restoreText && textarea.dataset.originalText) {{
-                        textarea.value = textarea.dataset.originalText;
-                    }}
-                    // Exit edit mode
-                    textDisplay.style.display = 'inline';
-                    editContainer.style.display = 'none';
-                    div.classList.remove('edit-mode-' + playerId);
-                }};
-                // Helper function to show success message
-                function showSuccessMessage(div, message) {{
-                    const successDiv = document.createElement('div');
-                    successDiv.style.cssText = `
-                        position: absolute;
-                        top: -30px;
-                        right: 10px;
-                        background: #4caf50;
-                        color: white;
-                        padding: 4px 8px;
-                        border-radius: 4px;
-                        font-size: 0.8em;
-                        pointer-events: none;
-                        z-index: 1000;
-                    `;
-                    successDiv.textContent = message;
-                    div.style.position = 'relative';
-                    div.appendChild(successDiv);
-                    setTimeout(() => {{
-                        if (successDiv.parentNode) {{
-                            successDiv.parentNode.removeChild(successDiv);
-                        }}
-                    }}, 2000);
-                }}
-                // Load saved edits from localStorage
-                const editKey = 'voxsum_edits_' + playerId;
-                const savedEdits = JSON.parse(localStorage.getItem(editKey) || '{{}}');
-                for (const [index, text] of Object.entries(savedEdits)) {{
-                    if (utterances[index]) {{
-                        utterances[index][2] = text;
-                    }}
-                }}
-            }})();
-        </script>
-    </body>
-    </html>
-    """
-    return html_content
-def create_export_interface():
-    """Create interface for exporting transcripts and summaries"""
-    if not st.session_state.utterances and not st.session_state.summary:
-        return
-    st.markdown("### 📥 Export Options")
-    export_tab1, export_tab2 = st.tabs(["📝 Transcript", "📄 Summary"])
-    with export_tab1:
-        if st.session_state.utterances:
-            # Choose format based on speaker diarization
-            if st.session_state.utterances_with_speakers:
-                st.markdown("**Speaker diarization detected - Transcript formats available:**")
-                format_options = TRANSCRIPT_FORMATS
-            else:
-                st.markdown("**No speaker diarization - Subtitle formats available:**")
-                format_options = SUBTITLE_FORMATS
-            # Format selection
-            format_name = st.selectbox(
-                "Export format",
-                list(format_options.keys()),
-                key="transcript_export_format"
-            )
-            format_info = format_options[format_name]
-            # Export button and download
-            if st.button(f"📥 Export as {format_name}", key="export_transcript"):
-                # Prepare data - use available utterances (with or without speakers)
-                if st.session_state.utterances_with_speakers:
-                    utterances_data = st.session_state.utterances_with_speakers
-                else:
-                    utterances_data = [(start, end, text, 0) for start, end, text in st.session_state.utterances]
-                # Generate content
-                try:
-                    if format_name in SUBTITLE_FORMATS:
-                        # For subtitle formats, use regular utterances
-                        regular_utterances = [(start, end, text) for start, end, text, _ in utterances_data]
-                        content = format_info["function"](regular_utterances, utterances_data if st.session_state.utterances_with_speakers else None)
-                    else:
-                        # For transcript formats, pass speaker-aware data
-                        content = format_info["function"](
-                            [(start, end, text) for start, end, text, _ in utterances_data],
-                            utterances_data if st.session_state.utterances_with_speakers else None
-                        )
-                    # Create download button
-                    filename = f"transcript_{datetime.now().strftime('%Y%m%d_%H%M%S')}{format_info['extension']}"
-                    st.download_button(
-                        label=f"💾 Download {filename}",
-                        data=content,
-                        file_name=filename,
-                        mime=format_info["mime_type"]
-                    )
-                except Exception as e:
-                    st.error(f"Export failed: {str(e)}")
-        else:
-            st.info("No transcript available for export")
-    with export_tab2:
-        if st.session_state.summary:
-            # Summary export formats
-            format_name = st.selectbox(
-                "Summary format",
-                list(SUMMARY_FORMATS.keys()),
-                key="summary_export_format"
-            )
-            format_info = SUMMARY_FORMATS[format_name]
-            # Metadata for summary
-            with st.expander("📋 Add metadata (optional)"):
-                metadata = {}
-                metadata["title"] = st.text_input("Title", key="summary_title")
-                metadata["date"] = st.date_input("Date", value=datetime.now().date(), key="summary_date").isoformat()
-                if st.session_state.utterances_with_speakers:
-                    num_speakers = len(set(speaker for _, _, _, speaker in st.session_state.utterances_with_speakers))
-                    metadata["speakers"] = f"{num_speakers} speakers detected"
-                if st.session_state.audio_path:
-                    # Calculate duration if possible
-                    try:
-                        if st.session_state.utterances:
-                            last_utterance = st.session_state.utterances[-1]
-                            duration_sec = last_utterance[1]  # end time
-                            duration_min = int(duration_sec // 60)
-                            duration_sec_remainder = int(duration_sec % 60)
-                            metadata["duration"] = f"{duration_min}m {duration_sec_remainder}s"
-                    except:
-                        pass
-                # Clean empty metadata
-                metadata = {k: v for k, v in metadata.items() if v}
-            # Export button
-            if st.button(f"📥 Export summary as {format_name}", key="export_summary"):
-                try:
-                    content = format_info["function"](st.session_state.summary, metadata if metadata else None)
-                    filename = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}{format_info['extension']}"
-                    st.download_button(
-                        label=f"💾 Download {filename}",
-                        data=content,
-                        file_name=filename,
-                        mime=format_info["mime_type"]
-                    )
-                except Exception as e:
-                    st.error(f"Export failed: {str(e)}")
-        else:
-            st.info("No summary available for export")
-def render_results_tab(settings):
-    st.subheader("🎤 Transcription & Summary")
-    status_placeholder = st.empty()
-    transcript_display = st.empty()
-    summary_container = st.container()
-    # Update pagination settings
-    st.session_state.utterances_per_page = settings.get("utterances_per_page", 100)
-    # Handle audio base64 encoding
-    if (st.session_state.audio_path and
-        st.session_state.get("prev_audio_path") != st.session_state.audio_path):
-        st.session_state.audio_base64 = None
-        st.session_state.prev_audio_path = st.session_state.audio_path
-        st.session_state.static_audio_url = None  # Reset static URL
-    # Transcription Process
-    if st.button("🎙️ Transcribe Audio"):
-        if st.session_state.audio_path:
-            status_placeholder.info("🔊 Transcribing audio... Please wait.")
-            st.session_state.utterances = []
-            st.session_state.transcript = ""
-            st.session_state.transcribing = True
-            with transcript_display.container():
-                st.markdown("### 📝 Live Transcript (Streaming)")
-                live_placeholder = st.empty()
-                progress_bar = st.progress(0)
-                utterance_counter = st.empty()
-            try:
-                # Determine model name and backend-specific parameters
-                if st.session_state.backend == "moonshine":
-                    model_key = model_names[settings["model_name"]]
-                else:
-                    model_key = sensevoice_models[settings["model_name"]]
-                gen = transcribe_file(
-                    st.session_state.audio_path,
-                    settings["vad_threshold"],
-                    model_key,
-                    backend=st.session_state.backend,
-                    language=st.session_state.language if st.session_state.backend == "sensevoice" else "auto",
-                    textnorm=st.session_state.textnorm if st.session_state.backend == "sensevoice" else "withitn"
-                )
-                # Estimate total duration for progress
-                try:
-                    import soundfile as sf
-                    audio_info = sf.info(st.session_state.audio_path)
-                    total_duration = audio_info.duration
-                except:
-                    total_duration = None
-                utterance_count = 0
-                for current_utterance, all_utts in gen:
-                    st.session_state.utterances = list(all_utts) if all_utts else []
-                    utterance_count = len(st.session_state.utterances)
-                    # Update progress if we have duration info
-                    if total_duration and current_utterance:
-                        progress = min(1.0, current_utterance[1] / total_duration)
-                        progress_bar.progress(progress)
-                    # Efficient transcript display for streaming
-                    if utterance_count <= 200:
-                        # For smaller transcripts, show full text
-                        st.session_state.transcript = "\n".join(
-                            text for start, end, text in st.session_state.utterances
-                        )
-                        live_placeholder.markdown(st.session_state.transcript)
-                    else:
-                        # For large transcripts, show last few utterances only
-                        recent_utterances = st.session_state.utterances[-10:]
-                        recent_text = "\n".join(
-                            f"[{int(start//60)}:{int(start%60):02d}] {text}"
-                            for start, end, text in recent_utterances
-                        )
-                        live_placeholder.markdown(f"**Recent utterances (last 10):**\n{recent_text}")
-                    utterance_counter.info(f"📊 {utterance_count} utterances processed")
-                st.session_state.transcribing = False
-                progress_bar.progress(1.0)
-                status_placeholder.success(f"✅ Transcription completed! {utterance_count} utterances generated.")
-                # Perform speaker diarization if enabled
-                print(f"🔍 DEBUG Diarization Check: enable_diarization={st.session_state.enable_diarization}, utterances_count={len(st.session_state.utterances)}")
-                if st.session_state.enable_diarization and st.session_state.utterances:
-                    print("✅ DEBUG: Starting diarization process...")
-                    status_placeholder.info("🎭 Performing speaker diarization... This may take a few minutes.")
-                    diarization_progress = st.progress(0)
-                    try:
-                        # Initialize embedding extractor (lighter than full diarization system)
-                        print("🔍 DEBUG: Initializing embedding extractor...")
-                        extractor_result = init_speaker_embedding_extractor(
-                            cluster_threshold=st.session_state.cluster_threshold,
-                            num_speakers=st.session_state.num_speakers
-                        )
-                        if extractor_result:
-                            print("✅ DEBUG: Embedding extractor initialized successfully")
-                            embedding_extractor, config_dict = extractor_result
-                            # Load audio for diarization (needs to be 16kHz)
-                            import soundfile as sf
-                            import scipy.signal
-                            audio, sample_rate = sf.read(st.session_state.audio_path, dtype='float32')
-                            # Resample to 16kHz if needed (reusing existing resampling logic)
-                            if sample_rate != 16000:
-                                audio = scipy.signal.resample(audio, int(len(audio) * 16000 / sample_rate))
-                                sample_rate = 16000
-                            # Ensure mono
-                            if len(audio.shape) > 1:
-                                audio = audio.mean(axis=1)
-                            # Progress callback for diarization
-                            def diarization_progress_callback(progress):
-                                diarization_progress.progress(min(1.0, progress))
-                            # Perform diarization using existing ASR utterance segments
-                            print(f"🔍 DEBUG: Starting diarization with {len(st.session_state.utterances)} utterances")
-                            diarization_result = perform_speaker_diarization_on_utterances(
-                                audio, sample_rate, st.session_state.utterances,
-                                embedding_extractor, config_dict, diarization_progress_callback
-                            )
-                            print(f"🔍 DEBUG: Diarization returned {len(diarization_result) if diarization_result else 0} results")
-                            if diarization_result:
-                                print("✅ DEBUG: Merging transcription with diarization...")
-                                # Merge transcription with diarization
-                                merged_utterances = merge_transcription_with_diarization(
-                                    st.session_state.utterances, diarization_result
-                                )
-                                # Merge consecutive utterances from the same speaker
-                                st.session_state.utterances_with_speakers = merge_consecutive_utterances(
-                                    merged_utterances, max_gap=1.0
-                                )
-                                print(f"✅ DEBUG: Merged result has {len(st.session_state.utterances_with_speakers)} utterances with speakers")
-                                # Calculate statistics
-                                st.session_state.diarization_stats = get_diarization_stats(
-                                    st.session_state.utterances_with_speakers
-                                )
-                                diarization_progress.progress(1.0)
-                                num_speakers = st.session_state.diarization_stats.get("total_speakers", 0)
-                                status_placeholder.success(f"✅ Speaker diarization completed! {num_speakers} speakers detected.")
-                            else:
-                                print("❌ DEBUG: Diarization returned empty result")
-                                status_placeholder.error("❌ Speaker diarization failed.")
-                                st.session_state.utterances_with_speakers = []
-                        else:
-                            print("❌ DEBUG: Failed to initialize embedding extractor")
-                            status_placeholder.error("❌ Failed to initialize speaker diarization.")
-                            st.session_state.utterances_with_speakers = []
-                    except Exception as e:
-                        print(f"❌ DEBUG: Exception in diarization: {str(e)}")
-                        status_placeholder.error(f"❌ Speaker diarization error: {str(e)}")
-                        st.session_state.utterances_with_speakers = []
-                else:
-                    # No diarization requested - clear previous results
-                    print(f"❌ DEBUG: Diarization not executed - enable_diarization={st.session_state.enable_diarization}, has_utterances={bool(st.session_state.utterances)}")
-                    st.session_state.utterances_with_speakers = []
-                    st.session_state.diarization_stats = {}
-                st.rerun()
-            except Exception as e:
-                status_placeholder.error(f"Transcription error: {str(e)}")
-                st.session_state.transcribing = False
-        else:
-            status_placeholder.warning("⚠️ No audio file available")
-    # Summarization Process
-    if st.button("📝 Generate Summary"):
-        if st.session_state.transcript:
-            status_placeholder.info("🧠 Generating summary...")
-            st.session_state.summary = ""
-            summary_container.empty()
-            # Show transcript during summarization
-            with transcript_display.container():
-                if st.session_state.audio_path and st.session_state.utterances:
-                    # Use efficient player for summarization view with speaker colors if available
-                    utterances_display = st.session_state.utterances_with_speakers if st.session_state.utterances_with_speakers else None
-                    html = create_efficient_sync_player(
-                        st.session_state.audio_path,
-                        st.session_state.utterances,
-                        utterances_display
-                    )
-                    # Dynamic height calculation with better scaling - increased for more visibility
-                    base_height = 300
-                    content_height = min(800, max(base_height, len(st.session_state.utterances) * 15 + 200))
-                    st.components.v1.html(html, height=content_height, scrolling=True)
-                elif st.session_state.utterances:
-                    st.markdown("### 📝 Transcript")
-                    # For very long transcripts, show summary info
-                    if len(st.session_state.utterances) > 500:
-                        st.info(f"📊 Large transcript: {len(st.session_state.utterances)} utterances")
-                        with st.expander("View full transcript"):
-                            st.markdown(st.session_state.transcript)
-                    else:
-                        st.markdown(st.session_state.transcript)
-                else:
-                    st.info("No transcript available.")
-            # Live summary display
-            live_summary_area = st.empty()
-            with live_summary_area.container():
-                st.markdown("### 📝 Live Summary (In Progress)")
-                progress_placeholder = st.empty()
-            summary_gen = summarize_transcript(
-                st.session_state.transcript,
-                settings["llm_model"],
-                settings["prompt_input"]
-            )
-            for accumulated_summary in summary_gen:
-                st.session_state.summary = accumulated_summary
-                progress_placeholder.markdown(accumulated_summary)
-            live_summary_area.empty()
-            st.rerun()
-        else:
-            status_placeholder.warning("⚠️ No transcript available")
-    # Display final results
-    if st.session_state.audio_path and st.session_state.utterances and not st.session_state.transcribing:
-        # Show speaker diarization statistics if available
-        if st.session_state.diarization_stats and st.session_state.diarization_stats.get("total_speakers", 0) > 0:
-            st.markdown("### 🎭 Speaker Analysis")
-            stats = st.session_state.diarization_stats
-            col1, col2 = st.columns([2, 1])
-            with col1:
-                # Speaker breakdown
-                speaker_data = []
-                for speaker_id, speaker_stats in stats["speakers"].items():
-                    speaker_data.append({
-                        "Speaker": f"Speaker {speaker_id + 1}",
-                        "Speaking Time": f"{speaker_stats['speaking_time']:.1f}s",
-                        "Percentage": f"{speaker_stats['percentage']:.1f}%",
-                        "Utterances": speaker_stats['utterances'],
-                        "Avg Length": f"{speaker_stats['avg_utterance_length']:.1f}s"
-                    })
-                import pandas as pd
-                df = pd.DataFrame(speaker_data)
-                st.dataframe(df, use_container_width=True)
-            with col2:
-                st.metric("Total Speakers", stats["total_speakers"])
-                st.metric("Total Duration", f"{stats['total_duration']:.1f}s")
-        # Performance optimization: show stats for large transcripts
-        if len(st.session_state.utterances) > 100:
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("📊 Utterances", len(st.session_state.utterances))
-            with col2:
-                duration = st.session_state.utterances[-1][1] if st.session_state.utterances else 0
-                st.metric("⏱️ Duration", f"{duration/60:.1f} min")
-            with col3:
-                avg_length = sum(len(text) for _, _, text in st.session_state.utterances) / len(st.session_state.utterances)
-                st.metric("📝 Avg Length", f"{avg_length:.0f} chars")
-        # Use efficient player for final results with speaker colors if available
-        utterances_display = st.session_state.utterances_with_speakers if st.session_state.utterances_with_speakers else None
-        # DEBUG: Print information about diarization
-        if utterances_display:
-            print(f"🎭 DEBUG: Using diarized utterances - {len(utterances_display)} segments with speakers")
-            for i, (start, end, text, speaker) in enumerate(utterances_display[:3]):  # Show first 3
-                print(f"  Sample {i+1}: [{start:.1f}-{end:.1f}s] Speaker {speaker}: '{text[:30]}...'")
-        else:
-            print(f"📝 DEBUG: Using regular utterances - {len(st.session_state.utterances)} segments without speakers")
-        html = create_efficient_sync_player(
-            st.session_state.audio_path,
-            st.session_state.utterances,
-            utterances_display
-        )
-        # Improved height calculation for better UX - increased for more transcript visibility
-        base_height = 350
-        content_height = min(900, max(base_height, len(st.session_state.utterances) * 12 + 250))
-        with transcript_display.container():
-            st.components.v1.html(html, height=content_height, scrolling=True)
-        # Show formatted transcript with speakers if diarization was performed
-        if st.session_state.utterances_with_speakers:
-            with st.expander("📄 Speaker-Labeled Transcript", expanded=False):
-                formatted_transcript = format_speaker_transcript(st.session_state.utterances_with_speakers)
-                st.markdown(formatted_transcript)
-        # Add export interface (editing is now inline)
-        st.markdown("---")
-        create_export_interface()
-    elif not st.session_state.utterances and not st.session_state.transcribing:
-        with transcript_display.container():
-            st.info("No transcript available. Click 'Transcribe Audio' to generate one.")
-    if st.session_state.summary:
-        with summary_container:
-            st.markdown("### 📝 Final Summary")
-            st.markdown(st.session_state.summary)
-# === 3. Main App ===
-def main():
-    init_session_state()
-    # Optimized page config for HF Spaces and large files
-    st.set_page_config(
-        page_title="🎙️ ASR + LLM",
-        layout="wide",
-        initial_sidebar_state="expanded",
-        menu_items={
-            'Get Help': 'https://github.com/your-repo/issues',
-            'Report a bug': 'https://github.com/your-repo/issues',
-            'About': "VoxSum Studio - Optimized for large audio files"
-        }
-    )
-    # HF Spaces specific optimizations
-    if os.environ.get('SPACE_ID'):
-        st.markdown("""
-        <div style='background: linear-gradient(90deg, #1f77b4, #ff7f0e); padding: 8px; border-radius: 6px; margin-bottom: 15px;'>
-            <p style='color: white; margin: 0; text-align: center; font-weight: 500;'>
-                🚀 Running on Hugging Face Spaces - Optimized for large audio files
-            </p>
-        </div>
-        """, unsafe_allow_html=True)
-    st.title("🎙️ Speech Summarization with Moonshine & SenseVoice ASR")
-    settings = render_settings_sidebar()
-    tab1, tab2, tab3 = st.tabs(["📻 Podcast", "🎵 Audio Input", "📄 Results"])
-    with tab1:
-        render_podcast_tab()
-    with tab2:
-        render_audio_tab()
-    with tab3:
-        render_results_tab(settings)
-if __name__ == "__main__":
-    main()