Luigi commited on
Commit
228a065
·
1 Parent(s): ca590d4

Clean up legacy Streamlit implementation

Browse files

- Remove src/streamlit_app.py (legacy Streamlit application)
- Remove src/editing_sync.py (Streamlit editing helper)
- Update README.md to remove references to deleted files
- Update improved_diarization.py comment to remove Streamlit reference

README.md CHANGED
@@ -59,11 +59,9 @@ voxsum-studio/
59
  │ ├── __init__.py # Makes src a Python package
60
  │ ├── asr.py # Logic for Automatic Speech Recognition (ASR) transcription
61
  │ ├── diarization.py # Speaker diarization functionality
62
- │ ├── editing_sync.py # Audio editing and synchronization
63
  │ ├── export_utils.py # Utilities for exporting transcripts and summaries
64
  │ ├── improved_diarization.py # Enhanced diarization features
65
  │ ├── podcast.py # Functions for podcast search, episode fetching, and audio downloading
66
- │ ├── streamlit_app.py # Legacy Streamlit application (for reference)
67
  │ ├── summarization.py # Logic for generating summaries using LLMs
68
  │ ├── utils.py # Utility functions and model configurations
69
  │ ├── server/ # FastAPI backend
 
59
  │ ├── __init__.py # Makes src a Python package
60
  │ ├── asr.py # Logic for Automatic Speech Recognition (ASR) transcription
61
  │ ├── diarization.py # Speaker diarization functionality
 
62
  │ ├── export_utils.py # Utilities for exporting transcripts and summaries
63
  │ ├── improved_diarization.py # Enhanced diarization features
64
  │ ├── podcast.py # Functions for podcast search, episode fetching, and audio downloading
 
65
  │ ├── summarization.py # Logic for generating summaries using LLMs
66
  │ ├── utils.py # Utility functions and model configurations
67
  │ ├── server/ # FastAPI backend
src/editing_sync.py DELETED
@@ -1,65 +0,0 @@
1
- """
2
- Helper script to handle inline editing communication with Streamlit
3
- """
4
-
5
- import streamlit as st
6
- import json
7
-
8
- def init_editing_communication():
9
- """Initialize communication channel for inline editing"""
10
-
11
- # Check for updates from JavaScript
12
- if 'editing_updates' not in st.session_state:
13
- st.session_state.editing_updates = {}
14
-
15
- # Add JavaScript to handle communication
16
- js_code = """
17
- <script>
18
- // Listen for utterance updates
19
- window.addEventListener('utteranceUpdate', function(event) {
20
- const detail = event.detail;
21
- console.log('📝 Utterance update received:', detail);
22
-
23
- // Send update to Streamlit via session state
24
- // Note: This is a demonstration - in production, you'd use st.components for two-way communication
25
- // For now, we rely on localStorage and manual sync
26
- });
27
-
28
- // Function to get all edits for sync with Streamlit
29
- window.getEditedUtterances = function(playerId) {
30
- const editKey = 'voxsum_edits_' + playerId;
31
- return JSON.parse(localStorage.getItem(editKey) || '{}');
32
- };
33
-
34
- // Function to clear edits after sync
35
- window.clearEditedUtterances = function(playerId) {
36
- const editKey = 'voxsum_edits_' + playerId;
37
- localStorage.removeItem(editKey);
38
- };
39
- </script>
40
- """
41
-
42
- st.components.v1.html(js_code, height=0)
43
-
44
- def check_for_editing_updates():
45
- """Check if there are any editing updates and apply them"""
46
-
47
- # This is a placeholder - in a real implementation, you'd need
48
- # a proper communication channel between JavaScript and Streamlit
49
- # For now, we show how the system would work
50
-
51
- if st.button("🔄 Sync edits from transcript", help="Click to apply any edits made in the interactive transcript"):
52
- # In a real implementation, this would:
53
- # 1. Get edits from JavaScript via st.components
54
- # 2. Apply them to session state
55
- # 3. Update the utterances
56
-
57
- st.info("Edits would be synchronized here. For demonstration purposes, the localStorage-based editing is working in the transcript viewer.")
58
-
59
- # For now, show current state
60
- if st.session_state.utterances:
61
- st.write(f"Current utterances: {len(st.session_state.utterances)}")
62
-
63
- return True
64
-
65
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/improved_diarization.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Diarisation Améliorée avec Clustering Adaptatif et Validation de Qualité
3
- Vendored copy so the module is importable when running Streamlit from `src/`.
4
  """
5
 
6
  import numpy as np
 
1
  """
2
  Diarisation Améliorée avec Clustering Adaptatif et Validation de Qualité
3
+ Vendored copy for importability from src/.
4
  """
5
 
6
  import numpy as np
src/streamlit_app.py DELETED
@@ -1,1444 +0,0 @@
1
- # frontend.py
2
- import streamlit as st
3
- from asr import transcribe_file
4
- from summarization import summarize_transcript
5
- from podcast import search_podcast_series, fetch_episodes, download_podcast_audio, fetch_audio
6
- from utils import model_names, sensevoice_models, available_gguf_llms
7
- from diarization import (
8
- init_speaker_embedding_extractor, perform_speaker_diarization_on_utterances,
9
- merge_transcription_with_diarization, merge_consecutive_utterances, format_speaker_transcript,
10
- get_diarization_stats, get_speaker_color
11
- )
12
- from export_utils import (
13
- SUBTITLE_FORMATS, TRANSCRIPT_FORMATS, SUMMARY_FORMATS,
14
- export_to_srt, export_to_vtt, export_to_ass, export_to_transcript_json,
15
- export_to_elan_eaf, export_plain_text, export_summary_markdown, export_summary_plain_text
16
- )
17
- import base64
18
- import json
19
- import hashlib
20
- import os
21
- import shutil
22
- import uuid
23
- import math
24
- from pathlib import Path
25
- from datetime import datetime
26
-
27
- # === 1. Session State Initialization ===
28
- def init_session_state():
29
- defaults = {
30
- "transcript": "",
31
- "summary": "",
32
- "status": "Ready",
33
- "audio_path": None,
34
- "utterances": [],
35
- "utterances_with_speakers": [], # New: for diarization results
36
- "audio_base64": None,
37
- "prev_audio_path": None,
38
- "transcribing": False,
39
- "series_list": [],
40
- "episodes": [],
41
- "backend": "sensevoice", # New: default backend
42
- "sensevoice_model": list(sensevoice_models.keys())[0], # New: default SenseVoice model
43
- "language": "auto", # New: language setting for SenseVoice
44
- "textnorm": "withitn", # New: text normalization for SenseVoice
45
- "current_page": 1, # New: for pagination
46
- "utterances_per_page": 100, # New: pagination size
47
- "static_audio_url": None, # New: for static audio serving
48
- # Speaker Diarization Settings
49
- "enable_diarization": False, # New: diarization toggle
50
- "num_speakers": -1, # New: number of speakers (-1 = auto)
51
- "cluster_threshold": 0.5, # New: clustering threshold
52
- "diarization_stats": {}, # New: speaker statistics
53
- "utterances_with_speakers": [], # New: diarized utterances
54
- }
55
- for key, value in defaults.items():
56
- if key not in st.session_state:
57
- st.session_state[key] = value
58
-
59
- # === 1.1. Static Audio File Management ===
60
- def cleanup_old_static_files():
61
- """Clean up old static audio files to prevent disk space issues on HF Spaces"""
62
- try:
63
- static_dir = Path("static")
64
- if not static_dir.exists():
65
- return
66
-
67
- # Get all audio files with their modification times
68
- audio_files = []
69
- for pattern in ["*.mp3", "*.wav", "*.m4a"]:
70
- audio_files.extend(static_dir.glob(pattern))
71
-
72
- # If more than 10 files, remove oldest ones
73
- if len(audio_files) > 10:
74
- audio_files.sort(key=lambda f: f.stat().st_mtime)
75
- for old_file in audio_files[:-10]: # Keep only 10 newest
76
- try:
77
- old_file.unlink()
78
- print(f"🧹 Cleaned up old audio file: {old_file.name}")
79
- except:
80
- pass
81
- except Exception as e:
82
- print(f"⚠️ Cleanup warning: {e}")
83
-
84
- def setup_static_audio(audio_path):
85
- """
86
- Copy audio file to static directory and return URL for serving.
87
- This eliminates the need for base64 encoding.
88
- """
89
- try:
90
- # Clean up old files first (important for HF Spaces)
91
- cleanup_old_static_files()
92
-
93
- # Use Streamlit's static directory structure
94
- static_dir = Path("static")
95
- static_dir.mkdir(exist_ok=True)
96
-
97
- # Generate unique filename
98
- audio_id = str(uuid.uuid4())[:8]
99
- file_extension = Path(audio_path).suffix or '.mp3'
100
- static_filename = f"audio_{audio_id}{file_extension}"
101
- static_path = static_dir / static_filename
102
-
103
- # Copy audio file
104
- shutil.copy2(audio_path, static_path)
105
-
106
- # Return relative URL that Streamlit can serve
107
- return f"./static/{static_filename}"
108
- except PermissionError:
109
- st.warning("⚠️ Cannot access static directory. Using fallback method.")
110
- return None
111
- except Exception as e:
112
- st.warning(f"Static file setup failed: {e}. Using fallback method.")
113
- return None
114
-
115
- # === 2. UI Components ===
116
- # In render_settings_sidebar function
117
- def render_settings_sidebar():
118
- with st.sidebar:
119
- st.header("⚙️ Settings")
120
-
121
- # Backend selection
122
- st.session_state.backend = st.radio(
123
- "ASR Backend",
124
- ["moonshine", "sensevoice"],
125
- index=0 if st.session_state.backend == "moonshine" else 1
126
- )
127
-
128
- # Model selection based on backend
129
- if st.session_state.backend == "moonshine":
130
- model_name = st.selectbox("Moonshine Model", model_names.keys())
131
- else:
132
- st.session_state.sensevoice_model = st.selectbox(
133
- "SenseVoice Model",
134
- sensevoice_models.keys(),
135
- index=list(sensevoice_models.keys()).index(st.session_state.sensevoice_model) if st.session_state.sensevoice_model in sensevoice_models else 0
136
- )
137
- model_name = st.session_state.sensevoice_model
138
-
139
- # SenseVoice specific settings
140
- st.session_state.language = st.selectbox(
141
- "Language",
142
- ["auto", "zh", "en", "ja", "ko", "yue"],
143
- index=["auto", "zh", "en", "ja", "ko", "yue"].index(st.session_state.language) if st.session_state.language in ["auto", "zh", "en", "ja", "ko", "yue"] else 0
144
- )
145
- st.session_state.textnorm = st.radio(
146
- "Text Normalization",
147
- ["withitn", "noitn"],
148
- index=0 if st.session_state.textnorm == "withitn" else 1
149
- )
150
-
151
- # Speaker Diarization Settings
152
- st.divider()
153
- st.subheader("🎭 Speaker Diarization")
154
- st.session_state.enable_diarization = st.checkbox(
155
- "Enable Speaker Diarization",
156
- value=st.session_state.enable_diarization,
157
- help="⚠️ This feature is time-consuming and will significantly increase processing time"
158
- )
159
-
160
- if st.session_state.enable_diarization:
161
- col1, col2 = st.columns(2)
162
- with col1:
163
- st.session_state.num_speakers = st.number_input(
164
- "Number of Speakers",
165
- min_value=-1,
166
- max_value=10,
167
- value=st.session_state.num_speakers,
168
- help="-1 for auto-detection"
169
- )
170
- with col2:
171
- st.session_state.cluster_threshold = st.slider(
172
- "Clustering Threshold",
173
- min_value=0.1,
174
- max_value=1.0,
175
- value=st.session_state.cluster_threshold,
176
- step=0.05,
177
- help="Lower = more speakers detected"
178
- )
179
-
180
- st.info("📝 **Note:** Speaker diarization requires downloading ~200MB of models on first use")
181
-
182
- return {
183
- "vad_threshold": st.slider("VAD Threshold", 0.1, 0.9, 0.5),
184
- "model_name": model_name,
185
- "llm_model": st.selectbox("LLM for Summarization", list(available_gguf_llms.keys())),
186
- "prompt_input": st.text_area("Custom Prompt", value="Summarize the transcript below."),
187
- "utterances_per_page": st.number_input("Utterances per page", min_value=20, max_value=500, value=st.session_state.utterances_per_page, step=20, help="For large transcripts, adjust pagination size")
188
- }
189
-
190
-
191
- def render_podcast_tab():
192
- st.subheader("Search Podcast")
193
- query = st.text_input("Enter podcast name")
194
-
195
- col1, col2 = st.columns(2)
196
- with col1:
197
- if st.button("Search Series") and query:
198
- st.session_state.series_list = search_podcast_series(query)
199
-
200
- if st.session_state.series_list:
201
- series_titles = [f"{s['title']} by {s['artist']}" for s in st.session_state.series_list]
202
- selected_title = st.selectbox("Select Series", series_titles)
203
- series = next((s for s in st.session_state.series_list if f"{s['title']} by {s['artist']}" == selected_title), None)
204
-
205
- if series:
206
- col1, col2 = st.columns([1, 3])
207
- with col1:
208
- st.image(series["thumbnail"], width=150)
209
- with col2:
210
- st.text_area("Series Info", value=f"Title: {series['title']}\nArtist: {series['artist']}\nEpisodes: {series['episode_count']}", disabled=True)
211
-
212
- if st.button("Load Episodes"):
213
- st.session_state.episodes = fetch_episodes(series["feed_url"])
214
-
215
- if st.session_state.episodes:
216
- episode_titles = [e["title"] for e in st.session_state.episodes]
217
- selected_episode = st.selectbox("Select Episode", episode_titles)
218
- episode = next((e for e in st.session_state.episodes if e["title"] == selected_episode), None)
219
-
220
- if episode:
221
- st.text_area("Episode Info", value=f"Title: {episode['title']}\nPublished: {episode['published']}\nDuration: {episode['duration']}", disabled=True)
222
- if st.button("Download Episode"):
223
- audio_path, status = download_podcast_audio(episode["audio_url"], episode["title"], st.session_state.status)
224
- st.session_state.audio_path = audio_path
225
- st.session_state.status = status
226
-
227
- def render_audio_tab():
228
- st.subheader("Upload or Fetch Audio")
229
-
230
- # YouTube Section
231
- youtube_url = st.text_input("YouTube URL")
232
- if st.button("Fetch from YouTube") and youtube_url:
233
- audio_path, status = fetch_audio(youtube_url, st.session_state.status)
234
- st.session_state.audio_path = audio_path
235
- st.session_state.audio_base64 = None
236
- st.session_state.status = status
237
-
238
- # File Upload Section
239
- uploaded_file = st.file_uploader("Upload Audio", type=["mp3", "wav"])
240
- if uploaded_file:
241
- import tempfile
242
- try:
243
- tmp = tempfile.NamedTemporaryFile(prefix="voxsum_", suffix=".mp3", delete=False)
244
- tmp.write(uploaded_file.getbuffer())
245
- tmp.flush()
246
- tmp.close()
247
- st.session_state.audio_path = tmp.name
248
- st.session_state.audio_base64 = None
249
- except Exception as e:
250
- st.error(f"Failed to save uploaded file: {e}")
251
-
252
- def create_efficient_sync_player(audio_path, utterances, utterances_with_speakers=None):
253
- """
254
- Ultra-optimized player with inline editing for large audio files and long transcripts:
255
- 1. Base64 encoding with intelligent size limits
256
- 2. Virtual scrolling for 1000+ utterances
257
- 3. Binary search for O(log n) synchronization
258
- 4. Efficient DOM management
259
- 5. Debounced updates
260
- 6. Speaker color coding for diarization
261
- 7. Inline editing with auto-save to session state
262
- """
263
-
264
- # Use speaker-aware utterances if available
265
- display_utterances = utterances_with_speakers if utterances_with_speakers else utterances
266
- has_speakers = utterances_with_speakers is not None
267
-
268
- print(f"🎭 DEBUG Player: has_speakers={has_speakers}, display_utterances count={len(display_utterances)}")
269
- if has_speakers and len(display_utterances) > 0:
270
- sample = display_utterances[0]
271
- print(f"🎭 DEBUG Player: Sample utterance format: {len(sample)} elements = {sample}")
272
-
273
- file_size = os.path.getsize(audio_path)
274
-
275
- # For now, use base64 for all files with intelligent limits
276
- # TODO: Implement proper static file serving for production
277
- if file_size > 100 * 1024 * 1024: # 100MB absolute limit
278
- return f"""
279
- <div style="padding: 20px; text-align: center; color: #d32f2f; background: #ffebee; border-radius: 8px;">
280
- ⚠️ Audio file too large ({file_size / 1024 / 1024:.1f}MB) for browser playback.
281
- <br>Please use a smaller file (< 100MB) for optimal performance.
282
- <br><small>Large file support requires production deployment.</small>
283
- </div>
284
- """
285
-
286
- # Read and encode file as base64 - most reliable method
287
- try:
288
- with open(audio_path, "rb") as f:
289
- audio_bytes = f.read()
290
-
291
- # Check if base64 will be too large for DOM
292
- base64_size = len(audio_bytes) * 4 // 3 # Approximate base64 size
293
- if base64_size > 100 * 1024 * 1024: # 100MB base64 limit
294
- return f"""
295
- <div style="padding: 20px; text-align: center; color: #d32f2f; background: #ffebee; border-radius: 8px;">
296
- ⚠️ Audio file creates {base64_size / 1024 / 1024:.1f}MB base64 string - too large for DOM.
297
- <br>Please use a smaller file (< 75MB original size).
298
- </div>
299
- """
300
-
301
- audio_url = f"data:audio/mp3;base64,{base64.b64encode(audio_bytes).decode('utf-8')}"
302
-
303
- # Warning for larger files
304
- audio_warning = ""
305
- if file_size > 10 * 1024 * 1024: # > 10MB
306
- audio_warning = f"""
307
- <div style="padding: 8px; background: #fff3e0; border-left: 4px solid #ff9800; margin-bottom: 10px; border-radius: 4px;">
308
- 📡 Loading {file_size / 1024 / 1024:.1f}MB file ({base64_size / 1024 / 1024:.1f}MB encoded)... This may take a moment.
309
- </div>
310
- """
311
- except Exception as e:
312
- return f"""
313
- <div style="padding: 20px; text-align: center; color: #d32f2f;">
314
- ❌ Failed to load audio file: {str(e)}
315
- </div>
316
- """
317
-
318
- # Generate unique ID for this player instance
319
- player_id = hashlib.md5((audio_path + str(len(display_utterances))).encode()).hexdigest()[:8]
320
-
321
- # Determine if we need virtualization
322
- use_virtualization = len(display_utterances) > 200
323
- max_visible_items = 50 if use_virtualization else len(display_utterances)
324
-
325
- # Prepare utterances data and speaker colors
326
- utterances_json = json.dumps(display_utterances)
327
-
328
- # Generate speaker color mapping for JavaScript
329
- speaker_colors = {}
330
- if has_speakers:
331
- unique_speakers = set()
332
- for utt in display_utterances:
333
- if len(utt) >= 4: # (start, end, text, speaker_id)
334
- unique_speakers.add(utt[3])
335
- for speaker_id in unique_speakers:
336
- speaker_colors[speaker_id] = get_speaker_color(speaker_id)
337
-
338
- speaker_colors_json = json.dumps(speaker_colors)
339
-
340
- html_content = f"""
341
- <!DOCTYPE html>
342
- <html>
343
- <head>
344
- <meta charset="UTF-8">
345
- <style>
346
- body {{
347
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
348
- margin: 0; padding: 10px; background: #fafafa;
349
- }}
350
- #audio-container-{player_id} {{
351
- margin-bottom: 15px;
352
- background: white;
353
- border-radius: 8px;
354
- padding: 10px;
355
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
356
- }}
357
- #transcript-container-{player_id} {{
358
- max-height: 600px;
359
- overflow-y: auto;
360
- border: 1px solid #e0e0e0;
361
- border-radius: 8px;
362
- background: white;
363
- position: relative;
364
- }}
365
- #virtual-content-{player_id} {{
366
- padding: 8px;
367
- position: relative;
368
- }}
369
- .utterance-{player_id} {{
370
- padding: 8px 12px;
371
- margin: 2px 0;
372
- border-radius: 6px;
373
- cursor: pointer;
374
- transition: all 0.15s ease;
375
- border-left: 3px solid transparent;
376
- font-size: 0.95em;
377
- line-height: 1.5;
378
- background: #fdfdfd;
379
- }}
380
- .utterance-{player_id}:hover {{
381
- background-color: #f0f8ff;
382
- transform: translateX(3px);
383
- box-shadow: 0 2px 8px rgba(33, 150, 243, 0.2);
384
- }}
385
- .current-{player_id} {{
386
- background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%) !important;
387
- border-left: 3px solid #2196f3 !important;
388
- font-weight: 500;
389
- box-shadow: 0 3px 12px rgba(33, 150, 243, 0.3);
390
- transform: translateX(3px);
391
- }}
392
- .timestamp-{player_id} {{
393
- font-size: 0.8em;
394
- color: #666;
395
- margin-right: 8px;
396
- font-weight: 600;
397
- background: #f5f5f5;
398
- padding: 2px 6px;
399
- border-radius: 3px;
400
- }}
401
- .pagination-{player_id} {{
402
- display: flex;
403
- justify-content: center;
404
- align-items: center;
405
- padding: 10px;
406
- background: #f8f9fa;
407
- border-top: 1px solid #e0e0e0;
408
- gap: 10px;
409
- }}
410
- .pagination-{player_id} button {{
411
- padding: 6px 12px;
412
- border: 1px solid #ddd;
413
- background: white;
414
- border-radius: 4px;
415
- cursor: pointer;
416
- transition: all 0.2s;
417
- }}
418
- .pagination-{player_id} button:hover {{
419
- background: #e3f2fd;
420
- border-color: #2196f3;
421
- }}
422
- .pagination-{player_id} button:disabled {{
423
- opacity: 0.5;
424
- cursor: not-allowed;
425
- }}
426
- .stats-{player_id} {{
427
- font-size: 0.85em;
428
- color: #666;
429
- text-align: center;
430
- padding: 5px;
431
- background: #f8f9fa;
432
- }}
433
-
434
- /* Inline editing styles */
435
- .edit-mode-{player_id} {{
436
- background: #fff8e1 !important;
437
- border: 2px solid #ff9800 !important;
438
- border-radius: 8px;
439
- }}
440
-
441
- .edit-controls-{player_id} {{
442
- display: flex;
443
- align-items: center;
444
- gap: 8px;
445
- margin-top: 8px;
446
- padding-top: 8px;
447
- border-top: 1px solid #e0e0e0;
448
- }}
449
-
450
- .edit-textarea-{player_id} {{
451
- width: 100%;
452
- border: 1px solid #ddd;
453
- border-radius: 4px;
454
- padding: 8px;
455
- font-size: 0.95em;
456
- line-height: 1.5;
457
- resize: vertical;
458
- min-height: 60px;
459
- font-family: inherit;
460
- }}
461
-
462
- .edit-btn-{player_id} {{
463
- padding: 4px 8px;
464
- border: 1px solid #ddd;
465
- border-radius: 4px;
466
- background: white;
467
- cursor: pointer;
468
- font-size: 0.8em;
469
- transition: all 0.2s;
470
- }}
471
-
472
- .edit-btn-{player_id}.save {{
473
- background: #4caf50;
474
- color: white;
475
- border-color: #4caf50;
476
- }}
477
-
478
- .edit-btn-{player_id}.cancel {{
479
- background: #f44336;
480
- color: white;
481
- border-color: #f44336;
482
- }}
483
-
484
- .edit-btn-{player_id}:hover {{
485
- opacity: 0.8;
486
- }}
487
-
488
- .edit-icon-{player_id} {{
489
- position: absolute;
490
- top: 8px;
491
- right: 8px;
492
- background: rgba(255, 152, 0, 0.1);
493
- border: 1px solid #ff9800;
494
- border-radius: 50%;
495
- width: 24px;
496
- height: 24px;
497
- display: flex;
498
- align-items: center;
499
- justify-content: center;
500
- cursor: pointer;
501
- font-size: 12px;
502
- opacity: 0;
503
- transition: opacity 0.2s;
504
- }}
505
-
506
- .utterance-{player_id}:hover .edit-icon-{player_id} {{
507
- opacity: 1;
508
- }}
509
-
510
- .utterance-text-{player_id} {{
511
- position: relative;
512
- padding-right: 30px;
513
- }}
514
- </style>
515
- </head>
516
- <body>
517
- {audio_warning}
518
- <div id="audio-container-{player_id}">
519
- <audio id="audio-{player_id}" controls preload="auto" style="width: 100%;">
520
- <source src="{audio_url}" type="audio/mp3">
521
- <source src="{audio_url}" type="audio/mpeg">
522
- <source src="{audio_url}" type="audio/wav">
523
- Your browser does not support the audio element.
524
- </audio>
525
- </div>
526
-
527
- <div class="stats-{player_id}">
528
- 📊 {len(display_utterances)} utterances • ⏱️ {display_utterances[-1][1]:.1f}s duration
529
- {' • 🔄 Virtual scrolling enabled' if use_virtualization else ''}
530
- {' • 🎭 Speaker diarization active' if has_speakers else ''}
531
- </div>
532
-
533
- <div id="transcript-container-{player_id}">
534
- <div id="virtual-content-{player_id}"></div>
535
- </div>
536
-
537
- {"<div class='pagination-" + player_id + "' id='pagination-" + player_id + "'></div>" if use_virtualization else ""}
538
-
539
- <script>
540
- (function() {{
541
- const playerId = '{player_id}';
542
- const player = document.getElementById('audio-' + playerId);
543
- const container = document.getElementById('transcript-container-' + playerId);
544
- const virtualContent = document.getElementById('virtual-content-' + playerId);
545
- const utterances = {utterances_json};
546
- const useVirtualization = {str(use_virtualization).lower()};
547
- const maxVisibleItems = {max_visible_items};
548
- const hasSpeakers = {str(has_speakers).lower()};
549
- const speakerColors = {speaker_colors_json};
550
-
551
- let currentHighlight = null;
552
- let isSeeking = false;
553
- let lastUpdateTime = 0;
554
- let currentPage = 1;
555
- let itemsPerPage = maxVisibleItems;
556
- let totalPages = Math.ceil(utterances.length / itemsPerPage);
557
-
558
- // Binary search for efficient utterance finding - O(log n)
559
- function findActiveUtterance(currentTime) {{
560
- let left = 0, right = utterances.length - 1;
561
- let result = -1;
562
-
563
- while (left <= right) {{
564
- const mid = Math.floor((left + right) / 2);
565
- const [start, end] = utterances[mid];
566
-
567
- if (currentTime >= start && currentTime < end) {{
568
- return mid;
569
- }} else if (currentTime < start) {{
570
- right = mid - 1;
571
- }} else {{
572
- left = mid + 1;
573
- if (currentTime >= start) result = mid; // Keep track of closest
574
- }}
575
- }}
576
- return result;
577
- }}
578
-
579
- // Efficient DOM builder with virtual scrolling
580
- function buildTranscript(page = 1) {{
581
- virtualContent.innerHTML = '';
582
-
583
- let startIdx, endIdx;
584
- if (useVirtualization) {{
585
- startIdx = (page - 1) * itemsPerPage;
586
- endIdx = Math.min(startIdx + itemsPerPage, utterances.length);
587
- }} else {{
588
- startIdx = 0;
589
- endIdx = utterances.length;
590
- }}
591
-
592
- // Create document fragment for efficient DOM insertion
593
- const fragment = document.createDocumentFragment();
594
-
595
- for (let i = startIdx; i < endIdx; i++) {{
596
- const utt = utterances[i];
597
- if (utt.length < 3) continue;
598
-
599
- const [start, end, text] = utt;
600
- const speakerId = hasSpeakers && utt.length >= 4 ? utt[3] : null;
601
-
602
- const div = document.createElement('div');
603
- div.className = 'utterance-' + playerId;
604
- div.dataset.start = start;
605
- div.dataset.end = end;
606
- div.dataset.index = i;
607
-
608
- // Apply speaker color if available
609
- if (speakerId !== null && speakerColors[speakerId]) {{
610
- div.style.borderLeftColor = speakerColors[speakerId];
611
- div.style.backgroundColor = speakerColors[speakerId] + '15'; // 15% opacity
612
- }}
613
-
614
- const minutes = Math.floor(start / 60);
615
- const seconds = Math.floor(start % 60).toString().padStart(2, '0');
616
-
617
- // Build content with optional speaker label and edit controls
618
- let content = `<span class="timestamp-${{playerId}}">[${{minutes}}:${{seconds}}]</span>`;
619
- if (speakerId !== null) {{
620
- content += ` <span class="speaker-label-${{playerId}}" style="background: ${{speakerColors[speakerId] || '#ccc'}}; color: white; padding: 2px 6px; border-radius: 3px; font-size: 0.8em; margin-right: 6px;">S${{speakerId + 1}}</span>`;
621
- }}
622
-
623
- // Wrap text in a container for editing
624
- content += `<div class="utterance-text-${{playerId}}">
625
- <span class="text-display-${{playerId}}">${{text}}</span>
626
- <div class="edit-icon-${{playerId}}" onclick="startEdit(${{i}})" title="Edit this utterance">✏️</div>
627
- <div class="edit-mode-container-${{playerId}}" style="display: none;">
628
- <textarea class="edit-textarea-${{playerId}}">${{text}}</textarea>
629
- <div class="edit-controls-${{playerId}}">
630
- <button class="edit-btn-${{playerId}} save" onclick="saveEdit(${{i}})">💾 Save</button>
631
- <button class="edit-btn-${{playerId}} cancel" onclick="cancelEdit(${{i}})">❌ Cancel</button>
632
- </div>
633
- </div>
634
- </div>`;
635
-
636
- div.innerHTML = content;
637
-
638
- // Optimized click handler
639
- div.addEventListener('click', (e) => {{
640
- e.stopPropagation();
641
- isSeeking = true;
642
- player.currentTime = start;
643
- player.play().catch(() => {{}});
644
- setTimeout(() => isSeeking = false, 150);
645
- }});
646
-
647
- fragment.appendChild(div);
648
- }}
649
-
650
- virtualContent.appendChild(fragment);
651
- updatePagination();
652
- }}
653
-
654
- // Pagination controls
655
- function updatePagination() {{
656
- if (!useVirtualization) return;
657
-
658
- const pagination = document.getElementById('pagination-' + playerId);
659
- if (!pagination) return;
660
-
661
- pagination.innerHTML = `
662
- <button onclick="window.transcriptPlayers_${{playerId}}.goToPage(1)"
663
- ${{currentPage === 1 ? 'disabled' : ''}}>⏮️</button>
664
- <button onclick="window.transcriptPlayers_${{playerId}}.goToPage(${{Math.max(1, currentPage - 1)}})"
665
- ${{currentPage === 1 ? 'disabled' : ''}}>⏪</button>
666
- <span>Page ${{currentPage}} of ${{totalPages}}</span>
667
- <button onclick="window.transcriptPlayers_${{playerId}}.goToPage(${{Math.min(totalPages, currentPage + 1)}})"
668
- ${{currentPage === totalPages ? 'disabled' : ''}}>⏩</button>
669
- <button onclick="window.transcriptPlayers_${{playerId}}.goToPage(${{totalPages}})"
670
- ${{currentPage === totalPages ? 'disabled' : ''}}>⏭️</button>
671
- `;
672
- }}
673
-
674
- // Page navigation
675
- function goToPage(page) {{
676
- if (page < 1 || page > totalPages) return;
677
- currentPage = page;
678
- buildTranscript(currentPage);
679
- }}
680
-
681
- // Auto-navigate to page containing active utterance
682
- function navigateToActiveUtterance(utteranceIndex) {{
683
- if (!useVirtualization || utteranceIndex === -1) return;
684
-
685
- const targetPage = Math.ceil((utteranceIndex + 1) / itemsPerPage);
686
- if (targetPage !== currentPage) {{
687
- currentPage = targetPage;
688
- buildTranscript(currentPage);
689
- }}
690
- }}
691
-
692
- // Optimized highlighting with debouncing - max 20fps for better performance
693
- function updateHighlight() {{
694
- const now = Date.now();
695
- if (now - lastUpdateTime < 50) return; // 20fps max
696
- lastUpdateTime = now;
697
-
698
- if (isSeeking) return;
699
-
700
- const time = player.currentTime;
701
- const activeUtteranceIndex = findActiveUtterance(time);
702
-
703
- // Auto-navigate to correct page if needed
704
- navigateToActiveUtterance(activeUtteranceIndex);
705
-
706
- // Find active div in current page
707
- const divs = virtualContent.querySelectorAll('.utterance-' + playerId);
708
- let activeDiv = null;
709
-
710
- for (const div of divs) {{
711
- const index = parseInt(div.dataset.index);
712
- if (index === activeUtteranceIndex) {{
713
- activeDiv = div;
714
- break;
715
- }}
716
- }}
717
-
718
- // Update highlight with smooth transition
719
- if (activeDiv !== currentHighlight) {{
720
- if (currentHighlight) {{
721
- currentHighlight.classList.remove('current-' + playerId);
722
- }}
723
- if (activeDiv) {{
724
- activeDiv.classList.add('current-' + playerId);
725
- // Smooth scroll with animation
726
- activeDiv.scrollIntoView({{
727
- behavior: 'smooth',
728
- block: 'center',
729
- inline: 'nearest'
730
- }});
731
- }}
732
- currentHighlight = activeDiv;
733
- }}
734
- }}
735
-
736
- // Global API for pagination
737
- window.transcriptPlayers_{player_id} = {{ goToPage }};
738
-
739
- // Initialize
740
- buildTranscript(1);
741
- player.addEventListener('timeupdate', updateHighlight);
742
-
743
- // Enhanced audio loading diagnostics with UI feedback
744
- player.addEventListener('loadstart', () => {{
745
- console.log('🔄 Audio loading started');
746
- const container = document.getElementById('audio-container-' + playerId);
747
- const statusDiv = document.createElement('div');
748
- statusDiv.id = 'loading-status-' + playerId;
749
- statusDiv.style.cssText = 'padding: 5px; background: #e3f2fd; color: #1976d2; border-radius: 4px; margin-top: 5px; font-size: 0.9em;';
750
- statusDiv.innerHTML = '🔄 Loading audio...';
751
- container.appendChild(statusDiv);
752
- }});
753
-
754
- player.addEventListener('loadedmetadata', () => {{
755
- console.log('✅ Audio metadata loaded');
756
- const statusDiv = document.getElementById('loading-status-' + playerId);
757
- if (statusDiv) statusDiv.innerHTML = '✅ Metadata loaded';
758
- }});
759
-
760
- player.addEventListener('loadeddata', () => {{
761
- console.log('✅ Audio data loaded');
762
- const statusDiv = document.getElementById('loading-status-' + playerId);
763
- if (statusDiv) statusDiv.innerHTML = '✅ Audio data ready';
764
- }});
765
-
766
- player.addEventListener('canplay', () => {{
767
- console.log('▶️ Audio can start playing');
768
- const statusDiv = document.getElementById('loading-status-' + playerId);
769
- if (statusDiv) {{
770
- statusDiv.innerHTML = '🎵 Ready to play';
771
- setTimeout(() => statusDiv.remove(), 2000);
772
- }}
773
- }});
774
-
775
- player.addEventListener('canplaythrough', () => {{
776
- console.log('🚀 Audio can play through');
777
- }});
778
-
779
- player.addEventListener('error', (e) => {{
780
- console.error('❌ Audio error:', e, player.error);
781
- const statusDiv = document.getElementById('loading-status-' + playerId);
782
- if (statusDiv) statusDiv.remove();
783
-
784
- const errorDiv = document.createElement('div');
785
- errorDiv.style.cssText = 'padding: 10px; background: #ffebee; color: #c62828; border-radius: 4px; margin-top: 10px; border-left: 4px solid #f44336;';
786
-
787
- let errorMessage = '❌ Audio loading failed. ';
788
- if (player.error) {{
789
- switch(player.error.code) {{
790
- case 1: errorMessage += 'Network error - check your connection.'; break;
791
- case 2: errorMessage += 'File format not supported.'; break;
792
- case 3: errorMessage += 'Audio decoding failed.'; break;
793
- case 4: errorMessage += 'Audio source not usable.'; break;
794
- default: errorMessage += 'Unknown error occurred.';
795
- }}
796
- }} else {{
797
- errorMessage += 'Please check the file format and try again.';
798
- }}
799
-
800
- errorDiv.innerHTML = errorMessage;
801
- document.getElementById('audio-container-' + playerId).appendChild(errorDiv);
802
- }});
803
-
804
- // Timeout fallback - if no canplay event after 30 seconds
805
- setTimeout(() => {{
806
- if (player.readyState === 0) {{
807
- console.warn('⚠️ Audio loading timeout');
808
- const container = document.getElementById('audio-container-' + playerId);
809
- const timeoutDiv = document.createElement('div');
810
- timeoutDiv.style.cssText = 'padding: 8px; background: #fff3e0; color: #f57c00; border-radius: 4px; margin-top: 5px;';
811
- timeoutDiv.innerHTML = '⚠️ Audio loading is taking longer than expected. Large file or slow connection?';
812
- container.appendChild(timeoutDiv);
813
- }}
814
- }}, 30000);
815
-
816
- // Handle seek events
817
- player.addEventListener('seeking', () => isSeeking = true);
818
- player.addEventListener('seeked', () => {{
819
- setTimeout(() => isSeeking = false, 100);
820
- }});
821
-
822
- // Keyboard navigation
823
- document.addEventListener('keydown', (e) => {{
824
- if (!useVirtualization) return;
825
- if (e.target.tagName === 'INPUT' || e.target.tagName === 'TEXTAREA') return;
826
-
827
- if (e.key === 'ArrowLeft' && currentPage > 1) {{
828
- e.preventDefault();
829
- goToPage(currentPage - 1);
830
- }} else if (e.key === 'ArrowRight' && currentPage < totalPages) {{
831
- e.preventDefault();
832
- goToPage(currentPage + 1);
833
- }}
834
- }});
835
-
836
- // Inline editing functions
837
- window.startEdit = function(index) {{
838
- const div = document.querySelector(`[data-index="${{index}}"]`);
839
- if (!div) return;
840
-
841
- const textDisplay = div.querySelector('.text-display-' + playerId);
842
- const editContainer = div.querySelector('.edit-mode-container-' + playerId);
843
- const textarea = div.querySelector('.edit-textarea-' + playerId);
844
-
845
- if (!textDisplay || !editContainer || !textarea) return;
846
-
847
- // Store original text for cancel
848
- textarea.dataset.originalText = textDisplay.textContent;
849
-
850
- // Switch to edit mode
851
- textDisplay.style.display = 'none';
852
- editContainer.style.display = 'block';
853
- div.classList.add('edit-mode-' + playerId);
854
-
855
- // Focus and select all text
856
- textarea.focus();
857
- textarea.select();
858
- }};
859
-
860
- window.saveEdit = function(index) {{
861
- const div = document.querySelector(`[data-index="${{index}}"]`);
862
- if (!div) return;
863
-
864
- const textDisplay = div.querySelector('.text-display-' + playerId);
865
- const editContainer = div.querySelector('.edit-mode-container-' + playerId);
866
- const textarea = div.querySelector('.edit-textarea-' + playerId);
867
-
868
- if (!textDisplay || !editContainer || !textarea) return;
869
-
870
- const newText = textarea.value.trim();
871
- if (!newText) {{
872
- alert('Text cannot be empty');
873
- return;
874
- }}
875
-
876
- // Update display text
877
- textDisplay.textContent = newText;
878
-
879
- // Update utterances data
880
- utterances[index][2] = newText;
881
-
882
- // Send update to Streamlit (via session state simulation)
883
- try {{
884
- // Create a custom event to notify Streamlit about the change
885
- const updateEvent = new CustomEvent('utteranceUpdate', {{
886
- detail: {{
887
- index: index,
888
- text: newText,
889
- playerId: playerId
890
- }}
891
- }});
892
- window.dispatchEvent(updateEvent);
893
-
894
- // Store in localStorage as backup
895
- const editKey = 'voxsum_edits_' + playerId;
896
- let edits = JSON.parse(localStorage.getItem(editKey) || '{{}}');
897
- edits[index] = newText;
898
- localStorage.setItem(editKey, JSON.stringify(edits));
899
-
900
- console.log('💾 Utterance updated:', index, newText);
901
- }} catch (e) {{
902
- console.warn('⚠️ Could not save to session state:', e);
903
- }}
904
-
905
- // Exit edit mode
906
- cancelEdit(index, false);
907
-
908
- // Show success feedback
909
- showSuccessMessage(div, 'Saved!');
910
- }};
911
-
912
- window.cancelEdit = function(index, restoreText = true) {{
913
- const div = document.querySelector(`[data-index="${{index}}"]`);
914
- if (!div) return;
915
-
916
- const textDisplay = div.querySelector('.text-display-' + playerId);
917
- const editContainer = div.querySelector('.edit-mode-container-' + playerId);
918
- const textarea = div.querySelector('.edit-textarea-' + playerId);
919
-
920
- if (!textDisplay || !editContainer || !textarea) return;
921
-
922
- // Restore original text if cancelling
923
- if (restoreText && textarea.dataset.originalText) {{
924
- textarea.value = textarea.dataset.originalText;
925
- }}
926
-
927
- // Exit edit mode
928
- textDisplay.style.display = 'inline';
929
- editContainer.style.display = 'none';
930
- div.classList.remove('edit-mode-' + playerId);
931
- }};
932
-
933
- // Helper function to show success message
934
- function showSuccessMessage(div, message) {{
935
- const successDiv = document.createElement('div');
936
- successDiv.style.cssText = `
937
- position: absolute;
938
- top: -30px;
939
- right: 10px;
940
- background: #4caf50;
941
- color: white;
942
- padding: 4px 8px;
943
- border-radius: 4px;
944
- font-size: 0.8em;
945
- pointer-events: none;
946
- z-index: 1000;
947
- `;
948
- successDiv.textContent = message;
949
-
950
- div.style.position = 'relative';
951
- div.appendChild(successDiv);
952
-
953
- setTimeout(() => {{
954
- if (successDiv.parentNode) {{
955
- successDiv.parentNode.removeChild(successDiv);
956
- }}
957
- }}, 2000);
958
- }}
959
-
960
- // Load saved edits from localStorage
961
- const editKey = 'voxsum_edits_' + playerId;
962
- const savedEdits = JSON.parse(localStorage.getItem(editKey) || '{{}}');
963
- for (const [index, text] of Object.entries(savedEdits)) {{
964
- if (utterances[index]) {{
965
- utterances[index][2] = text;
966
- }}
967
- }}
968
- }})();
969
- </script>
970
- </body>
971
- </html>
972
- """
973
- return html_content
974
-
975
- def create_export_interface():
976
- """Create interface for exporting transcripts and summaries"""
977
- if not st.session_state.utterances and not st.session_state.summary:
978
- return
979
-
980
- st.markdown("### 📥 Export Options")
981
-
982
- export_tab1, export_tab2 = st.tabs(["📝 Transcript", "📄 Summary"])
983
-
984
- with export_tab1:
985
- if st.session_state.utterances:
986
- # Choose format based on speaker diarization
987
- if st.session_state.utterances_with_speakers:
988
- st.markdown("**Speaker diarization detected - Transcript formats available:**")
989
- format_options = TRANSCRIPT_FORMATS
990
- else:
991
- st.markdown("**No speaker diarization - Subtitle formats available:**")
992
- format_options = SUBTITLE_FORMATS
993
-
994
- # Format selection
995
- format_name = st.selectbox(
996
- "Export format",
997
- list(format_options.keys()),
998
- key="transcript_export_format"
999
- )
1000
-
1001
- format_info = format_options[format_name]
1002
-
1003
- # Export button and download
1004
- if st.button(f"📥 Export as {format_name}", key="export_transcript"):
1005
- # Prepare data - use available utterances (with or without speakers)
1006
- if st.session_state.utterances_with_speakers:
1007
- utterances_data = st.session_state.utterances_with_speakers
1008
- else:
1009
- utterances_data = [(start, end, text, 0) for start, end, text in st.session_state.utterances]
1010
-
1011
- # Generate content
1012
- try:
1013
- if format_name in SUBTITLE_FORMATS:
1014
- # For subtitle formats, use regular utterances
1015
- regular_utterances = [(start, end, text) for start, end, text, _ in utterances_data]
1016
- content = format_info["function"](regular_utterances, utterances_data if st.session_state.utterances_with_speakers else None)
1017
- else:
1018
- # For transcript formats, pass speaker-aware data
1019
- content = format_info["function"](
1020
- [(start, end, text) for start, end, text, _ in utterances_data],
1021
- utterances_data if st.session_state.utterances_with_speakers else None
1022
- )
1023
-
1024
- # Create download button
1025
- filename = f"transcript_{datetime.now().strftime('%Y%m%d_%H%M%S')}{format_info['extension']}"
1026
- st.download_button(
1027
- label=f"💾 Download {filename}",
1028
- data=content,
1029
- file_name=filename,
1030
- mime=format_info["mime_type"]
1031
- )
1032
-
1033
- except Exception as e:
1034
- st.error(f"Export failed: {str(e)}")
1035
- else:
1036
- st.info("No transcript available for export")
1037
-
1038
- with export_tab2:
1039
- if st.session_state.summary:
1040
- # Summary export formats
1041
- format_name = st.selectbox(
1042
- "Summary format",
1043
- list(SUMMARY_FORMATS.keys()),
1044
- key="summary_export_format"
1045
- )
1046
-
1047
- format_info = SUMMARY_FORMATS[format_name]
1048
-
1049
- # Metadata for summary
1050
- with st.expander("📋 Add metadata (optional)"):
1051
- metadata = {}
1052
- metadata["title"] = st.text_input("Title", key="summary_title")
1053
- metadata["date"] = st.date_input("Date", value=datetime.now().date(), key="summary_date").isoformat()
1054
- if st.session_state.utterances_with_speakers:
1055
- num_speakers = len(set(speaker for _, _, _, speaker in st.session_state.utterances_with_speakers))
1056
- metadata["speakers"] = f"{num_speakers} speakers detected"
1057
- if st.session_state.audio_path:
1058
- # Calculate duration if possible
1059
- try:
1060
- if st.session_state.utterances:
1061
- last_utterance = st.session_state.utterances[-1]
1062
- duration_sec = last_utterance[1] # end time
1063
- duration_min = int(duration_sec // 60)
1064
- duration_sec_remainder = int(duration_sec % 60)
1065
- metadata["duration"] = f"{duration_min}m {duration_sec_remainder}s"
1066
- except:
1067
- pass
1068
-
1069
- # Clean empty metadata
1070
- metadata = {k: v for k, v in metadata.items() if v}
1071
-
1072
- # Export button
1073
- if st.button(f"📥 Export summary as {format_name}", key="export_summary"):
1074
- try:
1075
- content = format_info["function"](st.session_state.summary, metadata if metadata else None)
1076
-
1077
- filename = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}{format_info['extension']}"
1078
- st.download_button(
1079
- label=f"💾 Download {filename}",
1080
- data=content,
1081
- file_name=filename,
1082
- mime=format_info["mime_type"]
1083
- )
1084
-
1085
- except Exception as e:
1086
- st.error(f"Export failed: {str(e)}")
1087
- else:
1088
- st.info("No summary available for export")
1089
-
1090
- def render_results_tab(settings):
1091
- st.subheader("🎤 Transcription & Summary")
1092
- status_placeholder = st.empty()
1093
- transcript_display = st.empty()
1094
- summary_container = st.container()
1095
-
1096
- # Update pagination settings
1097
- st.session_state.utterances_per_page = settings.get("utterances_per_page", 100)
1098
-
1099
- # Handle audio base64 encoding
1100
- if (st.session_state.audio_path and
1101
- st.session_state.get("prev_audio_path") != st.session_state.audio_path):
1102
- st.session_state.audio_base64 = None
1103
- st.session_state.prev_audio_path = st.session_state.audio_path
1104
- st.session_state.static_audio_url = None # Reset static URL
1105
-
1106
- # Transcription Process
1107
- if st.button("🎙️ Transcribe Audio"):
1108
- if st.session_state.audio_path:
1109
- status_placeholder.info("🔊 Transcribing audio... Please wait.")
1110
- st.session_state.utterances = []
1111
- st.session_state.transcript = ""
1112
- st.session_state.transcribing = True
1113
-
1114
- with transcript_display.container():
1115
- st.markdown("### 📝 Live Transcript (Streaming)")
1116
- live_placeholder = st.empty()
1117
- progress_bar = st.progress(0)
1118
- utterance_counter = st.empty()
1119
-
1120
- try:
1121
- # Determine model name and backend-specific parameters
1122
- if st.session_state.backend == "moonshine":
1123
- model_key = model_names[settings["model_name"]]
1124
- else:
1125
- model_key = sensevoice_models[settings["model_name"]]
1126
-
1127
- gen = transcribe_file(
1128
- st.session_state.audio_path,
1129
- settings["vad_threshold"],
1130
- model_key,
1131
- backend=st.session_state.backend,
1132
- language=st.session_state.language if st.session_state.backend == "sensevoice" else "auto",
1133
- textnorm=st.session_state.textnorm if st.session_state.backend == "sensevoice" else "withitn"
1134
- )
1135
-
1136
- # Estimate total duration for progress
1137
- try:
1138
- import soundfile as sf
1139
- audio_info = sf.info(st.session_state.audio_path)
1140
- total_duration = audio_info.duration
1141
- except:
1142
- total_duration = None
1143
-
1144
- utterance_count = 0
1145
- for current_utterance, all_utts in gen:
1146
- st.session_state.utterances = list(all_utts) if all_utts else []
1147
- utterance_count = len(st.session_state.utterances)
1148
-
1149
- # Update progress if we have duration info
1150
- if total_duration and current_utterance:
1151
- progress = min(1.0, current_utterance[1] / total_duration)
1152
- progress_bar.progress(progress)
1153
-
1154
- # Efficient transcript display for streaming
1155
- if utterance_count <= 200:
1156
- # For smaller transcripts, show full text
1157
- st.session_state.transcript = "\n".join(
1158
- text for start, end, text in st.session_state.utterances
1159
- )
1160
- live_placeholder.markdown(st.session_state.transcript)
1161
- else:
1162
- # For large transcripts, show last few utterances only
1163
- recent_utterances = st.session_state.utterances[-10:]
1164
- recent_text = "\n".join(
1165
- f"[{int(start//60)}:{int(start%60):02d}] {text}"
1166
- for start, end, text in recent_utterances
1167
- )
1168
- live_placeholder.markdown(f"**Recent utterances (last 10):**\n{recent_text}")
1169
-
1170
- utterance_counter.info(f"📊 {utterance_count} utterances processed")
1171
-
1172
- st.session_state.transcribing = False
1173
- progress_bar.progress(1.0)
1174
- status_placeholder.success(f"✅ Transcription completed! {utterance_count} utterances generated.")
1175
-
1176
- # Perform speaker diarization if enabled
1177
- print(f"🔍 DEBUG Diarization Check: enable_diarization={st.session_state.enable_diarization}, utterances_count={len(st.session_state.utterances)}")
1178
- if st.session_state.enable_diarization and st.session_state.utterances:
1179
- print("✅ DEBUG: Starting diarization process...")
1180
- status_placeholder.info("🎭 Performing speaker diarization... This may take a few minutes.")
1181
- diarization_progress = st.progress(0)
1182
-
1183
- try:
1184
- # Initialize embedding extractor (lighter than full diarization system)
1185
- print("🔍 DEBUG: Initializing embedding extractor...")
1186
- extractor_result = init_speaker_embedding_extractor(
1187
- cluster_threshold=st.session_state.cluster_threshold,
1188
- num_speakers=st.session_state.num_speakers
1189
- )
1190
-
1191
- if extractor_result:
1192
- print("✅ DEBUG: Embedding extractor initialized successfully")
1193
- embedding_extractor, config_dict = extractor_result
1194
-
1195
- # Load audio for diarization (needs to be 16kHz)
1196
- import soundfile as sf
1197
- import scipy.signal
1198
-
1199
- audio, sample_rate = sf.read(st.session_state.audio_path, dtype='float32')
1200
-
1201
- # Resample to 16kHz if needed (reusing existing resampling logic)
1202
- if sample_rate != 16000:
1203
- audio = scipy.signal.resample(audio, int(len(audio) * 16000 / sample_rate))
1204
- sample_rate = 16000
1205
-
1206
- # Ensure mono
1207
- if len(audio.shape) > 1:
1208
- audio = audio.mean(axis=1)
1209
-
1210
- # Progress callback for diarization
1211
- def diarization_progress_callback(progress):
1212
- diarization_progress.progress(min(1.0, progress))
1213
-
1214
- # Perform diarization using existing ASR utterance segments
1215
- print(f"🔍 DEBUG: Starting diarization with {len(st.session_state.utterances)} utterances")
1216
- diarization_result = perform_speaker_diarization_on_utterances(
1217
- audio, sample_rate, st.session_state.utterances,
1218
- embedding_extractor, config_dict, diarization_progress_callback
1219
- )
1220
- print(f"🔍 DEBUG: Diarization returned {len(diarization_result) if diarization_result else 0} results")
1221
-
1222
- if diarization_result:
1223
- print("✅ DEBUG: Merging transcription with diarization...")
1224
- # Merge transcription with diarization
1225
- merged_utterances = merge_transcription_with_diarization(
1226
- st.session_state.utterances, diarization_result
1227
- )
1228
-
1229
- # Merge consecutive utterances from the same speaker
1230
- st.session_state.utterances_with_speakers = merge_consecutive_utterances(
1231
- merged_utterances, max_gap=1.0
1232
- )
1233
- print(f"✅ DEBUG: Merged result has {len(st.session_state.utterances_with_speakers)} utterances with speakers")
1234
-
1235
- # Calculate statistics
1236
- st.session_state.diarization_stats = get_diarization_stats(
1237
- st.session_state.utterances_with_speakers
1238
- )
1239
-
1240
- diarization_progress.progress(1.0)
1241
- num_speakers = st.session_state.diarization_stats.get("total_speakers", 0)
1242
- status_placeholder.success(f"✅ Speaker diarization completed! {num_speakers} speakers detected.")
1243
- else:
1244
- print("❌ DEBUG: Diarization returned empty result")
1245
- status_placeholder.error("❌ Speaker diarization failed.")
1246
- st.session_state.utterances_with_speakers = []
1247
- else:
1248
- print("❌ DEBUG: Failed to initialize embedding extractor")
1249
- status_placeholder.error("❌ Failed to initialize speaker diarization.")
1250
- st.session_state.utterances_with_speakers = []
1251
-
1252
- except Exception as e:
1253
- print(f"❌ DEBUG: Exception in diarization: {str(e)}")
1254
- status_placeholder.error(f"❌ Speaker diarization error: {str(e)}")
1255
- st.session_state.utterances_with_speakers = []
1256
- else:
1257
- # No diarization requested - clear previous results
1258
- print(f"❌ DEBUG: Diarization not executed - enable_diarization={st.session_state.enable_diarization}, has_utterances={bool(st.session_state.utterances)}")
1259
- st.session_state.utterances_with_speakers = []
1260
- st.session_state.diarization_stats = {}
1261
-
1262
- st.rerun()
1263
- except Exception as e:
1264
- status_placeholder.error(f"Transcription error: {str(e)}")
1265
- st.session_state.transcribing = False
1266
- else:
1267
- status_placeholder.warning("⚠️ No audio file available")
1268
-
1269
- # Summarization Process
1270
- if st.button("📝 Generate Summary"):
1271
- if st.session_state.transcript:
1272
- status_placeholder.info("🧠 Generating summary...")
1273
- st.session_state.summary = ""
1274
- summary_container.empty()
1275
-
1276
- # Show transcript during summarization
1277
- with transcript_display.container():
1278
- if st.session_state.audio_path and st.session_state.utterances:
1279
- # Use efficient player for summarization view with speaker colors if available
1280
- utterances_display = st.session_state.utterances_with_speakers if st.session_state.utterances_with_speakers else None
1281
- html = create_efficient_sync_player(
1282
- st.session_state.audio_path,
1283
- st.session_state.utterances,
1284
- utterances_display
1285
- )
1286
- # Dynamic height calculation with better scaling - increased for more visibility
1287
- base_height = 300
1288
- content_height = min(800, max(base_height, len(st.session_state.utterances) * 15 + 200))
1289
- st.components.v1.html(html, height=content_height, scrolling=True)
1290
- elif st.session_state.utterances:
1291
- st.markdown("### 📝 Transcript")
1292
- # For very long transcripts, show summary info
1293
- if len(st.session_state.utterances) > 500:
1294
- st.info(f"📊 Large transcript: {len(st.session_state.utterances)} utterances")
1295
- with st.expander("View full transcript"):
1296
- st.markdown(st.session_state.transcript)
1297
- else:
1298
- st.markdown(st.session_state.transcript)
1299
- else:
1300
- st.info("No transcript available.")
1301
-
1302
- # Live summary display
1303
- live_summary_area = st.empty()
1304
- with live_summary_area.container():
1305
- st.markdown("### 📝 Live Summary (In Progress)")
1306
- progress_placeholder = st.empty()
1307
-
1308
- summary_gen = summarize_transcript(
1309
- st.session_state.transcript,
1310
- settings["llm_model"],
1311
- settings["prompt_input"]
1312
- )
1313
-
1314
- for accumulated_summary in summary_gen:
1315
- st.session_state.summary = accumulated_summary
1316
- progress_placeholder.markdown(accumulated_summary)
1317
-
1318
- live_summary_area.empty()
1319
- st.rerun()
1320
- else:
1321
- status_placeholder.warning("⚠️ No transcript available")
1322
-
1323
- # Display final results
1324
- if st.session_state.audio_path and st.session_state.utterances and not st.session_state.transcribing:
1325
- # Show speaker diarization statistics if available
1326
- if st.session_state.diarization_stats and st.session_state.diarization_stats.get("total_speakers", 0) > 0:
1327
- st.markdown("### 🎭 Speaker Analysis")
1328
- stats = st.session_state.diarization_stats
1329
-
1330
- col1, col2 = st.columns([2, 1])
1331
- with col1:
1332
- # Speaker breakdown
1333
- speaker_data = []
1334
- for speaker_id, speaker_stats in stats["speakers"].items():
1335
- speaker_data.append({
1336
- "Speaker": f"Speaker {speaker_id + 1}",
1337
- "Speaking Time": f"{speaker_stats['speaking_time']:.1f}s",
1338
- "Percentage": f"{speaker_stats['percentage']:.1f}%",
1339
- "Utterances": speaker_stats['utterances'],
1340
- "Avg Length": f"{speaker_stats['avg_utterance_length']:.1f}s"
1341
- })
1342
-
1343
- import pandas as pd
1344
- df = pd.DataFrame(speaker_data)
1345
- st.dataframe(df, use_container_width=True)
1346
-
1347
- with col2:
1348
- st.metric("Total Speakers", stats["total_speakers"])
1349
- st.metric("Total Duration", f"{stats['total_duration']:.1f}s")
1350
-
1351
- # Performance optimization: show stats for large transcripts
1352
- if len(st.session_state.utterances) > 100:
1353
- col1, col2, col3 = st.columns(3)
1354
- with col1:
1355
- st.metric("📊 Utterances", len(st.session_state.utterances))
1356
- with col2:
1357
- duration = st.session_state.utterances[-1][1] if st.session_state.utterances else 0
1358
- st.metric("⏱️ Duration", f"{duration/60:.1f} min")
1359
- with col3:
1360
- avg_length = sum(len(text) for _, _, text in st.session_state.utterances) / len(st.session_state.utterances)
1361
- st.metric("📝 Avg Length", f"{avg_length:.0f} chars")
1362
-
1363
- # Use efficient player for final results with speaker colors if available
1364
- utterances_display = st.session_state.utterances_with_speakers if st.session_state.utterances_with_speakers else None
1365
-
1366
- # DEBUG: Print information about diarization
1367
- if utterances_display:
1368
- print(f"🎭 DEBUG: Using diarized utterances - {len(utterances_display)} segments with speakers")
1369
- for i, (start, end, text, speaker) in enumerate(utterances_display[:3]): # Show first 3
1370
- print(f" Sample {i+1}: [{start:.1f}-{end:.1f}s] Speaker {speaker}: '{text[:30]}...'")
1371
- else:
1372
- print(f"📝 DEBUG: Using regular utterances - {len(st.session_state.utterances)} segments without speakers")
1373
-
1374
- html = create_efficient_sync_player(
1375
- st.session_state.audio_path,
1376
- st.session_state.utterances,
1377
- utterances_display
1378
- )
1379
- # Improved height calculation for better UX - increased for more transcript visibility
1380
- base_height = 350
1381
- content_height = min(900, max(base_height, len(st.session_state.utterances) * 12 + 250))
1382
-
1383
- with transcript_display.container():
1384
- st.components.v1.html(html, height=content_height, scrolling=True)
1385
-
1386
- # Show formatted transcript with speakers if diarization was performed
1387
- if st.session_state.utterances_with_speakers:
1388
- with st.expander("📄 Speaker-Labeled Transcript", expanded=False):
1389
- formatted_transcript = format_speaker_transcript(st.session_state.utterances_with_speakers)
1390
- st.markdown(formatted_transcript)
1391
-
1392
- # Add export interface (editing is now inline)
1393
- st.markdown("---")
1394
- create_export_interface()
1395
-
1396
- elif not st.session_state.utterances and not st.session_state.transcribing:
1397
- with transcript_display.container():
1398
- st.info("No transcript available. Click 'Transcribe Audio' to generate one.")
1399
-
1400
- if st.session_state.summary:
1401
- with summary_container:
1402
- st.markdown("### 📝 Final Summary")
1403
- st.markdown(st.session_state.summary)
1404
-
1405
- # === 3. Main App ===
1406
- def main():
1407
- init_session_state()
1408
-
1409
- # Optimized page config for HF Spaces and large files
1410
- st.set_page_config(
1411
- page_title="🎙️ ASR + LLM",
1412
- layout="wide",
1413
- initial_sidebar_state="expanded",
1414
- menu_items={
1415
- 'Get Help': 'https://github.com/your-repo/issues',
1416
- 'Report a bug': 'https://github.com/your-repo/issues',
1417
- 'About': "VoxSum Studio - Optimized for large audio files"
1418
- }
1419
- )
1420
-
1421
- # HF Spaces specific optimizations
1422
- if os.environ.get('SPACE_ID'):
1423
- st.markdown("""
1424
- <div style='background: linear-gradient(90deg, #1f77b4, #ff7f0e); padding: 8px; border-radius: 6px; margin-bottom: 15px;'>
1425
- <p style='color: white; margin: 0; text-align: center; font-weight: 500;'>
1426
- 🚀 Running on Hugging Face Spaces - Optimized for large audio files
1427
- </p>
1428
- </div>
1429
- """, unsafe_allow_html=True)
1430
-
1431
- st.title("🎙️ Speech Summarization with Moonshine & SenseVoice ASR")
1432
-
1433
- settings = render_settings_sidebar()
1434
- tab1, tab2, tab3 = st.tabs(["📻 Podcast", "🎵 Audio Input", "📄 Results"])
1435
-
1436
- with tab1:
1437
- render_podcast_tab()
1438
- with tab2:
1439
- render_audio_tab()
1440
- with tab3:
1441
- render_results_tab(settings)
1442
-
1443
- if __name__ == "__main__":
1444
- main()