hlevring commited on
Commit
3f632a7
·
1 Parent(s): baaace4

Enhance word timestamp logic and graphing

Browse files
Files changed (3) hide show
  1. app.py +538 -58
  2. packages.txt +2 -1
  3. requirements.txt +2 -1
app.py CHANGED
@@ -8,6 +8,7 @@ import numpy as np
8
  import base64
9
  import io
10
  import json
 
11
  import matplotlib
12
  matplotlib.use('Agg')
13
  import matplotlib.pyplot as plt
@@ -20,11 +21,170 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
  punct_fixer = PunctFixer(language="da", device=device)
21
 
22
 
23
- def transcribe_audio(audio):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  try:
25
  # Check if audio is provided
26
  if audio is None:
27
- return "No audio provided. Please record or upload audio first.", [], None
28
 
29
  # Preprocess audio: convert to mono if stereo
30
  audio_data, sample_rate = sf.read(audio)
@@ -33,7 +193,18 @@ def transcribe_audio(audio):
33
  if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
34
  audio_data = np.mean(audio_data, axis=1)
35
 
36
- # Save as temporary mono file
 
 
 
 
 
 
 
 
 
 
 
37
  import tempfile
38
  import os
39
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
@@ -118,6 +289,68 @@ def transcribe_audio(audio):
118
  import traceback
119
  print(f"Timestamp extraction failed: {str(e)}\n{traceback.format_exc()}")
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  # Calculate audio duration
122
  audio_duration = len(audio_data) / sample_rate
123
 
@@ -131,31 +364,55 @@ def transcribe_audio(audio):
131
  'frame_duration': 0.08
132
  }
133
 
134
- # Return text, timestamps, audio data, raw_text, and export metadata
135
- return punctuated_text, timestamps_data, (audio_data, sample_rate), raw_text, export_metadata
136
- return "No transcription available.", [], None, "", {}
137
 
138
  except Exception as e:
139
  import traceback
140
- return f"Error during transcription: {str(e)}\n{traceback.format_exc()}", [], None, "", {}
141
 
142
 
143
- def extract_audio_segment(audio_state, start_time, end_time):
144
- """Fast audio extraction from memory with waveform visualization."""
 
 
 
 
 
 
 
 
 
 
145
  # Wrapper to ensure controls never collapse
146
- def wrap_output(content):
147
- return f'<div style="min-height: 200px;">{content}</div>'
148
 
149
  try:
150
  if audio_state is None:
151
  return wrap_output("<p style='color: red; padding: 20px;'>No audio loaded. Please transcribe audio first.</p>")
152
 
153
  audio_data, sample_rate = audio_state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- # Add 150ms padding for VISUALIZATION only
156
- padding = 0.15
157
- padded_start = max(0, start_time - padding)
158
- padded_end = min(len(audio_data) / sample_rate, end_time + padding)
159
 
160
  # Extract padded segment for waveform visualization
161
  start_sample_padded = int(padded_start * sample_rate)
@@ -200,7 +457,18 @@ def extract_audio_segment(audio_state, start_time, end_time):
200
 
201
  ax.set_xlabel('Time (seconds)', fontsize=10)
202
  ax.set_ylabel('Amplitude', fontsize=10)
203
- ax.set_title(f'Audio Segment: {start_time:.2f}s - {end_time:.2f}s (±150ms context)', fontsize=11)
 
 
 
 
 
 
 
 
 
 
 
204
  ax.legend(fontsize=9)
205
  ax.grid(True, alpha=0.3)
206
 
@@ -223,6 +491,16 @@ def extract_audio_segment(audio_state, start_time, end_time):
223
  import time
224
  unique_id = int(time.time() * 1000)
225
 
 
 
 
 
 
 
 
 
 
 
226
  # Create HTML with waveform and native audio controls
227
  html_output = f'''
228
  <div style="margin: 10px 0;" data-render-id="{unique_id}">
@@ -236,20 +514,22 @@ def extract_audio_segment(audio_state, start_time, end_time):
236
 
237
  <div style="margin-top: 8px; text-align: center;">
238
  <span style="font-size: 14px; font-weight: bold; color: #333;">
239
- Segment: {start_time:.2f}s - {end_time:.2f}s
240
  </span>
241
  <span style="font-size: 12px; color: #666; margin-left: 15px;">
242
- Duration: {end_time - start_time:.2f}s | Context shown: ±150ms
243
  </span>
244
  </div>
245
  </div>
246
  '''
247
 
248
- return wrap_output(html_output)
 
 
249
 
250
  except Exception as e:
251
  import traceback
252
- return wrap_output(f"<pre style='padding: 20px;'>Error: {str(e)}\n{traceback.format_exc()}</pre>")
253
 
254
 
255
  with gr.Blocks() as demo:
@@ -267,6 +547,30 @@ with gr.Blocks() as demo:
267
  sources=["microphone", "upload"],
268
  format="wav"
269
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  transcribe_button = gr.Button("Transcribe")
271
  transcription_output = gr.Textbox(label="Transcription", lines=5)
272
 
@@ -290,24 +594,47 @@ with gr.Blocks() as demo:
290
  # Track last played interval for smart replay
291
  last_interval_state = gr.State("")
292
 
 
 
 
293
  # Waveform player - below interval controls
294
  waveform_player = gr.HTML(label="Segment Player")
295
 
296
- def transcribe_and_setup_audio(audio):
297
- text, timestamps_data, audio_data, raw_text, export_metadata = transcribe_audio(audio)
 
 
298
 
299
- # Build word data as JSON for the iframe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  words_json = json.dumps([{
301
  'word': item['word'],
302
- 'start': round(item['start'], 2),
303
- 'end': round(item['end'], 2)
304
  } for item in timestamps_data])
305
 
306
  # Pre-generate full export JSON
307
  segments = [{
308
  'word': item['word'],
309
- 'start': round(item['start'], 2),
310
- 'end': round(item['end'], 2),
311
  'word_index': i
312
  } for i, item in enumerate(timestamps_data)]
313
 
@@ -343,50 +670,202 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
343
  }}
344
  .word-btn:hover {{ background: #c5e5f5; }}
345
  .word-btn.selected {{ background: #4CAF50; color: white; border-color: #3a9; }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  .time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
 
 
347
  .word {{ margin-left: 4px; }}
348
  </style>
349
  </head>
350
  <body>
351
  <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
352
- <h3 style="margin: 0;">Word Timestamps</h3>
 
 
 
 
 
 
353
  <a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
354
  </div>
355
  <script>var exportJsonStr = {json.dumps(export_json_str)};</script>
356
- <p class="help"><b>Click</b> = select word &nbsp;|&nbsp; <b>Ctrl+Click</b> = add to selection</p>
357
  <div class="container" id="words"></div>
358
  <script>
359
- var words = {words_json};
360
  var container = document.getElementById('words');
361
 
362
- words.forEach(function(w, i) {{
363
- var btn = document.createElement('span');
364
- btn.className = 'word-btn';
365
- btn.dataset.s = w.start;
366
- btn.dataset.e = w.end;
367
- btn.innerHTML = '<span class="time">[' + w.start.toFixed(2) + '-' + w.end.toFixed(2) + 's]</span><span class="word">' + w.word + '</span>';
368
- btn.onclick = function(e) {{
369
- var all = document.querySelectorAll('.word-btn');
370
- if (e.ctrlKey) {{
371
- this.classList.toggle('selected');
 
 
 
 
 
372
  }} else {{
373
- all.forEach(function(b) {{ b.classList.remove('selected'); }});
374
- this.classList.add('selected');
 
 
 
 
375
  }}
376
- updateInterval();
377
- }};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  container.appendChild(btn);
379
  }});
380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  function updateInterval() {{
382
- var sel = document.querySelectorAll('.word-btn.selected');
383
  if (sel.length === 0) return;
384
  var minS = Infinity, maxE = 0;
385
  sel.forEach(function(b) {{
386
  minS = Math.min(minS, parseFloat(b.dataset.s));
387
  maxE = Math.max(maxE, parseFloat(b.dataset.e));
388
  }});
389
- var interval = minS.toFixed(2) + '-' + maxE.toFixed(2);
390
  // Find the textbox in parent and update it
391
  try {{
392
  var boxes = parent.document.querySelectorAll('input[data-testid="textbox"], textarea');
@@ -399,7 +878,7 @@ function updateInterval() {{
399
  }} catch(err) {{ console.log('Could not update parent:', err); }}
400
  }}
401
 
402
- // Highlight words that overlap with manually entered interval (>50% of word must be in interval)
403
  function highlightFromInterval(intervalStr) {{
404
  if (!intervalStr) return;
405
  var parts = intervalStr.replace(',', '-').split('-');
@@ -410,13 +889,13 @@ function highlightFromInterval(intervalStr) {{
410
  document.querySelectorAll('.word-btn').forEach(function(btn) {{
411
  var ws = parseFloat(btn.dataset.s);
412
  var we = parseFloat(btn.dataset.e);
413
- var wordDuration = we - ws;
414
- // Calculate overlap between word and interval
415
  var overlapStart = Math.max(ws, s);
416
  var overlapEnd = Math.min(we, e);
417
  var overlap = Math.max(0, overlapEnd - overlapStart);
418
- // Highlight only if >50% of word is in interval
419
- if (wordDuration > 0 && (overlap / wordDuration) > 0.5) {{
420
  btn.classList.add('selected');
421
  }} else {{
422
  btn.classList.remove('selected');
@@ -481,10 +960,10 @@ document.getElementById('download-json').onclick = function(e) {{
481
 
482
  return text, audio, timestamps_html, audio_data, timestamps_data, initial_player
483
 
484
- def play_time_interval_fast(audio_state, time_interval, last_interval):
485
  """Fast extraction using preloaded audio from memory."""
486
  def wrap_error(msg):
487
- return f'<div style="min-height: 150px; padding: 20px; text-align: center; background: #f5f5f5; border-radius: 8px;"><p style="color: #666;">{msg}</p></div>', last_interval
488
 
489
  try:
490
  if not time_interval or not audio_state:
@@ -503,8 +982,9 @@ document.getElementById('download-json').onclick = function(e) {{
503
  return wrap_error("Start time must be before end time.")
504
 
505
  # Load/reload audio segment (autoplay will replay even if same interval)
506
- result = extract_audio_segment(audio_state, start_time, end_time)
507
- return result, time_interval
 
508
 
509
  except Exception as e:
510
  import traceback
@@ -512,15 +992,15 @@ document.getElementById('download-json').onclick = function(e) {{
512
 
513
  transcribe_button.click(
514
  fn=transcribe_and_setup_audio,
515
- inputs=audio_input,
516
  outputs=[transcription_output, audio_output, timestamps_output, audio_state, timestamps_state, waveform_player]
517
  )
518
 
519
  # Play interval button
520
  play_interval_button.click(
521
  fn=play_time_interval_fast,
522
- inputs=[audio_state, time_input, last_interval_state],
523
- outputs=[waveform_player, last_interval_state]
524
  )
525
 
526
  demo.launch()
 
8
  import base64
9
  import io
10
  import json
11
+ from ten_vad import TenVad
12
  import matplotlib
13
  matplotlib.use('Agg')
14
  import matplotlib.pyplot as plt
 
21
  punct_fixer = PunctFixer(language="da", device=device)
22
 
23
 
24
+ def detect_silence_periods(audio_data, sample_rate, prob_threshold=0.5, frame_rep_threshold=2):
25
+ """Run TEN VAD to detect silence periods in audio.
26
+
27
+ Args:
28
+ audio_data: numpy array of audio samples (float, mono, 16kHz)
29
+ sample_rate: sample rate (must be 16000)
30
+ prob_threshold: VAD probability threshold (0.0-1.0), higher = less sensitive
31
+ frame_rep_threshold: Number of consecutive frames required before state change
32
+
33
+ Returns:
34
+ List of dicts with 'start' and 'end' times for each silence period
35
+ """
36
+ TARGET_SR = 16000 # TEN VAD requires 16kHz
37
+ HOP_SIZE = 256 # 16ms at 16kHz
38
+
39
+ print(f"[VAD] Settings: prob_threshold={prob_threshold}, frame_rep_threshold={frame_rep_threshold}")
40
+
41
+ if sample_rate != TARGET_SR:
42
+ print(f"[VAD] Warning: Expected 16kHz audio, got {sample_rate}Hz")
43
+
44
+ # Convert float audio to int16 (TEN VAD expects int16)
45
+ if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
46
+ audio_int16 = (audio_data * 32767).astype(np.int16)
47
+ else:
48
+ audio_int16 = audio_data.astype(np.int16)
49
+
50
+ # Create VAD instance
51
+ vad = TenVad(hop_size=HOP_SIZE, threshold=prob_threshold)
52
+
53
+ silence_periods = []
54
+ frame_duration = HOP_SIZE / TARGET_SR # 0.016s = 16ms
55
+
56
+ # Process frame by frame and collect raw flags
57
+ num_frames = len(audio_int16) // HOP_SIZE
58
+ raw_flags = []
59
+
60
+ for i in range(num_frames):
61
+ frame_start = i * HOP_SIZE
62
+ frame = audio_int16[frame_start:frame_start + HOP_SIZE]
63
+
64
+ result = vad.process(frame)
65
+ # TEN VAD returns tuple: (probability, flag) or has .flag attribute
66
+ if isinstance(result, tuple):
67
+ flag = result[1] # (probability, flag)
68
+ else:
69
+ flag = result.flag
70
+ raw_flags.append(flag)
71
+
72
+ # Apply frame repetition threshold smoothing
73
+ # Only switch state after seeing frame_rep_threshold consecutive frames of the new state
74
+ in_silence = False
75
+ silence_start = 0.0
76
+ consecutive_count = 0
77
+ pending_state = None # The state we're potentially switching to
78
+
79
+ for i, flag in enumerate(raw_flags):
80
+ current_time = i * frame_duration
81
+ is_silence = (flag == 0)
82
+
83
+ if in_silence:
84
+ # Currently in silence, looking for speech
85
+ if not is_silence:
86
+ # Potential speech detected
87
+ if pending_state != 'speech':
88
+ pending_state = 'speech'
89
+ consecutive_count = 1
90
+ else:
91
+ consecutive_count += 1
92
+
93
+ if consecutive_count >= frame_rep_threshold:
94
+ # Confirmed speech - end silence period
95
+ # Adjust end time back to when speech actually started
96
+ actual_end = current_time - (consecutive_count - 1) * frame_duration
97
+ silence_periods.append({
98
+ 'start': round(silence_start, 3),
99
+ 'end': round(actual_end, 3)
100
+ })
101
+ in_silence = False
102
+ pending_state = None
103
+ consecutive_count = 0
104
+ else:
105
+ # Still silence, reset any pending speech detection
106
+ pending_state = None
107
+ consecutive_count = 0
108
+ else:
109
+ # Currently in speech, looking for silence
110
+ if is_silence:
111
+ # Potential silence detected
112
+ if pending_state != 'silence':
113
+ pending_state = 'silence'
114
+ consecutive_count = 1
115
+ potential_silence_start = current_time
116
+ else:
117
+ consecutive_count += 1
118
+
119
+ if consecutive_count >= frame_rep_threshold:
120
+ # Confirmed silence - start silence period
121
+ # Use the time when silence actually started
122
+ silence_start = potential_silence_start
123
+ in_silence = True
124
+ pending_state = None
125
+ consecutive_count = 0
126
+ else:
127
+ # Still speech, reset any pending silence detection
128
+ pending_state = None
129
+ consecutive_count = 0
130
+
131
+ # Handle case where audio ends in silence
132
+ if in_silence:
133
+ silence_periods.append({
134
+ 'start': round(silence_start, 3),
135
+ 'end': round(num_frames * frame_duration, 3)
136
+ })
137
+
138
+ return silence_periods
139
+
140
+
141
+ def print_speech_silence_log(timestamps_data, silence_periods):
142
+ """Print interleaved speech and silence log sorted by start time."""
143
+
144
+ # Build unified list
145
+ entries = []
146
+
147
+ # Add speech entries (word timestamps)
148
+ for item in timestamps_data:
149
+ entries.append({
150
+ 'type': 'speech',
151
+ 'start': item['start'],
152
+ 'end': item['end'],
153
+ 'word': item['word']
154
+ })
155
+
156
+ # Add silence entries
157
+ for item in silence_periods:
158
+ entries.append({
159
+ 'type': 'silence',
160
+ 'start': item['start'],
161
+ 'end': item['end']
162
+ })
163
+
164
+ # Sort by start time
165
+ entries.sort(key=lambda x: x['start'])
166
+
167
+ # Print log
168
+ print("\n=== SPEECH & SILENCE LOG ===")
169
+ for entry in entries:
170
+ if entry['type'] == 'speech':
171
+ print(f"[Speech] [{entry['start']:.3f}-{entry['end']:.3f}] {entry['word']}")
172
+ else:
173
+ duration_ms = int((entry['end'] - entry['start']) * 1000)
174
+ print(f"[Silence] [{entry['start']:.3f}-{entry['end']:.3f}] [{duration_ms}ms]")
175
+
176
+ # Calculate summary
177
+ total_silence = sum(p['end'] - p['start'] for p in silence_periods)
178
+ print(f"\n=== SUMMARY ===")
179
+ print(f"Words: {len(timestamps_data)}, Silence periods: {len(silence_periods)}, Total silence: {total_silence:.2f}s")
180
+ print("=" * 30 + "\n")
181
+
182
+
183
+ def transcribe_audio(audio, prob_threshold=0.5, frame_rep_threshold=2):
184
  try:
185
  # Check if audio is provided
186
  if audio is None:
187
+ return "No audio provided. Please record or upload audio first.", [], None, "", {}
188
 
189
  # Preprocess audio: convert to mono if stereo
190
  audio_data, sample_rate = sf.read(audio)
 
193
  if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
194
  audio_data = np.mean(audio_data, axis=1)
195
 
196
+ # Resample to 16kHz if needed (required by both Parakeet and TEN VAD)
197
+ TARGET_SR = 16000
198
+ if sample_rate != TARGET_SR:
199
+ duration = len(audio_data) / sample_rate
200
+ new_length = int(duration * TARGET_SR)
201
+ x_old = np.linspace(0, duration, len(audio_data), endpoint=False)
202
+ x_new = np.linspace(0, duration, new_length, endpoint=False)
203
+ audio_data = np.interp(x_new, x_old, audio_data).astype(np.float32)
204
+ print(f"[AUDIO] Resampled from {sample_rate}Hz to {TARGET_SR}Hz")
205
+ sample_rate = TARGET_SR
206
+
207
+ # Save as temporary mono 16kHz file
208
  import tempfile
209
  import os
210
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
 
289
  import traceback
290
  print(f"Timestamp extraction failed: {str(e)}\n{traceback.format_exc()}")
291
 
292
+ # Map punctuated words back to timestamps_data
293
+ # -----------------------------------------------------------------
294
+ # The ASR model outputs raw lowercase text without punctuation.
295
+ # PunctFixer adds punctuation and capitalization, but may occasionally:
296
+ # - Merge words (e.g., "i morgen" → "imorgen")
297
+ # - Split contractions differently
298
+ # - Result in different word counts than the raw output
299
+ #
300
+ # We handle this by:
301
+ # 1. If word counts match: direct position-based mapping (common case)
302
+ # 2. If counts differ: fuzzy matching with lookahead to realign
303
+ # -----------------------------------------------------------------
304
+ try:
305
+ import re
306
+ # Split punctuated text into words, keeping punctuation attached
307
+ punctuated_words = punctuated_text.split()
308
+
309
+ # Helper to strip punctuation for comparison (normalize for matching)
310
+ def strip_punct(s):
311
+ return re.sub(r'[^\w]', '', s).lower()
312
+
313
+ # Align punctuated words to raw words
314
+ if len(punctuated_words) == len(timestamps_data):
315
+ # Same word count - direct mapping (most common case)
316
+ # Verify base word matches before replacing to catch any edge cases
317
+ for i, pw in enumerate(punctuated_words):
318
+ if strip_punct(pw) == strip_punct(timestamps_data[i]['word']):
319
+ timestamps_data[i]['word'] = pw
320
+ else:
321
+ # Different word counts - PunctFixer may have merged/split words
322
+ # Use two-pointer approach with lookahead for realignment
323
+ pi = 0 # punctuated index
324
+ for ti in range(len(timestamps_data)):
325
+ if pi >= len(punctuated_words):
326
+ break
327
+ raw_word = strip_punct(timestamps_data[ti]['word'])
328
+ punct_word = strip_punct(punctuated_words[pi])
329
+ if raw_word == punct_word:
330
+ timestamps_data[ti]['word'] = punctuated_words[pi]
331
+ pi += 1
332
+ else:
333
+ # Words don't match - try lookahead to find alignment
334
+ # This handles cases where PunctFixer inserted/removed words
335
+ for look_ahead in range(1, min(3, len(punctuated_words) - pi)):
336
+ if strip_punct(punctuated_words[pi + look_ahead]) == raw_word:
337
+ pi += look_ahead
338
+ timestamps_data[ti]['word'] = punctuated_words[pi]
339
+ pi += 1
340
+ break
341
+ except Exception as e:
342
+ # Graceful fallback: keep original raw words if mapping fails
343
+ print(f"Punctuation mapping failed: {str(e)}")
344
+
345
+ # Run VAD to detect silence periods
346
+ silence_periods = []
347
+ try:
348
+ silence_periods = detect_silence_periods(audio_data, sample_rate, prob_threshold, frame_rep_threshold)
349
+ print_speech_silence_log(timestamps_data, silence_periods)
350
+ except Exception as e:
351
+ import traceback
352
+ print(f"[VAD] Error during silence detection: {str(e)}\n{traceback.format_exc()}")
353
+
354
  # Calculate audio duration
355
  audio_duration = len(audio_data) / sample_rate
356
 
 
364
  'frame_duration': 0.08
365
  }
366
 
367
+ # Return text, timestamps, audio data, raw_text, export metadata, and silence periods
368
+ return punctuated_text, timestamps_data, (audio_data, sample_rate), raw_text, export_metadata, silence_periods
369
+ return "No transcription available.", [], None, "", {}, []
370
 
371
  except Exception as e:
372
  import traceback
373
+ return f"Error during transcription: {str(e)}\n{traceback.format_exc()}", [], None, "", {}, []
374
 
375
 
376
+ def extract_audio_segment(audio_state, start_time, end_time, current_window=None):
377
+ """Fast audio extraction from memory with waveform visualization.
378
+
379
+ Args:
380
+ audio_state: Tuple of (audio_data, sample_rate)
381
+ start_time: Start time of the interval to play
382
+ end_time: End time of the interval to play
383
+ current_window: Dict with 'start' and 'end' of current waveform window, or None
384
+
385
+ Returns:
386
+ Tuple of (html_output, new_window_state)
387
+ """
388
  # Wrapper to ensure controls never collapse
389
+ def wrap_output(content, window_state=None):
390
+ return f'<div style="min-height: 200px;">{content}</div>', window_state
391
 
392
  try:
393
  if audio_state is None:
394
  return wrap_output("<p style='color: red; padding: 20px;'>No audio loaded. Please transcribe audio first.</p>")
395
 
396
  audio_data, sample_rate = audio_state
397
+ audio_duration = len(audio_data) / sample_rate
398
+
399
+ # Default context padding is 160ms
400
+ DEFAULT_PADDING = 0.16
401
+
402
+ # Determine if we need to redraw the waveform or just update the shaded area
403
+ need_redraw = True
404
+ if current_window is not None:
405
+ # Check if the new interval fits within the current window
406
+ if start_time >= current_window['start'] and end_time <= current_window['end']:
407
+ need_redraw = False
408
+ # Reuse the current window boundaries
409
+ padded_start = current_window['start']
410
+ padded_end = current_window['end']
411
 
412
+ if need_redraw:
413
+ # Calculate new window with ±160ms padding
414
+ padded_start = max(0, start_time - DEFAULT_PADDING)
415
+ padded_end = min(audio_duration, end_time + DEFAULT_PADDING)
416
 
417
  # Extract padded segment for waveform visualization
418
  start_sample_padded = int(padded_start * sample_rate)
 
457
 
458
  ax.set_xlabel('Time (seconds)', fontsize=10)
459
  ax.set_ylabel('Amplitude', fontsize=10)
460
+
461
+ # Calculate context on each side in ms
462
+ left_context_ms = int((start_time - padded_start) * 1000)
463
+ right_context_ms = int((padded_end - end_time) * 1000)
464
+
465
+ # Format context string - symmetric or asymmetric
466
+ if left_context_ms == right_context_ms:
467
+ context_str = f'(±{left_context_ms}ms context)'
468
+ else:
469
+ context_str = f'(-{left_context_ms}ms context +{right_context_ms}ms context)'
470
+
471
+ ax.set_title(f'Audio Segment: {start_time:.3f}s – {end_time:.3f}s {context_str}', fontsize=11)
472
  ax.legend(fontsize=9)
473
  ax.grid(True, alpha=0.3)
474
 
 
491
  import time
492
  unique_id = int(time.time() * 1000)
493
 
494
+ # Calculate context on each side in ms for the info text
495
+ left_context_ms = int((start_time - padded_start) * 1000)
496
+ right_context_ms = int((padded_end - end_time) * 1000)
497
+
498
+ # Format context string - symmetric or asymmetric
499
+ if left_context_ms == right_context_ms:
500
+ context_info = f'±{left_context_ms}ms'
501
+ else:
502
+ context_info = f'-{left_context_ms}ms / +{right_context_ms}ms'
503
+
504
  # Create HTML with waveform and native audio controls
505
  html_output = f'''
506
  <div style="margin: 10px 0;" data-render-id="{unique_id}">
 
514
 
515
  <div style="margin-top: 8px; text-align: center;">
516
  <span style="font-size: 14px; font-weight: bold; color: #333;">
517
+ Segment: {start_time:.3f}s {end_time:.3f}s
518
  </span>
519
  <span style="font-size: 12px; color: #666; margin-left: 15px;">
520
+ Duration: {(end_time - start_time)*1000:.0f}ms | Context shown: {context_info}
521
  </span>
522
  </div>
523
  </div>
524
  '''
525
 
526
+ # Return HTML and new window state
527
+ new_window = {'start': padded_start, 'end': padded_end}
528
+ return wrap_output(html_output, new_window)
529
 
530
  except Exception as e:
531
  import traceback
532
+ return wrap_output(f"<pre style='padding: 20px;'>Error: {str(e)}\n{traceback.format_exc()}</pre>", current_window)
533
 
534
 
535
  with gr.Blocks() as demo:
 
547
  sources=["microphone", "upload"],
548
  format="wav"
549
  )
550
+
551
+ # VAD Controls - inline labels with number inputs
552
+ with gr.Row():
553
+ gr.Markdown("**VAD: Probability Threshold**")
554
+ vad_prob_threshold = gr.Number(
555
+ show_label=False,
556
+ value=0.5,
557
+ minimum=0.0,
558
+ maximum=1.0,
559
+ step=0.05,
560
+ scale=0,
561
+ min_width=80
562
+ )
563
+ gr.Markdown("**VAD: Frame Repetition Threshold**")
564
+ vad_frame_rep = gr.Number(
565
+ show_label=False,
566
+ value=2,
567
+ minimum=1,
568
+ maximum=10,
569
+ step=1,
570
+ scale=0,
571
+ min_width=80
572
+ )
573
+
574
  transcribe_button = gr.Button("Transcribe")
575
  transcription_output = gr.Textbox(label="Transcription", lines=5)
576
 
 
594
  # Track last played interval for smart replay
595
  last_interval_state = gr.State("")
596
 
597
+ # Track current waveform window boundaries for smart redraw
598
+ waveform_window_state = gr.State(None)
599
+
600
  # Waveform player - below interval controls
601
  waveform_player = gr.HTML(label="Segment Player")
602
 
603
+ def transcribe_and_setup_audio(audio, prob_threshold, frame_rep_threshold):
604
+ text, timestamps_data, audio_data, raw_text, export_metadata, silence_periods = transcribe_audio(
605
+ audio, prob_threshold, int(frame_rep_threshold)
606
+ )
607
 
608
+ # Build combined entries (words + silence) sorted by start time
609
+ entries = []
610
+ for item in timestamps_data:
611
+ entries.append({
612
+ 'type': 'word',
613
+ 'word': item['word'],
614
+ 'start': round(item['start'], 3),
615
+ 'end': round(item['end'], 3)
616
+ })
617
+ for item in silence_periods:
618
+ entries.append({
619
+ 'type': 'silence',
620
+ 'start': round(item['start'], 3),
621
+ 'end': round(item['end'], 3)
622
+ })
623
+ entries.sort(key=lambda x: x['start'])
624
+ entries_json = json.dumps(entries)
625
+
626
+ # Build word data as JSON for the iframe (kept for backward compat)
627
  words_json = json.dumps([{
628
  'word': item['word'],
629
+ 'start': round(item['start'], 3),
630
+ 'end': round(item['end'], 3)
631
  } for item in timestamps_data])
632
 
633
  # Pre-generate full export JSON
634
  segments = [{
635
  'word': item['word'],
636
+ 'start': round(item['start'], 3),
637
+ 'end': round(item['end'], 3),
638
  'word_index': i
639
  } for i, item in enumerate(timestamps_data)]
640
 
 
670
  }}
671
  .word-btn:hover {{ background: #c5e5f5; }}
672
  .word-btn.selected {{ background: #4CAF50; color: white; border-color: #3a9; }}
673
+ .silence-btn {{
674
+ display: inline-block;
675
+ background: #ffe4c4;
676
+ padding: 5px 8px;
677
+ margin: 3px;
678
+ border-radius: 4px;
679
+ cursor: pointer;
680
+ border: 1px solid #dca;
681
+ font-size: 11px;
682
+ transition: all 0.15s;
683
+ }}
684
+ .silence-btn:hover {{ background: #ffd4a4; }}
685
+ .silence-btn.selected {{ background: #ff9800; color: white; border-color: #e68a00; }}
686
+ .checkbox-container {{
687
+ display: inline-flex;
688
+ align-items: center;
689
+ margin-left: 15px;
690
+ font-size: 12px;
691
+ cursor: pointer;
692
+ }}
693
+ .checkbox-container input {{
694
+ margin-right: 5px;
695
+ cursor: pointer;
696
+ }}
697
+ .checkbox-container:hover {{
698
+ color: #0066cc;
699
+ }}
700
  .time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
701
+ .silence-time {{ color: #996600; font-size: 10px; font-weight: bold; }}
702
+ .duration {{ color: #666; font-size: 10px; margin-left: 3px; }}
703
  .word {{ margin-left: 4px; }}
704
  </style>
705
  </head>
706
  <body>
707
  <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
708
+ <div style="display: flex; align-items: center;">
709
+ <h3 style="margin: 0;">Word Timestamps</h3>
710
+ <label class="checkbox-container" title="Extends word end times toward midpoint of gap to next word (max 120ms). Helps capture word endings that may be cut off.">
711
+ <input type="checkbox" id="adjust-intervals">
712
+ Apply Time Interval Adjustment
713
+ </label>
714
+ </div>
715
  <a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
716
  </div>
717
  <script>var exportJsonStr = {json.dumps(export_json_str)};</script>
718
+ <p class="help"><b>Click</b> = select &nbsp;|&nbsp; <b>Ctrl+Click</b> = toggle &nbsp;|&nbsp; <b>Shift+Click</b> = range &nbsp;&nbsp;&nbsp; <span style="background: #ffe4c4; padding: 2px 8px; border-radius: 3px; border: 1px solid #dca;"></span> = detected non speech</p>
719
  <div class="container" id="words"></div>
720
  <script>
721
+ var entries = {entries_json};
722
  var container = document.getElementById('words');
723
 
724
+ // Merge consecutive silence periods (no word between them)
725
+ // This keeps the raw data in Python logs but shows cleaner UI
726
+ function mergeConsecutiveSilences(entryList) {{
727
+ var merged = [];
728
+ var pendingSilence = null;
729
+
730
+ entryList.forEach(function(entry) {{
731
+ if (entry.type === 'silence') {{
732
+ if (pendingSilence === null) {{
733
+ // Start a new pending silence
734
+ pendingSilence = {{ type: 'silence', start: entry.start, end: entry.end }};
735
+ }} else {{
736
+ // Extend the pending silence
737
+ pendingSilence.end = entry.end;
738
+ }}
739
  }} else {{
740
+ // It's a word - flush any pending silence first
741
+ if (pendingSilence !== null) {{
742
+ merged.push(pendingSilence);
743
+ pendingSilence = null;
744
+ }}
745
+ merged.push(entry);
746
  }}
747
+ }});
748
+
749
+ // Don't forget trailing silence
750
+ if (pendingSilence !== null) {{
751
+ merged.push(pendingSilence);
752
+ }}
753
+
754
+ return merged;
755
+ }}
756
+
757
+ // Apply merging to entries for display
758
+ entries = mergeConsecutiveSilences(entries);
759
+
760
+ // Separate words and silence for adjustment calculations
761
+ var words = entries.filter(function(e) {{ return e.type === 'word'; }});
762
+ var silences = entries.filter(function(e) {{ return e.type === 'silence'; }});
763
+
764
+ // Calculate adjusted end times for words (simple: extend to midpoint, max 120ms)
765
+ function calculateAdjustedEnd(wordIndex) {{
766
+ var word = words[wordIndex];
767
+ var nextWord = words[wordIndex + 1];
768
+
769
+ if (!nextWord) return word.end; // Last word, no adjustment
770
+
771
+ var gap = nextWord.start - word.end;
772
+ var extension = Math.min(gap / 2, 0.12); // max 120ms, never beyond midpoint
773
+
774
+ return word.end + extension;
775
+ }}
776
+
777
+ // Store adjusted ends for each word
778
+ var adjustedEnds = words.map(function(w, i) {{ return calculateAdjustedEnd(i); }});
779
+
780
+ // Track last clicked item index for Shift+Click range selection
781
+ var lastClickedIndex = -1;
782
+
783
+ // Get all clickable buttons in order
784
+ function getAllButtons() {{
785
+ return Array.from(container.querySelectorAll('.word-btn, .silence-btn'));
786
+ }}
787
+
788
+ // Handle click with modifiers
789
+ function handleItemClick(btn, e) {{
790
+ var allBtns = getAllButtons();
791
+ var clickedIndex = allBtns.indexOf(btn);
792
+
793
+ if (e.shiftKey && lastClickedIndex >= 0) {{
794
+ // Shift+Click: select range between lastClickedIndex and clickedIndex
795
+ var start = Math.min(lastClickedIndex, clickedIndex);
796
+ var end = Math.max(lastClickedIndex, clickedIndex);
797
+ allBtns.forEach(function(b, i) {{
798
+ if (i >= start && i <= end) {{
799
+ b.classList.add('selected');
800
+ }}
801
+ }});
802
+ }} else if (e.ctrlKey) {{
803
+ // Ctrl+Click: toggle selection
804
+ btn.classList.toggle('selected');
805
+ }} else {{
806
+ // Regular click: select only this item
807
+ allBtns.forEach(function(b) {{ b.classList.remove('selected'); }});
808
+ btn.classList.add('selected');
809
+ }}
810
+
811
+ lastClickedIndex = clickedIndex;
812
+ updateInterval();
813
+ }}
814
+
815
+ // Render all entries
816
+ var wordIndex = 0;
817
+ entries.forEach(function(entry, i) {{
818
+ var btn = document.createElement('span');
819
+
820
+ if (entry.type === 'word') {{
821
+ var wi = wordIndex;
822
+ btn.className = 'word-btn';
823
+ btn.dataset.origS = entry.start;
824
+ btn.dataset.origE = entry.end;
825
+ btn.dataset.adjE = adjustedEnds[wi];
826
+ btn.dataset.s = entry.start;
827
+ btn.dataset.e = entry.end;
828
+ btn.dataset.word = entry.word;
829
+ btn.innerHTML = '<span class="time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="word"> ' + entry.word + '</span>';
830
+
831
+ btn.onclick = function(e) {{ handleItemClick(this, e); }};
832
+ wordIndex++;
833
+ }} else {{
834
+ btn.className = 'silence-btn';
835
+ btn.dataset.s = entry.start;
836
+ btn.dataset.e = entry.end;
837
+ var durationMs = Math.round((entry.end - entry.start) * 1000);
838
+ btn.innerHTML = '<span class="silence-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration">' + durationMs + 'ms</span>';
839
+ btn.onclick = function(e) {{ handleItemClick(this, e); }};
840
+ }}
841
+
842
  container.appendChild(btn);
843
  }});
844
 
845
+ // Toggle adjustment checkbox handler
846
+ function updateWordLabels() {{
847
+ var adjusted = document.getElementById('adjust-intervals').checked;
848
+ document.querySelectorAll('.word-btn').forEach(function(btn) {{
849
+ var s = parseFloat(btn.dataset.origS);
850
+ var e = adjusted ? parseFloat(btn.dataset.adjE) : parseFloat(btn.dataset.origE);
851
+ btn.dataset.s = s;
852
+ btn.dataset.e = e;
853
+ btn.innerHTML = '<span class="time">[' + s.toFixed(3) + '-' + e.toFixed(3) + 's]</span><span class="word"> ' + btn.dataset.word + '</span>';
854
+ }});
855
+ updateInterval();
856
+ }}
857
+
858
+ document.getElementById('adjust-intervals').addEventListener('change', updateWordLabels);
859
+
860
  function updateInterval() {{
861
+ var sel = document.querySelectorAll('.word-btn.selected, .silence-btn.selected');
862
  if (sel.length === 0) return;
863
  var minS = Infinity, maxE = 0;
864
  sel.forEach(function(b) {{
865
  minS = Math.min(minS, parseFloat(b.dataset.s));
866
  maxE = Math.max(maxE, parseFloat(b.dataset.e));
867
  }});
868
+ var interval = minS.toFixed(3) + '-' + maxE.toFixed(3);
869
  // Find the textbox in parent and update it
870
  try {{
871
  var boxes = parent.document.querySelectorAll('input[data-testid="textbox"], textarea');
 
878
  }} catch(err) {{ console.log('Could not update parent:', err); }}
879
  }}
880
 
881
+ // Highlight words that overlap with manually entered interval (>50% must be in interval)
882
  function highlightFromInterval(intervalStr) {{
883
  if (!intervalStr) return;
884
  var parts = intervalStr.replace(',', '-').split('-');
 
889
  document.querySelectorAll('.word-btn').forEach(function(btn) {{
890
  var ws = parseFloat(btn.dataset.s);
891
  var we = parseFloat(btn.dataset.e);
892
+ var itemDuration = we - ws;
893
+ // Calculate overlap between item and interval
894
  var overlapStart = Math.max(ws, s);
895
  var overlapEnd = Math.min(we, e);
896
  var overlap = Math.max(0, overlapEnd - overlapStart);
897
+ // Highlight only if >50% of item is in interval
898
+ if (itemDuration > 0 && (overlap / itemDuration) > 0.5) {{
899
  btn.classList.add('selected');
900
  }} else {{
901
  btn.classList.remove('selected');
 
960
 
961
  return text, audio, timestamps_html, audio_data, timestamps_data, initial_player
962
 
963
+ def play_time_interval_fast(audio_state, time_interval, last_interval, current_window):
964
  """Fast extraction using preloaded audio from memory."""
965
  def wrap_error(msg):
966
+ return f'<div style="min-height: 150px; padding: 20px; text-align: center; background: #f5f5f5; border-radius: 8px;"><p style="color: #666;">{msg}</p></div>', last_interval, current_window
967
 
968
  try:
969
  if not time_interval or not audio_state:
 
982
  return wrap_error("Start time must be before end time.")
983
 
984
  # Load/reload audio segment (autoplay will replay even if same interval)
985
+ # Pass current window state for smart redraw logic
986
+ result_html, new_window = extract_audio_segment(audio_state, start_time, end_time, current_window)
987
+ return result_html, time_interval, new_window
988
 
989
  except Exception as e:
990
  import traceback
 
992
 
993
  transcribe_button.click(
994
  fn=transcribe_and_setup_audio,
995
+ inputs=[audio_input, vad_prob_threshold, vad_frame_rep],
996
  outputs=[transcription_output, audio_output, timestamps_output, audio_state, timestamps_state, waveform_player]
997
  )
998
 
999
  # Play interval button
1000
  play_interval_button.click(
1001
  fn=play_time_interval_fast,
1002
+ inputs=[audio_state, time_input, last_interval_state, waveform_window_state],
1003
+ outputs=[waveform_player, last_interval_state, waveform_window_state]
1004
  )
1005
 
1006
  demo.launch()
packages.txt CHANGED
@@ -1 +1,2 @@
1
- ffmpeg
 
 
1
+ ffmpeg
2
+ libc++1
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  nemo-toolkit[asr]>=1.23.0
2
  punctfix==0.11.1
3
  soundfile
4
- matplotlib
 
 
1
  nemo-toolkit[asr]>=1.23.0
2
  punctfix==0.11.1
3
  soundfile
4
+ matplotlib
5
+ ten-vad