hlevring commited on
Commit
ff764ef
·
1 Parent(s): 44ae928

Detect un-transcribe text and allow to replace transcript

Browse files
Files changed (1) hide show
  1. app.py +70 -36
app.py CHANGED
@@ -262,7 +262,7 @@ def parse_transcript_file(file_path):
262
  raise ValueError("Unrecognized transcript format. Expected segments[].words[], words[], or segments[] with {start, end, text}")
263
 
264
 
265
- def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, min_on_ms=64):
266
  """Load external transcript and run VAD on audio.
267
 
268
  Args:
@@ -271,15 +271,12 @@ def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, m
271
  prob_threshold: VAD probability threshold
272
  min_off_ms: Minimum silence duration in ms
273
  min_on_ms: Minimum voice duration in ms
 
274
 
275
  Returns:
276
  Tuple of (text, timestamps_data, audio_data_tuple, raw_text, export_metadata, silence_periods)
277
  """
278
  try:
279
- # Check if audio is provided
280
- if audio is None:
281
- return "No audio provided. Please upload audio first.", [], None, "", {}, []
282
-
283
  # Check if transcript file is provided
284
  if transcript_file is None:
285
  return "No transcript file provided.", [], None, "", {}, []
@@ -291,23 +288,29 @@ def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, m
291
  except Exception as e:
292
  return f"Error parsing transcript file: {str(e)}", [], None, "", {}, []
293
 
294
- # Preprocess audio: convert to mono if stereo
295
- audio_data, sample_rate = sf.read(audio)
296
-
297
- # Convert stereo to mono by averaging channels
298
- if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
299
- audio_data = np.mean(audio_data, axis=1)
300
-
301
- # Resample to 16kHz if needed (required by TEN VAD)
302
- TARGET_SR = 16000
303
- if sample_rate != TARGET_SR:
304
- duration = len(audio_data) / sample_rate
305
- new_length = int(duration * TARGET_SR)
306
- x_old = np.linspace(0, duration, len(audio_data), endpoint=False)
307
- x_new = np.linspace(0, duration, new_length, endpoint=False)
308
- audio_data = np.interp(x_new, x_old, audio_data).astype(np.float32)
309
- print(f"[AUDIO] Resampled from {sample_rate}Hz to {TARGET_SR}Hz")
310
- sample_rate = TARGET_SR
 
 
 
 
 
 
311
 
312
  # Run VAD to detect silence periods
313
  silence_periods = []
@@ -578,6 +581,19 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
578
  }}
579
  .silence-btn:hover {{ background: #ffd4a4; }}
580
  .silence-btn.selected {{ background: #ff9800; color: white; border-color: #e68a00; }}
 
 
 
 
 
 
 
 
 
 
 
 
 
581
  .checkbox-container {{
582
  display: inline-flex;
583
  align-items: center;
@@ -594,6 +610,7 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
594
  }}
595
  .time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
596
  .silence-time {{ color: #996600; font-size: 10px; font-weight: bold; }}
 
597
  .duration {{ color: #666; font-size: 10px; margin-left: 3px; }}
598
  .word {{ margin-left: 4px; }}
599
  </style>
@@ -610,7 +627,7 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
610
  <a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
611
  </div>
612
  <script>var exportJsonStr = {json.dumps(export_json_str)};</script>
613
- <p class="help"><b>Click</b> = select &nbsp;|&nbsp; <b>Ctrl+Click</b> = toggle &nbsp;|&nbsp; <b>Shift+Click</b> = range &nbsp;&nbsp;&nbsp; <span style="background: #ffe4c4; padding: 2px 8px; border-radius: 3px; border: 1px solid #dca;"></span> = detected non speech</p>
614
  <div class="container" id="words"></div>
615
  <script>
616
  var entries = {entries_json};
@@ -666,7 +683,7 @@ var adjustedEnds = words.map(function(w, i) {{ return calculateAdjustedEnd(i); }
666
  var lastClickedIndex = -1;
667
 
668
  function getAllButtons() {{
669
- return Array.from(container.querySelectorAll('.word-btn, .silence-btn'));
670
  }}
671
 
672
  function handleItemClick(btn, e) {{
@@ -709,13 +726,20 @@ entries.forEach(function(entry, i) {{
709
 
710
  btn.onclick = function(e) {{ handleItemClick(this, e); }};
711
  wordIndex++;
712
- }} else {{
713
  btn.className = 'silence-btn';
714
  btn.dataset.s = entry.start;
715
  btn.dataset.e = entry.end;
716
  var durationMs = Math.round((entry.end - entry.start) * 1000);
717
  btn.innerHTML = '<span class="silence-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration">' + durationMs + 'ms</span>';
718
  btn.onclick = function(e) {{ handleItemClick(this, e); }};
 
 
 
 
 
 
 
719
  }}
720
 
721
  container.appendChild(btn);
@@ -746,7 +770,7 @@ function updateWordLabels() {{
746
  document.getElementById('adjust-intervals').addEventListener('change', updateWordLabels);
747
 
748
  function updateInterval() {{
749
- var sel = Array.from(document.querySelectorAll('.word-btn.selected, .silence-btn.selected'));
750
  if (sel.length === 0) return;
751
 
752
  // Sort selected items by start time
@@ -969,14 +993,15 @@ with gr.Blocks() as demo:
969
  # Waveform player - below interval controls
970
  waveform_player = gr.HTML(label="Segment Player")
971
 
972
- def load_transcript_and_setup(audio, transcript_file, prob_threshold, min_off_ms, min_on_ms):
973
  """Load external transcript and setup UI."""
974
  if transcript_file is None:
975
  # Return empty/unchanged outputs if no file selected
976
  return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
977
 
978
  text, timestamps_data, audio_data, raw_text, export_metadata, silence_periods = load_transcript(
979
- audio, transcript_file, prob_threshold, int(min_off_ms), int(min_on_ms)
 
980
  )
981
 
982
  # Check for errors
@@ -1000,14 +1025,23 @@ with gr.Blocks() as demo:
1000
  'end': round(item['end'], 3)
1001
  })
1002
  entries.sort(key=lambda x: x['start'])
1003
- entries_json = json.dumps(entries)
1004
 
1005
- # Build word data as JSON for the iframe
1006
- words_json = json.dumps([{
1007
- 'word': item['word'],
1008
- 'start': round(item['start'], 3),
1009
- 'end': round(item['end'], 3)
1010
- } for item in timestamps_data])
 
 
 
 
 
 
 
 
 
 
1011
 
1012
  # Pre-generate full export JSON
1013
  segments = [{
@@ -1077,7 +1111,7 @@ with gr.Blocks() as demo:
1077
  # Load transcript file input
1078
  transcript_file_input.change(
1079
  fn=load_transcript_and_setup,
1080
- inputs=[audio_input, transcript_file_input, vad_prob_threshold, vad_min_off, vad_min_on],
1081
  outputs=[transcription_output, timestamps_output, audio_state, timestamps_state, waveform_player]
1082
  )
1083
 
 
262
  raise ValueError("Unrecognized transcript format. Expected segments[].words[], words[], or segments[] with {start, end, text}")
263
 
264
 
265
+ def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, min_on_ms=64, existing_audio_state=None):
266
  """Load external transcript and run VAD on audio.
267
 
268
  Args:
 
271
  prob_threshold: VAD probability threshold
272
  min_off_ms: Minimum silence duration in ms
273
  min_on_ms: Minimum voice duration in ms
274
+ existing_audio_state: Optional (audio_data, sample_rate) tuple to reuse
275
 
276
  Returns:
277
  Tuple of (text, timestamps_data, audio_data_tuple, raw_text, export_metadata, silence_periods)
278
  """
279
  try:
 
 
 
 
280
  # Check if transcript file is provided
281
  if transcript_file is None:
282
  return "No transcript file provided.", [], None, "", {}, []
 
288
  except Exception as e:
289
  return f"Error parsing transcript file: {str(e)}", [], None, "", {}, []
290
 
291
+ # Get audio data: reuse existing state or load from file
292
+ if existing_audio_state is not None:
293
+ audio_data, sample_rate = existing_audio_state
294
+ print("[AUDIO] Reusing audio from memory")
295
+ elif audio is not None:
296
+ audio_data, sample_rate = sf.read(audio)
297
+
298
+ # Convert stereo to mono by averaging channels
299
+ if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
300
+ audio_data = np.mean(audio_data, axis=1)
301
+
302
+ # Resample to 16kHz if needed (required by TEN VAD)
303
+ TARGET_SR = 16000
304
+ if sample_rate != TARGET_SR:
305
+ duration = len(audio_data) / sample_rate
306
+ new_length = int(duration * TARGET_SR)
307
+ x_old = np.linspace(0, duration, len(audio_data), endpoint=False)
308
+ x_new = np.linspace(0, duration, new_length, endpoint=False)
309
+ audio_data = np.interp(x_new, x_old, audio_data).astype(np.float32)
310
+ print(f"[AUDIO] Resampled from {sample_rate}Hz to {TARGET_SR}Hz")
311
+ sample_rate = TARGET_SR
312
+ else:
313
+ return "No audio provided. Please upload audio first.", [], None, "", {}, []
314
 
315
  # Run VAD to detect silence periods
316
  silence_periods = []
 
581
  }}
582
  .silence-btn:hover {{ background: #ffd4a4; }}
583
  .silence-btn.selected {{ background: #ff9800; color: white; border-color: #e68a00; }}
584
+ .untranscribed-btn {{
585
+ display: inline-block;
586
+ background: #ffcccc;
587
+ padding: 5px 8px;
588
+ margin: 3px;
589
+ border-radius: 4px;
590
+ cursor: pointer;
591
+ border: 1px solid #c88;
592
+ font-size: 11px;
593
+ transition: all 0.15s;
594
+ }}
595
+ .untranscribed-btn:hover {{ background: #ffaaaa; }}
596
+ .untranscribed-btn.selected {{ background: #e53935; color: white; border-color: #c62828; }}
597
  .checkbox-container {{
598
  display: inline-flex;
599
  align-items: center;
 
610
  }}
611
  .time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
612
  .silence-time {{ color: #996600; font-size: 10px; font-weight: bold; }}
613
+ .untranscribed-time {{ color: #b71c1c; font-size: 10px; font-weight: bold; }}
614
  .duration {{ color: #666; font-size: 10px; margin-left: 3px; }}
615
  .word {{ margin-left: 4px; }}
616
  </style>
 
627
  <a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
628
  </div>
629
  <script>var exportJsonStr = {json.dumps(export_json_str)};</script>
630
+ <p class="help"><b>Click</b> = select &nbsp;|&nbsp; <b>Ctrl+Click</b> = toggle &nbsp;|&nbsp; <b>Shift+Click</b> = range &nbsp;&nbsp;&nbsp; <span style="background: #ffe4c4; padding: 2px 8px; border-radius: 3px; border: 1px solid #dca;"></span> = detected non speech &nbsp; <span style="background: #ffcccc; padding: 2px 8px; border-radius: 3px; border: 1px solid #c88;"></span> = speech without transcript</p>
631
  <div class="container" id="words"></div>
632
  <script>
633
  var entries = {entries_json};
 
683
  var lastClickedIndex = -1;
684
 
685
  function getAllButtons() {{
686
+ return Array.from(container.querySelectorAll('.word-btn, .silence-btn, .untranscribed-btn'));
687
  }}
688
 
689
  function handleItemClick(btn, e) {{
 
726
 
727
  btn.onclick = function(e) {{ handleItemClick(this, e); }};
728
  wordIndex++;
729
+ }} else if (entry.type === 'silence') {{
730
  btn.className = 'silence-btn';
731
  btn.dataset.s = entry.start;
732
  btn.dataset.e = entry.end;
733
  var durationMs = Math.round((entry.end - entry.start) * 1000);
734
  btn.innerHTML = '<span class="silence-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration">' + durationMs + 'ms</span>';
735
  btn.onclick = function(e) {{ handleItemClick(this, e); }};
736
+ }} else if (entry.type === 'untranscribed_speech') {{
737
+ btn.className = 'untranscribed-btn';
738
+ btn.dataset.s = entry.start;
739
+ btn.dataset.e = entry.end;
740
+ var durationMs = Math.round((entry.end - entry.start) * 1000);
741
+ btn.innerHTML = '<span class="untranscribed-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration"> XXXXX ' + durationMs + 'ms</span>';
742
+ btn.onclick = function(e) {{ handleItemClick(this, e); }};
743
  }}
744
 
745
  container.appendChild(btn);
 
770
  document.getElementById('adjust-intervals').addEventListener('change', updateWordLabels);
771
 
772
  function updateInterval() {{
773
+ var sel = Array.from(document.querySelectorAll('.word-btn.selected, .silence-btn.selected, .untranscribed-btn.selected'));
774
  if (sel.length === 0) return;
775
 
776
  // Sort selected items by start time
 
993
  # Waveform player - below interval controls
994
  waveform_player = gr.HTML(label="Segment Player")
995
 
996
+ def load_transcript_and_setup(audio, transcript_file, prob_threshold, min_off_ms, min_on_ms, existing_audio_state):
997
  """Load external transcript and setup UI."""
998
  if transcript_file is None:
999
  # Return empty/unchanged outputs if no file selected
1000
  return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
1001
 
1002
  text, timestamps_data, audio_data, raw_text, export_metadata, silence_periods = load_transcript(
1003
+ audio, transcript_file, prob_threshold, int(min_off_ms), int(min_on_ms),
1004
+ existing_audio_state=existing_audio_state
1005
  )
1006
 
1007
  # Check for errors
 
1025
  'end': round(item['end'], 3)
1026
  })
1027
  entries.sort(key=lambda x: x['start'])
 
1028
 
1029
+ # Detect untranscribed speech: gaps between consecutive silence entries
1030
+ # where VAD detected voice but the transcript has no words
1031
+ augmented_entries = []
1032
+ for i, entry in enumerate(entries):
1033
+ augmented_entries.append(entry)
1034
+ if entry['type'] == 'silence' and i + 1 < len(entries) and entries[i + 1]['type'] == 'silence':
1035
+ gap_start = entry['end']
1036
+ gap_end = entries[i + 1]['start']
1037
+ if gap_end - gap_start > 0.01:
1038
+ augmented_entries.append({
1039
+ 'type': 'untranscribed_speech',
1040
+ 'start': round(gap_start, 3),
1041
+ 'end': round(gap_end, 3)
1042
+ })
1043
+ entries = augmented_entries
1044
+ entries_json = json.dumps(entries)
1045
 
1046
  # Pre-generate full export JSON
1047
  segments = [{
 
1111
  # Load transcript file input
1112
  transcript_file_input.change(
1113
  fn=load_transcript_and_setup,
1114
+ inputs=[audio_input, transcript_file_input, vad_prob_threshold, vad_min_off, vad_min_on, audio_state],
1115
  outputs=[transcription_output, timestamps_output, audio_state, timestamps_state, waveform_player]
1116
  )
1117