Spaces:

hlevring
/

Timestamps-Tester

Running

App Files Files Community

hlevring commited on Mar 19

Commit

ff764ef

1 Parent(s): 44ae928

Detect un-transcribe text and allow to replace transcript

Browse files

Files changed (1) hide show

app.py +70 -36

app.py CHANGED Viewed

@@ -262,7 +262,7 @@ def parse_transcript_file(file_path):
     raise ValueError("Unrecognized transcript format. Expected segments[].words[], words[], or segments[] with {start, end, text}")
-def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, min_on_ms=64):
     """Load external transcript and run VAD on audio.
     Args:
@@ -271,15 +271,12 @@ def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, m
         prob_threshold: VAD probability threshold
         min_off_ms: Minimum silence duration in ms
         min_on_ms: Minimum voice duration in ms
     Returns:
         Tuple of (text, timestamps_data, audio_data_tuple, raw_text, export_metadata, silence_periods)
     """
     try:
-        # Check if audio is provided
-        if audio is None:
-            return "No audio provided. Please upload audio first.", [], None, "", {}, []
         # Check if transcript file is provided
         if transcript_file is None:
             return "No transcript file provided.", [], None, "", {}, []
@@ -291,23 +288,29 @@ def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, m
         except Exception as e:
             return f"Error parsing transcript file: {str(e)}", [], None, "", {}, []
-        # Preprocess audio: convert to mono if stereo
-        audio_data, sample_rate = sf.read(audio)
-        # Convert stereo to mono by averaging channels
-        if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
-            audio_data = np.mean(audio_data, axis=1)
-        # Resample to 16kHz if needed (required by TEN VAD)
-        TARGET_SR = 16000
-        if sample_rate != TARGET_SR:
-            duration = len(audio_data) / sample_rate
-            new_length = int(duration * TARGET_SR)
-            x_old = np.linspace(0, duration, len(audio_data), endpoint=False)
-            x_new = np.linspace(0, duration, new_length, endpoint=False)
-            audio_data = np.interp(x_new, x_old, audio_data).astype(np.float32)
-            print(f"[AUDIO] Resampled from {sample_rate}Hz to {TARGET_SR}Hz")
-            sample_rate = TARGET_SR
         # Run VAD to detect silence periods
         silence_periods = []
@@ -578,6 +581,19 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
 }}
 .silence-btn:hover {{ background: #ffd4a4; }}
 .silence-btn.selected {{ background: #ff9800; color: white; border-color: #e68a00; }}
 .checkbox-container {{
     display: inline-flex;
     align-items: center;
@@ -594,6 +610,7 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
 }}
 .time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
 .silence-time {{ color: #996600; font-size: 10px; font-weight: bold; }}
 .duration {{ color: #666; font-size: 10px; margin-left: 3px; }}
 .word {{ margin-left: 4px; }}
 </style>
@@ -610,7 +627,7 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
     <a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
 </div>
 <script>var exportJsonStr = {json.dumps(export_json_str)};</script>
-<p class="help"><b>Click</b> = select &nbsp;|&nbsp; <b>Ctrl+Click</b> = toggle &nbsp;|&nbsp; <b>Shift+Click</b> = range &nbsp;&nbsp;&nbsp; <span style="background: #ffe4c4; padding: 2px 8px; border-radius: 3px; border: 1px solid #dca;"></span> = detected non speech</p>
 <div class="container" id="words"></div>
 <script>
 var entries = {entries_json};
@@ -666,7 +683,7 @@ var adjustedEnds = words.map(function(w, i) {{ return calculateAdjustedEnd(i); }
 var lastClickedIndex = -1;
 function getAllButtons() {{
-    return Array.from(container.querySelectorAll('.word-btn, .silence-btn'));
 }}
 function handleItemClick(btn, e) {{
@@ -709,13 +726,20 @@ entries.forEach(function(entry, i) {{
         btn.onclick = function(e) {{ handleItemClick(this, e); }};
         wordIndex++;
-    }} else {{
         btn.className = 'silence-btn';
         btn.dataset.s = entry.start;
         btn.dataset.e = entry.end;
         var durationMs = Math.round((entry.end - entry.start) * 1000);
         btn.innerHTML = '<span class="silence-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration">' + durationMs + 'ms</span>';
         btn.onclick = function(e) {{ handleItemClick(this, e); }};
     }}
     container.appendChild(btn);
@@ -746,7 +770,7 @@ function updateWordLabels() {{
 document.getElementById('adjust-intervals').addEventListener('change', updateWordLabels);
 function updateInterval() {{
-    var sel = Array.from(document.querySelectorAll('.word-btn.selected, .silence-btn.selected'));
     if (sel.length === 0) return;
     // Sort selected items by start time
@@ -969,14 +993,15 @@ with gr.Blocks() as demo:
     # Waveform player - below interval controls
     waveform_player = gr.HTML(label="Segment Player")
-    def load_transcript_and_setup(audio, transcript_file, prob_threshold, min_off_ms, min_on_ms):
         """Load external transcript and setup UI."""
         if transcript_file is None:
             # Return empty/unchanged outputs if no file selected
             return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
         text, timestamps_data, audio_data, raw_text, export_metadata, silence_periods = load_transcript(
-            audio, transcript_file, prob_threshold, int(min_off_ms), int(min_on_ms)
         )
         # Check for errors
@@ -1000,14 +1025,23 @@ with gr.Blocks() as demo:
                 'end': round(item['end'], 3)
             })
         entries.sort(key=lambda x: x['start'])
-        entries_json = json.dumps(entries)
-        # Build word data as JSON for the iframe
-        words_json = json.dumps([{
-            'word': item['word'],
-            'start': round(item['start'], 3),
-            'end': round(item['end'], 3)
-        } for item in timestamps_data])
         # Pre-generate full export JSON
         segments = [{
@@ -1077,7 +1111,7 @@ with gr.Blocks() as demo:
     # Load transcript file input
     transcript_file_input.change(
         fn=load_transcript_and_setup,
-        inputs=[audio_input, transcript_file_input, vad_prob_threshold, vad_min_off, vad_min_on],
         outputs=[transcription_output, timestamps_output, audio_state, timestamps_state, waveform_player]
     )

     raise ValueError("Unrecognized transcript format. Expected segments[].words[], words[], or segments[] with {start, end, text}")
+def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, min_on_ms=64, existing_audio_state=None):
     """Load external transcript and run VAD on audio.
     Args:
         prob_threshold: VAD probability threshold
         min_off_ms: Minimum silence duration in ms
         min_on_ms: Minimum voice duration in ms
+        existing_audio_state: Optional (audio_data, sample_rate) tuple to reuse
     Returns:
         Tuple of (text, timestamps_data, audio_data_tuple, raw_text, export_metadata, silence_periods)
     """
     try:
         # Check if transcript file is provided
         if transcript_file is None:
             return "No transcript file provided.", [], None, "", {}, []
         except Exception as e:
             return f"Error parsing transcript file: {str(e)}", [], None, "", {}, []
+        # Get audio data: reuse existing state or load from file
+        if existing_audio_state is not None:
+            audio_data, sample_rate = existing_audio_state
+            print("[AUDIO] Reusing audio from memory")
+        elif audio is not None:
+            audio_data, sample_rate = sf.read(audio)
+            # Convert stereo to mono by averaging channels
+            if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
+                audio_data = np.mean(audio_data, axis=1)
+            # Resample to 16kHz if needed (required by TEN VAD)
+            TARGET_SR = 16000
+            if sample_rate != TARGET_SR:
+                duration = len(audio_data) / sample_rate
+                new_length = int(duration * TARGET_SR)
+                x_old = np.linspace(0, duration, len(audio_data), endpoint=False)
+                x_new = np.linspace(0, duration, new_length, endpoint=False)
+                audio_data = np.interp(x_new, x_old, audio_data).astype(np.float32)
+                print(f"[AUDIO] Resampled from {sample_rate}Hz to {TARGET_SR}Hz")
+                sample_rate = TARGET_SR
+        else:
+            return "No audio provided. Please upload audio first.", [], None, "", {}, []
         # Run VAD to detect silence periods
         silence_periods = []
 }}
 .silence-btn:hover {{ background: #ffd4a4; }}
 .silence-btn.selected {{ background: #ff9800; color: white; border-color: #e68a00; }}
+.untranscribed-btn {{
+    display: inline-block;
+    background: #ffcccc;
+    padding: 5px 8px;
+    margin: 3px;
+    border-radius: 4px;
+    cursor: pointer;
+    border: 1px solid #c88;
+    font-size: 11px;
+    transition: all 0.15s;
+}}
+.untranscribed-btn:hover {{ background: #ffaaaa; }}
+.untranscribed-btn.selected {{ background: #e53935; color: white; border-color: #c62828; }}
 .checkbox-container {{
     display: inline-flex;
     align-items: center;
 }}
 .time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
 .silence-time {{ color: #996600; font-size: 10px; font-weight: bold; }}
+.untranscribed-time {{ color: #b71c1c; font-size: 10px; font-weight: bold; }}
 .duration {{ color: #666; font-size: 10px; margin-left: 3px; }}
 .word {{ margin-left: 4px; }}
 </style>
     <a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
 </div>
 <script>var exportJsonStr = {json.dumps(export_json_str)};</script>
+<p class="help"><b>Click</b> = select &nbsp;|&nbsp; <b>Ctrl+Click</b> = toggle &nbsp;|&nbsp; <b>Shift+Click</b> = range &nbsp;&nbsp;&nbsp; <span style="background: #ffe4c4; padding: 2px 8px; border-radius: 3px; border: 1px solid #dca;"></span> = detected non speech &nbsp; <span style="background: #ffcccc; padding: 2px 8px; border-radius: 3px; border: 1px solid #c88;"></span> = speech without transcript</p>
 <div class="container" id="words"></div>
 <script>
 var entries = {entries_json};
 var lastClickedIndex = -1;
 function getAllButtons() {{
+    return Array.from(container.querySelectorAll('.word-btn, .silence-btn, .untranscribed-btn'));
 }}
 function handleItemClick(btn, e) {{
         btn.onclick = function(e) {{ handleItemClick(this, e); }};
         wordIndex++;
+    }} else if (entry.type === 'silence') {{
         btn.className = 'silence-btn';
         btn.dataset.s = entry.start;
         btn.dataset.e = entry.end;
         var durationMs = Math.round((entry.end - entry.start) * 1000);
         btn.innerHTML = '<span class="silence-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration">' + durationMs + 'ms</span>';
         btn.onclick = function(e) {{ handleItemClick(this, e); }};
+    }} else if (entry.type === 'untranscribed_speech') {{
+        btn.className = 'untranscribed-btn';
+        btn.dataset.s = entry.start;
+        btn.dataset.e = entry.end;
+        var durationMs = Math.round((entry.end - entry.start) * 1000);
+        btn.innerHTML = '<span class="untranscribed-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration"> XXXXX ' + durationMs + 'ms</span>';
+        btn.onclick = function(e) {{ handleItemClick(this, e); }};
     }}
     container.appendChild(btn);
 document.getElementById('adjust-intervals').addEventListener('change', updateWordLabels);
 function updateInterval() {{
+    var sel = Array.from(document.querySelectorAll('.word-btn.selected, .silence-btn.selected, .untranscribed-btn.selected'));
     if (sel.length === 0) return;
     // Sort selected items by start time
     # Waveform player - below interval controls
     waveform_player = gr.HTML(label="Segment Player")
+    def load_transcript_and_setup(audio, transcript_file, prob_threshold, min_off_ms, min_on_ms, existing_audio_state):
         """Load external transcript and setup UI."""
         if transcript_file is None:
             # Return empty/unchanged outputs if no file selected
             return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
         text, timestamps_data, audio_data, raw_text, export_metadata, silence_periods = load_transcript(
+            audio, transcript_file, prob_threshold, int(min_off_ms), int(min_on_ms),
+            existing_audio_state=existing_audio_state
         )
         # Check for errors
                 'end': round(item['end'], 3)
             })
         entries.sort(key=lambda x: x['start'])
+        # Detect untranscribed speech: gaps between consecutive silence entries
+        # where VAD detected voice but the transcript has no words
+        augmented_entries = []
+        for i, entry in enumerate(entries):
+            augmented_entries.append(entry)
+            if entry['type'] == 'silence' and i + 1 < len(entries) and entries[i + 1]['type'] == 'silence':
+                gap_start = entry['end']
+                gap_end = entries[i + 1]['start']
+                if gap_end - gap_start > 0.01:
+                    augmented_entries.append({
+                        'type': 'untranscribed_speech',
+                        'start': round(gap_start, 3),
+                        'end': round(gap_end, 3)
+                    })
+        entries = augmented_entries
+        entries_json = json.dumps(entries)
         # Pre-generate full export JSON
         segments = [{
     # Load transcript file input
     transcript_file_input.change(
         fn=load_transcript_and_setup,
+        inputs=[audio_input, transcript_file_input, vad_prob_threshold, vad_min_off, vad_min_on, audio_state],
         outputs=[transcription_output, timestamps_output, audio_state, timestamps_state, waveform_player]
     )