Spaces:

hlevring
/

DanishTTS2

Sleeping

App Files Files Community

hlevring commited on Jan 20

Commit

3f632a7

1 Parent(s): baaace4

Enhance word timestamp logic and graphing

Browse files

Files changed (3) hide show

app.py +538 -58
packages.txt +2 -1
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import numpy as np
 import base64
 import io
 import json
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
@@ -20,11 +21,170 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 punct_fixer = PunctFixer(language="da", device=device)
-def transcribe_audio(audio):
     try:
         # Check if audio is provided
         if audio is None:
-            return "No audio provided. Please record or upload audio first.", [], None
         # Preprocess audio: convert to mono if stereo
         audio_data, sample_rate = sf.read(audio)
@@ -33,7 +193,18 @@ def transcribe_audio(audio):
         if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
             audio_data = np.mean(audio_data, axis=1)
-        # Save as temporary mono file
         import tempfile
         import os
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
@@ -118,6 +289,68 @@ def transcribe_audio(audio):
                 import traceback
                 print(f"Timestamp extraction failed: {str(e)}\n{traceback.format_exc()}")
             # Calculate audio duration
             audio_duration = len(audio_data) / sample_rate
@@ -131,31 +364,55 @@ def transcribe_audio(audio):
                 'frame_duration': 0.08
             }
-            # Return text, timestamps, audio data, raw_text, and export metadata
-            return punctuated_text, timestamps_data, (audio_data, sample_rate), raw_text, export_metadata
-        return "No transcription available.", [], None, "", {}
     except Exception as e:
         import traceback
-        return f"Error during transcription: {str(e)}\n{traceback.format_exc()}", [], None, "", {}
-def extract_audio_segment(audio_state, start_time, end_time):
-    """Fast audio extraction from memory with waveform visualization."""
     # Wrapper to ensure controls never collapse
-    def wrap_output(content):
-        return f'<div style="min-height: 200px;">{content}</div>'
     try:
         if audio_state is None:
             return wrap_output("<p style='color: red; padding: 20px;'>No audio loaded. Please transcribe audio first.</p>")
         audio_data, sample_rate = audio_state
-        # Add 150ms padding for VISUALIZATION only
-        padding = 0.15
-        padded_start = max(0, start_time - padding)
-        padded_end = min(len(audio_data) / sample_rate, end_time + padding)
         # Extract padded segment for waveform visualization
         start_sample_padded = int(padded_start * sample_rate)
@@ -200,7 +457,18 @@ def extract_audio_segment(audio_state, start_time, end_time):
         ax.set_xlabel('Time (seconds)', fontsize=10)
         ax.set_ylabel('Amplitude', fontsize=10)
-        ax.set_title(f'Audio Segment: {start_time:.2f}s - {end_time:.2f}s (±150ms context)', fontsize=11)
         ax.legend(fontsize=9)
         ax.grid(True, alpha=0.3)
@@ -223,6 +491,16 @@ def extract_audio_segment(audio_state, start_time, end_time):
         import time
         unique_id = int(time.time() * 1000)
         # Create HTML with waveform and native audio controls
         html_output = f'''
         <div style="margin: 10px 0;" data-render-id="{unique_id}">
@@ -236,20 +514,22 @@ def extract_audio_segment(audio_state, start_time, end_time):
             <div style="margin-top: 8px; text-align: center;">
                 <span style="font-size: 14px; font-weight: bold; color: #333;">
-                    Segment: {start_time:.2f}s - {end_time:.2f}s
                 </span>
                 <span style="font-size: 12px; color: #666; margin-left: 15px;">
-                    Duration: {end_time - start_time:.2f}s | Context shown: ±150ms
                 </span>
             </div>
         </div>
         '''
-        return wrap_output(html_output)
     except Exception as e:
         import traceback
-        return wrap_output(f"<pre style='padding: 20px;'>Error: {str(e)}\n{traceback.format_exc()}</pre>")
 with gr.Blocks() as demo:
@@ -267,6 +547,30 @@ with gr.Blocks() as demo:
         sources=["microphone", "upload"],
         format="wav"
     )
     transcribe_button = gr.Button("Transcribe")
     transcription_output = gr.Textbox(label="Transcription", lines=5)
@@ -290,24 +594,47 @@ with gr.Blocks() as demo:
     # Track last played interval for smart replay
     last_interval_state = gr.State("")
     # Waveform player - below interval controls
     waveform_player = gr.HTML(label="Segment Player")
-    def transcribe_and_setup_audio(audio):
-        text, timestamps_data, audio_data, raw_text, export_metadata = transcribe_audio(audio)
-        # Build word data as JSON for the iframe
         words_json = json.dumps([{
             'word': item['word'],
-            'start': round(item['start'], 2),
-            'end': round(item['end'], 2)
         } for item in timestamps_data])
         # Pre-generate full export JSON
         segments = [{
             'word': item['word'],
-            'start': round(item['start'], 2),
-            'end': round(item['end'], 2),
             'word_index': i
         } for i, item in enumerate(timestamps_data)]
@@ -343,50 +670,202 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
 }}
 .word-btn:hover {{ background: #c5e5f5; }}
 .word-btn.selected {{ background: #4CAF50; color: white; border-color: #3a9; }}
 .time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
 .word {{ margin-left: 4px; }}
 </style>
 </head>
 <body>
 <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
-    <h3 style="margin: 0;">Word Timestamps</h3>
     <a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
 </div>
 <script>var exportJsonStr = {json.dumps(export_json_str)};</script>
-<p class="help"><b>Click</b> = select word &nbsp;|&nbsp; <b>Ctrl+Click</b> = add to selection</p>
 <div class="container" id="words"></div>
 <script>
-var words = {words_json};
 var container = document.getElementById('words');
-words.forEach(function(w, i) {{
-    var btn = document.createElement('span');
-    btn.className = 'word-btn';
-    btn.dataset.s = w.start;
-    btn.dataset.e = w.end;
-    btn.innerHTML = '<span class="time">[' + w.start.toFixed(2) + '-' + w.end.toFixed(2) + 's]</span><span class="word">' + w.word + '</span>';
-    btn.onclick = function(e) {{
-        var all = document.querySelectorAll('.word-btn');
-        if (e.ctrlKey) {{
-            this.classList.toggle('selected');
         }} else {{
-            all.forEach(function(b) {{ b.classList.remove('selected'); }});
-            this.classList.add('selected');
         }}
-        updateInterval();
-    }};
     container.appendChild(btn);
 }});
 function updateInterval() {{
-    var sel = document.querySelectorAll('.word-btn.selected');
     if (sel.length === 0) return;
     var minS = Infinity, maxE = 0;
     sel.forEach(function(b) {{
         minS = Math.min(minS, parseFloat(b.dataset.s));
         maxE = Math.max(maxE, parseFloat(b.dataset.e));
     }});
-    var interval = minS.toFixed(2) + '-' + maxE.toFixed(2);
     // Find the textbox in parent and update it
     try {{
         var boxes = parent.document.querySelectorAll('input[data-testid="textbox"], textarea');
@@ -399,7 +878,7 @@ function updateInterval() {{
     }} catch(err) {{ console.log('Could not update parent:', err); }}
 }}
-// Highlight words that overlap with manually entered interval (>50% of word must be in interval)
 function highlightFromInterval(intervalStr) {{
     if (!intervalStr) return;
     var parts = intervalStr.replace(',', '-').split('-');
@@ -410,13 +889,13 @@ function highlightFromInterval(intervalStr) {{
     document.querySelectorAll('.word-btn').forEach(function(btn) {{
         var ws = parseFloat(btn.dataset.s);
         var we = parseFloat(btn.dataset.e);
-        var wordDuration = we - ws;
-        // Calculate overlap between word and interval
         var overlapStart = Math.max(ws, s);
         var overlapEnd = Math.min(we, e);
         var overlap = Math.max(0, overlapEnd - overlapStart);
-        // Highlight only if >50% of word is in interval
-        if (wordDuration > 0 && (overlap / wordDuration) > 0.5) {{
             btn.classList.add('selected');
         }} else {{
             btn.classList.remove('selected');
@@ -481,10 +960,10 @@ document.getElementById('download-json').onclick = function(e) {{
         return text, audio, timestamps_html, audio_data, timestamps_data, initial_player
-    def play_time_interval_fast(audio_state, time_interval, last_interval):
         """Fast extraction using preloaded audio from memory."""
         def wrap_error(msg):
-            return f'<div style="min-height: 150px; padding: 20px; text-align: center; background: #f5f5f5; border-radius: 8px;"><p style="color: #666;">{msg}</p></div>', last_interval
         try:
             if not time_interval or not audio_state:
@@ -503,8 +982,9 @@ document.getElementById('download-json').onclick = function(e) {{
                 return wrap_error("Start time must be before end time.")
             # Load/reload audio segment (autoplay will replay even if same interval)
-            result = extract_audio_segment(audio_state, start_time, end_time)
-            return result, time_interval
         except Exception as e:
             import traceback
@@ -512,15 +992,15 @@ document.getElementById('download-json').onclick = function(e) {{
     transcribe_button.click(
         fn=transcribe_and_setup_audio,
-        inputs=audio_input,
         outputs=[transcription_output, audio_output, timestamps_output, audio_state, timestamps_state, waveform_player]
     )
     # Play interval button
     play_interval_button.click(
         fn=play_time_interval_fast,
-        inputs=[audio_state, time_input, last_interval_state],
-        outputs=[waveform_player, last_interval_state]
     )
 demo.launch()

 import base64
 import io
 import json
+from ten_vad import TenVad
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 punct_fixer = PunctFixer(language="da", device=device)
+def detect_silence_periods(audio_data, sample_rate, prob_threshold=0.5, frame_rep_threshold=2):
+    """Run TEN VAD to detect silence periods in audio.
+    Args:
+        audio_data: numpy array of audio samples (float, mono, 16kHz)
+        sample_rate: sample rate (must be 16000)
+        prob_threshold: VAD probability threshold (0.0-1.0), higher = less sensitive
+        frame_rep_threshold: Number of consecutive frames required before state change
+    Returns:
+        List of dicts with 'start' and 'end' times for each silence period
+    """
+    TARGET_SR = 16000  # TEN VAD requires 16kHz
+    HOP_SIZE = 256  # 16ms at 16kHz
+    print(f"[VAD] Settings: prob_threshold={prob_threshold}, frame_rep_threshold={frame_rep_threshold}")
+    if sample_rate != TARGET_SR:
+        print(f"[VAD] Warning: Expected 16kHz audio, got {sample_rate}Hz")
+    # Convert float audio to int16 (TEN VAD expects int16)
+    if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
+        audio_int16 = (audio_data * 32767).astype(np.int16)
+    else:
+        audio_int16 = audio_data.astype(np.int16)
+    # Create VAD instance
+    vad = TenVad(hop_size=HOP_SIZE, threshold=prob_threshold)
+    silence_periods = []
+    frame_duration = HOP_SIZE / TARGET_SR  # 0.016s = 16ms
+    # Process frame by frame and collect raw flags
+    num_frames = len(audio_int16) // HOP_SIZE
+    raw_flags = []
+    for i in range(num_frames):
+        frame_start = i * HOP_SIZE
+        frame = audio_int16[frame_start:frame_start + HOP_SIZE]
+        result = vad.process(frame)
+        # TEN VAD returns tuple: (probability, flag) or has .flag attribute
+        if isinstance(result, tuple):
+            flag = result[1]  # (probability, flag)
+        else:
+            flag = result.flag
+        raw_flags.append(flag)
+    # Apply frame repetition threshold smoothing
+    # Only switch state after seeing frame_rep_threshold consecutive frames of the new state
+    in_silence = False
+    silence_start = 0.0
+    consecutive_count = 0
+    pending_state = None  # The state we're potentially switching to
+    for i, flag in enumerate(raw_flags):
+        current_time = i * frame_duration
+        is_silence = (flag == 0)
+        if in_silence:
+            # Currently in silence, looking for speech
+            if not is_silence:
+                # Potential speech detected
+                if pending_state != 'speech':
+                    pending_state = 'speech'
+                    consecutive_count = 1
+                else:
+                    consecutive_count += 1
+                if consecutive_count >= frame_rep_threshold:
+                    # Confirmed speech - end silence period
+                    # Adjust end time back to when speech actually started
+                    actual_end = current_time - (consecutive_count - 1) * frame_duration
+                    silence_periods.append({
+                        'start': round(silence_start, 3),
+                        'end': round(actual_end, 3)
+                    })
+                    in_silence = False
+                    pending_state = None
+                    consecutive_count = 0
+            else:
+                # Still silence, reset any pending speech detection
+                pending_state = None
+                consecutive_count = 0
+        else:
+            # Currently in speech, looking for silence
+            if is_silence:
+                # Potential silence detected
+                if pending_state != 'silence':
+                    pending_state = 'silence'
+                    consecutive_count = 1
+                    potential_silence_start = current_time
+                else:
+                    consecutive_count += 1
+                if consecutive_count >= frame_rep_threshold:
+                    # Confirmed silence - start silence period
+                    # Use the time when silence actually started
+                    silence_start = potential_silence_start
+                    in_silence = True
+                    pending_state = None
+                    consecutive_count = 0
+            else:
+                # Still speech, reset any pending silence detection
+                pending_state = None
+                consecutive_count = 0
+    # Handle case where audio ends in silence
+    if in_silence:
+        silence_periods.append({
+            'start': round(silence_start, 3),
+            'end': round(num_frames * frame_duration, 3)
+        })
+    return silence_periods
+def print_speech_silence_log(timestamps_data, silence_periods):
+    """Print interleaved speech and silence log sorted by start time."""
+    # Build unified list
+    entries = []
+    # Add speech entries (word timestamps)
+    for item in timestamps_data:
+        entries.append({
+            'type': 'speech',
+            'start': item['start'],
+            'end': item['end'],
+            'word': item['word']
+        })
+    # Add silence entries
+    for item in silence_periods:
+        entries.append({
+            'type': 'silence',
+            'start': item['start'],
+            'end': item['end']
+        })
+    # Sort by start time
+    entries.sort(key=lambda x: x['start'])
+    # Print log
+    print("\n=== SPEECH & SILENCE LOG ===")
+    for entry in entries:
+        if entry['type'] == 'speech':
+            print(f"[Speech]  [{entry['start']:.3f}-{entry['end']:.3f}] {entry['word']}")
+        else:
+            duration_ms = int((entry['end'] - entry['start']) * 1000)
+            print(f"[Silence] [{entry['start']:.3f}-{entry['end']:.3f}] [{duration_ms}ms]")
+    # Calculate summary
+    total_silence = sum(p['end'] - p['start'] for p in silence_periods)
+    print(f"\n=== SUMMARY ===")
+    print(f"Words: {len(timestamps_data)}, Silence periods: {len(silence_periods)}, Total silence: {total_silence:.2f}s")
+    print("=" * 30 + "\n")
+def transcribe_audio(audio, prob_threshold=0.5, frame_rep_threshold=2):
     try:
         # Check if audio is provided
         if audio is None:
+            return "No audio provided. Please record or upload audio first.", [], None, "", {}
         # Preprocess audio: convert to mono if stereo
         audio_data, sample_rate = sf.read(audio)
         if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
             audio_data = np.mean(audio_data, axis=1)
+        # Resample to 16kHz if needed (required by both Parakeet and TEN VAD)
+        TARGET_SR = 16000
+        if sample_rate != TARGET_SR:
+            duration = len(audio_data) / sample_rate
+            new_length = int(duration * TARGET_SR)
+            x_old = np.linspace(0, duration, len(audio_data), endpoint=False)
+            x_new = np.linspace(0, duration, new_length, endpoint=False)
+            audio_data = np.interp(x_new, x_old, audio_data).astype(np.float32)
+            print(f"[AUDIO] Resampled from {sample_rate}Hz to {TARGET_SR}Hz")
+            sample_rate = TARGET_SR
+        # Save as temporary mono 16kHz file
         import tempfile
         import os
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
                 import traceback
                 print(f"Timestamp extraction failed: {str(e)}\n{traceback.format_exc()}")
+            # Map punctuated words back to timestamps_data
+            # -----------------------------------------------------------------
+            # The ASR model outputs raw lowercase text without punctuation.
+            # PunctFixer adds punctuation and capitalization, but may occasionally:
+            #   - Merge words (e.g., "i morgen" → "imorgen")
+            #   - Split contractions differently
+            #   - Result in different word counts than the raw output
+            #
+            # We handle this by:
+            #   1. If word counts match: direct position-based mapping (common case)
+            #   2. If counts differ: fuzzy matching with lookahead to realign
+            # -----------------------------------------------------------------
+            try:
+                import re
+                # Split punctuated text into words, keeping punctuation attached
+                punctuated_words = punctuated_text.split()
+                # Helper to strip punctuation for comparison (normalize for matching)
+                def strip_punct(s):
+                    return re.sub(r'[^\w]', '', s).lower()
+                # Align punctuated words to raw words
+                if len(punctuated_words) == len(timestamps_data):
+                    # Same word count - direct mapping (most common case)
+                    # Verify base word matches before replacing to catch any edge cases
+                    for i, pw in enumerate(punctuated_words):
+                        if strip_punct(pw) == strip_punct(timestamps_data[i]['word']):
+                            timestamps_data[i]['word'] = pw
+                else:
+                    # Different word counts - PunctFixer may have merged/split words
+                    # Use two-pointer approach with lookahead for realignment
+                    pi = 0  # punctuated index
+                    for ti in range(len(timestamps_data)):
+                        if pi >= len(punctuated_words):
+                            break
+                        raw_word = strip_punct(timestamps_data[ti]['word'])
+                        punct_word = strip_punct(punctuated_words[pi])
+                        if raw_word == punct_word:
+                            timestamps_data[ti]['word'] = punctuated_words[pi]
+                            pi += 1
+                        else:
+                            # Words don't match - try lookahead to find alignment
+                            # This handles cases where PunctFixer inserted/removed words
+                            for look_ahead in range(1, min(3, len(punctuated_words) - pi)):
+                                if strip_punct(punctuated_words[pi + look_ahead]) == raw_word:
+                                    pi += look_ahead
+                                    timestamps_data[ti]['word'] = punctuated_words[pi]
+                                    pi += 1
+                                    break
+            except Exception as e:
+                # Graceful fallback: keep original raw words if mapping fails
+                print(f"Punctuation mapping failed: {str(e)}")
+            # Run VAD to detect silence periods
+            silence_periods = []
+            try:
+                silence_periods = detect_silence_periods(audio_data, sample_rate, prob_threshold, frame_rep_threshold)
+                print_speech_silence_log(timestamps_data, silence_periods)
+            except Exception as e:
+                import traceback
+                print(f"[VAD] Error during silence detection: {str(e)}\n{traceback.format_exc()}")
             # Calculate audio duration
             audio_duration = len(audio_data) / sample_rate
                 'frame_duration': 0.08
             }
+            # Return text, timestamps, audio data, raw_text, export metadata, and silence periods
+            return punctuated_text, timestamps_data, (audio_data, sample_rate), raw_text, export_metadata, silence_periods
+        return "No transcription available.", [], None, "", {}, []
     except Exception as e:
         import traceback
+        return f"Error during transcription: {str(e)}\n{traceback.format_exc()}", [], None, "", {}, []
+def extract_audio_segment(audio_state, start_time, end_time, current_window=None):
+    """Fast audio extraction from memory with waveform visualization.
+    Args:
+        audio_state: Tuple of (audio_data, sample_rate)
+        start_time: Start time of the interval to play
+        end_time: End time of the interval to play
+        current_window: Dict with 'start' and 'end' of current waveform window, or None
+    Returns:
+        Tuple of (html_output, new_window_state)
+    """
     # Wrapper to ensure controls never collapse
+    def wrap_output(content, window_state=None):
+        return f'<div style="min-height: 200px;">{content}</div>', window_state
     try:
         if audio_state is None:
             return wrap_output("<p style='color: red; padding: 20px;'>No audio loaded. Please transcribe audio first.</p>")
         audio_data, sample_rate = audio_state
+        audio_duration = len(audio_data) / sample_rate
+        # Default context padding is 160ms
+        DEFAULT_PADDING = 0.16
+        # Determine if we need to redraw the waveform or just update the shaded area
+        need_redraw = True
+        if current_window is not None:
+            # Check if the new interval fits within the current window
+            if start_time >= current_window['start'] and end_time <= current_window['end']:
+                need_redraw = False
+                # Reuse the current window boundaries
+                padded_start = current_window['start']
+                padded_end = current_window['end']
+        if need_redraw:
+            # Calculate new window with ±160ms padding
+            padded_start = max(0, start_time - DEFAULT_PADDING)
+            padded_end = min(audio_duration, end_time + DEFAULT_PADDING)
         # Extract padded segment for waveform visualization
         start_sample_padded = int(padded_start * sample_rate)
         ax.set_xlabel('Time (seconds)', fontsize=10)
         ax.set_ylabel('Amplitude', fontsize=10)
+        # Calculate context on each side in ms
+        left_context_ms = int((start_time - padded_start) * 1000)
+        right_context_ms = int((padded_end - end_time) * 1000)
+        # Format context string - symmetric or asymmetric
+        if left_context_ms == right_context_ms:
+            context_str = f'(±{left_context_ms}ms context)'
+        else:
+            context_str = f'(-{left_context_ms}ms context +{right_context_ms}ms context)'
+        ax.set_title(f'Audio Segment: {start_time:.3f}s – {end_time:.3f}s {context_str}', fontsize=11)
         ax.legend(fontsize=9)
         ax.grid(True, alpha=0.3)
         import time
         unique_id = int(time.time() * 1000)
+        # Calculate context on each side in ms for the info text
+        left_context_ms = int((start_time - padded_start) * 1000)
+        right_context_ms = int((padded_end - end_time) * 1000)
+        # Format context string - symmetric or asymmetric
+        if left_context_ms == right_context_ms:
+            context_info = f'±{left_context_ms}ms'
+        else:
+            context_info = f'-{left_context_ms}ms / +{right_context_ms}ms'
         # Create HTML with waveform and native audio controls
         html_output = f'''
         <div style="margin: 10px 0;" data-render-id="{unique_id}">
             <div style="margin-top: 8px; text-align: center;">
                 <span style="font-size: 14px; font-weight: bold; color: #333;">
+                    Segment: {start_time:.3f}s – {end_time:.3f}s
                 </span>
                 <span style="font-size: 12px; color: #666; margin-left: 15px;">
+                    Duration: {(end_time - start_time)*1000:.0f}ms | Context shown: {context_info}
                 </span>
             </div>
         </div>
         '''
+        # Return HTML and new window state
+        new_window = {'start': padded_start, 'end': padded_end}
+        return wrap_output(html_output, new_window)
     except Exception as e:
         import traceback
+        return wrap_output(f"<pre style='padding: 20px;'>Error: {str(e)}\n{traceback.format_exc()}</pre>", current_window)
 with gr.Blocks() as demo:
         sources=["microphone", "upload"],
         format="wav"
     )
+    # VAD Controls - inline labels with number inputs
+    with gr.Row():
+        gr.Markdown("**VAD: Probability Threshold**")
+        vad_prob_threshold = gr.Number(
+            show_label=False,
+            value=0.5,
+            minimum=0.0,
+            maximum=1.0,
+            step=0.05,
+            scale=0,
+            min_width=80
+        )
+        gr.Markdown("**VAD: Frame Repetition Threshold**")
+        vad_frame_rep = gr.Number(
+            show_label=False,
+            value=2,
+            minimum=1,
+            maximum=10,
+            step=1,
+            scale=0,
+            min_width=80
+        )
     transcribe_button = gr.Button("Transcribe")
     transcription_output = gr.Textbox(label="Transcription", lines=5)
     # Track last played interval for smart replay
     last_interval_state = gr.State("")
+    # Track current waveform window boundaries for smart redraw
+    waveform_window_state = gr.State(None)
     # Waveform player - below interval controls
     waveform_player = gr.HTML(label="Segment Player")
+    def transcribe_and_setup_audio(audio, prob_threshold, frame_rep_threshold):
+        text, timestamps_data, audio_data, raw_text, export_metadata, silence_periods = transcribe_audio(
+            audio, prob_threshold, int(frame_rep_threshold)
+        )
+        # Build combined entries (words + silence) sorted by start time
+        entries = []
+        for item in timestamps_data:
+            entries.append({
+                'type': 'word',
+                'word': item['word'],
+                'start': round(item['start'], 3),
+                'end': round(item['end'], 3)
+            })
+        for item in silence_periods:
+            entries.append({
+                'type': 'silence',
+                'start': round(item['start'], 3),
+                'end': round(item['end'], 3)
+            })
+        entries.sort(key=lambda x: x['start'])
+        entries_json = json.dumps(entries)
+        # Build word data as JSON for the iframe (kept for backward compat)
         words_json = json.dumps([{
             'word': item['word'],
+            'start': round(item['start'], 3),
+            'end': round(item['end'], 3)
         } for item in timestamps_data])
         # Pre-generate full export JSON
         segments = [{
             'word': item['word'],
+            'start': round(item['start'], 3),
+            'end': round(item['end'], 3),
             'word_index': i
         } for i, item in enumerate(timestamps_data)]
 }}
 .word-btn:hover {{ background: #c5e5f5; }}
 .word-btn.selected {{ background: #4CAF50; color: white; border-color: #3a9; }}
+.silence-btn {{
+    display: inline-block;
+    background: #ffe4c4;
+    padding: 5px 8px;
+    margin: 3px;
+    border-radius: 4px;
+    cursor: pointer;
+    border: 1px solid #dca;
+    font-size: 11px;
+    transition: all 0.15s;
+}}
+.silence-btn:hover {{ background: #ffd4a4; }}
+.silence-btn.selected {{ background: #ff9800; color: white; border-color: #e68a00; }}
+.checkbox-container {{
+    display: inline-flex;
+    align-items: center;
+    margin-left: 15px;
+    font-size: 12px;
+    cursor: pointer;
+}}
+.checkbox-container input {{
+    margin-right: 5px;
+    cursor: pointer;
+}}
+.checkbox-container:hover {{
+    color: #0066cc;
+}}
 .time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
+.silence-time {{ color: #996600; font-size: 10px; font-weight: bold; }}
+.duration {{ color: #666; font-size: 10px; margin-left: 3px; }}
 .word {{ margin-left: 4px; }}
 </style>
 </head>
 <body>
 <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
+    <div style="display: flex; align-items: center;">
+        <h3 style="margin: 0;">Word Timestamps</h3>
+        <label class="checkbox-container" title="Extends word end times toward midpoint of gap to next word (max 120ms). Helps capture word endings that may be cut off.">
+            <input type="checkbox" id="adjust-intervals">
+            Apply Time Interval Adjustment
+        </label>
+    </div>
     <a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
 </div>
 <script>var exportJsonStr = {json.dumps(export_json_str)};</script>
+<p class="help"><b>Click</b> = select &nbsp;|&nbsp; <b>Ctrl+Click</b> = toggle &nbsp;|&nbsp; <b>Shift+Click</b> = range &nbsp;&nbsp;&nbsp; <span style="background: #ffe4c4; padding: 2px 8px; border-radius: 3px; border: 1px solid #dca;"></span> = detected non speech</p>
 <div class="container" id="words"></div>
 <script>
+var entries = {entries_json};
 var container = document.getElementById('words');
+// Merge consecutive silence periods (no word between them)
+// This keeps the raw data in Python logs but shows cleaner UI
+function mergeConsecutiveSilences(entryList) {{
+    var merged = [];
+    var pendingSilence = null;
+    entryList.forEach(function(entry) {{
+        if (entry.type === 'silence') {{
+            if (pendingSilence === null) {{
+                // Start a new pending silence
+                pendingSilence = {{ type: 'silence', start: entry.start, end: entry.end }};
+            }} else {{
+                // Extend the pending silence
+                pendingSilence.end = entry.end;
+            }}
         }} else {{
+            // It's a word - flush any pending silence first
+            if (pendingSilence !== null) {{
+                merged.push(pendingSilence);
+                pendingSilence = null;
+            }}
+            merged.push(entry);
         }}
+    }});
+    // Don't forget trailing silence
+    if (pendingSilence !== null) {{
+        merged.push(pendingSilence);
+    }}
+    return merged;
+}}
+// Apply merging to entries for display
+entries = mergeConsecutiveSilences(entries);
+// Separate words and silence for adjustment calculations
+var words = entries.filter(function(e) {{ return e.type === 'word'; }});
+var silences = entries.filter(function(e) {{ return e.type === 'silence'; }});
+// Calculate adjusted end times for words (simple: extend to midpoint, max 120ms)
+function calculateAdjustedEnd(wordIndex) {{
+    var word = words[wordIndex];
+    var nextWord = words[wordIndex + 1];
+    if (!nextWord) return word.end; // Last word, no adjustment
+    var gap = nextWord.start - word.end;
+    var extension = Math.min(gap / 2, 0.12); // max 120ms, never beyond midpoint
+    return word.end + extension;
+}}
+// Store adjusted ends for each word
+var adjustedEnds = words.map(function(w, i) {{ return calculateAdjustedEnd(i); }});
+// Track last clicked item index for Shift+Click range selection
+var lastClickedIndex = -1;
+// Get all clickable buttons in order
+function getAllButtons() {{
+    return Array.from(container.querySelectorAll('.word-btn, .silence-btn'));
+}}
+// Handle click with modifiers
+function handleItemClick(btn, e) {{
+    var allBtns = getAllButtons();
+    var clickedIndex = allBtns.indexOf(btn);
+    if (e.shiftKey && lastClickedIndex >= 0) {{
+        // Shift+Click: select range between lastClickedIndex and clickedIndex
+        var start = Math.min(lastClickedIndex, clickedIndex);
+        var end = Math.max(lastClickedIndex, clickedIndex);
+        allBtns.forEach(function(b, i) {{
+            if (i >= start && i <= end) {{
+                b.classList.add('selected');
+            }}
+        }});
+    }} else if (e.ctrlKey) {{
+        // Ctrl+Click: toggle selection
+        btn.classList.toggle('selected');
+    }} else {{
+        // Regular click: select only this item
+        allBtns.forEach(function(b) {{ b.classList.remove('selected'); }});
+        btn.classList.add('selected');
+    }}
+    lastClickedIndex = clickedIndex;
+    updateInterval();
+}}
+// Render all entries
+var wordIndex = 0;
+entries.forEach(function(entry, i) {{
+    var btn = document.createElement('span');
+    if (entry.type === 'word') {{
+        var wi = wordIndex;
+        btn.className = 'word-btn';
+        btn.dataset.origS = entry.start;
+        btn.dataset.origE = entry.end;
+        btn.dataset.adjE = adjustedEnds[wi];
+        btn.dataset.s = entry.start;
+        btn.dataset.e = entry.end;
+        btn.dataset.word = entry.word;
+        btn.innerHTML = '<span class="time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="word"> ' + entry.word + '</span>';
+        btn.onclick = function(e) {{ handleItemClick(this, e); }};
+        wordIndex++;
+    }} else {{
+        btn.className = 'silence-btn';
+        btn.dataset.s = entry.start;
+        btn.dataset.e = entry.end;
+        var durationMs = Math.round((entry.end - entry.start) * 1000);
+        btn.innerHTML = '<span class="silence-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration">' + durationMs + 'ms</span>';
+        btn.onclick = function(e) {{ handleItemClick(this, e); }};
+    }}
     container.appendChild(btn);
 }});
+// Toggle adjustment checkbox handler
+function updateWordLabels() {{
+    var adjusted = document.getElementById('adjust-intervals').checked;
+    document.querySelectorAll('.word-btn').forEach(function(btn) {{
+        var s = parseFloat(btn.dataset.origS);
+        var e = adjusted ? parseFloat(btn.dataset.adjE) : parseFloat(btn.dataset.origE);
+        btn.dataset.s = s;
+        btn.dataset.e = e;
+        btn.innerHTML = '<span class="time">[' + s.toFixed(3) + '-' + e.toFixed(3) + 's]</span><span class="word"> ' + btn.dataset.word + '</span>';
+    }});
+    updateInterval();
+}}
+document.getElementById('adjust-intervals').addEventListener('change', updateWordLabels);
 function updateInterval() {{
+    var sel = document.querySelectorAll('.word-btn.selected, .silence-btn.selected');
     if (sel.length === 0) return;
     var minS = Infinity, maxE = 0;
     sel.forEach(function(b) {{
         minS = Math.min(minS, parseFloat(b.dataset.s));
         maxE = Math.max(maxE, parseFloat(b.dataset.e));
     }});
+    var interval = minS.toFixed(3) + '-' + maxE.toFixed(3);
     // Find the textbox in parent and update it
     try {{
         var boxes = parent.document.querySelectorAll('input[data-testid="textbox"], textarea');
     }} catch(err) {{ console.log('Could not update parent:', err); }}
 }}
+// Highlight words that overlap with manually entered interval (>50% must be in interval)
 function highlightFromInterval(intervalStr) {{
     if (!intervalStr) return;
     var parts = intervalStr.replace(',', '-').split('-');
     document.querySelectorAll('.word-btn').forEach(function(btn) {{
         var ws = parseFloat(btn.dataset.s);
         var we = parseFloat(btn.dataset.e);
+        var itemDuration = we - ws;
+        // Calculate overlap between item and interval
         var overlapStart = Math.max(ws, s);
         var overlapEnd = Math.min(we, e);
         var overlap = Math.max(0, overlapEnd - overlapStart);
+        // Highlight only if >50% of item is in interval
+        if (itemDuration > 0 && (overlap / itemDuration) > 0.5) {{
             btn.classList.add('selected');
         }} else {{
             btn.classList.remove('selected');
         return text, audio, timestamps_html, audio_data, timestamps_data, initial_player
+    def play_time_interval_fast(audio_state, time_interval, last_interval, current_window):
         """Fast extraction using preloaded audio from memory."""
         def wrap_error(msg):
+            return f'<div style="min-height: 150px; padding: 20px; text-align: center; background: #f5f5f5; border-radius: 8px;"><p style="color: #666;">{msg}</p></div>', last_interval, current_window
         try:
             if not time_interval or not audio_state:
                 return wrap_error("Start time must be before end time.")
             # Load/reload audio segment (autoplay will replay even if same interval)
+            # Pass current window state for smart redraw logic
+            result_html, new_window = extract_audio_segment(audio_state, start_time, end_time, current_window)
+            return result_html, time_interval, new_window
         except Exception as e:
             import traceback
     transcribe_button.click(
         fn=transcribe_and_setup_audio,
+        inputs=[audio_input, vad_prob_threshold, vad_frame_rep],
         outputs=[transcription_output, audio_output, timestamps_output, audio_state, timestamps_state, waveform_player]
     )
     # Play interval button
     play_interval_button.click(
         fn=play_time_interval_fast,
+        inputs=[audio_state, time_input, last_interval_state, waveform_window_state],
+        outputs=[waveform_player, last_interval_state, waveform_window_state]
     )
 demo.launch()

packages.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- ffmpeg


1	+ ffmpeg
2	+ libc++1

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 nemo-toolkit[asr]>=1.23.0
 punctfix==0.11.1
 soundfile
-matplotlib

 nemo-toolkit[asr]>=1.23.0
 punctfix==0.11.1
 soundfile
+matplotlib
+ten-vad