Spaces:
Sleeping
Sleeping
Enhance word timestamp logic and graphing
Browse files- app.py +538 -58
- packages.txt +2 -1
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import numpy as np
|
|
| 8 |
import base64
|
| 9 |
import io
|
| 10 |
import json
|
|
|
|
| 11 |
import matplotlib
|
| 12 |
matplotlib.use('Agg')
|
| 13 |
import matplotlib.pyplot as plt
|
|
@@ -20,11 +21,170 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
| 20 |
punct_fixer = PunctFixer(language="da", device=device)
|
| 21 |
|
| 22 |
|
| 23 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
try:
|
| 25 |
# Check if audio is provided
|
| 26 |
if audio is None:
|
| 27 |
-
return "No audio provided. Please record or upload audio first.", [], None
|
| 28 |
|
| 29 |
# Preprocess audio: convert to mono if stereo
|
| 30 |
audio_data, sample_rate = sf.read(audio)
|
|
@@ -33,7 +193,18 @@ def transcribe_audio(audio):
|
|
| 33 |
if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
|
| 34 |
audio_data = np.mean(audio_data, axis=1)
|
| 35 |
|
| 36 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
import tempfile
|
| 38 |
import os
|
| 39 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
|
@@ -118,6 +289,68 @@ def transcribe_audio(audio):
|
|
| 118 |
import traceback
|
| 119 |
print(f"Timestamp extraction failed: {str(e)}\n{traceback.format_exc()}")
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
# Calculate audio duration
|
| 122 |
audio_duration = len(audio_data) / sample_rate
|
| 123 |
|
|
@@ -131,31 +364,55 @@ def transcribe_audio(audio):
|
|
| 131 |
'frame_duration': 0.08
|
| 132 |
}
|
| 133 |
|
| 134 |
-
# Return text, timestamps, audio data, raw_text,
|
| 135 |
-
return punctuated_text, timestamps_data, (audio_data, sample_rate), raw_text, export_metadata
|
| 136 |
-
return "No transcription available.", [], None, "", {}
|
| 137 |
|
| 138 |
except Exception as e:
|
| 139 |
import traceback
|
| 140 |
-
return f"Error during transcription: {str(e)}\n{traceback.format_exc()}", [], None, "", {}
|
| 141 |
|
| 142 |
|
| 143 |
-
def extract_audio_segment(audio_state, start_time, end_time):
|
| 144 |
-
"""Fast audio extraction from memory with waveform visualization.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
# Wrapper to ensure controls never collapse
|
| 146 |
-
def wrap_output(content):
|
| 147 |
-
return f'<div style="min-height: 200px;">{content}</div>'
|
| 148 |
|
| 149 |
try:
|
| 150 |
if audio_state is None:
|
| 151 |
return wrap_output("<p style='color: red; padding: 20px;'>No audio loaded. Please transcribe audio first.</p>")
|
| 152 |
|
| 153 |
audio_data, sample_rate = audio_state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
|
| 160 |
# Extract padded segment for waveform visualization
|
| 161 |
start_sample_padded = int(padded_start * sample_rate)
|
|
@@ -200,7 +457,18 @@ def extract_audio_segment(audio_state, start_time, end_time):
|
|
| 200 |
|
| 201 |
ax.set_xlabel('Time (seconds)', fontsize=10)
|
| 202 |
ax.set_ylabel('Amplitude', fontsize=10)
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
ax.legend(fontsize=9)
|
| 205 |
ax.grid(True, alpha=0.3)
|
| 206 |
|
|
@@ -223,6 +491,16 @@ def extract_audio_segment(audio_state, start_time, end_time):
|
|
| 223 |
import time
|
| 224 |
unique_id = int(time.time() * 1000)
|
| 225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
# Create HTML with waveform and native audio controls
|
| 227 |
html_output = f'''
|
| 228 |
<div style="margin: 10px 0;" data-render-id="{unique_id}">
|
|
@@ -236,20 +514,22 @@ def extract_audio_segment(audio_state, start_time, end_time):
|
|
| 236 |
|
| 237 |
<div style="margin-top: 8px; text-align: center;">
|
| 238 |
<span style="font-size: 14px; font-weight: bold; color: #333;">
|
| 239 |
-
Segment: {start_time:.
|
| 240 |
</span>
|
| 241 |
<span style="font-size: 12px; color: #666; margin-left: 15px;">
|
| 242 |
-
Duration: {end_time - start_time:.
|
| 243 |
</span>
|
| 244 |
</div>
|
| 245 |
</div>
|
| 246 |
'''
|
| 247 |
|
| 248 |
-
|
|
|
|
|
|
|
| 249 |
|
| 250 |
except Exception as e:
|
| 251 |
import traceback
|
| 252 |
-
return wrap_output(f"<pre style='padding: 20px;'>Error: {str(e)}\n{traceback.format_exc()}</pre>")
|
| 253 |
|
| 254 |
|
| 255 |
with gr.Blocks() as demo:
|
|
@@ -267,6 +547,30 @@ with gr.Blocks() as demo:
|
|
| 267 |
sources=["microphone", "upload"],
|
| 268 |
format="wav"
|
| 269 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
transcribe_button = gr.Button("Transcribe")
|
| 271 |
transcription_output = gr.Textbox(label="Transcription", lines=5)
|
| 272 |
|
|
@@ -290,24 +594,47 @@ with gr.Blocks() as demo:
|
|
| 290 |
# Track last played interval for smart replay
|
| 291 |
last_interval_state = gr.State("")
|
| 292 |
|
|
|
|
|
|
|
|
|
|
| 293 |
# Waveform player - below interval controls
|
| 294 |
waveform_player = gr.HTML(label="Segment Player")
|
| 295 |
|
| 296 |
-
def transcribe_and_setup_audio(audio):
|
| 297 |
-
text, timestamps_data, audio_data, raw_text, export_metadata = transcribe_audio(
|
|
|
|
|
|
|
| 298 |
|
| 299 |
-
# Build
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
words_json = json.dumps([{
|
| 301 |
'word': item['word'],
|
| 302 |
-
'start': round(item['start'],
|
| 303 |
-
'end': round(item['end'],
|
| 304 |
} for item in timestamps_data])
|
| 305 |
|
| 306 |
# Pre-generate full export JSON
|
| 307 |
segments = [{
|
| 308 |
'word': item['word'],
|
| 309 |
-
'start': round(item['start'],
|
| 310 |
-
'end': round(item['end'],
|
| 311 |
'word_index': i
|
| 312 |
} for i, item in enumerate(timestamps_data)]
|
| 313 |
|
|
@@ -343,50 +670,202 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
|
|
| 343 |
}}
|
| 344 |
.word-btn:hover {{ background: #c5e5f5; }}
|
| 345 |
.word-btn.selected {{ background: #4CAF50; color: white; border-color: #3a9; }}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
.time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
|
|
|
|
|
|
|
| 347 |
.word {{ margin-left: 4px; }}
|
| 348 |
</style>
|
| 349 |
</head>
|
| 350 |
<body>
|
| 351 |
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
|
| 352 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
<a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
|
| 354 |
</div>
|
| 355 |
<script>var exportJsonStr = {json.dumps(export_json_str)};</script>
|
| 356 |
-
<p class="help"><b>Click</b> = select
|
| 357 |
<div class="container" id="words"></div>
|
| 358 |
<script>
|
| 359 |
-
var
|
| 360 |
var container = document.getElementById('words');
|
| 361 |
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
}} else {{
|
| 373 |
-
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
}}
|
| 376 |
-
|
| 377 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
container.appendChild(btn);
|
| 379 |
}});
|
| 380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
function updateInterval() {{
|
| 382 |
-
var sel = document.querySelectorAll('.word-btn.selected');
|
| 383 |
if (sel.length === 0) return;
|
| 384 |
var minS = Infinity, maxE = 0;
|
| 385 |
sel.forEach(function(b) {{
|
| 386 |
minS = Math.min(minS, parseFloat(b.dataset.s));
|
| 387 |
maxE = Math.max(maxE, parseFloat(b.dataset.e));
|
| 388 |
}});
|
| 389 |
-
var interval = minS.toFixed(
|
| 390 |
// Find the textbox in parent and update it
|
| 391 |
try {{
|
| 392 |
var boxes = parent.document.querySelectorAll('input[data-testid="textbox"], textarea');
|
|
@@ -399,7 +878,7 @@ function updateInterval() {{
|
|
| 399 |
}} catch(err) {{ console.log('Could not update parent:', err); }}
|
| 400 |
}}
|
| 401 |
|
| 402 |
-
// Highlight words that overlap with manually entered interval (>50%
|
| 403 |
function highlightFromInterval(intervalStr) {{
|
| 404 |
if (!intervalStr) return;
|
| 405 |
var parts = intervalStr.replace(',', '-').split('-');
|
|
@@ -410,13 +889,13 @@ function highlightFromInterval(intervalStr) {{
|
|
| 410 |
document.querySelectorAll('.word-btn').forEach(function(btn) {{
|
| 411 |
var ws = parseFloat(btn.dataset.s);
|
| 412 |
var we = parseFloat(btn.dataset.e);
|
| 413 |
-
var
|
| 414 |
-
// Calculate overlap between
|
| 415 |
var overlapStart = Math.max(ws, s);
|
| 416 |
var overlapEnd = Math.min(we, e);
|
| 417 |
var overlap = Math.max(0, overlapEnd - overlapStart);
|
| 418 |
-
// Highlight only if >50% of
|
| 419 |
-
if (
|
| 420 |
btn.classList.add('selected');
|
| 421 |
}} else {{
|
| 422 |
btn.classList.remove('selected');
|
|
@@ -481,10 +960,10 @@ document.getElementById('download-json').onclick = function(e) {{
|
|
| 481 |
|
| 482 |
return text, audio, timestamps_html, audio_data, timestamps_data, initial_player
|
| 483 |
|
| 484 |
-
def play_time_interval_fast(audio_state, time_interval, last_interval):
|
| 485 |
"""Fast extraction using preloaded audio from memory."""
|
| 486 |
def wrap_error(msg):
|
| 487 |
-
return f'<div style="min-height: 150px; padding: 20px; text-align: center; background: #f5f5f5; border-radius: 8px;"><p style="color: #666;">{msg}</p></div>', last_interval
|
| 488 |
|
| 489 |
try:
|
| 490 |
if not time_interval or not audio_state:
|
|
@@ -503,8 +982,9 @@ document.getElementById('download-json').onclick = function(e) {{
|
|
| 503 |
return wrap_error("Start time must be before end time.")
|
| 504 |
|
| 505 |
# Load/reload audio segment (autoplay will replay even if same interval)
|
| 506 |
-
|
| 507 |
-
|
|
|
|
| 508 |
|
| 509 |
except Exception as e:
|
| 510 |
import traceback
|
|
@@ -512,15 +992,15 @@ document.getElementById('download-json').onclick = function(e) {{
|
|
| 512 |
|
| 513 |
transcribe_button.click(
|
| 514 |
fn=transcribe_and_setup_audio,
|
| 515 |
-
inputs=audio_input,
|
| 516 |
outputs=[transcription_output, audio_output, timestamps_output, audio_state, timestamps_state, waveform_player]
|
| 517 |
)
|
| 518 |
|
| 519 |
# Play interval button
|
| 520 |
play_interval_button.click(
|
| 521 |
fn=play_time_interval_fast,
|
| 522 |
-
inputs=[audio_state, time_input, last_interval_state],
|
| 523 |
-
outputs=[waveform_player, last_interval_state]
|
| 524 |
)
|
| 525 |
|
| 526 |
demo.launch()
|
|
|
|
| 8 |
import base64
|
| 9 |
import io
|
| 10 |
import json
|
| 11 |
+
from ten_vad import TenVad
|
| 12 |
import matplotlib
|
| 13 |
matplotlib.use('Agg')
|
| 14 |
import matplotlib.pyplot as plt
|
|
|
|
| 21 |
punct_fixer = PunctFixer(language="da", device=device)
|
| 22 |
|
| 23 |
|
| 24 |
+
def detect_silence_periods(audio_data, sample_rate, prob_threshold=0.5, frame_rep_threshold=2):
|
| 25 |
+
"""Run TEN VAD to detect silence periods in audio.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
audio_data: numpy array of audio samples (float, mono, 16kHz)
|
| 29 |
+
sample_rate: sample rate (must be 16000)
|
| 30 |
+
prob_threshold: VAD probability threshold (0.0-1.0), higher = less sensitive
|
| 31 |
+
frame_rep_threshold: Number of consecutive frames required before state change
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
List of dicts with 'start' and 'end' times for each silence period
|
| 35 |
+
"""
|
| 36 |
+
TARGET_SR = 16000 # TEN VAD requires 16kHz
|
| 37 |
+
HOP_SIZE = 256 # 16ms at 16kHz
|
| 38 |
+
|
| 39 |
+
print(f"[VAD] Settings: prob_threshold={prob_threshold}, frame_rep_threshold={frame_rep_threshold}")
|
| 40 |
+
|
| 41 |
+
if sample_rate != TARGET_SR:
|
| 42 |
+
print(f"[VAD] Warning: Expected 16kHz audio, got {sample_rate}Hz")
|
| 43 |
+
|
| 44 |
+
# Convert float audio to int16 (TEN VAD expects int16)
|
| 45 |
+
if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
|
| 46 |
+
audio_int16 = (audio_data * 32767).astype(np.int16)
|
| 47 |
+
else:
|
| 48 |
+
audio_int16 = audio_data.astype(np.int16)
|
| 49 |
+
|
| 50 |
+
# Create VAD instance
|
| 51 |
+
vad = TenVad(hop_size=HOP_SIZE, threshold=prob_threshold)
|
| 52 |
+
|
| 53 |
+
silence_periods = []
|
| 54 |
+
frame_duration = HOP_SIZE / TARGET_SR # 0.016s = 16ms
|
| 55 |
+
|
| 56 |
+
# Process frame by frame and collect raw flags
|
| 57 |
+
num_frames = len(audio_int16) // HOP_SIZE
|
| 58 |
+
raw_flags = []
|
| 59 |
+
|
| 60 |
+
for i in range(num_frames):
|
| 61 |
+
frame_start = i * HOP_SIZE
|
| 62 |
+
frame = audio_int16[frame_start:frame_start + HOP_SIZE]
|
| 63 |
+
|
| 64 |
+
result = vad.process(frame)
|
| 65 |
+
# TEN VAD returns tuple: (probability, flag) or has .flag attribute
|
| 66 |
+
if isinstance(result, tuple):
|
| 67 |
+
flag = result[1] # (probability, flag)
|
| 68 |
+
else:
|
| 69 |
+
flag = result.flag
|
| 70 |
+
raw_flags.append(flag)
|
| 71 |
+
|
| 72 |
+
# Apply frame repetition threshold smoothing
|
| 73 |
+
# Only switch state after seeing frame_rep_threshold consecutive frames of the new state
|
| 74 |
+
in_silence = False
|
| 75 |
+
silence_start = 0.0
|
| 76 |
+
consecutive_count = 0
|
| 77 |
+
pending_state = None # The state we're potentially switching to
|
| 78 |
+
|
| 79 |
+
for i, flag in enumerate(raw_flags):
|
| 80 |
+
current_time = i * frame_duration
|
| 81 |
+
is_silence = (flag == 0)
|
| 82 |
+
|
| 83 |
+
if in_silence:
|
| 84 |
+
# Currently in silence, looking for speech
|
| 85 |
+
if not is_silence:
|
| 86 |
+
# Potential speech detected
|
| 87 |
+
if pending_state != 'speech':
|
| 88 |
+
pending_state = 'speech'
|
| 89 |
+
consecutive_count = 1
|
| 90 |
+
else:
|
| 91 |
+
consecutive_count += 1
|
| 92 |
+
|
| 93 |
+
if consecutive_count >= frame_rep_threshold:
|
| 94 |
+
# Confirmed speech - end silence period
|
| 95 |
+
# Adjust end time back to when speech actually started
|
| 96 |
+
actual_end = current_time - (consecutive_count - 1) * frame_duration
|
| 97 |
+
silence_periods.append({
|
| 98 |
+
'start': round(silence_start, 3),
|
| 99 |
+
'end': round(actual_end, 3)
|
| 100 |
+
})
|
| 101 |
+
in_silence = False
|
| 102 |
+
pending_state = None
|
| 103 |
+
consecutive_count = 0
|
| 104 |
+
else:
|
| 105 |
+
# Still silence, reset any pending speech detection
|
| 106 |
+
pending_state = None
|
| 107 |
+
consecutive_count = 0
|
| 108 |
+
else:
|
| 109 |
+
# Currently in speech, looking for silence
|
| 110 |
+
if is_silence:
|
| 111 |
+
# Potential silence detected
|
| 112 |
+
if pending_state != 'silence':
|
| 113 |
+
pending_state = 'silence'
|
| 114 |
+
consecutive_count = 1
|
| 115 |
+
potential_silence_start = current_time
|
| 116 |
+
else:
|
| 117 |
+
consecutive_count += 1
|
| 118 |
+
|
| 119 |
+
if consecutive_count >= frame_rep_threshold:
|
| 120 |
+
# Confirmed silence - start silence period
|
| 121 |
+
# Use the time when silence actually started
|
| 122 |
+
silence_start = potential_silence_start
|
| 123 |
+
in_silence = True
|
| 124 |
+
pending_state = None
|
| 125 |
+
consecutive_count = 0
|
| 126 |
+
else:
|
| 127 |
+
# Still speech, reset any pending silence detection
|
| 128 |
+
pending_state = None
|
| 129 |
+
consecutive_count = 0
|
| 130 |
+
|
| 131 |
+
# Handle case where audio ends in silence
|
| 132 |
+
if in_silence:
|
| 133 |
+
silence_periods.append({
|
| 134 |
+
'start': round(silence_start, 3),
|
| 135 |
+
'end': round(num_frames * frame_duration, 3)
|
| 136 |
+
})
|
| 137 |
+
|
| 138 |
+
return silence_periods
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def print_speech_silence_log(timestamps_data, silence_periods):
|
| 142 |
+
"""Print interleaved speech and silence log sorted by start time."""
|
| 143 |
+
|
| 144 |
+
# Build unified list
|
| 145 |
+
entries = []
|
| 146 |
+
|
| 147 |
+
# Add speech entries (word timestamps)
|
| 148 |
+
for item in timestamps_data:
|
| 149 |
+
entries.append({
|
| 150 |
+
'type': 'speech',
|
| 151 |
+
'start': item['start'],
|
| 152 |
+
'end': item['end'],
|
| 153 |
+
'word': item['word']
|
| 154 |
+
})
|
| 155 |
+
|
| 156 |
+
# Add silence entries
|
| 157 |
+
for item in silence_periods:
|
| 158 |
+
entries.append({
|
| 159 |
+
'type': 'silence',
|
| 160 |
+
'start': item['start'],
|
| 161 |
+
'end': item['end']
|
| 162 |
+
})
|
| 163 |
+
|
| 164 |
+
# Sort by start time
|
| 165 |
+
entries.sort(key=lambda x: x['start'])
|
| 166 |
+
|
| 167 |
+
# Print log
|
| 168 |
+
print("\n=== SPEECH & SILENCE LOG ===")
|
| 169 |
+
for entry in entries:
|
| 170 |
+
if entry['type'] == 'speech':
|
| 171 |
+
print(f"[Speech] [{entry['start']:.3f}-{entry['end']:.3f}] {entry['word']}")
|
| 172 |
+
else:
|
| 173 |
+
duration_ms = int((entry['end'] - entry['start']) * 1000)
|
| 174 |
+
print(f"[Silence] [{entry['start']:.3f}-{entry['end']:.3f}] [{duration_ms}ms]")
|
| 175 |
+
|
| 176 |
+
# Calculate summary
|
| 177 |
+
total_silence = sum(p['end'] - p['start'] for p in silence_periods)
|
| 178 |
+
print(f"\n=== SUMMARY ===")
|
| 179 |
+
print(f"Words: {len(timestamps_data)}, Silence periods: {len(silence_periods)}, Total silence: {total_silence:.2f}s")
|
| 180 |
+
print("=" * 30 + "\n")
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def transcribe_audio(audio, prob_threshold=0.5, frame_rep_threshold=2):
|
| 184 |
try:
|
| 185 |
# Check if audio is provided
|
| 186 |
if audio is None:
|
| 187 |
+
return "No audio provided. Please record or upload audio first.", [], None, "", {}
|
| 188 |
|
| 189 |
# Preprocess audio: convert to mono if stereo
|
| 190 |
audio_data, sample_rate = sf.read(audio)
|
|
|
|
| 193 |
if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
|
| 194 |
audio_data = np.mean(audio_data, axis=1)
|
| 195 |
|
| 196 |
+
# Resample to 16kHz if needed (required by both Parakeet and TEN VAD)
|
| 197 |
+
TARGET_SR = 16000
|
| 198 |
+
if sample_rate != TARGET_SR:
|
| 199 |
+
duration = len(audio_data) / sample_rate
|
| 200 |
+
new_length = int(duration * TARGET_SR)
|
| 201 |
+
x_old = np.linspace(0, duration, len(audio_data), endpoint=False)
|
| 202 |
+
x_new = np.linspace(0, duration, new_length, endpoint=False)
|
| 203 |
+
audio_data = np.interp(x_new, x_old, audio_data).astype(np.float32)
|
| 204 |
+
print(f"[AUDIO] Resampled from {sample_rate}Hz to {TARGET_SR}Hz")
|
| 205 |
+
sample_rate = TARGET_SR
|
| 206 |
+
|
| 207 |
+
# Save as temporary mono 16kHz file
|
| 208 |
import tempfile
|
| 209 |
import os
|
| 210 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
|
|
|
| 289 |
import traceback
|
| 290 |
print(f"Timestamp extraction failed: {str(e)}\n{traceback.format_exc()}")
|
| 291 |
|
| 292 |
+
# Map punctuated words back to timestamps_data
|
| 293 |
+
# -----------------------------------------------------------------
|
| 294 |
+
# The ASR model outputs raw lowercase text without punctuation.
|
| 295 |
+
# PunctFixer adds punctuation and capitalization, but may occasionally:
|
| 296 |
+
# - Merge words (e.g., "i morgen" → "imorgen")
|
| 297 |
+
# - Split contractions differently
|
| 298 |
+
# - Result in different word counts than the raw output
|
| 299 |
+
#
|
| 300 |
+
# We handle this by:
|
| 301 |
+
# 1. If word counts match: direct position-based mapping (common case)
|
| 302 |
+
# 2. If counts differ: fuzzy matching with lookahead to realign
|
| 303 |
+
# -----------------------------------------------------------------
|
| 304 |
+
try:
|
| 305 |
+
import re
|
| 306 |
+
# Split punctuated text into words, keeping punctuation attached
|
| 307 |
+
punctuated_words = punctuated_text.split()
|
| 308 |
+
|
| 309 |
+
# Helper to strip punctuation for comparison (normalize for matching)
|
| 310 |
+
def strip_punct(s):
|
| 311 |
+
return re.sub(r'[^\w]', '', s).lower()
|
| 312 |
+
|
| 313 |
+
# Align punctuated words to raw words
|
| 314 |
+
if len(punctuated_words) == len(timestamps_data):
|
| 315 |
+
# Same word count - direct mapping (most common case)
|
| 316 |
+
# Verify base word matches before replacing to catch any edge cases
|
| 317 |
+
for i, pw in enumerate(punctuated_words):
|
| 318 |
+
if strip_punct(pw) == strip_punct(timestamps_data[i]['word']):
|
| 319 |
+
timestamps_data[i]['word'] = pw
|
| 320 |
+
else:
|
| 321 |
+
# Different word counts - PunctFixer may have merged/split words
|
| 322 |
+
# Use two-pointer approach with lookahead for realignment
|
| 323 |
+
pi = 0 # punctuated index
|
| 324 |
+
for ti in range(len(timestamps_data)):
|
| 325 |
+
if pi >= len(punctuated_words):
|
| 326 |
+
break
|
| 327 |
+
raw_word = strip_punct(timestamps_data[ti]['word'])
|
| 328 |
+
punct_word = strip_punct(punctuated_words[pi])
|
| 329 |
+
if raw_word == punct_word:
|
| 330 |
+
timestamps_data[ti]['word'] = punctuated_words[pi]
|
| 331 |
+
pi += 1
|
| 332 |
+
else:
|
| 333 |
+
# Words don't match - try lookahead to find alignment
|
| 334 |
+
# This handles cases where PunctFixer inserted/removed words
|
| 335 |
+
for look_ahead in range(1, min(3, len(punctuated_words) - pi)):
|
| 336 |
+
if strip_punct(punctuated_words[pi + look_ahead]) == raw_word:
|
| 337 |
+
pi += look_ahead
|
| 338 |
+
timestamps_data[ti]['word'] = punctuated_words[pi]
|
| 339 |
+
pi += 1
|
| 340 |
+
break
|
| 341 |
+
except Exception as e:
|
| 342 |
+
# Graceful fallback: keep original raw words if mapping fails
|
| 343 |
+
print(f"Punctuation mapping failed: {str(e)}")
|
| 344 |
+
|
| 345 |
+
# Run VAD to detect silence periods
|
| 346 |
+
silence_periods = []
|
| 347 |
+
try:
|
| 348 |
+
silence_periods = detect_silence_periods(audio_data, sample_rate, prob_threshold, frame_rep_threshold)
|
| 349 |
+
print_speech_silence_log(timestamps_data, silence_periods)
|
| 350 |
+
except Exception as e:
|
| 351 |
+
import traceback
|
| 352 |
+
print(f"[VAD] Error during silence detection: {str(e)}\n{traceback.format_exc()}")
|
| 353 |
+
|
| 354 |
# Calculate audio duration
|
| 355 |
audio_duration = len(audio_data) / sample_rate
|
| 356 |
|
|
|
|
| 364 |
'frame_duration': 0.08
|
| 365 |
}
|
| 366 |
|
| 367 |
+
# Return text, timestamps, audio data, raw_text, export metadata, and silence periods
|
| 368 |
+
return punctuated_text, timestamps_data, (audio_data, sample_rate), raw_text, export_metadata, silence_periods
|
| 369 |
+
return "No transcription available.", [], None, "", {}, []
|
| 370 |
|
| 371 |
except Exception as e:
|
| 372 |
import traceback
|
| 373 |
+
return f"Error during transcription: {str(e)}\n{traceback.format_exc()}", [], None, "", {}, []
|
| 374 |
|
| 375 |
|
| 376 |
+
def extract_audio_segment(audio_state, start_time, end_time, current_window=None):
|
| 377 |
+
"""Fast audio extraction from memory with waveform visualization.
|
| 378 |
+
|
| 379 |
+
Args:
|
| 380 |
+
audio_state: Tuple of (audio_data, sample_rate)
|
| 381 |
+
start_time: Start time of the interval to play
|
| 382 |
+
end_time: End time of the interval to play
|
| 383 |
+
current_window: Dict with 'start' and 'end' of current waveform window, or None
|
| 384 |
+
|
| 385 |
+
Returns:
|
| 386 |
+
Tuple of (html_output, new_window_state)
|
| 387 |
+
"""
|
| 388 |
# Wrapper to ensure controls never collapse
|
| 389 |
+
def wrap_output(content, window_state=None):
|
| 390 |
+
return f'<div style="min-height: 200px;">{content}</div>', window_state
|
| 391 |
|
| 392 |
try:
|
| 393 |
if audio_state is None:
|
| 394 |
return wrap_output("<p style='color: red; padding: 20px;'>No audio loaded. Please transcribe audio first.</p>")
|
| 395 |
|
| 396 |
audio_data, sample_rate = audio_state
|
| 397 |
+
audio_duration = len(audio_data) / sample_rate
|
| 398 |
+
|
| 399 |
+
# Default context padding is 160ms
|
| 400 |
+
DEFAULT_PADDING = 0.16
|
| 401 |
+
|
| 402 |
+
# Determine if we need to redraw the waveform or just update the shaded area
|
| 403 |
+
need_redraw = True
|
| 404 |
+
if current_window is not None:
|
| 405 |
+
# Check if the new interval fits within the current window
|
| 406 |
+
if start_time >= current_window['start'] and end_time <= current_window['end']:
|
| 407 |
+
need_redraw = False
|
| 408 |
+
# Reuse the current window boundaries
|
| 409 |
+
padded_start = current_window['start']
|
| 410 |
+
padded_end = current_window['end']
|
| 411 |
|
| 412 |
+
if need_redraw:
|
| 413 |
+
# Calculate new window with ±160ms padding
|
| 414 |
+
padded_start = max(0, start_time - DEFAULT_PADDING)
|
| 415 |
+
padded_end = min(audio_duration, end_time + DEFAULT_PADDING)
|
| 416 |
|
| 417 |
# Extract padded segment for waveform visualization
|
| 418 |
start_sample_padded = int(padded_start * sample_rate)
|
|
|
|
| 457 |
|
| 458 |
ax.set_xlabel('Time (seconds)', fontsize=10)
|
| 459 |
ax.set_ylabel('Amplitude', fontsize=10)
|
| 460 |
+
|
| 461 |
+
# Calculate context on each side in ms
|
| 462 |
+
left_context_ms = int((start_time - padded_start) * 1000)
|
| 463 |
+
right_context_ms = int((padded_end - end_time) * 1000)
|
| 464 |
+
|
| 465 |
+
# Format context string - symmetric or asymmetric
|
| 466 |
+
if left_context_ms == right_context_ms:
|
| 467 |
+
context_str = f'(±{left_context_ms}ms context)'
|
| 468 |
+
else:
|
| 469 |
+
context_str = f'(-{left_context_ms}ms context +{right_context_ms}ms context)'
|
| 470 |
+
|
| 471 |
+
ax.set_title(f'Audio Segment: {start_time:.3f}s – {end_time:.3f}s {context_str}', fontsize=11)
|
| 472 |
ax.legend(fontsize=9)
|
| 473 |
ax.grid(True, alpha=0.3)
|
| 474 |
|
|
|
|
| 491 |
import time
|
| 492 |
unique_id = int(time.time() * 1000)
|
| 493 |
|
| 494 |
+
# Calculate context on each side in ms for the info text
|
| 495 |
+
left_context_ms = int((start_time - padded_start) * 1000)
|
| 496 |
+
right_context_ms = int((padded_end - end_time) * 1000)
|
| 497 |
+
|
| 498 |
+
# Format context string - symmetric or asymmetric
|
| 499 |
+
if left_context_ms == right_context_ms:
|
| 500 |
+
context_info = f'±{left_context_ms}ms'
|
| 501 |
+
else:
|
| 502 |
+
context_info = f'-{left_context_ms}ms / +{right_context_ms}ms'
|
| 503 |
+
|
| 504 |
# Create HTML with waveform and native audio controls
|
| 505 |
html_output = f'''
|
| 506 |
<div style="margin: 10px 0;" data-render-id="{unique_id}">
|
|
|
|
| 514 |
|
| 515 |
<div style="margin-top: 8px; text-align: center;">
|
| 516 |
<span style="font-size: 14px; font-weight: bold; color: #333;">
|
| 517 |
+
Segment: {start_time:.3f}s – {end_time:.3f}s
|
| 518 |
</span>
|
| 519 |
<span style="font-size: 12px; color: #666; margin-left: 15px;">
|
| 520 |
+
Duration: {(end_time - start_time)*1000:.0f}ms | Context shown: {context_info}
|
| 521 |
</span>
|
| 522 |
</div>
|
| 523 |
</div>
|
| 524 |
'''
|
| 525 |
|
| 526 |
+
# Return HTML and new window state
|
| 527 |
+
new_window = {'start': padded_start, 'end': padded_end}
|
| 528 |
+
return wrap_output(html_output, new_window)
|
| 529 |
|
| 530 |
except Exception as e:
|
| 531 |
import traceback
|
| 532 |
+
return wrap_output(f"<pre style='padding: 20px;'>Error: {str(e)}\n{traceback.format_exc()}</pre>", current_window)
|
| 533 |
|
| 534 |
|
| 535 |
with gr.Blocks() as demo:
|
|
|
|
| 547 |
sources=["microphone", "upload"],
|
| 548 |
format="wav"
|
| 549 |
)
|
| 550 |
+
|
| 551 |
+
# VAD Controls - inline labels with number inputs
|
| 552 |
+
with gr.Row():
|
| 553 |
+
gr.Markdown("**VAD: Probability Threshold**")
|
| 554 |
+
vad_prob_threshold = gr.Number(
|
| 555 |
+
show_label=False,
|
| 556 |
+
value=0.5,
|
| 557 |
+
minimum=0.0,
|
| 558 |
+
maximum=1.0,
|
| 559 |
+
step=0.05,
|
| 560 |
+
scale=0,
|
| 561 |
+
min_width=80
|
| 562 |
+
)
|
| 563 |
+
gr.Markdown("**VAD: Frame Repetition Threshold**")
|
| 564 |
+
vad_frame_rep = gr.Number(
|
| 565 |
+
show_label=False,
|
| 566 |
+
value=2,
|
| 567 |
+
minimum=1,
|
| 568 |
+
maximum=10,
|
| 569 |
+
step=1,
|
| 570 |
+
scale=0,
|
| 571 |
+
min_width=80
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
transcribe_button = gr.Button("Transcribe")
|
| 575 |
transcription_output = gr.Textbox(label="Transcription", lines=5)
|
| 576 |
|
|
|
|
| 594 |
# Track last played interval for smart replay
|
| 595 |
last_interval_state = gr.State("")
|
| 596 |
|
| 597 |
+
# Track current waveform window boundaries for smart redraw
|
| 598 |
+
waveform_window_state = gr.State(None)
|
| 599 |
+
|
| 600 |
# Waveform player - below interval controls
|
| 601 |
waveform_player = gr.HTML(label="Segment Player")
|
| 602 |
|
| 603 |
+
def transcribe_and_setup_audio(audio, prob_threshold, frame_rep_threshold):
|
| 604 |
+
text, timestamps_data, audio_data, raw_text, export_metadata, silence_periods = transcribe_audio(
|
| 605 |
+
audio, prob_threshold, int(frame_rep_threshold)
|
| 606 |
+
)
|
| 607 |
|
| 608 |
+
# Build combined entries (words + silence) sorted by start time
|
| 609 |
+
entries = []
|
| 610 |
+
for item in timestamps_data:
|
| 611 |
+
entries.append({
|
| 612 |
+
'type': 'word',
|
| 613 |
+
'word': item['word'],
|
| 614 |
+
'start': round(item['start'], 3),
|
| 615 |
+
'end': round(item['end'], 3)
|
| 616 |
+
})
|
| 617 |
+
for item in silence_periods:
|
| 618 |
+
entries.append({
|
| 619 |
+
'type': 'silence',
|
| 620 |
+
'start': round(item['start'], 3),
|
| 621 |
+
'end': round(item['end'], 3)
|
| 622 |
+
})
|
| 623 |
+
entries.sort(key=lambda x: x['start'])
|
| 624 |
+
entries_json = json.dumps(entries)
|
| 625 |
+
|
| 626 |
+
# Build word data as JSON for the iframe (kept for backward compat)
|
| 627 |
words_json = json.dumps([{
|
| 628 |
'word': item['word'],
|
| 629 |
+
'start': round(item['start'], 3),
|
| 630 |
+
'end': round(item['end'], 3)
|
| 631 |
} for item in timestamps_data])
|
| 632 |
|
| 633 |
# Pre-generate full export JSON
|
| 634 |
segments = [{
|
| 635 |
'word': item['word'],
|
| 636 |
+
'start': round(item['start'], 3),
|
| 637 |
+
'end': round(item['end'], 3),
|
| 638 |
'word_index': i
|
| 639 |
} for i, item in enumerate(timestamps_data)]
|
| 640 |
|
|
|
|
| 670 |
}}
|
| 671 |
.word-btn:hover {{ background: #c5e5f5; }}
|
| 672 |
.word-btn.selected {{ background: #4CAF50; color: white; border-color: #3a9; }}
|
| 673 |
+
.silence-btn {{
|
| 674 |
+
display: inline-block;
|
| 675 |
+
background: #ffe4c4;
|
| 676 |
+
padding: 5px 8px;
|
| 677 |
+
margin: 3px;
|
| 678 |
+
border-radius: 4px;
|
| 679 |
+
cursor: pointer;
|
| 680 |
+
border: 1px solid #dca;
|
| 681 |
+
font-size: 11px;
|
| 682 |
+
transition: all 0.15s;
|
| 683 |
+
}}
|
| 684 |
+
.silence-btn:hover {{ background: #ffd4a4; }}
|
| 685 |
+
.silence-btn.selected {{ background: #ff9800; color: white; border-color: #e68a00; }}
|
| 686 |
+
.checkbox-container {{
|
| 687 |
+
display: inline-flex;
|
| 688 |
+
align-items: center;
|
| 689 |
+
margin-left: 15px;
|
| 690 |
+
font-size: 12px;
|
| 691 |
+
cursor: pointer;
|
| 692 |
+
}}
|
| 693 |
+
.checkbox-container input {{
|
| 694 |
+
margin-right: 5px;
|
| 695 |
+
cursor: pointer;
|
| 696 |
+
}}
|
| 697 |
+
.checkbox-container:hover {{
|
| 698 |
+
color: #0066cc;
|
| 699 |
+
}}
|
| 700 |
.time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
|
| 701 |
+
.silence-time {{ color: #996600; font-size: 10px; font-weight: bold; }}
|
| 702 |
+
.duration {{ color: #666; font-size: 10px; margin-left: 3px; }}
|
| 703 |
.word {{ margin-left: 4px; }}
|
| 704 |
</style>
|
| 705 |
</head>
|
| 706 |
<body>
|
| 707 |
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
|
| 708 |
+
<div style="display: flex; align-items: center;">
|
| 709 |
+
<h3 style="margin: 0;">Word Timestamps</h3>
|
| 710 |
+
<label class="checkbox-container" title="Extends word end times toward midpoint of gap to next word (max 120ms). Helps capture word endings that may be cut off.">
|
| 711 |
+
<input type="checkbox" id="adjust-intervals">
|
| 712 |
+
Apply Time Interval Adjustment
|
| 713 |
+
</label>
|
| 714 |
+
</div>
|
| 715 |
<a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
|
| 716 |
</div>
|
| 717 |
<script>var exportJsonStr = {json.dumps(export_json_str)};</script>
|
| 718 |
+
<p class="help"><b>Click</b> = select | <b>Ctrl+Click</b> = toggle | <b>Shift+Click</b> = range <span style="background: #ffe4c4; padding: 2px 8px; border-radius: 3px; border: 1px solid #dca;"></span> = detected non speech</p>
|
| 719 |
<div class="container" id="words"></div>
|
| 720 |
<script>
|
| 721 |
+
var entries = {entries_json};
|
| 722 |
var container = document.getElementById('words');
|
| 723 |
|
| 724 |
+
// Merge consecutive silence periods (no word between them)
|
| 725 |
+
// This keeps the raw data in Python logs but shows cleaner UI
|
| 726 |
+
function mergeConsecutiveSilences(entryList) {{
|
| 727 |
+
var merged = [];
|
| 728 |
+
var pendingSilence = null;
|
| 729 |
+
|
| 730 |
+
entryList.forEach(function(entry) {{
|
| 731 |
+
if (entry.type === 'silence') {{
|
| 732 |
+
if (pendingSilence === null) {{
|
| 733 |
+
// Start a new pending silence
|
| 734 |
+
pendingSilence = {{ type: 'silence', start: entry.start, end: entry.end }};
|
| 735 |
+
}} else {{
|
| 736 |
+
// Extend the pending silence
|
| 737 |
+
pendingSilence.end = entry.end;
|
| 738 |
+
}}
|
| 739 |
}} else {{
|
| 740 |
+
// It's a word - flush any pending silence first
|
| 741 |
+
if (pendingSilence !== null) {{
|
| 742 |
+
merged.push(pendingSilence);
|
| 743 |
+
pendingSilence = null;
|
| 744 |
+
}}
|
| 745 |
+
merged.push(entry);
|
| 746 |
}}
|
| 747 |
+
}});
|
| 748 |
+
|
| 749 |
+
// Don't forget trailing silence
|
| 750 |
+
if (pendingSilence !== null) {{
|
| 751 |
+
merged.push(pendingSilence);
|
| 752 |
+
}}
|
| 753 |
+
|
| 754 |
+
return merged;
|
| 755 |
+
}}
|
| 756 |
+
|
| 757 |
+
// Apply merging to entries for display
|
| 758 |
+
entries = mergeConsecutiveSilences(entries);
|
| 759 |
+
|
| 760 |
+
// Separate words and silence for adjustment calculations
|
| 761 |
+
var words = entries.filter(function(e) {{ return e.type === 'word'; }});
|
| 762 |
+
var silences = entries.filter(function(e) {{ return e.type === 'silence'; }});
|
| 763 |
+
|
| 764 |
+
// Calculate adjusted end times for words (simple: extend to midpoint, max 120ms)
|
| 765 |
+
function calculateAdjustedEnd(wordIndex) {{
|
| 766 |
+
var word = words[wordIndex];
|
| 767 |
+
var nextWord = words[wordIndex + 1];
|
| 768 |
+
|
| 769 |
+
if (!nextWord) return word.end; // Last word, no adjustment
|
| 770 |
+
|
| 771 |
+
var gap = nextWord.start - word.end;
|
| 772 |
+
var extension = Math.min(gap / 2, 0.12); // max 120ms, never beyond midpoint
|
| 773 |
+
|
| 774 |
+
return word.end + extension;
|
| 775 |
+
}}
|
| 776 |
+
|
| 777 |
+
// Store adjusted ends for each word
|
| 778 |
+
var adjustedEnds = words.map(function(w, i) {{ return calculateAdjustedEnd(i); }});
|
| 779 |
+
|
| 780 |
+
// Track last clicked item index for Shift+Click range selection
|
| 781 |
+
var lastClickedIndex = -1;
|
| 782 |
+
|
| 783 |
+
// Get all clickable buttons in order
|
| 784 |
+
function getAllButtons() {{
|
| 785 |
+
return Array.from(container.querySelectorAll('.word-btn, .silence-btn'));
|
| 786 |
+
}}
|
| 787 |
+
|
| 788 |
+
// Handle click with modifiers
|
| 789 |
+
function handleItemClick(btn, e) {{
|
| 790 |
+
var allBtns = getAllButtons();
|
| 791 |
+
var clickedIndex = allBtns.indexOf(btn);
|
| 792 |
+
|
| 793 |
+
if (e.shiftKey && lastClickedIndex >= 0) {{
|
| 794 |
+
// Shift+Click: select range between lastClickedIndex and clickedIndex
|
| 795 |
+
var start = Math.min(lastClickedIndex, clickedIndex);
|
| 796 |
+
var end = Math.max(lastClickedIndex, clickedIndex);
|
| 797 |
+
allBtns.forEach(function(b, i) {{
|
| 798 |
+
if (i >= start && i <= end) {{
|
| 799 |
+
b.classList.add('selected');
|
| 800 |
+
}}
|
| 801 |
+
}});
|
| 802 |
+
}} else if (e.ctrlKey) {{
|
| 803 |
+
// Ctrl+Click: toggle selection
|
| 804 |
+
btn.classList.toggle('selected');
|
| 805 |
+
}} else {{
|
| 806 |
+
// Regular click: select only this item
|
| 807 |
+
allBtns.forEach(function(b) {{ b.classList.remove('selected'); }});
|
| 808 |
+
btn.classList.add('selected');
|
| 809 |
+
}}
|
| 810 |
+
|
| 811 |
+
lastClickedIndex = clickedIndex;
|
| 812 |
+
updateInterval();
|
| 813 |
+
}}
|
| 814 |
+
|
| 815 |
+
// Render all entries
|
| 816 |
+
var wordIndex = 0;
|
| 817 |
+
entries.forEach(function(entry, i) {{
|
| 818 |
+
var btn = document.createElement('span');
|
| 819 |
+
|
| 820 |
+
if (entry.type === 'word') {{
|
| 821 |
+
var wi = wordIndex;
|
| 822 |
+
btn.className = 'word-btn';
|
| 823 |
+
btn.dataset.origS = entry.start;
|
| 824 |
+
btn.dataset.origE = entry.end;
|
| 825 |
+
btn.dataset.adjE = adjustedEnds[wi];
|
| 826 |
+
btn.dataset.s = entry.start;
|
| 827 |
+
btn.dataset.e = entry.end;
|
| 828 |
+
btn.dataset.word = entry.word;
|
| 829 |
+
btn.innerHTML = '<span class="time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="word"> ' + entry.word + '</span>';
|
| 830 |
+
|
| 831 |
+
btn.onclick = function(e) {{ handleItemClick(this, e); }};
|
| 832 |
+
wordIndex++;
|
| 833 |
+
}} else {{
|
| 834 |
+
btn.className = 'silence-btn';
|
| 835 |
+
btn.dataset.s = entry.start;
|
| 836 |
+
btn.dataset.e = entry.end;
|
| 837 |
+
var durationMs = Math.round((entry.end - entry.start) * 1000);
|
| 838 |
+
btn.innerHTML = '<span class="silence-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration">' + durationMs + 'ms</span>';
|
| 839 |
+
btn.onclick = function(e) {{ handleItemClick(this, e); }};
|
| 840 |
+
}}
|
| 841 |
+
|
| 842 |
container.appendChild(btn);
|
| 843 |
}});
|
| 844 |
|
| 845 |
+
// Toggle adjustment checkbox handler
|
| 846 |
+
function updateWordLabels() {{
|
| 847 |
+
var adjusted = document.getElementById('adjust-intervals').checked;
|
| 848 |
+
document.querySelectorAll('.word-btn').forEach(function(btn) {{
|
| 849 |
+
var s = parseFloat(btn.dataset.origS);
|
| 850 |
+
var e = adjusted ? parseFloat(btn.dataset.adjE) : parseFloat(btn.dataset.origE);
|
| 851 |
+
btn.dataset.s = s;
|
| 852 |
+
btn.dataset.e = e;
|
| 853 |
+
btn.innerHTML = '<span class="time">[' + s.toFixed(3) + '-' + e.toFixed(3) + 's]</span><span class="word"> ' + btn.dataset.word + '</span>';
|
| 854 |
+
}});
|
| 855 |
+
updateInterval();
|
| 856 |
+
}}
|
| 857 |
+
|
| 858 |
+
document.getElementById('adjust-intervals').addEventListener('change', updateWordLabels);
|
| 859 |
+
|
| 860 |
function updateInterval() {{
|
| 861 |
+
var sel = document.querySelectorAll('.word-btn.selected, .silence-btn.selected');
|
| 862 |
if (sel.length === 0) return;
|
| 863 |
var minS = Infinity, maxE = 0;
|
| 864 |
sel.forEach(function(b) {{
|
| 865 |
minS = Math.min(minS, parseFloat(b.dataset.s));
|
| 866 |
maxE = Math.max(maxE, parseFloat(b.dataset.e));
|
| 867 |
}});
|
| 868 |
+
var interval = minS.toFixed(3) + '-' + maxE.toFixed(3);
|
| 869 |
// Find the textbox in parent and update it
|
| 870 |
try {{
|
| 871 |
var boxes = parent.document.querySelectorAll('input[data-testid="textbox"], textarea');
|
|
|
|
| 878 |
}} catch(err) {{ console.log('Could not update parent:', err); }}
|
| 879 |
}}
|
| 880 |
|
| 881 |
+
// Highlight words that overlap with manually entered interval (>50% must be in interval)
|
| 882 |
function highlightFromInterval(intervalStr) {{
|
| 883 |
if (!intervalStr) return;
|
| 884 |
var parts = intervalStr.replace(',', '-').split('-');
|
|
|
|
| 889 |
document.querySelectorAll('.word-btn').forEach(function(btn) {{
|
| 890 |
var ws = parseFloat(btn.dataset.s);
|
| 891 |
var we = parseFloat(btn.dataset.e);
|
| 892 |
+
var itemDuration = we - ws;
|
| 893 |
+
// Calculate overlap between item and interval
|
| 894 |
var overlapStart = Math.max(ws, s);
|
| 895 |
var overlapEnd = Math.min(we, e);
|
| 896 |
var overlap = Math.max(0, overlapEnd - overlapStart);
|
| 897 |
+
// Highlight only if >50% of item is in interval
|
| 898 |
+
if (itemDuration > 0 && (overlap / itemDuration) > 0.5) {{
|
| 899 |
btn.classList.add('selected');
|
| 900 |
}} else {{
|
| 901 |
btn.classList.remove('selected');
|
|
|
|
| 960 |
|
| 961 |
return text, audio, timestamps_html, audio_data, timestamps_data, initial_player
|
| 962 |
|
| 963 |
+
def play_time_interval_fast(audio_state, time_interval, last_interval, current_window):
|
| 964 |
"""Fast extraction using preloaded audio from memory."""
|
| 965 |
def wrap_error(msg):
|
| 966 |
+
return f'<div style="min-height: 150px; padding: 20px; text-align: center; background: #f5f5f5; border-radius: 8px;"><p style="color: #666;">{msg}</p></div>', last_interval, current_window
|
| 967 |
|
| 968 |
try:
|
| 969 |
if not time_interval or not audio_state:
|
|
|
|
| 982 |
return wrap_error("Start time must be before end time.")
|
| 983 |
|
| 984 |
# Load/reload audio segment (autoplay will replay even if same interval)
|
| 985 |
+
# Pass current window state for smart redraw logic
|
| 986 |
+
result_html, new_window = extract_audio_segment(audio_state, start_time, end_time, current_window)
|
| 987 |
+
return result_html, time_interval, new_window
|
| 988 |
|
| 989 |
except Exception as e:
|
| 990 |
import traceback
|
|
|
|
| 992 |
|
| 993 |
transcribe_button.click(
|
| 994 |
fn=transcribe_and_setup_audio,
|
| 995 |
+
inputs=[audio_input, vad_prob_threshold, vad_frame_rep],
|
| 996 |
outputs=[transcription_output, audio_output, timestamps_output, audio_state, timestamps_state, waveform_player]
|
| 997 |
)
|
| 998 |
|
| 999 |
# Play interval button
|
| 1000 |
play_interval_button.click(
|
| 1001 |
fn=play_time_interval_fast,
|
| 1002 |
+
inputs=[audio_state, time_input, last_interval_state, waveform_window_state],
|
| 1003 |
+
outputs=[waveform_player, last_interval_state, waveform_window_state]
|
| 1004 |
)
|
| 1005 |
|
| 1006 |
demo.launch()
|
packages.txt
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-
ffmpeg
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
| 2 |
+
libc++1
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
nemo-toolkit[asr]>=1.23.0
|
| 2 |
punctfix==0.11.1
|
| 3 |
soundfile
|
| 4 |
-
matplotlib
|
|
|
|
|
|
| 1 |
nemo-toolkit[asr]>=1.23.0
|
| 2 |
punctfix==0.11.1
|
| 3 |
soundfile
|
| 4 |
+
matplotlib
|
| 5 |
+
ten-vad
|