Spaces:
Running
Running
Detect un-transcribe text and allow to replace transcript
Browse files
app.py
CHANGED
|
@@ -262,7 +262,7 @@ def parse_transcript_file(file_path):
|
|
| 262 |
raise ValueError("Unrecognized transcript format. Expected segments[].words[], words[], or segments[] with {start, end, text}")
|
| 263 |
|
| 264 |
|
| 265 |
-
def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, min_on_ms=64):
|
| 266 |
"""Load external transcript and run VAD on audio.
|
| 267 |
|
| 268 |
Args:
|
|
@@ -271,15 +271,12 @@ def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, m
|
|
| 271 |
prob_threshold: VAD probability threshold
|
| 272 |
min_off_ms: Minimum silence duration in ms
|
| 273 |
min_on_ms: Minimum voice duration in ms
|
|
|
|
| 274 |
|
| 275 |
Returns:
|
| 276 |
Tuple of (text, timestamps_data, audio_data_tuple, raw_text, export_metadata, silence_periods)
|
| 277 |
"""
|
| 278 |
try:
|
| 279 |
-
# Check if audio is provided
|
| 280 |
-
if audio is None:
|
| 281 |
-
return "No audio provided. Please upload audio first.", [], None, "", {}, []
|
| 282 |
-
|
| 283 |
# Check if transcript file is provided
|
| 284 |
if transcript_file is None:
|
| 285 |
return "No transcript file provided.", [], None, "", {}, []
|
|
@@ -291,23 +288,29 @@ def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, m
|
|
| 291 |
except Exception as e:
|
| 292 |
return f"Error parsing transcript file: {str(e)}", [], None, "", {}, []
|
| 293 |
|
| 294 |
-
#
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
audio_data =
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
# Run VAD to detect silence periods
|
| 313 |
silence_periods = []
|
|
@@ -578,6 +581,19 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
|
|
| 578 |
}}
|
| 579 |
.silence-btn:hover {{ background: #ffd4a4; }}
|
| 580 |
.silence-btn.selected {{ background: #ff9800; color: white; border-color: #e68a00; }}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
.checkbox-container {{
|
| 582 |
display: inline-flex;
|
| 583 |
align-items: center;
|
|
@@ -594,6 +610,7 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
|
|
| 594 |
}}
|
| 595 |
.time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
|
| 596 |
.silence-time {{ color: #996600; font-size: 10px; font-weight: bold; }}
|
|
|
|
| 597 |
.duration {{ color: #666; font-size: 10px; margin-left: 3px; }}
|
| 598 |
.word {{ margin-left: 4px; }}
|
| 599 |
</style>
|
|
@@ -610,7 +627,7 @@ h3 {{ margin-bottom: 8px; font-size: 16px; }}
|
|
| 610 |
<a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
|
| 611 |
</div>
|
| 612 |
<script>var exportJsonStr = {json.dumps(export_json_str)};</script>
|
| 613 |
-
<p class="help"><b>Click</b> = select | <b>Ctrl+Click</b> = toggle | <b>Shift+Click</b> = range <span style="background: #ffe4c4; padding: 2px 8px; border-radius: 3px; border: 1px solid #dca;"></span> = detected non speech</p>
|
| 614 |
<div class="container" id="words"></div>
|
| 615 |
<script>
|
| 616 |
var entries = {entries_json};
|
|
@@ -666,7 +683,7 @@ var adjustedEnds = words.map(function(w, i) {{ return calculateAdjustedEnd(i); }
|
|
| 666 |
var lastClickedIndex = -1;
|
| 667 |
|
| 668 |
function getAllButtons() {{
|
| 669 |
-
return Array.from(container.querySelectorAll('.word-btn, .silence-btn'));
|
| 670 |
}}
|
| 671 |
|
| 672 |
function handleItemClick(btn, e) {{
|
|
@@ -709,13 +726,20 @@ entries.forEach(function(entry, i) {{
|
|
| 709 |
|
| 710 |
btn.onclick = function(e) {{ handleItemClick(this, e); }};
|
| 711 |
wordIndex++;
|
| 712 |
-
}} else {{
|
| 713 |
btn.className = 'silence-btn';
|
| 714 |
btn.dataset.s = entry.start;
|
| 715 |
btn.dataset.e = entry.end;
|
| 716 |
var durationMs = Math.round((entry.end - entry.start) * 1000);
|
| 717 |
btn.innerHTML = '<span class="silence-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration">' + durationMs + 'ms</span>';
|
| 718 |
btn.onclick = function(e) {{ handleItemClick(this, e); }};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
}}
|
| 720 |
|
| 721 |
container.appendChild(btn);
|
|
@@ -746,7 +770,7 @@ function updateWordLabels() {{
|
|
| 746 |
document.getElementById('adjust-intervals').addEventListener('change', updateWordLabels);
|
| 747 |
|
| 748 |
function updateInterval() {{
|
| 749 |
-
var sel = Array.from(document.querySelectorAll('.word-btn.selected, .silence-btn.selected'));
|
| 750 |
if (sel.length === 0) return;
|
| 751 |
|
| 752 |
// Sort selected items by start time
|
|
@@ -969,14 +993,15 @@ with gr.Blocks() as demo:
|
|
| 969 |
# Waveform player - below interval controls
|
| 970 |
waveform_player = gr.HTML(label="Segment Player")
|
| 971 |
|
| 972 |
-
def load_transcript_and_setup(audio, transcript_file, prob_threshold, min_off_ms, min_on_ms):
|
| 973 |
"""Load external transcript and setup UI."""
|
| 974 |
if transcript_file is None:
|
| 975 |
# Return empty/unchanged outputs if no file selected
|
| 976 |
return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
|
| 977 |
|
| 978 |
text, timestamps_data, audio_data, raw_text, export_metadata, silence_periods = load_transcript(
|
| 979 |
-
audio, transcript_file, prob_threshold, int(min_off_ms), int(min_on_ms)
|
|
|
|
| 980 |
)
|
| 981 |
|
| 982 |
# Check for errors
|
|
@@ -1000,14 +1025,23 @@ with gr.Blocks() as demo:
|
|
| 1000 |
'end': round(item['end'], 3)
|
| 1001 |
})
|
| 1002 |
entries.sort(key=lambda x: x['start'])
|
| 1003 |
-
entries_json = json.dumps(entries)
|
| 1004 |
|
| 1005 |
-
#
|
| 1006 |
-
|
| 1007 |
-
|
| 1008 |
-
|
| 1009 |
-
|
| 1010 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1011 |
|
| 1012 |
# Pre-generate full export JSON
|
| 1013 |
segments = [{
|
|
@@ -1077,7 +1111,7 @@ with gr.Blocks() as demo:
|
|
| 1077 |
# Load transcript file input
|
| 1078 |
transcript_file_input.change(
|
| 1079 |
fn=load_transcript_and_setup,
|
| 1080 |
-
inputs=[audio_input, transcript_file_input, vad_prob_threshold, vad_min_off, vad_min_on],
|
| 1081 |
outputs=[transcription_output, timestamps_output, audio_state, timestamps_state, waveform_player]
|
| 1082 |
)
|
| 1083 |
|
|
|
|
| 262 |
raise ValueError("Unrecognized transcript format. Expected segments[].words[], words[], or segments[] with {start, end, text}")
|
| 263 |
|
| 264 |
|
| 265 |
+
def load_transcript(audio, transcript_file, prob_threshold=0.5, min_off_ms=48, min_on_ms=64, existing_audio_state=None):
|
| 266 |
"""Load external transcript and run VAD on audio.
|
| 267 |
|
| 268 |
Args:
|
|
|
|
| 271 |
prob_threshold: VAD probability threshold
|
| 272 |
min_off_ms: Minimum silence duration in ms
|
| 273 |
min_on_ms: Minimum voice duration in ms
|
| 274 |
+
existing_audio_state: Optional (audio_data, sample_rate) tuple to reuse
|
| 275 |
|
| 276 |
Returns:
|
| 277 |
Tuple of (text, timestamps_data, audio_data_tuple, raw_text, export_metadata, silence_periods)
|
| 278 |
"""
|
| 279 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
# Check if transcript file is provided
|
| 281 |
if transcript_file is None:
|
| 282 |
return "No transcript file provided.", [], None, "", {}, []
|
|
|
|
| 288 |
except Exception as e:
|
| 289 |
return f"Error parsing transcript file: {str(e)}", [], None, "", {}, []
|
| 290 |
|
| 291 |
+
# Get audio data: reuse existing state or load from file
|
| 292 |
+
if existing_audio_state is not None:
|
| 293 |
+
audio_data, sample_rate = existing_audio_state
|
| 294 |
+
print("[AUDIO] Reusing audio from memory")
|
| 295 |
+
elif audio is not None:
|
| 296 |
+
audio_data, sample_rate = sf.read(audio)
|
| 297 |
+
|
| 298 |
+
# Convert stereo to mono by averaging channels
|
| 299 |
+
if len(audio_data.shape) > 1 and audio_data.shape[1] == 2:
|
| 300 |
+
audio_data = np.mean(audio_data, axis=1)
|
| 301 |
+
|
| 302 |
+
# Resample to 16kHz if needed (required by TEN VAD)
|
| 303 |
+
TARGET_SR = 16000
|
| 304 |
+
if sample_rate != TARGET_SR:
|
| 305 |
+
duration = len(audio_data) / sample_rate
|
| 306 |
+
new_length = int(duration * TARGET_SR)
|
| 307 |
+
x_old = np.linspace(0, duration, len(audio_data), endpoint=False)
|
| 308 |
+
x_new = np.linspace(0, duration, new_length, endpoint=False)
|
| 309 |
+
audio_data = np.interp(x_new, x_old, audio_data).astype(np.float32)
|
| 310 |
+
print(f"[AUDIO] Resampled from {sample_rate}Hz to {TARGET_SR}Hz")
|
| 311 |
+
sample_rate = TARGET_SR
|
| 312 |
+
else:
|
| 313 |
+
return "No audio provided. Please upload audio first.", [], None, "", {}, []
|
| 314 |
|
| 315 |
# Run VAD to detect silence periods
|
| 316 |
silence_periods = []
|
|
|
|
| 581 |
}}
|
| 582 |
.silence-btn:hover {{ background: #ffd4a4; }}
|
| 583 |
.silence-btn.selected {{ background: #ff9800; color: white; border-color: #e68a00; }}
|
| 584 |
+
.untranscribed-btn {{
|
| 585 |
+
display: inline-block;
|
| 586 |
+
background: #ffcccc;
|
| 587 |
+
padding: 5px 8px;
|
| 588 |
+
margin: 3px;
|
| 589 |
+
border-radius: 4px;
|
| 590 |
+
cursor: pointer;
|
| 591 |
+
border: 1px solid #c88;
|
| 592 |
+
font-size: 11px;
|
| 593 |
+
transition: all 0.15s;
|
| 594 |
+
}}
|
| 595 |
+
.untranscribed-btn:hover {{ background: #ffaaaa; }}
|
| 596 |
+
.untranscribed-btn.selected {{ background: #e53935; color: white; border-color: #c62828; }}
|
| 597 |
.checkbox-container {{
|
| 598 |
display: inline-flex;
|
| 599 |
align-items: center;
|
|
|
|
| 610 |
}}
|
| 611 |
.time {{ color: #0066cc; font-size: 10px; font-weight: bold; }}
|
| 612 |
.silence-time {{ color: #996600; font-size: 10px; font-weight: bold; }}
|
| 613 |
+
.untranscribed-time {{ color: #b71c1c; font-size: 10px; font-weight: bold; }}
|
| 614 |
.duration {{ color: #666; font-size: 10px; margin-left: 3px; }}
|
| 615 |
.word {{ margin-left: 4px; }}
|
| 616 |
</style>
|
|
|
|
| 627 |
<a href="#" id="download-json" style="font-size: 12px; color: #0066cc; text-decoration: none;">📥 Download JSON</a>
|
| 628 |
</div>
|
| 629 |
<script>var exportJsonStr = {json.dumps(export_json_str)};</script>
|
| 630 |
+
<p class="help"><b>Click</b> = select | <b>Ctrl+Click</b> = toggle | <b>Shift+Click</b> = range <span style="background: #ffe4c4; padding: 2px 8px; border-radius: 3px; border: 1px solid #dca;"></span> = detected non speech <span style="background: #ffcccc; padding: 2px 8px; border-radius: 3px; border: 1px solid #c88;"></span> = speech without transcript</p>
|
| 631 |
<div class="container" id="words"></div>
|
| 632 |
<script>
|
| 633 |
var entries = {entries_json};
|
|
|
|
| 683 |
var lastClickedIndex = -1;
|
| 684 |
|
| 685 |
function getAllButtons() {{
|
| 686 |
+
return Array.from(container.querySelectorAll('.word-btn, .silence-btn, .untranscribed-btn'));
|
| 687 |
}}
|
| 688 |
|
| 689 |
function handleItemClick(btn, e) {{
|
|
|
|
| 726 |
|
| 727 |
btn.onclick = function(e) {{ handleItemClick(this, e); }};
|
| 728 |
wordIndex++;
|
| 729 |
+
}} else if (entry.type === 'silence') {{
|
| 730 |
btn.className = 'silence-btn';
|
| 731 |
btn.dataset.s = entry.start;
|
| 732 |
btn.dataset.e = entry.end;
|
| 733 |
var durationMs = Math.round((entry.end - entry.start) * 1000);
|
| 734 |
btn.innerHTML = '<span class="silence-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration">' + durationMs + 'ms</span>';
|
| 735 |
btn.onclick = function(e) {{ handleItemClick(this, e); }};
|
| 736 |
+
}} else if (entry.type === 'untranscribed_speech') {{
|
| 737 |
+
btn.className = 'untranscribed-btn';
|
| 738 |
+
btn.dataset.s = entry.start;
|
| 739 |
+
btn.dataset.e = entry.end;
|
| 740 |
+
var durationMs = Math.round((entry.end - entry.start) * 1000);
|
| 741 |
+
btn.innerHTML = '<span class="untranscribed-time">[' + entry.start.toFixed(3) + '-' + entry.end.toFixed(3) + 's]</span><span class="duration"> XXXXX ' + durationMs + 'ms</span>';
|
| 742 |
+
btn.onclick = function(e) {{ handleItemClick(this, e); }};
|
| 743 |
}}
|
| 744 |
|
| 745 |
container.appendChild(btn);
|
|
|
|
| 770 |
document.getElementById('adjust-intervals').addEventListener('change', updateWordLabels);
|
| 771 |
|
| 772 |
function updateInterval() {{
|
| 773 |
+
var sel = Array.from(document.querySelectorAll('.word-btn.selected, .silence-btn.selected, .untranscribed-btn.selected'));
|
| 774 |
if (sel.length === 0) return;
|
| 775 |
|
| 776 |
// Sort selected items by start time
|
|
|
|
| 993 |
# Waveform player - below interval controls
|
| 994 |
waveform_player = gr.HTML(label="Segment Player")
|
| 995 |
|
| 996 |
+
def load_transcript_and_setup(audio, transcript_file, prob_threshold, min_off_ms, min_on_ms, existing_audio_state):
|
| 997 |
"""Load external transcript and setup UI."""
|
| 998 |
if transcript_file is None:
|
| 999 |
# Return empty/unchanged outputs if no file selected
|
| 1000 |
return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
|
| 1001 |
|
| 1002 |
text, timestamps_data, audio_data, raw_text, export_metadata, silence_periods = load_transcript(
|
| 1003 |
+
audio, transcript_file, prob_threshold, int(min_off_ms), int(min_on_ms),
|
| 1004 |
+
existing_audio_state=existing_audio_state
|
| 1005 |
)
|
| 1006 |
|
| 1007 |
# Check for errors
|
|
|
|
| 1025 |
'end': round(item['end'], 3)
|
| 1026 |
})
|
| 1027 |
entries.sort(key=lambda x: x['start'])
|
|
|
|
| 1028 |
|
| 1029 |
+
# Detect untranscribed speech: gaps between consecutive silence entries
|
| 1030 |
+
# where VAD detected voice but the transcript has no words
|
| 1031 |
+
augmented_entries = []
|
| 1032 |
+
for i, entry in enumerate(entries):
|
| 1033 |
+
augmented_entries.append(entry)
|
| 1034 |
+
if entry['type'] == 'silence' and i + 1 < len(entries) and entries[i + 1]['type'] == 'silence':
|
| 1035 |
+
gap_start = entry['end']
|
| 1036 |
+
gap_end = entries[i + 1]['start']
|
| 1037 |
+
if gap_end - gap_start > 0.01:
|
| 1038 |
+
augmented_entries.append({
|
| 1039 |
+
'type': 'untranscribed_speech',
|
| 1040 |
+
'start': round(gap_start, 3),
|
| 1041 |
+
'end': round(gap_end, 3)
|
| 1042 |
+
})
|
| 1043 |
+
entries = augmented_entries
|
| 1044 |
+
entries_json = json.dumps(entries)
|
| 1045 |
|
| 1046 |
# Pre-generate full export JSON
|
| 1047 |
segments = [{
|
|
|
|
| 1111 |
# Load transcript file input
|
| 1112 |
transcript_file_input.change(
|
| 1113 |
fn=load_transcript_and_setup,
|
| 1114 |
+
inputs=[audio_input, transcript_file_input, vad_prob_threshold, vad_min_off, vad_min_on, audio_state],
|
| 1115 |
outputs=[transcription_output, timestamps_output, audio_state, timestamps_state, waveform_player]
|
| 1116 |
)
|
| 1117 |
|