Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# app.py
|
| 2 |
-
# Whisper Transcriber —
|
| 3 |
# Requirements: gradio, whisper, pydub, pyzipper, python-docx, ffmpeg
|
| 4 |
|
| 5 |
import os
|
|
@@ -12,7 +12,9 @@ import traceback
|
|
| 12 |
import threading
|
| 13 |
import re
|
| 14 |
from difflib import get_close_matches
|
|
|
|
| 15 |
from pathlib import Path
|
|
|
|
| 16 |
|
| 17 |
# Force unbuffered prints for logs
|
| 18 |
os.environ["PYTHONUNBUFFERED"] = "1"
|
|
@@ -386,19 +388,19 @@ def segments_to_srt(segments):
|
|
| 386 |
return "\n".join(lines)
|
| 387 |
|
| 388 |
|
| 389 |
-
# ---------- ZIP extraction + mapping for UI ----------
|
| 390 |
def extract_zip_and_map(zip_path, zip_password=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
global EXTRACT_MAP
|
| 392 |
EXTRACT_MAP = {}
|
| 393 |
-
|
|
|
|
|
|
|
| 394 |
try:
|
| 395 |
-
if os.path.exists(temp_extract_dir):
|
| 396 |
-
try:
|
| 397 |
-
shutil.rmtree(temp_extract_dir)
|
| 398 |
-
except Exception:
|
| 399 |
-
pass
|
| 400 |
os.makedirs(temp_extract_dir, exist_ok=True)
|
| 401 |
-
logs = []
|
| 402 |
with pyzipper.ZipFile(zip_path, "r") as zf:
|
| 403 |
if zip_password:
|
| 404 |
try:
|
|
@@ -442,53 +444,251 @@ def extract_zip_and_map(zip_path, zip_password=None):
|
|
| 442 |
return friendly, "\n".join(logs)
|
| 443 |
except Exception as e:
|
| 444 |
traceback.print_exc()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
return [], f"Extraction failed: {e}"
|
| 446 |
|
| 447 |
|
| 448 |
-
# ----------
|
| 449 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
logs = []
|
| 451 |
try:
|
| 452 |
if not path:
|
| 453 |
-
return None,
|
| 454 |
p = path.name if hasattr(path, "name") else str(path)
|
| 455 |
device = None if device_choice == "auto" else device_choice
|
| 456 |
-
|
| 457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
wav = convert_to_wav_if_needed(p)
|
| 459 |
logs.append(f"Converted to WAV: {os.path.basename(wav)}")
|
| 460 |
-
|
| 461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
if enable_memory:
|
| 463 |
-
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
srt_path = None
|
| 466 |
-
if generate_srt
|
| 467 |
-
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
| 469 |
with open(srt_fp, "w", encoding="utf-8") as fh:
|
| 470 |
fh.write(srt_text)
|
| 471 |
srt_path = srt_fp
|
| 472 |
logs.append(f"SRT generated: {srt_path}")
|
| 473 |
-
|
| 474 |
-
try:
|
| 475 |
-
update_memory_with_transcript(text)
|
| 476 |
-
logs.append("Memory updated.")
|
| 477 |
-
except Exception:
|
| 478 |
-
pass
|
| 479 |
if wav and os.path.exists(wav) and wav != p:
|
| 480 |
try:
|
| 481 |
os.unlink(wav)
|
| 482 |
except Exception:
|
| 483 |
pass
|
| 484 |
-
|
|
|
|
|
|
|
| 485 |
except Exception as e:
|
| 486 |
tb = traceback.format_exc()
|
| 487 |
return "", None, f"Transcription error: {e}\n{tb}"
|
| 488 |
|
| 489 |
|
| 490 |
-
# ---------- Batch transcribe (
|
| 491 |
-
def batch_transcribe(friendly_selected, uploaded_files, model_name, device_name, merge_flag, enable_mem, generate_srt):
|
| 492 |
logs = []
|
| 493 |
transcripts = []
|
| 494 |
srt_files = []
|
|
@@ -513,7 +713,17 @@ def batch_transcribe(friendly_selected, uploaded_files, model_name, device_name,
|
|
| 513 |
total = len(paths)
|
| 514 |
for idx, p in enumerate(paths, start=1):
|
| 515 |
logs.append(f"[{idx}/{total}] Processing: {p}")
|
| 516 |
-
text, srt_path, lg = transcribe_single_file(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
logs.append(lg)
|
| 518 |
transcripts.append(f"FILE: {os.path.basename(p)}\n{text}\n")
|
| 519 |
if srt_path:
|
|
@@ -561,7 +771,7 @@ body { background: var(--bg); color: var(--text); font-family: Inter, system-ui,
|
|
| 561 |
.small-note { color:var(--muted); font-size:12px;}
|
| 562 |
"""
|
| 563 |
|
| 564 |
-
with gr.Blocks(title="Whisper Transcriber (dark/light)", css=CSS) as demo:
|
| 565 |
# apply saved theme early
|
| 566 |
gr.HTML("""
|
| 567 |
<script>
|
|
@@ -585,7 +795,7 @@ with gr.Blocks(title="Whisper Transcriber (dark/light)", css=CSS) as demo:
|
|
| 585 |
gr.HTML("<div class='app-icon'>WT</div>")
|
| 586 |
with gr.Column():
|
| 587 |
gr.Markdown("<h3 style='margin:0'>Whisper Transcriber — improved</h3>")
|
| 588 |
-
gr.Markdown("<div class='small-note'>
|
| 589 |
|
| 590 |
with gr.Tabs():
|
| 591 |
# Single Audio Tab
|
|
@@ -601,6 +811,10 @@ with gr.Blocks(title="Whisper Transcriber (dark/light)", css=CSS) as demo:
|
|
| 601 |
with gr.Row():
|
| 602 |
mem_toggle = gr.Checkbox(label="Enable memory corrections", value=False)
|
| 603 |
srt_toggle = gr.Checkbox(label="Generate SRT", value=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
| 605 |
with gr.Column(scale=1):
|
| 606 |
with gr.Group(elem_classes="card"):
|
|
@@ -610,15 +824,29 @@ with gr.Blocks(title="Whisper Transcriber (dark/light)", css=CSS) as demo:
|
|
| 610 |
srt_download = gr.File(label="SRT (if generated / available)")
|
| 611 |
single_logs = gr.Textbox(label="Logs", lines=8, interactive=False)
|
| 612 |
|
| 613 |
-
def _single_action(audio_file, model_name, device, mem_on, srt_on):
|
| 614 |
if not audio_file:
|
| 615 |
return None, "", None, "No audio file provided."
|
| 616 |
path = audio_file if isinstance(audio_file, str) else (audio_file.name if hasattr(audio_file, "name") else str(audio_file))
|
| 617 |
-
text, srt_path, logs = transcribe_single_file(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
preview = audio_file
|
| 619 |
return preview, text, srt_path, logs
|
| 620 |
|
| 621 |
-
transcribe_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
|
| 623 |
# Batch Tab
|
| 624 |
with gr.TabItem("Batch Transcribe"):
|
|
@@ -638,6 +866,10 @@ with gr.Blocks(title="Whisper Transcriber (dark/light)", css=CSS) as demo:
|
|
| 638 |
batch_merge = gr.Checkbox(label="Merge transcripts to DOCX", value=True)
|
| 639 |
batch_mem = gr.Checkbox(label="Enable memory corrections", value=False)
|
| 640 |
batch_srt = gr.Checkbox(label="Generate SRT(s) if available", value=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
batch_run_btn = gr.Button("Start Batch Transcription", variant="primary")
|
| 642 |
with gr.Column(scale=1):
|
| 643 |
with gr.Group(elem_classes="card"):
|
|
@@ -649,18 +881,34 @@ with gr.Blocks(title="Whisper Transcriber (dark/light)", css=CSS) as demo:
|
|
| 649 |
|
| 650 |
def _do_extract(zip_file, password):
|
| 651 |
if not zip_file:
|
| 652 |
-
return [], "No ZIP provided."
|
| 653 |
zip_path = zip_file.name if hasattr(zip_file, "name") else str(zip_file)
|
| 654 |
friendly, logs = extract_zip_and_map(zip_path, password)
|
| 655 |
-
return
|
|
|
|
| 656 |
|
| 657 |
batch_extract_btn.click(fn=_do_extract, inputs=[batch_zip, zip_password], outputs=[batch_select, batch_extract_logs])
|
| 658 |
|
| 659 |
-
def _do_batch(friendly_selected, uploaded_files, model_name, device, merge_flag, mem_flag, srt_flag):
|
| 660 |
-
combined, logs, out_doc, srt_path = batch_transcribe(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
return combined, logs, out_doc, srt_path
|
| 662 |
|
| 663 |
-
batch_run_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
|
| 665 |
# Memory Tab
|
| 666 |
with gr.TabItem("Memory"):
|
|
@@ -761,13 +1009,13 @@ with gr.Blocks(title="Whisper Transcriber (dark/light)", css=CSS) as demo:
|
|
| 761 |
with gr.Group(elem_classes="card"):
|
| 762 |
gr.Markdown("### Runtime & tips")
|
| 763 |
gr.Markdown("- Use `large-v3` only if your whisper package supports it.")
|
| 764 |
-
gr.Markdown("- Extraction writes to system temp
|
| 765 |
-
gr.Markdown("-
|
| 766 |
with gr.Column():
|
| 767 |
with gr.Group(elem_classes="card"):
|
| 768 |
gr.Markdown("### Theme")
|
| 769 |
theme_toggle = gr.Button("Toggle Dark / Light Theme")
|
| 770 |
-
|
| 771 |
gr.Markdown("### Diagnostics")
|
| 772 |
diag_btn = gr.Button("Show memory summary")
|
| 773 |
diag_out = gr.Textbox(label="Diagnostics", lines=12, interactive=False)
|
|
|
|
| 1 |
# app.py
|
| 2 |
+
# Whisper Transcriber — Fixed: per-run extract dirs + CheckboxGroup update + misc imports
|
| 3 |
# Requirements: gradio, whisper, pydub, pyzipper, python-docx, ffmpeg
|
| 4 |
|
| 5 |
import os
|
|
|
|
| 12 |
import threading
|
| 13 |
import re
|
| 14 |
from difflib import get_close_matches
|
| 15 |
+
from uuid import uuid4
|
| 16 |
from pathlib import Path
|
| 17 |
+
from difflib import get_close_matches
|
| 18 |
|
| 19 |
# Force unbuffered prints for logs
|
| 20 |
os.environ["PYTHONUNBUFFERED"] = "1"
|
|
|
|
| 388 |
return "\n".join(lines)
|
| 389 |
|
| 390 |
|
| 391 |
+
# ---------- ZIP extraction + mapping for UI (per-run temp dir) ----------
|
| 392 |
def extract_zip_and_map(zip_path, zip_password=None):
|
| 393 |
+
"""
|
| 394 |
+
Extract to a unique per-run temp directory and populate EXTRACT_MAP with absolute paths.
|
| 395 |
+
Returns (friendly_list, logs)
|
| 396 |
+
"""
|
| 397 |
global EXTRACT_MAP
|
| 398 |
EXTRACT_MAP = {}
|
| 399 |
+
run_id = uuid4().hex
|
| 400 |
+
temp_extract_dir = os.path.join(tempfile.gettempdir(), f"extracted_audio_{run_id}")
|
| 401 |
+
logs = []
|
| 402 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
os.makedirs(temp_extract_dir, exist_ok=True)
|
|
|
|
| 404 |
with pyzipper.ZipFile(zip_path, "r") as zf:
|
| 405 |
if zip_password:
|
| 406 |
try:
|
|
|
|
| 444 |
return friendly, "\n".join(logs)
|
| 445 |
except Exception as e:
|
| 446 |
traceback.print_exc()
|
| 447 |
+
# on failure, attempt cleanup
|
| 448 |
+
try:
|
| 449 |
+
if os.path.exists(temp_extract_dir):
|
| 450 |
+
shutil.rmtree(temp_extract_dir)
|
| 451 |
+
except Exception:
|
| 452 |
+
pass
|
| 453 |
return [], f"Extraction failed: {e}"
|
| 454 |
|
| 455 |
|
| 456 |
+
# ---------- Audio trimming helper for two-pass ----------
|
| 457 |
+
def trim_audio_segment(src_path, start_sec, end_sec):
|
| 458 |
+
"""
|
| 459 |
+
Extract a short audio segment [start_sec, end_sec] to a temp wav file using ffmpeg.
|
| 460 |
+
Returns path to wav or raises exception.
|
| 461 |
+
"""
|
| 462 |
+
src = str(src_path)
|
| 463 |
+
out_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 464 |
+
out_tmp.close()
|
| 465 |
+
out_path = out_tmp.name
|
| 466 |
+
try:
|
| 467 |
+
cmd = [
|
| 468 |
+
"ffmpeg",
|
| 469 |
+
"-hide_banner",
|
| 470 |
+
"-loglevel",
|
| 471 |
+
"error",
|
| 472 |
+
"-y",
|
| 473 |
+
"-ss",
|
| 474 |
+
str(start_sec),
|
| 475 |
+
"-to",
|
| 476 |
+
str(end_sec),
|
| 477 |
+
"-i",
|
| 478 |
+
src,
|
| 479 |
+
"-ar",
|
| 480 |
+
"16000",
|
| 481 |
+
"-ac",
|
| 482 |
+
"1",
|
| 483 |
+
out_path,
|
| 484 |
+
]
|
| 485 |
+
proc = subprocess.run(cmd, capture_output=True, timeout=30, text=True)
|
| 486 |
+
if proc.returncode != 0 or not os.path.exists(out_path) or os.path.getsize(out_path) < MIN_WAV_SIZE:
|
| 487 |
+
try:
|
| 488 |
+
if os.path.exists(out_path):
|
| 489 |
+
os.unlink(out_path)
|
| 490 |
+
except Exception:
|
| 491 |
+
pass
|
| 492 |
+
raise Exception(f"ffmpeg trim failed: {proc.stderr or proc.stdout}")
|
| 493 |
+
return out_path
|
| 494 |
+
except Exception as e:
|
| 495 |
+
try:
|
| 496 |
+
if os.path.exists(out_path):
|
| 497 |
+
os.unlink(out_path)
|
| 498 |
+
except Exception:
|
| 499 |
+
pass
|
| 500 |
+
raise
|
| 501 |
+
|
| 502 |
+
|
| 503 |
+
# ---------- Transcribe single file (supports two-pass) ----------
|
| 504 |
+
def transcribe_single_file(
|
| 505 |
+
path,
|
| 506 |
+
model_name="small",
|
| 507 |
+
device_choice="auto",
|
| 508 |
+
enable_memory=False,
|
| 509 |
+
generate_srt=False,
|
| 510 |
+
use_two_pass=False,
|
| 511 |
+
fast_model="small",
|
| 512 |
+
refine_model=None,
|
| 513 |
+
refine_threshold=-1.0,
|
| 514 |
+
):
|
| 515 |
+
"""
|
| 516 |
+
If use_two_pass is True:
|
| 517 |
+
1) run fast_model for quick pass
|
| 518 |
+
2) apply memory corrections
|
| 519 |
+
3) for segments with avg_logprob < refine_threshold re-run refine_model on trimmed audio
|
| 520 |
+
4) recombine segments, apply memory, output text and optional SRT
|
| 521 |
+
"""
|
| 522 |
logs = []
|
| 523 |
try:
|
| 524 |
if not path:
|
| 525 |
+
return None, None, "No file provided."
|
| 526 |
p = path.name if hasattr(path, "name") else str(path)
|
| 527 |
device = None if device_choice == "auto" else device_choice
|
| 528 |
+
|
| 529 |
+
# If not using two-pass, keep old behavior
|
| 530 |
+
if not use_two_pass:
|
| 531 |
+
model = get_whisper_model(model_name, device=device)
|
| 532 |
+
logs.append(f"Loaded model: {model_name}")
|
| 533 |
+
wav = convert_to_wav_if_needed(p)
|
| 534 |
+
logs.append(f"Converted to WAV: {os.path.basename(wav)}")
|
| 535 |
+
result = model.transcribe(wav)
|
| 536 |
+
text = result.get("text", "").strip()
|
| 537 |
+
if enable_memory:
|
| 538 |
+
text = memory_correct_text(text)
|
| 539 |
+
text = postprocess_transcript(text)
|
| 540 |
+
srt_path = None
|
| 541 |
+
if generate_srt and result.get("segments"):
|
| 542 |
+
srt_text = segments_to_srt(result["segments"])
|
| 543 |
+
srt_fp = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(p))[0]}.srt")
|
| 544 |
+
with open(srt_fp, "w", encoding="utf-8") as fh:
|
| 545 |
+
fh.write(srt_text)
|
| 546 |
+
srt_path = srt_fp
|
| 547 |
+
logs.append(f"SRT generated: {srt_path}")
|
| 548 |
+
if enable_memory:
|
| 549 |
+
try:
|
| 550 |
+
update_memory_with_transcript(text)
|
| 551 |
+
logs.append("Memory updated.")
|
| 552 |
+
except Exception:
|
| 553 |
+
pass
|
| 554 |
+
if wav and os.path.exists(wav) and wav != p:
|
| 555 |
+
try:
|
| 556 |
+
os.unlink(wav)
|
| 557 |
+
except Exception:
|
| 558 |
+
pass
|
| 559 |
+
return text, srt_path, "\n".join(logs)
|
| 560 |
+
|
| 561 |
+
# ---------------- Two-pass flow ----------------
|
| 562 |
+
if refine_model is None:
|
| 563 |
+
refine_model = model_name
|
| 564 |
+
|
| 565 |
+
logs.append(f"Two-pass enabled: fast_model={fast_model}, refine_model={refine_model}, threshold={refine_threshold}")
|
| 566 |
+
|
| 567 |
+
# 1) fast pass
|
| 568 |
+
fast = get_whisper_model(fast_model, device=device)
|
| 569 |
+
logs.append(f"Loaded fast model: {fast_model}")
|
| 570 |
wav = convert_to_wav_if_needed(p)
|
| 571 |
logs.append(f"Converted to WAV: {os.path.basename(wav)}")
|
| 572 |
+
|
| 573 |
+
fast_result = fast.transcribe(wav)
|
| 574 |
+
segments = fast_result.get("segments") or []
|
| 575 |
+
|
| 576 |
+
# fallback: no segments -> treat as single text
|
| 577 |
+
if not segments:
|
| 578 |
+
text = fast_result.get("text", "").strip()
|
| 579 |
+
if enable_memory:
|
| 580 |
+
text = memory_correct_text(text)
|
| 581 |
+
update_memory_with_transcript(text)
|
| 582 |
+
text = postprocess_transcript(text)
|
| 583 |
+
srt_ret = None
|
| 584 |
+
if generate_srt and fast_result.get("segments"):
|
| 585 |
+
srt_text = segments_to_srt(fast_result["segments"])
|
| 586 |
+
srt_fp = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(p))[0]}.srt")
|
| 587 |
+
with open(srt_fp, "w", encoding="utf-8") as fh:
|
| 588 |
+
fh.write(srt_text)
|
| 589 |
+
srt_ret = srt_fp
|
| 590 |
+
logs.append(f"SRT generated: {srt_fp}")
|
| 591 |
+
if wav and os.path.exists(wav) and wav != p:
|
| 592 |
+
try:
|
| 593 |
+
os.unlink(wav)
|
| 594 |
+
except Exception:
|
| 595 |
+
pass
|
| 596 |
+
return text, srt_ret, "\n".join(logs)
|
| 597 |
+
|
| 598 |
+
# 2) memory-correct segments and tag low-confidence ones
|
| 599 |
+
refined_segments = []
|
| 600 |
+
segments_to_refine = []
|
| 601 |
+
for seg in segments:
|
| 602 |
+
seg_text = seg.get("text", "").strip()
|
| 603 |
+
if enable_memory:
|
| 604 |
+
corrected = memory_correct_text(seg_text)
|
| 605 |
+
else:
|
| 606 |
+
corrected = seg_text
|
| 607 |
+
seg_copy = dict(seg)
|
| 608 |
+
seg_copy["text"] = corrected
|
| 609 |
+
refined_segments.append(seg_copy)
|
| 610 |
+
avg_lp = seg.get("avg_logprob", None)
|
| 611 |
+
if avg_lp is None:
|
| 612 |
+
continue
|
| 613 |
+
try:
|
| 614 |
+
if float(avg_lp) < float(refine_threshold):
|
| 615 |
+
segments_to_refine.append(seg_copy)
|
| 616 |
+
except Exception:
|
| 617 |
+
continue
|
| 618 |
+
|
| 619 |
+
logs.append(f"Fast pass produced {len(segments)} segments; {len(segments_to_refine)} queued for refinement.")
|
| 620 |
+
|
| 621 |
+
# 3) refine low-confidence segments
|
| 622 |
+
if segments_to_refine:
|
| 623 |
+
refine = get_whisper_model(refine_model, device=device)
|
| 624 |
+
logs.append(f"Loaded refine model: {refine_model}")
|
| 625 |
+
for seg in segments_to_refine:
|
| 626 |
+
start = seg.get("start", 0.0)
|
| 627 |
+
end = seg.get("end", start + seg.get("duration", 0.0))
|
| 628 |
+
if end <= start:
|
| 629 |
+
continue
|
| 630 |
+
try:
|
| 631 |
+
seg_wav = trim_audio_segment(wav, start, end)
|
| 632 |
+
r_result = refine.transcribe(seg_wav)
|
| 633 |
+
new_text = r_result.get("text", "").strip()
|
| 634 |
+
if enable_memory:
|
| 635 |
+
new_text = memory_correct_text(new_text)
|
| 636 |
+
# update matching segment by start/end
|
| 637 |
+
for rs in refined_segments:
|
| 638 |
+
if abs(rs.get("start", 0.0) - start) < 0.001 and abs(rs.get("end", 0.0) - end) < 0.001:
|
| 639 |
+
rs["text"] = new_text
|
| 640 |
+
if r_result.get("segments"):
|
| 641 |
+
rs["avg_logprob"] = r_result["segments"][0].get("avg_logprob", rs.get("avg_logprob"))
|
| 642 |
+
break
|
| 643 |
+
try:
|
| 644 |
+
if os.path.exists(seg_wav):
|
| 645 |
+
os.unlink(seg_wav)
|
| 646 |
+
except Exception:
|
| 647 |
+
pass
|
| 648 |
+
except Exception as e:
|
| 649 |
+
logs.append(f"Refine failed for segment {start}-{end}: {e}")
|
| 650 |
+
continue
|
| 651 |
+
|
| 652 |
+
# 4) recombine segments
|
| 653 |
+
full_text_parts = [s.get("text", "").strip() for s in sorted(refined_segments, key=lambda x: x.get("start", 0.0))]
|
| 654 |
+
combined_text = " ".join([p for p in full_text_parts if p])
|
| 655 |
if enable_memory:
|
| 656 |
+
combined_text = memory_correct_text(combined_text)
|
| 657 |
+
try:
|
| 658 |
+
update_memory_with_transcript(combined_text)
|
| 659 |
+
logs.append("Memory updated.")
|
| 660 |
+
except Exception:
|
| 661 |
+
pass
|
| 662 |
+
combined_text = postprocess_transcript(combined_text)
|
| 663 |
+
|
| 664 |
+
# 5) generate SRT if requested
|
| 665 |
srt_path = None
|
| 666 |
+
if generate_srt:
|
| 667 |
+
srt_segs = []
|
| 668 |
+
for rs in sorted(refined_segments, key=lambda x: x.get("start", 0.0)):
|
| 669 |
+
srt_segs.append({"start": rs.get("start", 0.0), "end": rs.get("end", 0.0), "text": rs.get("text", "")})
|
| 670 |
+
srt_text = segments_to_srt(srt_segs)
|
| 671 |
+
srt_fp = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(p))[0]}_two_pass.srt")
|
| 672 |
with open(srt_fp, "w", encoding="utf-8") as fh:
|
| 673 |
fh.write(srt_text)
|
| 674 |
srt_path = srt_fp
|
| 675 |
logs.append(f"SRT generated: {srt_path}")
|
| 676 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
if wav and os.path.exists(wav) and wav != p:
|
| 678 |
try:
|
| 679 |
os.unlink(wav)
|
| 680 |
except Exception:
|
| 681 |
pass
|
| 682 |
+
|
| 683 |
+
return combined_text, srt_path, "\n".join(logs)
|
| 684 |
+
|
| 685 |
except Exception as e:
|
| 686 |
tb = traceback.format_exc()
|
| 687 |
return "", None, f"Transcription error: {e}\n{tb}"
|
| 688 |
|
| 689 |
|
| 690 |
+
# ---------- Batch transcribe (uses transcribe_single_file's two-pass when requested) ----------
|
| 691 |
+
def batch_transcribe(friendly_selected, uploaded_files, model_name, device_name, merge_flag, enable_mem, generate_srt, use_two_pass=False, fast_model="small", refine_threshold=-1.0):
|
| 692 |
logs = []
|
| 693 |
transcripts = []
|
| 694 |
srt_files = []
|
|
|
|
| 713 |
total = len(paths)
|
| 714 |
for idx, p in enumerate(paths, start=1):
|
| 715 |
logs.append(f"[{idx}/{total}] Processing: {p}")
|
| 716 |
+
text, srt_path, lg = transcribe_single_file(
|
| 717 |
+
p,
|
| 718 |
+
model_name=model_name,
|
| 719 |
+
device_choice=device_name,
|
| 720 |
+
enable_memory=enable_mem,
|
| 721 |
+
generate_srt=generate_srt,
|
| 722 |
+
use_two_pass=use_two_pass,
|
| 723 |
+
fast_model=fast_model,
|
| 724 |
+
refine_model=model_name,
|
| 725 |
+
refine_threshold=refine_threshold,
|
| 726 |
+
)
|
| 727 |
logs.append(lg)
|
| 728 |
transcripts.append(f"FILE: {os.path.basename(p)}\n{text}\n")
|
| 729 |
if srt_path:
|
|
|
|
| 771 |
.small-note { color:var(--muted); font-size:12px;}
|
| 772 |
"""
|
| 773 |
|
| 774 |
+
with gr.Blocks(title="Whisper Transcriber (dark/light + two-pass)", css=CSS) as demo:
|
| 775 |
# apply saved theme early
|
| 776 |
gr.HTML("""
|
| 777 |
<script>
|
|
|
|
| 795 |
gr.HTML("<div class='app-icon'>WT</div>")
|
| 796 |
with gr.Column():
|
| 797 |
gr.Markdown("<h3 style='margin:0'>Whisper Transcriber — improved</h3>")
|
| 798 |
+
gr.Markdown("<div class='small-note'>Two-pass speedup, per-file selection after unzip, SRT export, model availability checks, dark/light toggle.</div>")
|
| 799 |
|
| 800 |
with gr.Tabs():
|
| 801 |
# Single Audio Tab
|
|
|
|
| 811 |
with gr.Row():
|
| 812 |
mem_toggle = gr.Checkbox(label="Enable memory corrections", value=False)
|
| 813 |
srt_toggle = gr.Checkbox(label="Generate SRT", value=False)
|
| 814 |
+
with gr.Row():
|
| 815 |
+
use_two_pass_single = gr.Checkbox(label="Use two-pass speedup (fast then refine)", value=False)
|
| 816 |
+
fast_model_choice = gr.Dropdown(choices=[c for c in ["tiny", "base", "small"] if c in AVAILABLE_MODEL_SET], value="small", label="Fast model")
|
| 817 |
+
refine_threshold_single = gr.Number(value=-1.0, label="Refine threshold (avg_logprob) — lower is stricter", precision=2)
|
| 818 |
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
| 819 |
with gr.Column(scale=1):
|
| 820 |
with gr.Group(elem_classes="card"):
|
|
|
|
| 824 |
srt_download = gr.File(label="SRT (if generated / available)")
|
| 825 |
single_logs = gr.Textbox(label="Logs", lines=8, interactive=False)
|
| 826 |
|
| 827 |
+
def _single_action(audio_file, model_name, device, mem_on, srt_on, use_two_pass_flag, fast_model, refine_thresh):
|
| 828 |
if not audio_file:
|
| 829 |
return None, "", None, "No audio file provided."
|
| 830 |
path = audio_file if isinstance(audio_file, str) else (audio_file.name if hasattr(audio_file, "name") else str(audio_file))
|
| 831 |
+
text, srt_path, logs = transcribe_single_file(
|
| 832 |
+
path,
|
| 833 |
+
model_name=model_name,
|
| 834 |
+
device_choice=device,
|
| 835 |
+
enable_memory=mem_on,
|
| 836 |
+
generate_srt=srt_on,
|
| 837 |
+
use_two_pass=use_two_pass_flag,
|
| 838 |
+
fast_model=fast_model,
|
| 839 |
+
refine_model=model_name,
|
| 840 |
+
refine_threshold=refine_thresh,
|
| 841 |
+
)
|
| 842 |
preview = audio_file
|
| 843 |
return preview, text, srt_path, logs
|
| 844 |
|
| 845 |
+
transcribe_btn.click(
|
| 846 |
+
fn=_single_action,
|
| 847 |
+
inputs=[single_audio, model_select, device_choice, mem_toggle, srt_toggle, use_two_pass_single, fast_model_choice, refine_threshold_single],
|
| 848 |
+
outputs=[audio_preview, transcript_out, srt_download, single_logs],
|
| 849 |
+
)
|
| 850 |
|
| 851 |
# Batch Tab
|
| 852 |
with gr.TabItem("Batch Transcribe"):
|
|
|
|
| 866 |
batch_merge = gr.Checkbox(label="Merge transcripts to DOCX", value=True)
|
| 867 |
batch_mem = gr.Checkbox(label="Enable memory corrections", value=False)
|
| 868 |
batch_srt = gr.Checkbox(label="Generate SRT(s) if available", value=False)
|
| 869 |
+
with gr.Row():
|
| 870 |
+
batch_use_two_pass = gr.Checkbox(label="Use two-pass speedup", value=False)
|
| 871 |
+
batch_fast_model = gr.Dropdown(choices=[c for c in ["tiny", "base", "small"] if c in AVAILABLE_MODEL_SET], value="small", label="Fast model")
|
| 872 |
+
batch_refine_threshold = gr.Number(value=-1.0, label="Refine threshold (avg_logprob)", precision=2)
|
| 873 |
batch_run_btn = gr.Button("Start Batch Transcription", variant="primary")
|
| 874 |
with gr.Column(scale=1):
|
| 875 |
with gr.Group(elem_classes="card"):
|
|
|
|
| 881 |
|
| 882 |
def _do_extract(zip_file, password):
|
| 883 |
if not zip_file:
|
| 884 |
+
return gr.CheckboxGroup.update(choices=[]), "No ZIP provided."
|
| 885 |
zip_path = zip_file.name if hasattr(zip_file, "name") else str(zip_file)
|
| 886 |
friendly, logs = extract_zip_and_map(zip_path, password)
|
| 887 |
+
# return a component update so the CheckboxGroup shows new choices reliably
|
| 888 |
+
return gr.CheckboxGroup.update(choices=friendly), logs
|
| 889 |
|
| 890 |
batch_extract_btn.click(fn=_do_extract, inputs=[batch_zip, zip_password], outputs=[batch_select, batch_extract_logs])
|
| 891 |
|
| 892 |
+
def _do_batch(friendly_selected, uploaded_files, model_name, device, merge_flag, mem_flag, srt_flag, use_two_pass_flag, fast_model, refine_thresh):
|
| 893 |
+
combined, logs, out_doc, srt_path = batch_transcribe(
|
| 894 |
+
friendly_selected,
|
| 895 |
+
uploaded_files,
|
| 896 |
+
model_name,
|
| 897 |
+
device,
|
| 898 |
+
merge_flag,
|
| 899 |
+
mem_flag,
|
| 900 |
+
srt_flag,
|
| 901 |
+
use_two_pass=use_two_pass_flag,
|
| 902 |
+
fast_model=fast_model,
|
| 903 |
+
refine_threshold=refine_thresh,
|
| 904 |
+
)
|
| 905 |
return combined, logs, out_doc, srt_path
|
| 906 |
|
| 907 |
+
batch_run_btn.click(
|
| 908 |
+
fn=_do_batch,
|
| 909 |
+
inputs=[batch_select, batch_files, batch_model, batch_device, batch_merge, batch_mem, batch_srt, batch_use_two_pass, batch_fast_model, batch_refine_threshold],
|
| 910 |
+
outputs=[batch_trans_out, batch_logs, batch_doc_download, batch_srt_download],
|
| 911 |
+
)
|
| 912 |
|
| 913 |
# Memory Tab
|
| 914 |
with gr.TabItem("Memory"):
|
|
|
|
| 1009 |
with gr.Group(elem_classes="card"):
|
| 1010 |
gr.Markdown("### Runtime & tips")
|
| 1011 |
gr.Markdown("- Use `large-v3` only if your whisper package supports it.")
|
| 1012 |
+
gr.Markdown("- Extraction writes to a per-run temp directory under system temp. Re-extracting creates a new run dir.")
|
| 1013 |
+
gr.Markdown("- Two-pass helps on long files where heavy model is costly.")
|
| 1014 |
with gr.Column():
|
| 1015 |
with gr.Group(elem_classes="card"):
|
| 1016 |
gr.Markdown("### Theme")
|
| 1017 |
theme_toggle = gr.Button("Toggle Dark / Light Theme")
|
| 1018 |
+
gr.Markdown("Theme preference is saved in your browser (localStorage).")
|
| 1019 |
gr.Markdown("### Diagnostics")
|
| 1020 |
diag_btn = gr.Button("Show memory summary")
|
| 1021 |
diag_out = gr.Textbox(label="Diagnostics", lines=12, interactive=False)
|