staraks commited on
Commit
9f68e0b
·
verified ·
1 Parent(s): ec6edb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +262 -370
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # app.py
2
- # Whisper Transcriber Full corrected app.py (multi-tab, Audio Transcribe focused)
3
- # Requirements: gradio, whisper, pydub, pyzipper, python-docx, ffmpeg installed.
4
 
5
  import os
6
  import sys
@@ -14,10 +14,10 @@ import re
14
  from difflib import get_close_matches
15
  from pathlib import Path
16
 
17
- # Force unbuffered output so container logs show prints immediately
18
  os.environ["PYTHONUNBUFFERED"] = "1"
19
 
20
- print("DEBUG: app.py bootstrap starting", flush=True)
21
 
22
  # Third-party imports
23
  try:
@@ -43,10 +43,11 @@ FFMPEG_CANDIDATES = [
43
  ("mulaw", 8000, 1),
44
  ]
45
  MODEL_CACHE = {}
 
46
  FINETUNE_WORKDIR = os.path.join(tempfile.gettempdir(), "finetune_workdir")
47
  os.makedirs(FINETUNE_WORKDIR, exist_ok=True)
48
 
49
- # ---------- Helpers: Memory & Postprocessing ----------
50
  def load_memory():
51
  try:
52
  if os.path.exists(MEMORY_FILE):
@@ -67,6 +68,7 @@ def load_memory():
67
  pass
68
  return mem
69
 
 
70
  def save_memory(mem):
71
  with MEMORY_LOCK:
72
  try:
@@ -75,6 +77,7 @@ def save_memory(mem):
75
  except Exception:
76
  traceback.print_exc()
77
 
 
78
  memory = load_memory()
79
 
80
  MEDICAL_ABBREVIATIONS = {
@@ -90,13 +93,13 @@ MEDICAL_ABBREVIATIONS = {
90
  "adm": "admit",
91
  "disch": "discharge",
92
  }
93
-
94
  DRUG_NORMALIZATION = {
95
  "metformin": "Metformin",
96
  "aspirin": "Aspirin",
97
  "amoxicillin": "Amoxicillin",
98
  }
99
 
 
100
  def expand_abbreviations(text):
101
  tokens = re.split(r"(\s+)", text)
102
  out = []
@@ -112,11 +115,13 @@ def expand_abbreviations(text):
112
  out.append(t)
113
  return "".join(out)
114
 
 
115
  def normalize_drugs(text):
116
  for k, v in DRUG_NORMALIZATION.items():
117
  text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
118
  return text
119
 
 
120
  def punctuation_and_capitalization(text):
121
  text = text.strip()
122
  if not text:
@@ -132,6 +137,7 @@ def punctuation_and_capitalization(text):
132
  out.append(p)
133
  return "".join(out)
134
 
 
135
  def postprocess_transcript(text):
136
  if not text:
137
  return text
@@ -141,11 +147,13 @@ def postprocess_transcript(text):
141
  t = punctuation_and_capitalization(t)
142
  return t
143
 
 
144
  def extract_words_and_phrases(text):
145
  words = re.findall(r"[A-Za-z0-9\-']+", text)
146
  sentences = [s.strip() for s in re.split(r"(?<=[.?!])\s+", text) if s.strip()]
147
  return [w for w in words if w.strip()], sentences
148
 
 
149
  def update_memory_with_transcript(transcript):
150
  global memory
151
  words, sentences = extract_words_and_phrases(transcript)
@@ -161,6 +169,7 @@ def update_memory_with_transcript(transcript):
161
  if changed:
162
  save_memory(memory)
163
 
 
164
  def memory_correct_text(text, min_ratio=0.85):
165
  if not text or (not memory.get("words") and not memory.get("phrases")):
166
  return text
@@ -194,7 +203,8 @@ def memory_correct_text(text, min_ratio=0.85):
194
  corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
195
  return corrected
196
 
197
- # ---------- File utilities ----------
 
198
  def save_as_word(text, filename=None):
199
  if filename is None:
200
  filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
@@ -203,6 +213,7 @@ def save_as_word(text, filename=None):
203
  doc.save(filename)
204
  return filename
205
 
 
206
  # ---------- Conversion helpers ----------
207
  def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
208
  try:
@@ -230,6 +241,7 @@ def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
230
  pass
231
  return False, str(e)
232
 
 
233
  def convert_to_wav_if_needed(input_path):
234
  input_path = str(input_path)
235
  lower = input_path.lower()
@@ -312,7 +324,38 @@ def convert_to_wav_if_needed(input_path):
312
 
313
  raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}")
314
 
315
- # ---------- Whisper model loader ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  def get_whisper_model(name, device=None):
317
  if name not in MODEL_CACHE:
318
  print(f"DEBUG: loading whisper model '{name}'", flush=True)
@@ -322,12 +365,44 @@ def get_whisper_model(name, device=None):
322
  else:
323
  MODEL_CACHE[name] = whisper.load_model(name)
324
  except TypeError:
325
- # some whisper versions don't accept device arg
326
  MODEL_CACHE[name] = whisper.load_model(name)
327
  return MODEL_CACHE[name]
328
 
329
- # ---------- ZIP extraction helper ----------
330
- def extract_zip_list(zip_file, zip_password):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  temp_extract_dir = os.path.join(tempfile.gettempdir(), "extracted_audio")
332
  try:
333
  if os.path.exists(temp_extract_dir):
@@ -336,382 +411,247 @@ def extract_zip_list(zip_file, zip_password):
336
  except Exception:
337
  pass
338
  os.makedirs(temp_extract_dir, exist_ok=True)
339
- extracted = []
340
  logs = []
341
- with pyzipper.ZipFile(zip_file, "r") as zf:
342
  if zip_password:
343
  try:
344
  zf.setpassword(zip_password.encode())
345
  except Exception:
346
- logs.append("Warning: failed to set zip password (unexpected).")
347
- exts = [".mp3", ".wav", ".aac", ".flac", ".ogg", ".m4a", ".dat", ".dct"]
 
348
  for info in zf.infolist():
349
  if info.is_dir():
350
  continue
351
  _, ext = os.path.splitext(info.filename)
352
- if ext.lower() in exts:
353
- try:
354
- zf.extract(info, path=temp_extract_dir)
355
- except RuntimeError as e:
356
- logs.append(f"Password required/incorrect for {info.filename}: {e}")
357
- continue
358
- except pyzipper.BadZipFile:
359
- logs.append(f"Bad zip entry: {info.filename}")
360
- continue
361
- except Exception as e:
362
- logs.append(f"Error extracting {info.filename}: {e}")
363
- continue
364
- p = os.path.normpath(os.path.join(temp_extract_dir, info.filename))
365
- if os.path.exists(p):
366
- extracted.append(p)
367
- logs.append(f"Extracted: {info.filename}")
368
- if not extracted:
369
- logs.append("No supported audio files found in zip.")
 
 
 
 
 
 
 
 
 
 
 
370
  return [], "\n".join(logs)
371
- return extracted, "\n".join(logs)
 
 
372
  except Exception as e:
373
  traceback.print_exc()
374
  return [], f"Extraction failed: {e}"
375
 
376
- # ---------- Simple single-file transcriber ----------
377
- def transcribe_single(audio_path, model_name="small", enable_memory=False, device_choice="auto"):
 
378
  logs = []
379
- transcript_text = ""
380
  try:
381
- if not audio_path:
382
- return None, "No audio provided.", "No file provided."
383
- path = str(audio_path)
 
384
  device = None if device_choice == "auto" else device_choice
385
  model = get_whisper_model(model_name, device=device)
386
  logs.append(f"Loaded model: {model_name}")
387
- wav = convert_to_wav_if_needed(path)
388
  logs.append(f"Converted to WAV: {os.path.basename(wav)}")
 
389
  result = model.transcribe(wav)
390
  text = result.get("text", "").strip()
391
  if enable_memory:
392
  text = memory_correct_text(text)
393
  text = postprocess_transcript(text)
394
- transcript_text = text
 
 
 
 
 
 
 
 
395
  if enable_memory:
396
  try:
397
  update_memory_with_transcript(text)
398
  logs.append("Memory updated.")
399
  except Exception:
400
  pass
401
- # cleanup temporary wav if created
402
- if wav and os.path.exists(wav) and wav != path:
403
  try:
404
  os.unlink(wav)
405
  except Exception:
406
  pass
407
- return path, transcript_text, "\n".join(logs)
408
  except Exception as e:
409
  tb = traceback.format_exc()
410
- return None, "", f"Error: {e}\n{tb}"
411
 
412
- # ---------- Fine-tune helpers (include old-files support) ----------
413
- def _collect_old_files_into(dst_dir, old_dir_path):
414
- msgs = []
415
- copied = 0
416
- try:
417
- if not os.path.isdir(old_dir_path):
418
- return 0, f"Old-files path is not a directory: {old_dir_path}"
419
- for root, _, files in os.walk(old_dir_path):
420
- for f in files:
421
- if f.lower().endswith((".wav", ".mp3", ".flac", ".m4a", ".ogg")):
422
- src_audio = os.path.join(root, f)
423
- base = os.path.splitext(f)[0]
424
- possible_txt = os.path.join(root, base + ".txt")
425
- rel_subdir = os.path.relpath(root, old_dir_path)
426
- target_subdir = os.path.join(dst_dir, rel_subdir)
427
- os.makedirs(target_subdir, exist_ok=True)
428
- target_audio = os.path.join(target_subdir, f)
429
- shutil.copy2(src_audio, target_audio)
430
- if os.path.exists(possible_txt):
431
- shutil.copy2(possible_txt, os.path.join(target_subdir, base + ".txt"))
432
- msgs.append(f"Copied pair: {os.path.join(rel_subdir, f)} + .txt")
433
- else:
434
- msgs.append(f"Copied audio (no transcript found): {os.path.join(rel_subdir, f)}")
435
- copied += 1
436
- return copied, "\n".join(msgs)
437
- except Exception as e:
438
- traceback.print_exc()
439
- return copied, f"Error copying old files: {e}"
440
 
441
- def prepare_finetune_dataset(uploaded_zip_or_dir, include_old_files=False, old_files_dir=""):
442
- dst = os.path.join(FINETUNE_WORKDIR, "data")
443
- try:
444
- if os.path.exists(dst):
445
- shutil.rmtree(dst)
446
- os.makedirs(dst, exist_ok=True)
447
- except Exception as e:
448
- return f"Failed to prepare workdir: {e}", ""
449
- path = None
450
- try:
451
- if uploaded_zip_or_dir:
452
- if isinstance(uploaded_zip_or_dir, (str, os.PathLike)):
453
- path = str(uploaded_zip_or_dir)
454
- elif hasattr(uploaded_zip_or_dir, "name"):
455
- path = uploaded_zip_or_dir.name
456
- elif isinstance(uploaded_zip_or_dir, dict) and uploaded_zip_or_dir.get("name"):
457
- path = uploaded_zip_or_dir["name"]
458
- except Exception as e:
459
- return f"Unable to determine uploaded path: {e}", ""
460
- # extract or copy uploaded dataset if provided
461
- if path and os.path.isfile(path) and path.lower().endswith(".zip"):
462
- try:
463
- with pyzipper.ZipFile(path, "r") as zf:
464
- zf.extractall(dst)
465
- except Exception as e:
466
- return f"Failed to extract ZIP: {e}", ""
467
- elif path and os.path.isdir(path):
468
- try:
469
- for item in os.listdir(path):
470
- s = os.path.join(path, item)
471
- d = os.path.join(dst, item)
472
- if os.path.isdir(s):
473
- shutil.copytree(s, d)
474
- else:
475
- shutil.copy2(s, d)
476
- except Exception as e:
477
- return f"Failed to copy dataset dir: {e}", ""
478
- # include old files if requested
479
- old_msgs = ""
480
- if include_old_files and old_files_dir:
481
- old_path = None
482
- if isinstance(old_files_dir, (str, os.PathLike)):
483
- old_path = str(old_files_dir)
484
- elif hasattr(old_files_dir, "name"):
485
- old_path = old_files_dir.name
486
- elif isinstance(old_files_dir, dict) and old_files_dir.get("name"):
487
- old_path = old_files_dir["name"]
488
- if old_path:
489
- copied, msg = _collect_old_files_into(dst, old_path)
490
- old_msgs = f"\nOld-files: copied {copied} audio files.\nDetails:\n{msg}"
491
- # find or build manifest
492
- transcripts_candidates = [
493
- os.path.join(dst, "transcripts.tsv"),
494
- os.path.join(dst, "metadata.tsv"),
495
- os.path.join(dst, "manifest.tsv"),
496
- os.path.join(dst, "transcripts.txt"),
497
- os.path.join(dst, "manifest.jsonl"),
498
- ]
499
- manifest_path = os.path.join(FINETUNE_WORKDIR, "manifest.tsv")
500
- found = False
501
- for tpath in transcripts_candidates:
502
- if os.path.exists(tpath):
503
- try:
504
- shutil.copy2(tpath, manifest_path)
505
- found = True
506
- break
507
- except Exception:
508
- pass
509
- missing_transcripts = 0
510
- if not found:
511
- audio_files = []
512
- for root, _, files in os.walk(dst):
513
- for f in files:
514
- if f.lower().endswith((".wav", ".mp3", ".flac", ".m4a", ".ogg")):
515
- audio_files.append(os.path.join(root, f))
516
- if not audio_files:
517
- return f"No audio files found in dataset.{old_msgs}", ""
518
- entries = []
519
- for a in audio_files:
520
- base = os.path.splitext(a)[0]
521
- t_candidate = base + ".txt"
522
- transcript = ""
523
- if os.path.exists(t_candidate):
524
- try:
525
- with open(t_candidate, "r", encoding="utf-8") as fh:
526
- transcript = fh.read().strip().replace("\n", " ")
527
- except Exception:
528
- transcript = ""
529
  else:
530
- missing_transcripts += 1
531
- entries.append(f"{a}\t{transcript}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  try:
533
- with open(manifest_path, "w", encoding="utf-8") as fh:
534
- fh.write("\n".join(entries))
535
- found = True
536
  except Exception as e:
537
- return f"Failed to write manifest: {e}{old_msgs}", ""
538
- if not found:
539
- return f"Failed to locate or build manifest.{old_msgs}", ""
540
- status_msg = f"Dataset prepared. Manifest: {manifest_path}{old_msgs}"
541
- if missing_transcripts > 0:
542
- status_msg += f"\nWarning: {missing_transcripts} audio files have no matching .txt transcript (empty transcripts saved)."
543
- return status_msg, manifest_path
544
-
545
- def start_finetune(manifest_path, base_model, epochs, batch_size, lr, output_dir):
546
- outdir = output_dir or os.path.join(FINETUNE_WORKDIR, "output")
547
- os.makedirs(outdir, exist_ok=True)
548
- START_CMD = [
549
- sys.executable,
550
- "fine_tune.py",
551
- "--manifest",
552
- manifest_path,
553
- "--base_model",
554
- base_model,
555
- "--epochs",
556
- str(epochs),
557
- "--batch_size",
558
- str(batch_size),
559
- "--lr",
560
- str(lr),
561
- "--output_dir",
562
- outdir,
563
- ]
564
- try:
565
- logfile = open(os.path.join(outdir, "finetune_stdout.log"), "a", encoding="utf-8")
566
- proc = subprocess.Popen(START_CMD, stdout=logfile, stderr=logfile, cwd=os.getcwd())
567
- return f"Fine-tune started (PID={proc.pid}). Logs: {logfile.name}"
568
- except FileNotFoundError as e:
569
- return f"Training script not found: {e}. Put 'fine_tune.py' in project root or change START_CMD."
570
- except Exception as e:
571
- return f"Failed to start fine-tune: {e}"
572
 
573
- def tail_finetune_logs(logpath, lines=200):
574
- try:
575
- if not os.path.exists(logpath):
576
- return "No logs yet."
577
- with open(logpath, "r", encoding="utf-8", errors="ignore") as fh:
578
- all_lines = fh.read().splitlines()
579
- last = all_lines[-lines:]
580
- return "\n".join(last)
581
- except Exception as e:
582
- return f"Failed to read logs: {e}"
583
 
584
- # ---------- UI CSS ----------
 
 
 
 
585
  CSS = """
586
- :root{
587
- --accent:#4f46e5;
588
- --muted:#6b7280;
589
- --card:#ffffff;
590
- --bg:#f7f8fb;
591
- }
592
  body { background: var(--bg); font-family: Inter, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
593
- .header { padding: 18px 24px; border-radius: 12px; background: linear-gradient(90deg, rgba(79,70,229,0.12), rgba(99,102,241,0.04)); margin-bottom: 18px; display:flex;align-items:center;gap:16px; }
594
- .app-icon { width:62px;height:62px;border-radius:12px;background:linear-gradient(135deg,var(--accent),#06b6d4);display:flex;align-items:center;justify-content:center;color:white;font-weight:700;font-size:24px; }
595
- .header-title h1 { margin:0;font-size:20px;}
596
- .header-sub { color:var(--muted); margin-top:4px;font-size:13px;}
597
- .card { background:var(--card); border-radius:12px; padding:14px; box-shadow: 0 6px 20px rgba(16,24,40,0.06); }
598
- .transcript-area { white-space:pre-wrap; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, "Roboto Mono", monospace; background:#0f172a; color:#e6eef8; padding:12px; border-radius:10px; min-height:220px; }
599
  .small-note { color:var(--muted); font-size:12px;}
600
  """
601
 
602
- # ---------- Build UI ----------
603
- print("DEBUG: building Gradio Blocks", flush=True)
604
- with gr.Blocks(title="Whisper Transcriber", css=CSS) as demo:
605
- # Header
606
  with gr.Row(elem_classes="header"):
607
  with gr.Column(scale=0):
608
  gr.HTML("<div class='app-icon'>WT</div>")
609
  with gr.Column():
610
- gr.HTML("<h1 style='margin:0'>Whisper Transcriber</h1>")
611
- gr.Markdown("<div class='header-sub'>Transcribe, batch, memory & fine-tune multi-tab UI</div>")
612
 
613
  with gr.Tabs():
614
- # Audio Transcribe Tab
615
  with gr.TabItem("Audio Transcribe"):
616
  with gr.Row():
617
  with gr.Column(scale=1):
618
  with gr.Group(elem_classes="card"):
619
- gr.Markdown("### Quick Single Audio Transcribe")
620
  single_audio = gr.Audio(label="Upload or record audio", type="filepath")
621
  with gr.Row():
622
- model_select = gr.Dropdown(choices=["small","medium","large","large-v3","base"], value="large-v3", label="Model")
623
- device_select = gr.Dropdown(choices=["auto","cpu","cuda"], value="auto", label="Device")
624
  with gr.Row():
625
- mem_toggle = gr.Checkbox(label="Enable correction memory", value=False)
626
- format_choice = gr.Dropdown(choices=["Plain","SOAP (medical)"], value="Plain", label="Format")
627
  transcribe_btn = gr.Button("Transcribe", variant="primary")
628
- gr.Markdown("<div class='small-note'>Tip: choose large-v3 if your environment supports it.</div>")
629
  with gr.Column(scale=1):
630
  with gr.Group(elem_classes="card"):
631
- gr.Markdown("### Player & Transcript")
632
- audio_preview = gr.Audio(label="Player", interactive=False)
633
- transcript_out = gr.Textbox(label="Transcript", lines=14, interactive=False, elem_classes="transcript-area")
634
- transcript_logs = gr.Textbox(label="Logs", lines=6, interactive=False)
635
-
636
- def _do_single_transcribe(audio_file, model_name, device_choice, enable_memory, fmt_choice):
637
- player_path, transcript, logs = transcribe_single(audio_file, model_name=model_name, enable_memory=enable_memory, device_choice=device_choice)
638
- if fmt_choice == "SOAP":
639
- sentences = re.split(r"(?<=[.?!])\s+", transcript)
640
- subj = sentences[0] if sentences else ""
641
- obj = sentences[1] if len(sentences) > 1 else ""
642
- soap = f"S: {subj}\nO: {obj}\nA: Assessment pending\nP: Plan: follow up"
643
- transcript = soap
644
- return player_path, transcript, logs
645
-
646
- transcribe_btn.click(fn=_do_single_transcribe, inputs=[single_audio, model_select, device_select, mem_toggle, format_choice], outputs=[audio_preview, transcript_out, transcript_logs])
647
-
648
- # Batch Transcribe Tab
649
  with gr.TabItem("Batch Transcribe"):
650
  with gr.Row():
651
  with gr.Column(scale=1):
652
  with gr.Group(elem_classes="card"):
653
- gr.Markdown("### Batch / ZIP workflow")
654
- batch_files = gr.File(label="Upload multiple audio files (optional)", file_count="multiple", type="filepath")
655
  batch_zip = gr.File(label="Or upload ZIP with audio (optional)", file_count="single", type="filepath")
656
  zip_password = gr.Textbox(label="ZIP password (optional)")
 
 
 
657
  with gr.Row():
658
- batch_model = gr.Dropdown(choices=["small","medium","large","large-v3","base"], value="small", label="Model")
659
- batch_device = gr.Dropdown(choices=["auto","cpu","cuda"], value="auto", label="Device")
660
- batch_merge = gr.Checkbox(label="Merge all transcripts into one .docx", value=True)
661
  batch_mem = gr.Checkbox(label="Enable memory corrections", value=False)
662
- batch_extract_btn = gr.Button("Extract ZIP & List Files")
663
- batch_extract_logs = gr.Textbox(label="Extraction logs", lines=6, interactive=False)
664
- batch_select = gr.CheckboxGroup(choices=[], label="Select extracted files to transcribe", interactive=True)
665
- batch_trans_btn = gr.Button("Start Batch Transcription", variant="primary")
666
  with gr.Column(scale=1):
667
  with gr.Group(elem_classes="card"):
668
- gr.Markdown("### Output")
669
  batch_trans_out = gr.Textbox(label="Transcript (combined)", lines=16, interactive=False)
670
  batch_logs = gr.Textbox(label="Logs", lines=10, interactive=False)
671
- batch_download = gr.File(label="Merged .docx (when available)")
 
672
 
673
- def _extract_zip_for_ui(zip_file, password):
674
  if not zip_file:
675
- return [], "No zip provided."
676
  zip_path = zip_file.name if hasattr(zip_file, "name") else str(zip_file)
677
- extracted, logs = extract_zip_list(zip_path, password)
678
- short_logs = logs + "\n\nFiles:\n" + "\n".join([os.path.basename(p) for p in extracted])
679
- return extracted, short_logs
680
-
681
- batch_extract_btn.click(fn=_extract_zip_for_ui, inputs=[batch_zip, zip_password], outputs=[batch_select, batch_extract_logs])
682
-
683
- def _batch_transcribe(selected_check, uploaded_files, model_name, device_name, merge_flag, enable_mem):
684
- paths = []
685
- if selected_check:
686
- paths.extend(selected_check)
687
- if uploaded_files:
688
- if isinstance(uploaded_files, (list, tuple)):
689
- for x in uploaded_files:
690
- paths.append(str(x))
691
- else:
692
- paths.append(str(uploaded_files))
693
- if not paths:
694
- return "", "No files selected or uploaded.", None
695
- logs = []
696
- transcripts = []
697
- out_doc = None
698
- for p in paths:
699
- try:
700
- _, txt, lg = transcribe_single(p, model_name=model_name, enable_memory=enable_mem, device_choice=device_name)
701
- logs.append(lg)
702
- transcripts.append(f"FILE: {os.path.basename(str(p))}\n{txt}\n")
703
- except Exception as e:
704
- logs.append(f"Failed {p}: {e}")
705
- combined = "\n\n".join(transcripts)
706
- if merge_flag:
707
- try:
708
- out_doc = save_as_word(combined)
709
- logs.append(f"Merged saved: {out_doc}")
710
- except Exception as e:
711
- logs.append(f"Merge failed: {e}")
712
- return combined, "\n".join(logs), out_doc
713
 
714
- batch_trans_btn.click(fn=_batch_transcribe, inputs=[batch_select, batch_files, batch_model, batch_device, batch_merge, batch_mem], outputs=[batch_trans_out, batch_logs, batch_download])
715
 
716
  # Memory Tab
717
  with gr.TabItem("Memory"):
@@ -719,9 +659,9 @@ with gr.Blocks(title="Whisper Transcriber", css=CSS) as demo:
719
  with gr.Column(scale=1):
720
  with gr.Group(elem_classes="card"):
721
  gr.Markdown("### Correction Memory")
722
- mem_upload = gr.File(label="Import memory (JSON or text)", file_count="single", type="filepath")
723
  mem_import_btn = gr.Button("Import Memory")
724
- mem_add_text = gr.Textbox(label="Add word / phrase", placeholder="Type and click Add")
725
  mem_add_btn = gr.Button("Add to Memory")
726
  mem_clear_btn = gr.Button("Clear Memory")
727
  mem_view_btn = gr.Button("View Memory")
@@ -790,87 +730,39 @@ with gr.Blocks(title="Whisper Transcriber", css=CSS) as demo:
790
  def _view_mem():
791
  w = memory.get("words", {})
792
  p = memory.get("phrases", {})
793
- out = []
794
- out.append("WORDS (top 30):")
795
  for k, v in sorted(w.items(), key=lambda kv: -kv[1])[:30]:
796
- out.append(f"{k}: {v}")
797
- out.append("")
798
- out.append("PHRASES (top 20):")
799
  for k, v in sorted(p.items(), key=lambda kv: -kv[1])[:20]:
800
- out.append(f"{k}: {v}")
801
- return "\n".join(out)
802
 
803
  mem_import_btn.click(fn=_import_mem, inputs=[mem_upload], outputs=[mem_status])
804
- mem_add_btn.click(fn=_add_mem, inputs=[mem_add_text], outputs=[mem_status])
805
  mem_clear_btn.click(fn=_clear_mem, inputs=[], outputs=[mem_status])
806
  mem_view_btn.click(fn=_view_mem, inputs=[], outputs=[mem_status])
807
 
808
- # Fine-tune Tab
809
- with gr.TabItem("Fine-tune"):
810
- with gr.Row():
811
- with gr.Column(scale=1):
812
- with gr.Group(elem_classes="card"):
813
- gr.Markdown("### Prepare & Launch Fine-tune")
814
- ft_upload = gr.File(label="Upload dataset ZIP (optional)", file_count="single", type="filepath")
815
- ft_include_old = gr.Checkbox(label="Include old audio+transcript folder", value=False)
816
- ft_old = gr.File(label="Old files folder (optional)", file_count="single", type="filepath")
817
- ft_prepare_btn = gr.Button("Prepare dataset")
818
- ft_manifest_box = gr.Textbox(label="Prepare status / manifest", lines=4, interactive=False)
819
- ft_base_model = gr.Dropdown(choices=["small","base","medium","large","large-v3"], value="small", label="Base model")
820
- ft_epochs = gr.Slider(minimum=1, maximum=100, value=3, step=1, label="Epochs")
821
- ft_batch = gr.Number(label="Batch size", value=8)
822
- ft_lr = gr.Number(label="Learning rate", value=1e-5, precision=8)
823
- ft_output_dir = gr.Textbox(label="Output dir (optional)", value="", placeholder="Leave blank to use temp output")
824
- ft_start_btn = gr.Button("Start Fine-tune")
825
- ft_stop_btn = gr.Button("Stop Fine-tune")
826
- ft_start_status = gr.Textbox(label="Start/Stop status", interactive=False, lines=4)
827
- ft_tail_btn = gr.Button("Tail training logs")
828
- ft_logs = gr.Textbox(label="Training logs (tail)", interactive=False, lines=12)
829
- with gr.Column(scale=1):
830
- with gr.Group(elem_classes="card"):
831
- gr.Markdown("### Notes")
832
- gr.Markdown("- Old-files folder should contain audio files and matching .txt transcripts with the same basename.")
833
- gr.Markdown("- The app prepares a manifest and calls your `fine_tune.py` training script (you must provide it).")
834
-
835
- def _prepare_action(ft_upload_file, include_old, old_dir):
836
- status, manifest = prepare_finetune_dataset(ft_upload_file, include_old_files=include_old, old_files_dir=old_dir)
837
- return status
838
-
839
- def _start_action(manifest_text, base_model, epochs, batch_size, lr, output_dir):
840
- manifest_guess = os.path.join(FINETUNE_WORKDIR, "manifest.tsv")
841
- if not os.path.exists(manifest_guess):
842
- return "Manifest not found. Prepare dataset first or manually provide manifest."
843
- status = start_finetune(manifest_guess, base_model, int(epochs), int(batch_size), float(lr), output_dir)
844
- return status
845
-
846
- ft_prepare_btn.click(fn=_prepare_action, inputs=[ft_upload, ft_include_old, ft_old], outputs=[ft_manifest_box])
847
- ft_start_btn.click(fn=_start_action, inputs=[ft_manifest_box, ft_base_model, ft_epochs, ft_batch, ft_lr, ft_output_dir], outputs=[ft_start_status])
848
- ft_stop_btn.click(fn=lambda: "Stop not implemented in placeholder", inputs=[], outputs=[ft_start_status])
849
- ft_tail_btn.click(fn=lambda: "Tail logs not implemented in placeholder", inputs=[], outputs=[ft_logs])
850
-
851
  # Settings Tab
852
  with gr.TabItem("Settings"):
853
  with gr.Row():
854
  with gr.Column():
855
  with gr.Group(elem_classes="card"):
856
  gr.Markdown("### Runtime & tips")
857
- gr.Markdown("- Use large-v3 only if your whisper package supports it.")
858
  gr.Markdown("- Extraction writes to system temp `extracted_audio`. Re-extracting overwrites it.")
859
- gr.Markdown("- Provide your `fine_tune.py` for real fine-tuning.")
860
  with gr.Column():
861
  with gr.Group(elem_classes="card"):
862
  gr.Markdown("### Diagnostics")
863
  diag_btn = gr.Button("Show memory summary")
864
  diag_out = gr.Textbox(label="Diagnostics", lines=12, interactive=False)
865
- diag_btn.click(fn=lambda: _view_mem(), inputs=[], outputs=[diag_out])
866
 
867
- # ---------- Launch ----------
868
  if __name__ == "__main__":
869
  port = int(os.environ.get("PORT", 7860))
870
- print("DEBUG: launching Gradio on port", port, flush=True)
871
- try:
872
- demo.queue().launch(server_name="0.0.0.0", server_port=port)
873
- except Exception as e:
874
- print("FATAL: demo.launch failed:", e, flush=True)
875
- traceback.print_exc()
876
- raise
 
1
  # app.py
2
+ # Improved Whisper Transcriber (per-file selection after unzip, model availability check, SRT export)
3
+ # Requirements: gradio, whisper, pydub, pyzipper, python-docx, ffmpeg
4
 
5
  import os
6
  import sys
 
14
  from difflib import get_close_matches
15
  from pathlib import Path
16
 
17
+ # Force unbuffered prints for logs
18
  os.environ["PYTHONUNBUFFERED"] = "1"
19
 
20
+ print("DEBUG: improved app.py bootstrap starting", flush=True)
21
 
22
  # Third-party imports
23
  try:
 
43
  ("mulaw", 8000, 1),
44
  ]
45
  MODEL_CACHE = {}
46
+ EXTRACT_MAP = {} # maps friendly basename -> full path (populated after unzip)
47
  FINETUNE_WORKDIR = os.path.join(tempfile.gettempdir(), "finetune_workdir")
48
  os.makedirs(FINETUNE_WORKDIR, exist_ok=True)
49
 
50
+ # ---------- Helpers: memory & postprocessing ----------
51
  def load_memory():
52
  try:
53
  if os.path.exists(MEMORY_FILE):
 
68
  pass
69
  return mem
70
 
71
+
72
  def save_memory(mem):
73
  with MEMORY_LOCK:
74
  try:
 
77
  except Exception:
78
  traceback.print_exc()
79
 
80
+
81
  memory = load_memory()
82
 
83
  MEDICAL_ABBREVIATIONS = {
 
93
  "adm": "admit",
94
  "disch": "discharge",
95
  }
 
96
  DRUG_NORMALIZATION = {
97
  "metformin": "Metformin",
98
  "aspirin": "Aspirin",
99
  "amoxicillin": "Amoxicillin",
100
  }
101
 
102
+
103
  def expand_abbreviations(text):
104
  tokens = re.split(r"(\s+)", text)
105
  out = []
 
115
  out.append(t)
116
  return "".join(out)
117
 
118
+
119
  def normalize_drugs(text):
120
  for k, v in DRUG_NORMALIZATION.items():
121
  text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
122
  return text
123
 
124
+
125
  def punctuation_and_capitalization(text):
126
  text = text.strip()
127
  if not text:
 
137
  out.append(p)
138
  return "".join(out)
139
 
140
+
141
  def postprocess_transcript(text):
142
  if not text:
143
  return text
 
147
  t = punctuation_and_capitalization(t)
148
  return t
149
 
150
+
151
  def extract_words_and_phrases(text):
152
  words = re.findall(r"[A-Za-z0-9\-']+", text)
153
  sentences = [s.strip() for s in re.split(r"(?<=[.?!])\s+", text) if s.strip()]
154
  return [w for w in words if w.strip()], sentences
155
 
156
+
157
  def update_memory_with_transcript(transcript):
158
  global memory
159
  words, sentences = extract_words_and_phrases(transcript)
 
169
  if changed:
170
  save_memory(memory)
171
 
172
+
173
  def memory_correct_text(text, min_ratio=0.85):
174
  if not text or (not memory.get("words") and not memory.get("phrases")):
175
  return text
 
203
  corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
204
  return corrected
205
 
206
+
207
+ # ---------- File utils ----------
208
  def save_as_word(text, filename=None):
209
  if filename is None:
210
  filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
 
213
  doc.save(filename)
214
  return filename
215
 
216
+
217
  # ---------- Conversion helpers ----------
218
  def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
219
  try:
 
241
  pass
242
  return False, str(e)
243
 
244
+
245
  def convert_to_wav_if_needed(input_path):
246
  input_path = str(input_path)
247
  lower = input_path.lower()
 
324
 
325
  raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}")
326
 
327
+
328
+ # ---------- Whisper model loader & availability ----------
329
+ def whisper_available_models():
330
+ """Return set of model names if whisper provides helper; otherwise conservative fallback."""
331
+ try:
332
+ # many whisper forks expose available_models()
333
+ models = whisper.available_models()
334
+ if isinstance(models, (list, tuple, set)):
335
+ return set(models)
336
+ except Exception:
337
+ pass
338
+ # fallback: offer the common set but note we can't verify at startup
339
+ return set(["tiny", "base", "small", "medium", "large", "large-v3"])
340
+
341
+
342
+ AVAILABLE_MODEL_SET = whisper_available_models()
343
+
344
+
345
+ def safe_model_choices(prefer_default="small"):
346
+ # hide entries not in AVAILABLE_MODEL_SET
347
+ base_choices = ["small", "medium", "large", "large-v3", "base", "tiny"]
348
+ choices = [m for m in base_choices if m in AVAILABLE_MODEL_SET]
349
+ if not choices:
350
+ choices = base_choices # if we couldn't detect, still present choices
351
+ # ensure prefer_default exists
352
+ if prefer_default in choices:
353
+ default = prefer_default
354
+ else:
355
+ default = choices[0]
356
+ return choices, default
357
+
358
+
359
  def get_whisper_model(name, device=None):
360
  if name not in MODEL_CACHE:
361
  print(f"DEBUG: loading whisper model '{name}'", flush=True)
 
365
  else:
366
  MODEL_CACHE[name] = whisper.load_model(name)
367
  except TypeError:
 
368
  MODEL_CACHE[name] = whisper.load_model(name)
369
  return MODEL_CACHE[name]
370
 
371
+
372
+ # ---------- SRT export ----------
373
+ def segments_to_srt(segments):
374
+ """
375
+ segments: iterable of dicts with 'start','end','text' or whisper segments
376
+ returns srt_text
377
+ """
378
+ def fmt_time(t):
379
+ # t in seconds
380
+ h = int(t // 3600)
381
+ m = int((t % 3600) // 60)
382
+ s = int(t % 60)
383
+ ms = int((t - int(t)) * 1000)
384
+ return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
385
+
386
+ lines = []
387
+ for i, seg in enumerate(segments, start=1):
388
+ start = seg.get("start", 0)
389
+ end = seg.get("end", 0)
390
+ text = seg.get("text", "").strip()
391
+ lines.append(str(i))
392
+ lines.append(f"{fmt_time(start)} --> {fmt_time(end)}")
393
+ lines.append(text)
394
+ lines.append("") # blank line
395
+ return "\n".join(lines)
396
+
397
+
398
+ # ---------- ZIP extraction + mapping for UI ----------
399
+ def extract_zip_and_map(zip_path, zip_password=None):
400
+ """
401
+ Extracts supported audio files into temp dir and builds EXTRACT_MAP mapping friendly basename -> full path.
402
+ Returns list of friendly basenames and log string.
403
+ """
404
+ global EXTRACT_MAP
405
+ EXTRACT_MAP = {}
406
  temp_extract_dir = os.path.join(tempfile.gettempdir(), "extracted_audio")
407
  try:
408
  if os.path.exists(temp_extract_dir):
 
411
  except Exception:
412
  pass
413
  os.makedirs(temp_extract_dir, exist_ok=True)
 
414
  logs = []
415
+ with pyzipper.ZipFile(zip_path, "r") as zf:
416
  if zip_password:
417
  try:
418
  zf.setpassword(zip_password.encode())
419
  except Exception:
420
+ logs.append("Warning: failed to set zip password (continuing).")
421
+ count = {}
422
+ supported = [".mp3", ".wav", ".aac", ".flac", ".ogg", ".m4a", ".dat", ".dct"]
423
  for info in zf.infolist():
424
  if info.is_dir():
425
  continue
426
  _, ext = os.path.splitext(info.filename)
427
+ if ext.lower() not in supported:
428
+ continue
429
+ try:
430
+ zf.extract(info, path=temp_extract_dir)
431
+ except RuntimeError as e:
432
+ logs.append(f"Password required or incorrect for {info.filename}: {e}")
433
+ continue
434
+ except Exception as e:
435
+ logs.append(f"Error extracting {info.filename}: {e}")
436
+ continue
437
+ fullp = os.path.normpath(os.path.join(temp_extract_dir, info.filename))
438
+ if not os.path.exists(fullp):
439
+ continue
440
+ # friendly basename (avoid collisions)
441
+ base = os.path.basename(info.filename)
442
+ # if collision, append suffix
443
+ key = base
444
+ if key in EXTRACT_MAP:
445
+ # create unique by adding index
446
+ idx = count.get(base, 1) + 1
447
+ count[base] = idx
448
+ name_only, extn = os.path.splitext(base)
449
+ key = f"{name_only} ({idx}){extn}"
450
+ else:
451
+ count[base] = 1
452
+ EXTRACT_MAP[key] = fullp
453
+ logs.append(f"Extracted: {info.filename} -> {key}")
454
+ if not EXTRACT_MAP:
455
+ logs.append("No supported audio files found in ZIP.")
456
  return [], "\n".join(logs)
457
+ # return sorted friendly names
458
+ friendly = sorted(EXTRACT_MAP.keys())
459
+ return friendly, "\n".join(logs)
460
  except Exception as e:
461
  traceback.print_exc()
462
  return [], f"Extraction failed: {e}"
463
 
464
+
465
+ # ---------- Single-file transcribe (with SRT option) ----------
466
+ def transcribe_single_file(path, model_name="small", device_choice="auto", enable_memory=False, generate_srt=False):
467
  logs = []
 
468
  try:
469
+ if not path:
470
+ return None, "", "No file provided."
471
+ # normalize path if it's a file-like dict
472
+ p = path.name if hasattr(path, "name") else str(path)
473
  device = None if device_choice == "auto" else device_choice
474
  model = get_whisper_model(model_name, device=device)
475
  logs.append(f"Loaded model: {model_name}")
476
+ wav = convert_to_wav_if_needed(p)
477
  logs.append(f"Converted to WAV: {os.path.basename(wav)}")
478
+ # call whisper transcribe
479
  result = model.transcribe(wav)
480
  text = result.get("text", "").strip()
481
  if enable_memory:
482
  text = memory_correct_text(text)
483
  text = postprocess_transcript(text)
484
+ srt_path = None
485
+ if generate_srt and result.get("segments"):
486
+ srt_text = segments_to_srt(result["segments"])
487
+ # save srt in temp dir
488
+ srt_fp = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(p))[0]}.srt")
489
+ with open(srt_fp, "w", encoding="utf-8") as fh:
490
+ fh.write(srt_text)
491
+ srt_path = srt_fp
492
+ logs.append(f"SRT generated: {srt_path}")
493
  if enable_memory:
494
  try:
495
  update_memory_with_transcript(text)
496
  logs.append("Memory updated.")
497
  except Exception:
498
  pass
499
+ # cleanup intermediate wav if created
500
+ if wav and os.path.exists(wav) and wav != p:
501
  try:
502
  os.unlink(wav)
503
  except Exception:
504
  pass
505
+ return text, srt_path, "\n".join(logs)
506
  except Exception as e:
507
  tb = traceback.format_exc()
508
+ return "", None, f"Transcription error: {e}\n{tb}"
509
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
 
511
+ # ---------- Batch transcribe (maps friendly names to real paths) ----------
512
+ def batch_transcribe(friendly_selected, uploaded_files, model_name, device_name, merge_flag, enable_mem, generate_srt):
513
+ logs = []
514
+ transcripts = []
515
+ srt_files = []
516
+ out_doc = None
517
+ paths = []
518
+ # selected from zip (friendly names)
519
+ if friendly_selected:
520
+ for key in friendly_selected:
521
+ p = EXTRACT_MAP.get(key)
522
+ if p:
523
+ paths.append(p)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  else:
525
+ logs.append(f"Warning: selected file not found in extract map: {key}")
526
+ # uploaded files
527
+ if uploaded_files:
528
+ if isinstance(uploaded_files, (list, tuple)):
529
+ for f in uploaded_files:
530
+ paths.append(str(f))
531
+ else:
532
+ paths.append(str(uploaded_files))
533
+ if not paths:
534
+ return "", "No files selected or uploaded.", None, None
535
+
536
+ total = len(paths)
537
+ for idx, p in enumerate(paths, start=1):
538
+ logs.append(f"[{idx}/{total}] Processing: {p}")
539
+ text, srt_path, lg = transcribe_single_file(p, model_name=model_name, device_choice=device_name, enable_memory=enable_mem, generate_srt=generate_srt)
540
+ logs.append(lg)
541
+ transcripts.append(f"FILE: {os.path.basename(p)}\n{text}\n")
542
+ if srt_path:
543
+ srt_files.append(srt_path)
544
+ combined = "\n\n".join(transcripts)
545
+ if merge_flag:
546
  try:
547
+ out_doc = save_as_word(combined)
548
+ logs.append(f"Merged transcript saved: {out_doc}")
 
549
  except Exception as e:
550
+ logs.append(f"Merge failed: {e}")
551
+ # if multiple SRTs, if desired we could zip them; here we just return first SRT if any
552
+ srt_return = srt_files[0] if srt_files else None
553
+ return combined, "\n".join(logs), out_doc, srt_return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
 
 
 
 
 
 
 
 
 
 
 
555
 
556
+ # ---------- UI building ----------
557
+ print("DEBUG: building Gradio UI", flush=True)
558
+
559
+ available_choices, default_choice = safe_model_choices(prefer_default="small")
560
+
561
  CSS = """
562
+ :root{ --accent:#4f46e5; --muted:#6b7280; --card:#ffffff; --bg:#f7f8fb; }
 
 
 
 
 
563
  body { background: var(--bg); font-family: Inter, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
564
+ .header { padding: 14px; border-radius: 10px; background: linear-gradient(90deg, rgba(79,70,229,0.08), rgba(99,102,241,0.02)); margin-bottom: 12px; display:flex;align-items:center;gap:12px; }
565
+ .app-icon { width:50px;height:50px;border-radius:10px;background:linear-gradient(135deg,var(--accent),#06b6d4);display:flex;align-items:center;justify-content:center;color:white;font-weight:700;font-size:20px; }
566
+ .card { background:var(--card); border-radius:10px; padding:12px; box-shadow: 0 6px 20px rgba(16,24,40,0.04); }
567
+ .transcript-area { white-space:pre-wrap; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, "Roboto Mono", monospace; background:#0f172a; color:#e6eef8; padding:12px; border-radius:8px; min-height:200px; }
 
 
568
  .small-note { color:var(--muted); font-size:12px;}
569
  """
570
 
571
+ with gr.Blocks(title="Whisper Transcriber (improved)", css=CSS) as demo:
 
 
 
572
  with gr.Row(elem_classes="header"):
573
  with gr.Column(scale=0):
574
  gr.HTML("<div class='app-icon'>WT</div>")
575
  with gr.Column():
576
+ gr.Markdown("<h3 style='margin:0'>Whisper Transcriber — improved</h3>")
577
+ gr.Markdown("<div class='small-note'>Per-file selection after unzip, SRT export, model availability checks.</div>")
578
 
579
  with gr.Tabs():
580
+ # Single Audio Tab
581
  with gr.TabItem("Audio Transcribe"):
582
  with gr.Row():
583
  with gr.Column(scale=1):
584
  with gr.Group(elem_classes="card"):
585
+ gr.Markdown("### Single audio")
586
  single_audio = gr.Audio(label="Upload or record audio", type="filepath")
587
  with gr.Row():
588
+ model_select = gr.Dropdown(choices=available_choices, value=default_choice, label="Model")
589
+ device_choice = gr.Dropdown(choices=["auto", "cpu", "cuda"], value="auto", label="Device")
590
  with gr.Row():
591
+ mem_toggle = gr.Checkbox(label="Enable memory corrections", value=False)
592
+ srt_toggle = gr.Checkbox(label="Generate SRT", value=False)
593
  transcribe_btn = gr.Button("Transcribe", variant="primary")
 
594
  with gr.Column(scale=1):
595
  with gr.Group(elem_classes="card"):
596
+ gr.Markdown("### Output")
597
+ audio_preview = gr.Audio(interactive=False)
598
+ transcript_out = gr.Textbox(label="Transcript", lines=12, interactive=False, elem_classes="transcript-area")
599
+ srt_download = gr.File(label="SRT (if generated / available)")
600
+ single_logs = gr.Textbox(label="Logs", lines=8, interactive=False)
601
+
602
+ def _single_action(audio_file, model_name, device, mem_on, srt_on):
603
+ if not audio_file:
604
+ return None, "", None, "No audio file provided."
605
+ path = audio_file if isinstance(audio_file, str) else (audio_file.name if hasattr(audio_file, "name") else str(audio_file))
606
+ text, srt_path, logs = transcribe_single_file(path, model_name=model_name, device_choice=device, enable_memory=mem_on, generate_srt=srt_on)
607
+ # set audio preview to original file
608
+ preview = audio_file
609
+ return preview, text, srt_path, logs
610
+
611
+ transcribe_btn.click(fn=_single_action, inputs=[single_audio, model_select, device_choice, mem_toggle, srt_toggle], outputs=[audio_preview, transcript_out, srt_download, single_logs])
612
+
613
+ # Batch Tab
614
  with gr.TabItem("Batch Transcribe"):
615
  with gr.Row():
616
  with gr.Column(scale=1):
617
  with gr.Group(elem_classes="card"):
618
+ gr.Markdown("### Batch (upload multiple or ZIP)")
619
+ batch_files = gr.File(label="Upload audio files (optional)", file_count="multiple", type="filepath")
620
  batch_zip = gr.File(label="Or upload ZIP with audio (optional)", file_count="single", type="filepath")
621
  zip_password = gr.Textbox(label="ZIP password (optional)")
622
+ batch_extract_btn = gr.Button("Extract ZIP & List files")
623
+ batch_extract_logs = gr.Textbox(label="Extraction logs", lines=6, interactive=False)
624
+ batch_select = gr.CheckboxGroup(choices=[], label="Select extracted files (friendly names)", interactive=True)
625
  with gr.Row():
626
+ batch_model = gr.Dropdown(choices=available_choices, value=default_choice, label="Model")
627
+ batch_device = gr.Dropdown(choices=["auto", "cpu", "cuda"], value="auto", label="Device")
628
+ batch_merge = gr.Checkbox(label="Merge transcripts to DOCX", value=True)
629
  batch_mem = gr.Checkbox(label="Enable memory corrections", value=False)
630
+ batch_srt = gr.Checkbox(label="Generate SRT(s) if available", value=False)
631
+ batch_run_btn = gr.Button("Start Batch Transcription", variant="primary")
 
 
632
  with gr.Column(scale=1):
633
  with gr.Group(elem_classes="card"):
634
+ gr.Markdown("### Batch Output")
635
  batch_trans_out = gr.Textbox(label="Transcript (combined)", lines=16, interactive=False)
636
  batch_logs = gr.Textbox(label="Logs", lines=10, interactive=False)
637
+ batch_doc_download = gr.File(label="Merged DOCX (if created)")
638
+ batch_srt_download = gr.File(label="First SRT (if any)")
639
 
640
+ def _do_extract(zip_file, password):
641
  if not zip_file:
642
+ return [], "No ZIP provided."
643
  zip_path = zip_file.name if hasattr(zip_file, "name") else str(zip_file)
644
+ friendly, logs = extract_zip_and_map(zip_path, password)
645
+ # Show friendly names and logs
646
+ return friendly, logs
647
+
648
+ batch_extract_btn.click(fn=_do_extract, inputs=[batch_zip, zip_password], outputs=[batch_select, batch_extract_logs])
649
+
650
+ def _do_batch(friendly_selected, uploaded_files, model_name, device, merge_flag, mem_flag, srt_flag):
651
+ combined, logs, out_doc, srt_path = batch_transcribe(friendly_selected, uploaded_files, model_name, device, merge_flag, mem_flag, srt_flag)
652
+ return combined, logs, out_doc, srt_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
653
 
654
+ batch_run_btn.click(fn=_do_batch, inputs=[batch_select, batch_files, batch_model, batch_device, batch_merge, batch_mem, batch_srt], outputs=[batch_trans_out, batch_logs, batch_doc_download, batch_srt_download])
655
 
656
  # Memory Tab
657
  with gr.TabItem("Memory"):
 
659
  with gr.Column(scale=1):
660
  with gr.Group(elem_classes="card"):
661
  gr.Markdown("### Correction Memory")
662
+ mem_upload = gr.File(label="Import memory file (JSON or text)", file_count="single", type="filepath")
663
  mem_import_btn = gr.Button("Import Memory")
664
+ mem_text = gr.Textbox(label="Add word/phrase", placeholder="Type word or phrase")
665
  mem_add_btn = gr.Button("Add to Memory")
666
  mem_clear_btn = gr.Button("Clear Memory")
667
  mem_view_btn = gr.Button("View Memory")
 
730
  def _view_mem():
731
  w = memory.get("words", {})
732
  p = memory.get("phrases", {})
733
+ out_lines = []
734
+ out_lines.append("WORDS (top 30):")
735
  for k, v in sorted(w.items(), key=lambda kv: -kv[1])[:30]:
736
+ out_lines.append(f"{k}: {v}")
737
+ out_lines.append("")
738
+ out_lines.append("PHRASES (top 20):")
739
  for k, v in sorted(p.items(), key=lambda kv: -kv[1])[:20]:
740
+ out_lines.append(f"{k}: {v}")
741
+ return "\n".join(out_lines)
742
 
743
  mem_import_btn.click(fn=_import_mem, inputs=[mem_upload], outputs=[mem_status])
744
+ mem_add_btn.click(fn=_add_mem, inputs=[mem_text], outputs=[mem_status])
745
  mem_clear_btn.click(fn=_clear_mem, inputs=[], outputs=[mem_status])
746
  mem_view_btn.click(fn=_view_mem, inputs=[], outputs=[mem_status])
747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748
  # Settings Tab
749
  with gr.TabItem("Settings"):
750
  with gr.Row():
751
  with gr.Column():
752
  with gr.Group(elem_classes="card"):
753
  gr.Markdown("### Runtime & tips")
754
+ gr.Markdown("- Use `large-v3` only if your whisper package supports it.")
755
  gr.Markdown("- Extraction writes to system temp `extracted_audio`. Re-extracting overwrites it.")
756
+ gr.Markdown("- Provide `fine_tune.py` if you plan to use the Fine-tune workflow.")
757
  with gr.Column():
758
  with gr.Group(elem_classes="card"):
759
  gr.Markdown("### Diagnostics")
760
  diag_btn = gr.Button("Show memory summary")
761
  diag_out = gr.Textbox(label="Diagnostics", lines=12, interactive=False)
762
+ diag_btn.click(fn=lambda: (lambda: _view_mem())(), inputs=[], outputs=[diag_out])
763
 
764
+ # Launch
765
  if __name__ == "__main__":
766
  port = int(os.environ.get("PORT", 7860))
767
+ print("DEBUG: launching improved Gradio on port", port, flush=True)
768
+ demo.queue().launch(server_name="0.0.0.0", server_port=port)