staraks commited on
Commit
fff2ebd
·
verified ·
1 Parent(s): c7a9d02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +715 -279
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # app.py
2
- # Whisper Transcriber - single / multi / zip options + memory
3
- # Compatible with gradio 3.x, whisper, pydub, pyzipper, python-docx, ffmpeg
4
 
5
  import os
6
  import sys
@@ -11,16 +11,17 @@ import subprocess
11
  import traceback
12
  import threading
13
  import re
 
14
  from difflib import get_close_matches
15
- from pathlib import Path
16
  from uuid import uuid4
 
17
 
18
- # immediate prints
19
  os.environ["PYTHONUNBUFFERED"] = "1"
20
 
21
  print("DEBUG: app.py bootstrap starting", flush=True)
22
 
23
- # third-party imports
24
  try:
25
  import gradio as gr
26
  import whisper
@@ -33,9 +34,9 @@ except Exception as e:
33
  raise
34
 
35
  # ---------- Config ----------
36
- MIN_WAV_SIZE = 1024
37
  MEMORY_FILE = "memory.json"
38
  MEMORY_LOCK = threading.Lock()
 
39
  FFMPEG_CANDIDATES = [
40
  ("s16le", 16000, 1),
41
  ("s16le", 44100, 2),
@@ -44,8 +45,9 @@ FFMPEG_CANDIDATES = [
44
  ("mulaw", 8000, 1),
45
  ]
46
  MODEL_CACHE = {}
 
47
 
48
- # ---------- Memory helpers ----------
49
  def load_memory():
50
  try:
51
  if os.path.exists(MEMORY_FILE):
@@ -78,7 +80,6 @@ def save_memory(mem):
78
 
79
  memory = load_memory()
80
 
81
- # ---------- Postprocessing ----------
82
  MEDICAL_ABBREVIATIONS = {
83
  "pt": "patient",
84
  "dx": "diagnosis",
@@ -92,7 +93,6 @@ MEDICAL_ABBREVIATIONS = {
92
  "adm": "admit",
93
  "disch": "discharge",
94
  }
95
-
96
  DRUG_NORMALIZATION = {
97
  "metformin": "Metformin",
98
  "aspirin": "Aspirin",
@@ -214,19 +214,24 @@ def save_as_word(text, filename=None):
214
  return filename
215
 
216
 
217
- def _ffmpeg_convert(input_path, out_path):
218
- cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", out_path]
219
  try:
 
 
 
 
 
220
  proc = subprocess.run(cmd, capture_output=True, timeout=60, text=True)
 
221
  if proc.returncode == 0 and os.path.exists(out_path) and os.path.getsize(out_path) > MIN_WAV_SIZE:
222
- return True, proc.stdout + proc.stderr
223
  else:
224
  try:
225
  if os.path.exists(out_path):
226
  os.unlink(out_path)
227
  except Exception:
228
  pass
229
- return False, proc.stdout + proc.stderr
230
  except Exception as e:
231
  try:
232
  if os.path.exists(out_path):
@@ -242,6 +247,7 @@ def convert_to_wav_if_needed(input_path):
242
  if lower.endswith(".wav"):
243
  return input_path
244
 
 
245
  tmp = None
246
  try:
247
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
@@ -255,25 +261,29 @@ def convert_to_wav_if_needed(input_path):
255
  except Exception:
256
  pass
257
  except Exception:
 
258
  try:
259
  if tmp and os.path.exists(tmp.name):
260
  os.unlink(tmp.name)
261
  except Exception:
262
  pass
263
 
264
- # ffmpeg fallback attempts
265
  diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
266
  diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
267
  diagnostics = []
268
  for fmt, sr, ch in FFMPEG_CANDIDATES:
269
  out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
270
  out_wav.close()
271
- success, debug = _ffmpeg_convert(input_path, out_wav.name)
272
  diagnostics.append(f"TRY fmt={fmt} sr={sr} ch={ch} success={success}\n{debug}\n")
273
  if success:
274
  try:
275
  with open(diag_log, "w", encoding="utf-8") as fh:
276
- fh.write("pydub auto error: fallback used\n\n")
 
 
 
277
  fh.write("Diagnostics:\n")
278
  fh.write("\n".join(diagnostics))
279
  except Exception:
@@ -286,6 +296,16 @@ def convert_to_wav_if_needed(input_path):
286
  except Exception:
287
  pass
288
 
 
 
 
 
 
 
 
 
 
 
289
  try:
290
  with open(input_path, "rb") as fh:
291
  head = fh.read(512)
@@ -295,6 +315,8 @@ def convert_to_wav_if_needed(input_path):
295
 
296
  try:
297
  with open(diag_log, "w", encoding="utf-8") as fh:
 
 
298
  fh.write("Full diagnostics:\n\n")
299
  fh.write("\n\n".join(diagnostics))
300
  except Exception as e:
@@ -327,38 +349,61 @@ def safe_model_choices(prefer_default="small"):
327
 
328
 
329
  def get_whisper_model(name, device=None):
330
- key = f"{name}|{device}"
331
- if key not in MODEL_CACHE:
332
- print(f"DEBUG: loading whisper model '{name}' (device={device})", flush=True)
333
  try:
334
- if device and device != "auto":
335
- MODEL_CACHE[key] = whisper.load_model(name, device=device)
336
  else:
337
- MODEL_CACHE[key] = whisper.load_model(name)
338
  except TypeError:
339
- MODEL_CACHE[key] = whisper.load_model(name)
340
- return MODEL_CACHE[key]
341
-
342
-
343
- # ---------- ZIP extraction ----------
344
- def extract_zip_and_list(zip_path, zip_password=None):
345
- temp_extract_dir = os.path.join(tempfile.gettempdir(), f"extracted_{uuid4().hex[:8]}")
346
- os.makedirs(temp_extract_dir, exist_ok=True)
347
- extracted = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  logs = []
349
  try:
 
350
  with pyzipper.ZipFile(zip_path, "r") as zf:
351
  if zip_password:
352
  try:
353
  zf.setpassword(zip_password.encode())
354
  except Exception:
355
- # non-fatal
356
- pass
 
357
  for info in zf.infolist():
358
  if info.is_dir():
359
  continue
360
  _, ext = os.path.splitext(info.filename)
361
- if ext.lower() not in [".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".dct", ".dat"]:
362
  continue
363
  try:
364
  zf.extract(info, path=temp_extract_dir)
@@ -371,304 +416,658 @@ def extract_zip_and_list(zip_path, zip_password=None):
371
  fullp = os.path.normpath(os.path.join(temp_extract_dir, info.filename))
372
  if not os.path.exists(fullp):
373
  continue
374
- key = os.path.basename(info.filename)
375
- # ensure unique key
376
- i = 1
377
- original = key
378
- while key in extracted:
379
- name_only, extn = os.path.splitext(original)
380
- i += 1
381
- key = f"{name_only} ({i}){extn}"
382
- extracted[key] = fullp
 
383
  logs.append(f"Extracted: {info.filename} -> {key}")
384
- if not extracted:
385
  logs.append("No supported audio files found in ZIP.")
386
- except pyzipper.BadZipFile:
387
- logs.append("Invalid zip file.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  except Exception as e:
389
- logs.append(f"Zip extraction failed: {e}")
390
- return extracted, "\n".join(logs)
 
 
 
 
391
 
392
 
393
- # ---------- Transcription operations ----------
394
- def transcribe_file(path, model_name="small", device_choice="auto", enable_memory=False):
 
 
 
 
 
 
 
 
 
 
395
  logs = []
396
  try:
397
  if not path:
398
- return "", "No path", "\n".join(logs)
399
- # normalize path
400
- p = str(path)
401
- logs.append(f"Converting: {p}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  wav = convert_to_wav_if_needed(p)
403
- logs.append(f"WAV ready: {wav}")
404
- model = get_whisper_model(model_name, device=None if device_choice == "auto" else device_choice)
405
- logs.append(f"Model loaded: {model_name}")
406
- result = model.transcribe(wav)
407
- text = result.get("text", "").strip()
408
- if enable_memory:
409
- text = memory_correct_text(text)
410
- text = postprocess_transcript(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  if enable_memory:
 
412
  try:
413
- update_memory_with_transcript(text)
414
- logs.append("Memory updated")
415
  except Exception:
416
  pass
417
- # cleanup tmp wav if it was created
418
- if wav and os.path.exists(wav) and not p.lower().endswith(".wav"):
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  try:
420
  os.unlink(wav)
421
  except Exception:
422
  pass
423
- return text, None, "\n".join(logs)
 
 
424
  except Exception as e:
425
  tb = traceback.format_exc()
426
- return "", f"ERROR: {e}", tb
427
 
428
 
429
- # ---------- Batch orchestration ----------
430
- def batch_transcribe_from_paths(paths, model_name, device_name, enable_mem, merge_flag):
431
  logs = []
432
  transcripts = []
433
- perfile_docx = []
434
- errors = []
435
- for idx, p in enumerate(paths, start=1):
436
- logs.append(f"[{idx}/{len(paths)}] {p}")
437
- text, err, lg = transcribe_file(p, model_name=model_name, device_choice=device_name, enable_memory=enable_mem)
438
- logs.append(lg or "")
439
- if err:
440
- errors.append(f"{os.path.basename(p)}: {err}")
441
- transcripts.append(f"FILE: {os.path.basename(p)}\nERROR: {err}\n")
 
 
 
 
 
442
  else:
443
- transcripts.append(f"FILE: {os.path.basename(p)}\n{text}\n")
444
- # create per-file docx
445
- try:
446
- safe_name = Path(p).stem
447
- out_doc = os.path.join(tempfile.gettempdir(), f"{safe_name}_{uuid4().hex[:8]}.docx")
448
- save_as_word(text or "", out_doc)
449
- perfile_docx.append((os.path.basename(p), out_doc))
450
- except Exception as e:
451
- errors.append(f"Failed to write docx for {p}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  combined = "\n\n".join(transcripts)
453
- merged_doc = None
454
  if merge_flag:
455
  try:
456
- merged_doc = os.path.join(tempfile.gettempdir(), f"merged_{uuid4().hex[:8]}.docx")
457
- save_as_word(combined, merged_doc)
458
- logs.append(f"Merged saved: {merged_doc}")
459
  except Exception as e:
460
  logs.append(f"Merge failed: {e}")
461
- # zip per-file docx for download
462
- zip_path = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  try:
464
- zip_path = os.path.join(tempfile.gettempdir(), f"perfiles_{uuid4().hex[:8]}.zip")
465
- import zipfile
466
- with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
467
- for base, pth in perfile_docx:
468
- arcname = Path(base).stem + ".docx"
469
- zf.write(pth, arcname=arcname)
470
- logs.append(f"Per-file ZIP created: {zip_path}")
471
- except Exception as e:
472
- logs.append(f"Failed to create per-file ZIP: {e}")
473
- zip_path = None
474
- # add errors to logs
475
- if errors:
476
- logs.append("Errors:")
477
- logs.extend(errors)
478
- # also list produced files
479
- logs.append("Per-file outputs:")
480
- for base, pth in perfile_docx:
481
- logs.append(f" - {base} -> {Path(pth).name}")
482
- return combined, "\n".join(logs), zip_path, merged_doc
483
-
484
-
485
- # ---------- UI building ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  print("DEBUG: building Gradio UI", flush=True)
487
  available_choices, default_choice = safe_model_choices(prefer_default="small")
488
 
489
  CSS = """
490
  :root{
491
  --accent:#4f46e5;
492
- --muted:#9ca3af;
493
- --card:#0b1220;
494
- --bg:#071022;
495
- --text:#e6eef8;
496
- --transcript-bg:#071026;
497
  --transcript-color:#e6eef8;
498
  }
 
 
 
 
 
 
 
 
 
499
  body { background: var(--bg); color: var(--text); font-family: Inter, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
500
- .card { background: var(--card); border-radius:10px; padding:12px; box-shadow: 0 6px 20px rgba(0,0,0,0.4); }
 
 
 
501
  .small-note { color:var(--muted); font-size:12px;}
502
- .transcript-area { white-space:pre-wrap; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, "Roboto Mono", monospace; background: var(--transcript-bg); color: var(--transcript-color); padding:12px; border-radius:8px; min-height:160px; }
503
  """
504
 
505
- with gr.Blocks(title="Whisper Transcriber - single/multi/zip", css=CSS) as demo:
506
- # set dark theme default via a tiny script
507
- gr.HTML("""<script>document.documentElement.setAttribute('data-theme','dark');</script>""")
508
-
509
- gr.Markdown("<h3 style='margin:6px 0'>Whisper Transcriber — Single / Multi / ZIP</h3>")
510
- gr.Markdown("<div class='small-note'>Options: single file, multiple files, or ZIP (default ZIP password: <code>dietcoke1</code>). Memory available.</div>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
 
512
  with gr.Tabs():
513
- # --- Single file tab ---
514
- with gr.TabItem("Single file"):
515
  with gr.Row():
516
  with gr.Column(scale=1):
517
- single_audio = gr.File(label="Select audio file", file_count="single", type="filepath")
518
- model_select_single = gr.Dropdown(choices=available_choices, value=default_choice, label="Model")
519
- device_single = gr.Dropdown(choices=["auto", "cpu", "cuda"], value="auto", label="Device")
520
- mem_single = gr.Checkbox(label="Enable correction memory", value=False)
521
- single_transcribe_btn = gr.Button("Transcribe single file", variant="primary")
 
 
 
 
 
522
  with gr.Column(scale=1):
523
- single_transcript = gr.Textbox(label="Transcript", lines=14, interactive=False)
 
 
 
524
  single_logs = gr.Textbox(label="Logs", lines=8, interactive=False)
525
- single_doc_download = gr.File(label="Download .docx (single)")
526
-
527
- def _single_run(file_path, model_name, device_choice, enable_mem):
528
- if not file_path:
529
- return "", "No file selected.", None
530
- path = file_path if isinstance(file_path, str) else (file_path.name if hasattr(file_path, "name") else str(file_path))
531
- text, err, logs = transcribe_file(path, model_name=model_name, device_choice=device_choice, enable_memory=enable_mem)
532
- if err:
533
- return text, logs, None
534
- # write docx
535
- try:
536
- out_doc = os.path.join(tempfile.gettempdir(), f"{Path(path).stem}_{uuid4().hex[:8]}.docx")
537
- save_as_word(text or "", out_doc)
538
- except Exception as e:
539
- logs = (logs or "") + f"\nFailed to write docx: {e}"
540
- out_doc = None
541
- return text, logs, out_doc
542
 
543
- single_transcribe_btn.click(fn=_single_run, inputs=[single_audio, model_select_single, device_single, mem_single], outputs=[single_transcript, single_logs, single_doc_download])
544
-
545
- # --- Multi-file tab ---
546
- with gr.TabItem("Multi-file"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
  with gr.Row():
548
  with gr.Column(scale=1):
549
- multi_files = gr.File(label="Upload multiple audio files", file_count="multiple", type="filepath")
550
- model_multi = gr.Dropdown(choices=available_choices, value=default_choice, label="Model")
551
- device_multi = gr.Dropdown(choices=["auto", "cpu", "cuda"], value="auto", label="Device")
552
- mem_multi = gr.Checkbox(label="Enable correction memory", value=False)
553
- merge_multi = gr.Checkbox(label="Merge into single .docx (optional)", value=True)
554
- multi_run_btn = gr.Button("Start multi-file transcription", variant="primary")
 
 
 
 
 
 
 
 
 
 
555
  with gr.Column(scale=1):
556
- multi_transcript = gr.Textbox(label="Combined Transcript", lines=14, interactive=False)
557
- multi_logs = gr.Textbox(label="Logs", lines=10, interactive=False)
558
- multi_perfiles_zip = gr.File(label="Download per-file .docx ZIP (separate docs)", interactive=False)
559
- multi_merged_doc = gr.File(label="Download merged docx (if created)", interactive=False)
560
-
561
- def _multi_run(files, model_name, device_choice, enable_mem, merge_flag):
562
- if not files:
563
- return "", "No files uploaded.", None, None
564
- paths = [str(f) for f in files] if isinstance(files, (list, tuple)) else [str(files)]
565
- combined, logs, zip_path, merged_path = batch_transcribe_from_paths(paths, model_name, device_choice, enable_mem, merge_flag)
566
- return combined, logs, zip_path, merged_path
567
-
568
- multi_run_btn.click(fn=_multi_run, inputs=[multi_files, model_multi, device_multi, mem_multi, merge_multi], outputs=[multi_transcript, multi_logs, multi_perfiles_zip, multi_merged_doc])
569
-
570
- # --- ZIP tab ---
571
- with gr.TabItem("ZIP"):
572
- with gr.Row():
573
- with gr.Column(scale=1):
574
- zip_file = gr.File(label="Upload ZIP containing audio files", file_count="single", type="filepath")
575
- use_default_zip_pass = gr.Checkbox(label="Use default ZIP password (dietcoke1)", value=True)
576
- zip_password = gr.Textbox(label="ZIP password (override)", placeholder="If left empty and default checked, 'dietcoke1' will be used")
577
- model_zip = gr.Dropdown(choices=available_choices, value=default_choice, label="Model")
578
- device_zip = gr.Dropdown(choices=["auto", "cpu", "cuda"], value="auto", label="Device")
579
- mem_zip = gr.Checkbox(label="Enable correction memory", value=False)
580
- merge_zip = gr.Checkbox(label="Merge into single .docx (optional)", value=True)
581
- zip_extract_btn = gr.Button("Extract & Transcribe ZIP", variant="primary")
582
- with gr.Column(scale=1):
583
- zip_extract_logs = gr.Textbox(label="Extraction & transcription logs", lines=12, interactive=False)
584
- zip_perfiles_zip = gr.File(label="Download per-file .docx ZIP", interactive=False)
585
- zip_merged_doc = gr.File(label="Download merged docx (if created)", interactive=False)
586
-
587
- def _zip_run(zfile, use_default, pwd_override, model_name, device_choice, enable_mem, merge_flag):
588
- if not zfile:
589
- return "No ZIP provided.", None, None
590
- zip_path = zfile if isinstance(zfile, str) else (zfile.name if hasattr(zfile, "name") else str(zfile))
591
- final_pwd = None
592
- if use_default and (not pwd_override or pwd_override.strip() == ""):
593
- final_pwd = "dietcoke1"
594
- elif pwd_override and pwd_override.strip():
595
- final_pwd = pwd_override.strip()
596
- extracted_map, logs0 = extract_zip_and_list(zip_path, final_pwd)
597
- logs_lines = [logs0]
598
- if not extracted_map:
599
- return "\n".join(logs_lines), None, None
600
- # transcribe in file order
601
- paths = [extracted_map[k] for k in sorted(extracted_map.keys())]
602
- combined, logs1, per_zip, merged_doc = batch_transcribe_from_paths(paths, model_name, device_choice, enable_mem, merge_flag)
603
- logs_lines.append(logs1)
604
- # final logs
605
- return "\n\n".join(logs_lines), per_zip, merged_doc
606
-
607
- zip_extract_btn.click(fn=_zip_run, inputs=[zip_file, use_default_zip_pass, zip_password, model_zip, device_zip, mem_zip, merge_zip], outputs=[zip_extract_logs, zip_perfiles_zip, zip_merged_doc])
608
-
609
- # --- Memory tab ---
610
  with gr.TabItem("Memory"):
611
  with gr.Row():
612
  with gr.Column(scale=1):
613
- mem_upload = gr.File(label="Import memory file (JSON or text)", file_count="single", type="filepath")
614
- mem_import_btn = gr.Button("Import memory file")
615
- mem_add_text = gr.Textbox(label="Add single word or phrase", placeholder="Type word or phrase")
616
- mem_add_btn = gr.Button("Add to memory")
617
- mem_clear_btn = gr.Button("Clear memory")
618
- mem_view_btn = gr.Button("View memory")
 
 
 
619
  with gr.Column(scale=1):
620
- mem_status = gr.Textbox(label="Memory status / preview", lines=14, interactive=False)
 
 
 
621
 
622
- def _import_mem(uploaded):
623
- if not uploaded:
624
- return "No file provided."
625
- path = uploaded if isinstance(uploaded, str) else (uploaded.name if hasattr(uploaded, "name") else str(uploaded))
626
- try:
627
- with open(path, "r", encoding="utf-8") as fh:
628
- raw = fh.read()
629
- parsed = None
630
- try:
631
- parsed = json.loads(raw)
632
- except Exception:
633
- parsed = None
634
- added = 0
635
- if isinstance(parsed, dict):
636
- with MEMORY_LOCK:
637
- for k, v in parsed.get("words", {}).items():
638
- memory["words"][k.lower()] = memory["words"].get(k.lower(), 0) + int(v)
639
- added += 1
640
- for k, v in parsed.get("phrases", {}).items():
641
- memory["phrases"][k] = memory["phrases"].get(k, 0) + int(v)
642
- added += 1
643
- save_memory(memory)
644
- return f"Imported memory JSON entries: {added}"
645
- # fallback to line-per-entry
646
- lines = [l.strip() for l in raw.splitlines() if l.strip()]
647
- with MEMORY_LOCK:
648
- for line in lines:
649
- if "," in line:
650
- k, c = line.split(",", 1)
651
- try:
652
- cnt = int(c)
653
- except:
654
- cnt = 1
655
- memory["words"][k.lower()] = memory["words"].get(k.lower(), 0) + cnt
656
- else:
657
- # short lines -> words, longer -> phrase
658
- if len(line.split()) <= 3:
659
- memory["words"][line.lower()] = memory["words"].get(line.lower(), 0) + 1
660
- else:
661
- memory["phrases"][line] = memory["phrases"].get(line, 0) + 1
662
- added += 1
663
- save_memory(memory)
664
- return f"Imported {added} entries from text."
665
- except Exception as e:
666
- return f"Import failed: {e}"
667
 
668
- def _add_mem(txt):
669
- if not txt or not txt.strip():
670
  return "No entry provided."
671
- e = txt.strip()
672
  with MEMORY_LOCK:
673
  if len(e.split()) <= 3:
674
  memory["words"][e.lower()] = memory["words"].get(e.lower(), 0) + 1
@@ -690,20 +1089,57 @@ with gr.Blocks(title="Whisper Transcriber - single/multi/zip", css=CSS) as demo:
690
  w = memory.get("words", {})
691
  p = memory.get("phrases", {})
692
  out_lines = []
693
- out_lines.append("WORDS (top 50):")
694
- for k, v in sorted(w.items(), key=lambda kv: -kv[1])[:50]:
695
  out_lines.append(f"{k}: {v}")
696
  out_lines.append("")
697
- out_lines.append("PHRASES (top 50):")
698
- for k, v in sorted(p.items(), key=lambda kv: -kv[1])[:50]:
699
  out_lines.append(f"{k}: {v}")
700
  return "\n".join(out_lines)
701
 
702
- mem_import_btn.click(fn=_import_mem, inputs=[mem_upload], outputs=[mem_status])
703
- mem_add_btn.click(fn=_add_mem, inputs=[mem_add_text], outputs=[mem_status])
704
  mem_clear_btn.click(fn=_clear_mem, inputs=[], outputs=[mem_status])
705
  mem_view_btn.click(fn=_view_mem, inputs=[], outputs=[mem_status])
706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
707
  # ---------- Launch ----------
708
  if __name__ == "__main__":
709
  port = int(os.environ.get("PORT", 7860))
 
1
  # app.py
2
+ # Whisper Transcriber Gradio 3.x compatible full file
3
+ # Requirements: gradio (3.x), whisper, pydub, pyzipper, python-docx, ffmpeg installed
4
 
5
  import os
6
  import sys
 
11
  import traceback
12
  import threading
13
  import re
14
+ import zipfile
15
  from difflib import get_close_matches
 
16
  from uuid import uuid4
17
+ from pathlib import Path
18
 
19
+ # Force unbuffered prints for logs
20
  os.environ["PYTHONUNBUFFERED"] = "1"
21
 
22
  print("DEBUG: app.py bootstrap starting", flush=True)
23
 
24
+ # Third-party imports (ensure installed)
25
  try:
26
  import gradio as gr
27
  import whisper
 
34
  raise
35
 
36
  # ---------- Config ----------
 
37
  MEMORY_FILE = "memory.json"
38
  MEMORY_LOCK = threading.Lock()
39
+ MIN_WAV_SIZE = 1024
40
  FFMPEG_CANDIDATES = [
41
  ("s16le", 16000, 1),
42
  ("s16le", 44100, 2),
 
45
  ("mulaw", 8000, 1),
46
  ]
47
  MODEL_CACHE = {}
48
+ EXTRACT_MAP = {} # friendly_name -> absolute path
49
 
50
+ # ---------- Memory & postprocessing ----------
51
  def load_memory():
52
  try:
53
  if os.path.exists(MEMORY_FILE):
 
80
 
81
  memory = load_memory()
82
 
 
83
  MEDICAL_ABBREVIATIONS = {
84
  "pt": "patient",
85
  "dx": "diagnosis",
 
93
  "adm": "admit",
94
  "disch": "discharge",
95
  }
 
96
  DRUG_NORMALIZATION = {
97
  "metformin": "Metformin",
98
  "aspirin": "Aspirin",
 
214
  return filename
215
 
216
 
217
+ def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
 
218
  try:
219
+ cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"]
220
+ if fmt in ("s16le", "pcm_s16le", "mulaw"):
221
+ cmd += ["-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_path]
222
+ else:
223
+ cmd += ["-i", input_path, "-ar", str(sr), "-ac", str(ch), out_path]
224
  proc = subprocess.run(cmd, capture_output=True, timeout=60, text=True)
225
+ stdout_stderr = (proc.stdout or "") + (proc.stderr or "")
226
  if proc.returncode == 0 and os.path.exists(out_path) and os.path.getsize(out_path) > MIN_WAV_SIZE:
227
+ return True, stdout_stderr
228
  else:
229
  try:
230
  if os.path.exists(out_path):
231
  os.unlink(out_path)
232
  except Exception:
233
  pass
234
+ return False, stdout_stderr
235
  except Exception as e:
236
  try:
237
  if os.path.exists(out_path):
 
247
  if lower.endswith(".wav"):
248
  return input_path
249
 
250
+ auto_err = ""
251
  tmp = None
252
  try:
253
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
 
261
  except Exception:
262
  pass
263
  except Exception:
264
+ auto_err = traceback.format_exc()
265
  try:
266
  if tmp and os.path.exists(tmp.name):
267
  os.unlink(tmp.name)
268
  except Exception:
269
  pass
270
 
271
+ # ffmpeg fallback
272
  diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
273
  diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
274
  diagnostics = []
275
  for fmt, sr, ch in FFMPEG_CANDIDATES:
276
  out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
277
  out_wav.close()
278
+ success, debug = _ffmpeg_convert(input_path, out_wav.name, fmt, sr, ch)
279
  diagnostics.append(f"TRY fmt={fmt} sr={sr} ch={ch} success={success}\n{debug}\n")
280
  if success:
281
  try:
282
  with open(diag_log, "w", encoding="utf-8") as fh:
283
+ fh.write("pydub auto error:\n")
284
+ fh.write(auto_err + "\n\n")
285
+ fh.write("Successful ffmpeg candidate:\n")
286
+ fh.write(f"fmt={fmt} sr={sr} ch={ch}\n\n")
287
  fh.write("Diagnostics:\n")
288
  fh.write("\n".join(diagnostics))
289
  except Exception:
 
296
  except Exception:
297
  pass
298
 
299
+ try:
300
+ fp = subprocess.run(
301
+ ["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
302
+ capture_output=True,
303
+ text=True,
304
+ timeout=10,
305
+ )
306
+ diagnostics.append("FFPROBE:\n" + (fp.stdout.strip() or fp.stderr.strip()))
307
+ except Exception as e:
308
+ diagnostics.append("ffprobe failed: " + str(e))
309
  try:
310
  with open(input_path, "rb") as fh:
311
  head = fh.read(512)
 
315
 
316
  try:
317
  with open(diag_log, "w", encoding="utf-8") as fh:
318
+ fh.write("pydub auto error:\n")
319
+ fh.write(auto_err + "\n\n")
320
  fh.write("Full diagnostics:\n\n")
321
  fh.write("\n\n".join(diagnostics))
322
  except Exception as e:
 
349
 
350
 
351
  def get_whisper_model(name, device=None):
352
+ if name not in MODEL_CACHE:
353
+ print(f"DEBUG: loading whisper model '{name}'", flush=True)
 
354
  try:
355
+ if device:
356
+ MODEL_CACHE[name] = whisper.load_model(name, device=device)
357
  else:
358
+ MODEL_CACHE[name] = whisper.load_model(name)
359
  except TypeError:
360
+ MODEL_CACHE[name] = whisper.load_model(name)
361
+ return MODEL_CACHE[name]
362
+
363
+
364
+ # ---------- SRT helper ----------
365
+ def segments_to_srt(segments):
366
+ def fmt_time(t):
367
+ h = int(t // 3600)
368
+ m = int((t % 3600) // 60)
369
+ s = int(t % 60)
370
+ ms = int((t - int(t)) * 1000)
371
+ return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
372
+
373
+ lines = []
374
+ for i, seg in enumerate(segments, start=1):
375
+ start = seg.get("start", 0)
376
+ end = seg.get("end", 0)
377
+ text = seg.get("text", "").strip()
378
+ lines.append(str(i))
379
+ lines.append(f"{fmt_time(start)} --> {fmt_time(end)}")
380
+ lines.append(text)
381
+ lines.append("")
382
+ return "\n".join(lines)
383
+
384
+
385
+ # ---------- ZIP extraction (per-run dir) ----------
386
+ def extract_zip_and_map(zip_path, zip_password=None):
387
+ global EXTRACT_MAP
388
+ EXTRACT_MAP = {}
389
+ run_id = uuid4().hex
390
+ temp_extract_dir = os.path.join(tempfile.gettempdir(), f"extracted_audio_{run_id}")
391
  logs = []
392
  try:
393
+ os.makedirs(temp_extract_dir, exist_ok=True)
394
  with pyzipper.ZipFile(zip_path, "r") as zf:
395
  if zip_password:
396
  try:
397
  zf.setpassword(zip_password.encode())
398
  except Exception:
399
+ logs.append("Warning: failed to set zip password (continuing).")
400
+ count = {}
401
+ supported = [".mp3", ".wav", ".aac", ".flac", ".ogg", ".m4a", ".dat", ".dct"]
402
  for info in zf.infolist():
403
  if info.is_dir():
404
  continue
405
  _, ext = os.path.splitext(info.filename)
406
+ if ext.lower() not in supported:
407
  continue
408
  try:
409
  zf.extract(info, path=temp_extract_dir)
 
416
  fullp = os.path.normpath(os.path.join(temp_extract_dir, info.filename))
417
  if not os.path.exists(fullp):
418
  continue
419
+ base = os.path.basename(info.filename)
420
+ key = base
421
+ if key in EXTRACT_MAP:
422
+ idx = count.get(base, 1) + 1
423
+ count[base] = idx
424
+ name_only, extn = os.path.splitext(base)
425
+ key = f"{name_only} ({idx}){extn}"
426
+ else:
427
+ count[base] = 1
428
+ EXTRACT_MAP[key] = fullp
429
  logs.append(f"Extracted: {info.filename} -> {key}")
430
+ if not EXTRACT_MAP:
431
  logs.append("No supported audio files found in ZIP.")
432
+ return [], "\n".join(logs)
433
+ friendly = sorted(EXTRACT_MAP.keys())
434
+ return friendly, "\n".join(logs)
435
+ except Exception as e:
436
+ traceback.print_exc()
437
+ try:
438
+ if os.path.exists(temp_extract_dir):
439
+ shutil.rmtree(temp_extract_dir)
440
+ except Exception:
441
+ pass
442
+ return [], f"Extraction failed: {e}"
443
+
444
+
445
+ # ---------- Trim helper used in two-pass ----------
446
+ def trim_audio_segment(src_path, start_sec, end_sec):
447
+ src = str(src_path)
448
+ out_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
449
+ out_tmp.close()
450
+ out_path = out_tmp.name
451
+ try:
452
+ cmd = [
453
+ "ffmpeg",
454
+ "-hide_banner",
455
+ "-loglevel",
456
+ "error",
457
+ "-y",
458
+ "-ss",
459
+ str(start_sec),
460
+ "-to",
461
+ str(end_sec),
462
+ "-i",
463
+ src,
464
+ "-ar",
465
+ "16000",
466
+ "-ac",
467
+ "1",
468
+ out_path,
469
+ ]
470
+ proc = subprocess.run(cmd, capture_output=True, timeout=30, text=True)
471
+ if proc.returncode != 0 or not os.path.exists(out_path) or os.path.getsize(out_path) < MIN_WAV_SIZE:
472
+ try:
473
+ if os.path.exists(out_path):
474
+ os.unlink(out_path)
475
+ except Exception:
476
+ pass
477
+ raise Exception(f"ffmpeg trim failed: {proc.stderr or proc.stdout}")
478
+ return out_path
479
  except Exception as e:
480
+ try:
481
+ if os.path.exists(out_path):
482
+ os.unlink(out_path)
483
+ except Exception:
484
+ pass
485
+ raise
486
 
487
 
488
+ # ---------- Core transcription (single file, supports two-pass) ----------
489
+ def transcribe_single_file(
490
+ path,
491
+ model_name="small",
492
+ device_choice="auto",
493
+ enable_memory=False,
494
+ generate_srt=False,
495
+ use_two_pass=False,
496
+ fast_model="small",
497
+ refine_model=None,
498
+ refine_threshold=-1.0,
499
+ ):
500
  logs = []
501
  try:
502
  if not path:
503
+ return None, None, "No file provided."
504
+ p = path.name if hasattr(path, "name") else str(path)
505
+ device = None if device_choice == "auto" else device_choice
506
+
507
+ if not use_two_pass:
508
+ model = get_whisper_model(model_name, device=device)
509
+ logs.append(f"Loaded model: {model_name}")
510
+ wav = convert_to_wav_if_needed(p)
511
+ logs.append(f"Converted to WAV: {os.path.basename(wav)}")
512
+ result = model.transcribe(wav)
513
+ text = result.get("text", "").strip()
514
+ if enable_memory:
515
+ text = memory_correct_text(text)
516
+ text = postprocess_transcript(text)
517
+ srt_path = None
518
+ if generate_srt and result.get("segments"):
519
+ srt_text = segments_to_srt(result["segments"])
520
+ srt_fp = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(p))[0]}.srt")
521
+ with open(srt_fp, "w", encoding="utf-8") as fh:
522
+ fh.write(srt_text)
523
+ srt_path = srt_fp
524
+ logs.append(f"SRT generated: {srt_path}")
525
+ if enable_memory:
526
+ try:
527
+ update_memory_with_transcript(text)
528
+ logs.append("Memory updated.")
529
+ except Exception:
530
+ pass
531
+ if wav and os.path.exists(wav) and wav != p:
532
+ try:
533
+ os.unlink(wav)
534
+ except Exception:
535
+ pass
536
+ return text, srt_path, "\n".join(logs)
537
+
538
+ # Two-pass
539
+ if refine_model is None:
540
+ refine_model = model_name
541
+
542
+ logs.append(f"Two-pass enabled: fast_model={fast_model}, refine_model={refine_model}, threshold={refine_threshold}")
543
+
544
+ fast = get_whisper_model(fast_model, device=device)
545
+ logs.append(f"Loaded fast model: {fast_model}")
546
  wav = convert_to_wav_if_needed(p)
547
+ logs.append(f"Converted to WAV: {os.path.basename(wav)}")
548
+
549
+ fast_result = fast.transcribe(wav)
550
+ segments = fast_result.get("segments") or []
551
+
552
+ if not segments:
553
+ text = fast_result.get("text", "").strip()
554
+ if enable_memory:
555
+ text = memory_correct_text(text)
556
+ update_memory_with_transcript(text)
557
+ text = postprocess_transcript(text)
558
+ srt_ret = None
559
+ if generate_srt and fast_result.get("segments"):
560
+ srt_text = segments_to_srt(fast_result["segments"])
561
+ srt_fp = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(p))[0]}.srt")
562
+ with open(srt_fp, "w", encoding="utf-8") as fh:
563
+ fh.write(srt_text)
564
+ srt_ret = srt_fp
565
+ logs.append(f"SRT generated: {srt_fp}")
566
+ if wav and os.path.exists(wav) and wav != p:
567
+ try:
568
+ os.unlink(wav)
569
+ except Exception:
570
+ pass
571
+ return text, srt_ret, "\n".join(logs)
572
+
573
+ refined_segments = []
574
+ segments_to_refine = []
575
+ for seg in segments:
576
+ seg_text = seg.get("text", "").strip()
577
+ if enable_memory:
578
+ corrected = memory_correct_text(seg_text)
579
+ else:
580
+ corrected = seg_text
581
+ seg_copy = dict(seg)
582
+ seg_copy["text"] = corrected
583
+ refined_segments.append(seg_copy)
584
+ avg_lp = seg.get("avg_logprob", None)
585
+ if avg_lp is None:
586
+ continue
587
+ try:
588
+ if float(avg_lp) < float(refine_threshold):
589
+ segments_to_refine.append(seg_copy)
590
+ except Exception:
591
+ continue
592
+
593
+ logs.append(f"Fast pass: {len(segments)} segments, {len(segments_to_refine)} to refine.")
594
+
595
+ if segments_to_refine:
596
+ refine = get_whisper_model(refine_model, device=device)
597
+ logs.append(f"Loaded refine model: {refine_model}")
598
+ for seg in segments_to_refine:
599
+ start = seg.get("start", 0.0)
600
+ end = seg.get("end", start + seg.get("duration", 0.0))
601
+ if end <= start:
602
+ continue
603
+ try:
604
+ seg_wav = trim_audio_segment(wav, start, end)
605
+ r_result = refine.transcribe(seg_wav)
606
+ new_text = r_result.get("text", "").strip()
607
+ if enable_memory:
608
+ new_text = memory_correct_text(new_text)
609
+ for rs in refined_segments:
610
+ if abs(rs.get("start", 0.0) - start) < 0.001 and abs(rs.get("end", 0.0) - end) < 0.001:
611
+ rs["text"] = new_text
612
+ if r_result.get("segments"):
613
+ rs["avg_logprob"] = r_result["segments"][0].get("avg_logprob", rs.get("avg_logprob"))
614
+ break
615
+ try:
616
+ if os.path.exists(seg_wav):
617
+ os.unlink(seg_wav)
618
+ except Exception:
619
+ pass
620
+ except Exception as e:
621
+ logs.append(f"Refine failed for {start}-{end}: {e}")
622
+ continue
623
+
624
+ full_text_parts = [s.get("text", "").strip() for s in sorted(refined_segments, key=lambda x: x.get("start", 0.0))]
625
+ combined_text = " ".join([p for p in full_text_parts if p])
626
  if enable_memory:
627
+ combined_text = memory_correct_text(combined_text)
628
  try:
629
+ update_memory_with_transcript(combined_text)
630
+ logs.append("Memory updated.")
631
  except Exception:
632
  pass
633
+ combined_text = postprocess_transcript(combined_text)
634
+
635
+ srt_path = None
636
+ if generate_srt:
637
+ srt_segs = []
638
+ for rs in sorted(refined_segments, key=lambda x: x.get("start", 0.0)):
639
+ srt_segs.append({"start": rs.get("start", 0.0), "end": rs.get("end", 0.0), "text": rs.get("text", "")})
640
+ srt_text = segments_to_srt(srt_segs)
641
+ srt_fp = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(p))[0]}_two_pass.srt")
642
+ with open(srt_fp, "w", encoding="utf-8") as fh:
643
+ fh.write(srt_text)
644
+ srt_path = srt_fp
645
+ logs.append(f"SRT generated: {srt_path}")
646
+
647
+ if wav and os.path.exists(wav) and wav != p:
648
  try:
649
  os.unlink(wav)
650
  except Exception:
651
  pass
652
+
653
+ return combined_text, srt_path, "\n".join(logs)
654
+
655
  except Exception as e:
656
  tb = traceback.format_exc()
657
+ return "", None, f"Transcription error: {e}\n{tb}"
658
 
659
 
660
+ # ---------- Batch transcribe ----------
661
+ def batch_transcribe(friendly_selected, uploaded_files, model_name, device_name, merge_flag, enable_mem, generate_srt, use_two_pass=False, fast_model="small", refine_threshold=-1.0):
662
  logs = []
663
  transcripts = []
664
+ srt_files = []
665
+ out_doc = None
666
+ paths = []
667
+ if friendly_selected:
668
+ for key in friendly_selected:
669
+ p = EXTRACT_MAP.get(key)
670
+ if p:
671
+ paths.append(p)
672
+ else:
673
+ logs.append(f"Warning: selected not found in extract map: {key}")
674
+ if uploaded_files:
675
+ if isinstance(uploaded_files, (list, tuple)):
676
+ for f in uploaded_files:
677
+ paths.append(str(f))
678
  else:
679
+ paths.append(str(uploaded_files))
680
+ if not paths:
681
+ return "", "No files selected or uploaded.", None, None
682
+
683
+ total = len(paths)
684
+ for idx, p in enumerate(paths, start=1):
685
+ logs.append(f"[{idx}/{total}] Processing: {p}")
686
+ text, srt_path, lg = transcribe_single_file(
687
+ p,
688
+ model_name=model_name,
689
+ device_choice=device_name,
690
+ enable_memory=enable_mem,
691
+ generate_srt=generate_srt,
692
+ use_two_pass=use_two_pass,
693
+ fast_model=fast_model,
694
+ refine_model=model_name,
695
+ refine_threshold=refine_threshold,
696
+ )
697
+ logs.append(lg)
698
+ transcripts.append(f"FILE: {os.path.basename(p)}\n{text}\n")
699
+ if srt_path:
700
+ srt_files.append(srt_path)
701
  combined = "\n\n".join(transcripts)
 
702
  if merge_flag:
703
  try:
704
+ out_doc = save_as_word(combined)
705
+ logs.append(f"Merged saved: {out_doc}")
 
706
  except Exception as e:
707
  logs.append(f"Merge failed: {e}")
708
+ srt_return = srt_files[0] if srt_files else None
709
+ return combined, "\n".join(logs), out_doc, srt_return
710
+
711
+
712
+ # ---------- Robust multi-file memory importer ----------
713
+ def _read_file_text_try_encodings(path):
714
+ """
715
+ Try multiple encodings to read a text file. Returns tuple (text(str), encoding_used or None).
716
+ On failure returns (None, None).
717
+ """
718
+ encodings = ["utf-8", "utf-16", "latin-1"]
719
+ for enc in encodings:
720
+ try:
721
+ with open(path, "r", encoding=enc) as fh:
722
+ return fh.read(), enc
723
+ except UnicodeDecodeError:
724
+ continue
725
+ except Exception:
726
+ break
727
+
728
+ # Last resort: try open as binary and attempt utf-8 with errors='replace'
729
  try:
730
+ with open(path, "rb") as fh:
731
+ raw = fh.read()
732
+ try:
733
+ text = raw.decode("utf-8")
734
+ return text, "utf-8(guessed)"
735
+ except Exception:
736
+ text = raw.decode("latin-1", errors="replace")
737
+ return text, "latin-1(replaced)"
738
+ except Exception:
739
+ return None, None
740
+
741
+
742
+ def _process_single_memory_text(text):
743
+ """
744
+ Given the text of a file, merge into memory dict.
745
+ Returns number of 'entries' added.
746
+ """
747
+ added = 0
748
+ # try JSON first
749
+ try:
750
+ parsed = json.loads(text)
751
+ if isinstance(parsed, dict):
752
+ words = parsed.get("words", {})
753
+ phrases = parsed.get("phrases", {})
754
+ with MEMORY_LOCK:
755
+ for k, v in words.items():
756
+ try:
757
+ cnt = int(v)
758
+ except Exception:
759
+ cnt = 1
760
+ memory["words"][k.lower()] = memory["words"].get(k.lower(), 0) + cnt
761
+ added += 1
762
+ for k, v in phrases.items():
763
+ try:
764
+ cnt = int(v)
765
+ except Exception:
766
+ cnt = 1
767
+ memory["phrases"][k] = memory["phrases"].get(k, 0) + cnt
768
+ added += 1
769
+ return added
770
+ except Exception:
771
+ pass
772
+
773
+ # fallback: line-by-line file with optional "word,count" or plain lines
774
+ lines = [l.strip() for l in text.splitlines() if l.strip()]
775
+ with MEMORY_LOCK:
776
+ for line in lines:
777
+ if "," in line:
778
+ parts = [p.strip() for p in line.split(",", 1)]
779
+ key = parts[0]
780
+ try:
781
+ cnt = int(parts[1])
782
+ except Exception:
783
+ cnt = 1
784
+ memory["words"][key.lower()] = memory["words"].get(key.lower(), 0) + cnt
785
+ added += 1
786
+ else:
787
+ # if short, treat as word; otherwise phrase
788
+ if len(line.split()) <= 3:
789
+ memory["words"][line.lower()] = memory["words"].get(line.lower(), 0) + 1
790
+ added += 1
791
+ else:
792
+ memory["phrases"][line] = memory["phrases"].get(line, 0) + 1
793
+ added += 1
794
+ return added
795
+
796
+
797
+ def import_memory_files(uploaded_files):
798
+ """
799
+ Accepts a single path or a list of paths (filepaths from gr.File with type='filepath').
800
+ Supports plain text, JSON, and zip files containing text/JSON files.
801
+ Returns a friendly status string.
802
+ """
803
+ if not uploaded_files:
804
+ return "No files provided."
805
+
806
+ if isinstance(uploaded_files, (str, os.PathLike)):
807
+ uploaded_files = [str(uploaded_files)]
808
+ elif isinstance(uploaded_files, dict) and uploaded_files.get("name"):
809
+ uploaded_files = [uploaded_files["name"]]
810
+ elif isinstance(uploaded_files, (list, tuple)):
811
+ normalized = []
812
+ for f in uploaded_files:
813
+ if isinstance(f, (str, os.PathLike)):
814
+ normalized.append(str(f))
815
+ elif isinstance(f, dict) and f.get("name"):
816
+ normalized.append(f["name"])
817
+ elif hasattr(f, "name"):
818
+ normalized.append(f.name)
819
+ uploaded_files = normalized
820
+ else:
821
+ return "Unable to interpret uploaded files."
822
+
823
+ total_added = 0
824
+ skipped = []
825
+ messages = []
826
+
827
+ for fp in uploaded_files:
828
+ try:
829
+ if not os.path.exists(fp):
830
+ messages.append(f"Skipped missing: {fp}")
831
+ continue
832
+ lower = fp.lower()
833
+ if lower.endswith(".zip"):
834
+ try:
835
+ with zipfile.ZipFile(fp, "r") as zf:
836
+ for info in zf.infolist():
837
+ if info.is_dir():
838
+ continue
839
+ name = info.filename
840
+ try:
841
+ with zf.open(info) as member:
842
+ raw = member.read()
843
+ text = None
844
+ for enc in ("utf-8", "utf-16", "latin-1"):
845
+ try:
846
+ text = raw.decode(enc)
847
+ break
848
+ except Exception:
849
+ text = None
850
+ if text is None:
851
+ text = raw.decode("latin-1", errors="replace")
852
+ added = _process_single_memory_text(text)
853
+ total_added += added
854
+ messages.append(f"Imported {added} from ZIP member {name}")
855
+ messages.append(f"Processed ZIP: {os.path.basename(fp)}")
856
+ continue
857
+ except zipfile.BadZipFile:
858
+ messages.append(f"Bad zip: {fp}")
859
+ continue
860
+ # otherwise try to read as text with multiple encodings
861
+ text, used_enc = _read_file_text_try_encodings(fp)
862
+ if text is None:
863
+ skipped.append(fp)
864
+ continue
865
+ added = _process_single_memory_text(text)
866
+ total_added += added
867
+ messages.append(f"Imported {added} from {os.path.basename(fp)} (enc={used_enc})")
868
+ except Exception as e:
869
+ skipped.append(f"{fp}: {e}")
870
+
871
+ try:
872
+ save_memory(memory)
873
+ except Exception:
874
+ pass
875
+
876
+ summary_lines = []
877
+ summary_lines.append(f"Total entries added: {total_added}")
878
+ if messages:
879
+ summary_lines.append("Details:")
880
+ summary_lines.extend(messages)
881
+ if skipped:
882
+ summary_lines.append("Skipped/failed:")
883
+ summary_lines.extend(skipped)
884
+ return "\n".join(summary_lines)
885
+
886
+
887
+ # ---------- Build Gradio UI (3.x compatible) ----------
888
  print("DEBUG: building Gradio UI", flush=True)
889
  available_choices, default_choice = safe_model_choices(prefer_default="small")
890
 
891
  CSS = """
892
  :root{
893
  --accent:#4f46e5;
894
+ --muted:#6b7280;
895
+ --card:#ffffff;
896
+ --bg:#f7f8fb;
897
+ --text:#0f172a;
898
+ --transcript-bg:#0f172a;
899
  --transcript-color:#e6eef8;
900
  }
901
+ [data-theme="dark"] {
902
+ --accent: #7c3aed;
903
+ --muted: #9ca3af;
904
+ --card: #0b1220;
905
+ --bg: #071022;
906
+ --text: #e6eef8;
907
+ --transcript-bg: #071026;
908
+ --transcript-color: #e6eef8;
909
+ }
910
  body { background: var(--bg); color: var(--text); font-family: Inter, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
911
+ .header { padding: 14px; border-radius: 10px; background: linear-gradient(90deg, rgba(79,70,229,0.08), rgba(99,102,241,0.02)); margin-bottom: 12px; display:flex;align-items:center;gap:12px; }
912
+ .app-icon { width:50px;height:50px;border-radius:10px;background:linear-gradient(135deg,var(--accent),#06b6d4);display:flex;align-items:center;justify-content:center;color:white;font-weight:700;font-size:20px; }
913
+ .card { background:var(--card); border-radius:10px; padding:12px; box-shadow: 0 6px 20px rgba(16,24,40,0.04); }
914
+ .transcript-area { white-space:pre-wrap; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, "Roboto Mono", monospace; background: var(--transcript-bg); color: var(--transcript-color); padding:12px; border-radius:8px; min-height:200px; }
915
  .small-note { color:var(--muted); font-size:12px;}
 
916
  """
917
 
918
+ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
919
+ # Theme initializer + toggle injected via HTML (works across gradio versions)
920
+ gr.HTML("""
921
+ <script>
922
+ (function() {
923
+ try {
924
+ const saved = localStorage.getItem('wt_theme');
925
+ if (saved) {
926
+ document.documentElement.setAttribute('data-theme', saved);
927
+ } else {
928
+ document.documentElement.setAttribute('data-theme', 'dark');
929
+ }
930
+ } catch (e) { console.warn('theme init failed', e); }
931
+ })();
932
+ </script>
933
+ """)
934
+
935
+ # Header
936
+ with gr.Row():
937
+ with gr.Column(scale=0):
938
+ gr.HTML("<div style='width:50px;height:50px;border-radius:10px;background:linear-gradient(135deg,#4f46e5,#06b6d4);display:flex;align-items:center;justify-content:center;color:white;font-weight:700;font-size:20px;'>WT</div>")
939
+ with gr.Column():
940
+ gr.Markdown("<h3 style='margin:0'>Whisper Transcriber (Gradio 3.x)</h3>")
941
+ gr.Markdown("<div class='small-note'>Two-pass speedup, per-run ZIP extraction, memory corrections, SRT export, dark theme default</div>")
942
 
943
  with gr.Tabs():
944
+ # Single audio
945
+ with gr.TabItem("Audio Transcribe"):
946
  with gr.Row():
947
  with gr.Column(scale=1):
948
+ gr.Markdown("### Input")
949
+ single_audio = gr.Audio(label="Upload or record audio", type="filepath")
950
+ model_select = gr.Dropdown(choices=available_choices, value=default_choice, label="Model")
951
+ device_choice = gr.Dropdown(choices=["auto", "cpu", "cuda"], value="auto", label="Device")
952
+ mem_toggle = gr.Checkbox(label="Enable memory corrections", value=False)
953
+ srt_toggle = gr.Checkbox(label="Generate SRT", value=False)
954
+ use_two_pass_single = gr.Checkbox(label="Use two-pass speedup (fast then refine)", value=False)
955
+ fast_model_choice = gr.Dropdown(choices=[c for c in ["tiny", "base", "small"] if c in AVAILABLE_MODEL_SET], value="small", label="Fast model")
956
+ refine_threshold_single = gr.Number(value=-1.0, label="Refine threshold (avg_logprob)", precision=2)
957
+ transcribe_btn = gr.Button("Transcribe", variant="primary")
958
  with gr.Column(scale=1):
959
+ gr.Markdown("### Output")
960
+ audio_preview = gr.Audio(interactive=False)
961
+ transcript_out = gr.Textbox(label="Transcript", lines=14, interactive=False)
962
+ srt_download = gr.File(label="SRT (if generated)")
963
  single_logs = gr.Textbox(label="Logs", lines=8, interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
964
 
965
+ def _single_action(audio_file, model_name, device, mem_on, srt_on, use_two_pass_flag, fast_model, refine_thresh):
966
+ if not audio_file:
967
+ return None, "", None, "No audio provided."
968
+ path = audio_file if isinstance(audio_file, str) else (audio_file.name if hasattr(audio_file, "name") else str(audio_file))
969
+ text, srt_path, logs = transcribe_single_file(
970
+ path,
971
+ model_name=model_name,
972
+ device_choice=device,
973
+ enable_memory=mem_on,
974
+ generate_srt=srt_on,
975
+ use_two_pass=use_two_pass_flag,
976
+ fast_model=fast_model,
977
+ refine_model=model_name,
978
+ refine_threshold=refine_thresh,
979
+ )
980
+ preview = audio_file
981
+ return preview, text, srt_path, logs
982
+
983
+ transcribe_btn.click(
984
+ fn=_single_action,
985
+ inputs=[single_audio, model_select, device_choice, mem_toggle, srt_toggle, use_two_pass_single, fast_model_choice, refine_threshold_single],
986
+ outputs=[audio_preview, transcript_out, srt_download, single_logs],
987
+ )
988
+
989
+ # Batch tab
990
+ with gr.TabItem("Batch Transcribe"):
991
  with gr.Row():
992
  with gr.Column(scale=1):
993
+ gr.Markdown("### Batch input")
994
+ batch_files = gr.File(label="Upload audio files (optional)", file_count="multiple", type="filepath")
995
+ batch_zip = gr.File(label="Or upload ZIP with audio (optional)", file_count="single", type="filepath")
996
+ zip_password = gr.Textbox(label="ZIP password (optional)")
997
+ batch_extract_btn = gr.Button("Extract ZIP & List files")
998
+ batch_extract_logs = gr.Textbox(label="Extraction logs", lines=6, interactive=False)
999
+ batch_select = gr.CheckboxGroup(choices=[], label="Select extracted files", interactive=True)
1000
+ batch_model = gr.Dropdown(choices=available_choices, value=default_choice, label="Model")
1001
+ batch_device = gr.Dropdown(choices=["auto", "cpu", "cuda"], value="auto", label="Device")
1002
+ batch_merge = gr.Checkbox(label="Merge transcripts to DOCX", value=True)
1003
+ batch_mem = gr.Checkbox(label="Enable memory corrections", value=False)
1004
+ batch_srt = gr.Checkbox(label="Generate SRT(s)", value=False)
1005
+ batch_use_two_pass = gr.Checkbox(label="Use two-pass speedup", value=False)
1006
+ batch_fast_model = gr.Dropdown(choices=[c for c in ["tiny", "base", "small"] if c in AVAILABLE_MODEL_SET], value="small", label="Fast model")
1007
+ batch_refine_threshold = gr.Number(value=-1.0, label="Refine threshold", precision=2)
1008
+ batch_run_btn = gr.Button("Start Batch Transcription", variant="primary")
1009
  with gr.Column(scale=1):
1010
+ gr.Markdown("### Batch Output")
1011
+ batch_trans_out = gr.Textbox(label="Transcript (combined)", lines=16, interactive=False)
1012
+ batch_logs = gr.Textbox(label="Logs", lines=10, interactive=False)
1013
+ batch_doc_download = gr.File(label="Merged DOCX (if created)")
1014
+ batch_srt_download = gr.File(label="First SRT (if any)")
1015
+
1016
+ def _do_extract(zip_file, password):
1017
+ if not zip_file:
1018
+ return gr.update(choices=[]), "No ZIP provided."
1019
+ zip_path = zip_file.name if hasattr(zip_file, "name") else str(zip_file)
1020
+ friendly, logs = extract_zip_and_map(zip_path, password)
1021
+ return gr.update(choices=friendly), logs
1022
+
1023
+ batch_extract_btn.click(fn=_do_extract, inputs=[batch_zip, zip_password], outputs=[batch_select, batch_extract_logs])
1024
+
1025
+ def _do_batch(friendly_selected, uploaded_files, model_name, device, merge_flag, mem_flag, srt_flag, use_two_pass_flag, fast_model, refine_thresh):
1026
+ combined, logs, out_doc, srt_path = batch_transcribe(
1027
+ friendly_selected,
1028
+ uploaded_files,
1029
+ model_name,
1030
+ device,
1031
+ merge_flag,
1032
+ mem_flag,
1033
+ srt_flag,
1034
+ use_two_pass=use_two_pass_flag,
1035
+ fast_model=fast_model,
1036
+ refine_threshold=refine_thresh,
1037
+ )
1038
+ return combined, logs, out_doc, srt_path
1039
+
1040
+ batch_run_btn.click(
1041
+ fn=_do_batch,
1042
+ inputs=[batch_select, batch_files, batch_model, batch_device, batch_merge, batch_mem, batch_srt, batch_use_two_pass, batch_fast_model, batch_refine_threshold],
1043
+ outputs=[batch_trans_out, batch_logs, batch_doc_download, batch_srt_download],
1044
+ )
1045
+
1046
+ # Memory tab (updated to accept multiple files or zips)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1047
  with gr.TabItem("Memory"):
1048
  with gr.Row():
1049
  with gr.Column(scale=1):
1050
+ gr.Markdown("### Correction Memory")
1051
+ mem_upload = gr.File(label="Import memory files (text/JSON/zip). You may select multiple files", file_count="multiple", type="filepath")
1052
+ mem_import_btn = gr.Button("Import memory files")
1053
+ mem_text = gr.Textbox(label="Add word/phrase", placeholder="Type word or phrase")
1054
+ mem_add_btn = gr.Button("Add to Memory")
1055
+ mem_clear_btn = gr.Button("Clear Memory")
1056
+ mem_view_btn = gr.Button("View Memory")
1057
+ mem_status = gr.Textbox(label="Memory status / preview", lines=12, interactive=False)
1058
+
1059
  with gr.Column(scale=1):
1060
+ gr.Markdown("### Memory controls")
1061
+ gr.Markdown("- JSON format: {\"words\": {\"word\": count}, \"phrases\": {\"phrase\": count}}")
1062
+ gr.Markdown("- Plain text: one word/phrase per line or `word,count` per line")
1063
+ gr.Markdown("- ZIP files: will be scanned and any text/JSON files imported")
1064
 
1065
+ mem_import_btn.click(fn=import_memory_files, inputs=[mem_upload], outputs=[mem_status])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1066
 
1067
+ def _add_mem(entry):
1068
+ if not entry or not entry.strip():
1069
  return "No entry provided."
1070
+ e = entry.strip()
1071
  with MEMORY_LOCK:
1072
  if len(e.split()) <= 3:
1073
  memory["words"][e.lower()] = memory["words"].get(e.lower(), 0) + 1
 
1089
  w = memory.get("words", {})
1090
  p = memory.get("phrases", {})
1091
  out_lines = []
1092
+ out_lines.append("WORDS (top 30):")
1093
+ for k, v in sorted(w.items(), key=lambda kv: -kv[1])[:30]:
1094
  out_lines.append(f"{k}: {v}")
1095
  out_lines.append("")
1096
+ out_lines.append("PHRASES (top 20):")
1097
+ for k, v in sorted(p.items(), key=lambda kv: -kv[1])[:20]:
1098
  out_lines.append(f"{k}: {v}")
1099
  return "\n".join(out_lines)
1100
 
1101
+ mem_add_btn.click(fn=_add_mem, inputs=[mem_text], outputs=[mem_status])
 
1102
  mem_clear_btn.click(fn=_clear_mem, inputs=[], outputs=[mem_status])
1103
  mem_view_btn.click(fn=_view_mem, inputs=[], outputs=[mem_status])
1104
 
1105
+ # Settings tab (theme toggle via injected HTML)
1106
+ with gr.TabItem("Settings"):
1107
+ with gr.Row():
1108
+ with gr.Column():
1109
+ gr.Markdown("### Runtime & tips")
1110
+ gr.Markdown("- Use `large-v3` only if your whisper package supports it.")
1111
+ gr.Markdown("- Extraction writes to a per-run temp directory under system temp.")
1112
+ gr.Markdown("- Two-pass helps when heavy model is slow.")
1113
+ with gr.Column():
1114
+ gr.Markdown("### Theme")
1115
+ gr.HTML("""
1116
+ <div style="display:flex;gap:8px;align-items:center;">
1117
+ <button id="wt_theme_btn" style="padding:8px 12px;border-radius:8px;border:1px solid rgba(0,0,0,0.06);background:var(--card);cursor:pointer;">
1118
+ Toggle Dark / Light Theme
1119
+ </button>
1120
+ <span style="color:var(--muted);font-size:13px;">Theme preference saved in browser</span>
1121
+ </div>
1122
+ <script>
1123
+ (function(){
1124
+ try {
1125
+ const root = document.documentElement;
1126
+ const btn = document.getElementById('wt_theme_btn');
1127
+ try {
1128
+ const saved = localStorage.getItem('wt_theme');
1129
+ if (saved) root.setAttribute('data-theme', saved);
1130
+ } catch(e){}
1131
+ btn.addEventListener('click', function(){
1132
+ try {
1133
+ const cur = root.getAttribute('data-theme') === 'dark' ? 'light' : 'dark';
1134
+ root.setAttribute('data-theme', cur);
1135
+ try { localStorage.setItem('wt_theme', cur); } catch(e){}
1136
+ } catch(e){ console.error(e); }
1137
+ });
1138
+ } catch(e){}
1139
+ })();
1140
+ </script>
1141
+ """)
1142
+
1143
  # ---------- Launch ----------
1144
  if __name__ == "__main__":
1145
  port = int(os.environ.get("PORT", 7860))