staraks commited on
Commit
847997b
·
verified ·
1 Parent(s): f22f266

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -108
app.py CHANGED
@@ -1,6 +1,10 @@
1
  # app.py
2
- # Whisper Transcriber — Gradio 3.x compatible full file (chunked streaming)
3
- # Requirements: gradio (3.x), whisper, pydub, pyzipper, python-docx, ffmpeg installed
 
 
 
 
4
 
5
  import os
6
  import sys
@@ -14,13 +18,15 @@ import re
14
  from difflib import get_close_matches
15
  from uuid import uuid4
16
  from pathlib import Path
 
 
17
 
18
  # Force unbuffered prints for logs
19
  os.environ["PYTHONUNBUFFERED"] = "1"
20
 
21
  print("DEBUG: app.py bootstrap starting", flush=True)
22
 
23
- # Third-party imports (ensure installed)
24
  try:
25
  import gradio as gr
26
  import whisper
@@ -46,6 +52,40 @@ FFMPEG_CANDIDATES = [
46
  MODEL_CACHE = {}
47
  EXTRACT_MAP = {} # friendly_name -> absolute path
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # ---------- Memory & postprocessing ----------
50
  def load_memory():
51
  try:
@@ -67,7 +107,6 @@ def load_memory():
67
  pass
68
  return mem
69
 
70
-
71
  def save_memory(mem):
72
  with MEMORY_LOCK:
73
  try:
@@ -76,7 +115,6 @@ def save_memory(mem):
76
  except Exception:
77
  traceback.print_exc()
78
 
79
-
80
  memory = load_memory()
81
 
82
  MEDICAL_ABBREVIATIONS = {
@@ -98,7 +136,6 @@ DRUG_NORMALIZATION = {
98
  "amoxicillin": "Amoxicillin",
99
  }
100
 
101
-
102
  def expand_abbreviations(text):
103
  tokens = re.split(r"(\s+)", text)
104
  out = []
@@ -114,13 +151,11 @@ def expand_abbreviations(text):
114
  out.append(t)
115
  return "".join(out)
116
 
117
-
118
  def normalize_drugs(text):
119
  for k, v in DRUG_NORMALIZATION.items():
120
  text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
121
  return text
122
 
123
-
124
  def punctuation_and_capitalization(text):
125
  text = text.strip()
126
  if not text:
@@ -136,7 +171,6 @@ def punctuation_and_capitalization(text):
136
  out.append(p)
137
  return "".join(out)
138
 
139
-
140
  def postprocess_transcript(text):
141
  if not text:
142
  return text
@@ -146,13 +180,11 @@ def postprocess_transcript(text):
146
  t = punctuation_and_capitalization(t)
147
  return t
148
 
149
-
150
  def extract_words_and_phrases(text):
151
  words = re.findall(r"[A-Za-z0-9\-']+", text)
152
  sentences = [s.strip() for s in re.split(r"(?<=[.?!])\s+", text) if s.strip()]
153
  return [w for w in words if w.strip()], sentences
154
 
155
-
156
  def update_memory_with_transcript(transcript):
157
  global memory
158
  words, sentences = extract_words_and_phrases(transcript)
@@ -168,7 +200,6 @@ def update_memory_with_transcript(transcript):
168
  if changed:
169
  save_memory(memory)
170
 
171
-
172
  def memory_correct_text(text, min_ratio=0.85):
173
  if not text or (not memory.get("words") and not memory.get("phrases")):
174
  return text
@@ -202,7 +233,6 @@ def memory_correct_text(text, min_ratio=0.85):
202
  corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
203
  return corrected
204
 
205
-
206
  # ---------- Utilities ----------
207
  def save_as_word(text, filename=None):
208
  if filename is None:
@@ -212,7 +242,6 @@ def save_as_word(text, filename=None):
212
  doc.save(filename)
213
  return filename
214
 
215
-
216
  def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
217
  try:
218
  cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"]
@@ -239,7 +268,6 @@ def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
239
  pass
240
  return False, str(e)
241
 
242
-
243
  def convert_to_wav_if_needed(input_path):
244
  input_path = str(input_path)
245
  lower = input_path.lower()
@@ -322,7 +350,6 @@ def convert_to_wav_if_needed(input_path):
322
 
323
  raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}")
324
 
325
-
326
  # ---------- Whisper helper ----------
327
  def whisper_available_models():
328
  try:
@@ -333,10 +360,8 @@ def whisper_available_models():
333
  pass
334
  return set(["tiny", "base", "small", "medium", "large", "large-v3"])
335
 
336
-
337
  AVAILABLE_MODEL_SET = whisper_available_models()
338
 
339
-
340
  def safe_model_choices(prefer_default="small"):
341
  base_choices = ["small", "medium", "large", "large-v3", "base", "tiny"]
342
  choices = [m for m in base_choices if m in AVAILABLE_MODEL_SET]
@@ -345,7 +370,6 @@ def safe_model_choices(prefer_default="small"):
345
  default = prefer_default if prefer_default in choices else choices[0]
346
  return choices, default
347
 
348
-
349
  def get_whisper_model(name, device=None):
350
  if name not in MODEL_CACHE:
351
  print(f"DEBUG: loading whisper model '{name}'", flush=True)
@@ -358,7 +382,6 @@ def get_whisper_model(name, device=None):
358
  MODEL_CACHE[name] = whisper.load_model(name)
359
  return MODEL_CACHE[name]
360
 
361
-
362
  # ---------- SRT helper ----------
363
  def segments_to_srt(segments):
364
  def fmt_time(t):
@@ -379,7 +402,6 @@ def segments_to_srt(segments):
379
  lines.append("")
380
  return "\n".join(lines)
381
 
382
-
383
  # ---------- ZIP extraction (per-run dir) ----------
384
  def extract_zip_and_map(zip_path, zip_password=None):
385
  global EXTRACT_MAP
@@ -439,7 +461,6 @@ def extract_zip_and_map(zip_path, zip_password=None):
439
  pass
440
  return [], f"Extraction failed: {e}"
441
 
442
-
443
  # ---------- Trim helper used in two-pass ----------
444
  def trim_audio_segment(src_path, start_sec, end_sec):
445
  src = str(src_path)
@@ -482,8 +503,7 @@ def trim_audio_segment(src_path, start_sec, end_sec):
482
  pass
483
  raise
484
 
485
-
486
- # ---------- Core transcription (single file, supports two-pass) ----------
487
  def transcribe_single_file(
488
  path,
489
  model_name="small",
@@ -495,6 +515,7 @@ def transcribe_single_file(
495
  refine_model=None,
496
  refine_threshold=-1.0,
497
  ):
 
498
  logs = []
499
  try:
500
  if not path:
@@ -533,16 +554,13 @@ def transcribe_single_file(
533
  pass
534
  return text, srt_path, "\n".join(logs)
535
 
536
- # Two-pass path remains unchanged (not used by generator directly)
537
- # ... omitted here for brevity (two-pass logic same as previous full file) ...
538
- # For the generator flow we use chunking; two-pass heavy refinement is optional
539
- return "", None, "Two-pass is not invoked in this helper in streaming mode."
540
  except Exception as e:
541
  tb = traceback.format_exc()
542
  return "", None, f"Transcription error: {e}\n{tb}"
543
 
544
-
545
- # ---------- Batch transcribe (unchanged, uses transcribe_single_file) ----------
546
  def batch_transcribe(friendly_selected, uploaded_files, model_name, device_name, merge_flag, enable_mem, generate_srt, use_two_pass=False, fast_model="small", refine_threshold=-1.0):
547
  logs = []
548
  transcripts = []
@@ -593,7 +611,6 @@ def batch_transcribe(friendly_selected, uploaded_files, model_name, device_name,
593
  srt_return = srt_files[0] if srt_files else None
594
  return combined, "\n".join(logs), out_doc, srt_return
595
 
596
-
597
  # ---------- Build Gradio UI (3.x compatible) ----------
598
  print("DEBUG: building Gradio UI", flush=True)
599
  available_choices, default_choice = safe_model_choices(prefer_default="small")
@@ -631,7 +648,6 @@ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
631
  <script>
632
  (function() {
633
  try {
634
- // Load saved preference or fall back to OS preference, then 'light'
635
  var saved = null;
636
  try { saved = localStorage.getItem('wt_theme'); } catch(e){ saved = null; }
637
  var chosen = null;
@@ -643,7 +659,6 @@ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
643
  chosen = 'light';
644
  }
645
  document.documentElement.setAttribute('data-theme', chosen);
646
-
647
  try {
648
  var style = document.createElement('style');
649
  style.innerHTML = `
@@ -662,7 +677,7 @@ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
662
  gr.HTML("<div style='width:50px;height:50px;border-radius:10px;background:linear-gradient(135deg,#4f46e5,#06b6d4);display:flex;align-items:center;justify-content:center;color:white;font-weight:700;font-size:20px;'>WT</div>")
663
  with gr.Column():
664
  gr.Markdown("<h3 style='margin:0'>Whisper Transcriber (Gradio 3.x)</h3>")
665
- gr.Markdown("<div class='small-note'>Two-pass speedup, per-run ZIP extraction, memory corrections, SRT export, dark/light toggle</div>")
666
 
667
  with gr.Tabs():
668
  # Single audio
@@ -675,118 +690,170 @@ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
675
  device_choice = gr.Dropdown(choices=["auto", "cpu", "cuda"], value="auto", label="Device")
676
  mem_toggle = gr.Checkbox(label="Enable memory corrections", value=False)
677
  srt_toggle = gr.Checkbox(label="Generate SRT", value=False)
 
 
 
 
 
 
 
678
  use_two_pass_single = gr.Checkbox(label="Use two-pass speedup (fast then refine)", value=False)
679
- fast_model_choice = gr.Dropdown(choices=[c for c in ["tiny", "base", "small"] if c in AVAILABLE_MODEL_SET], value="small", label="Fast model")
680
  refine_threshold_single = gr.Number(value=-1.0, label="Refine threshold (avg_logprob)", precision=2)
681
  transcribe_btn = gr.Button("Transcribe", variant="primary")
682
  with gr.Column(scale=1):
683
  gr.Markdown("### Output")
684
- # progress: numeric slider visually works as a progress bar in Gradio 3.x
685
  progress_num = gr.Slider(minimum=0, maximum=100, value=0, label="Progress (%)", interactive=False)
686
  transcript_out = gr.Textbox(label="Transcript", lines=14, interactive=False)
687
  srt_download = gr.File(label="SRT (if generated)")
688
  single_logs = gr.Textbox(label="Logs", lines=8, interactive=False)
689
 
690
- # ---------- streaming, chunked single-file transcription ----------
691
- def _single_generator(audio_file, model_name, device, mem_on, srt_on, use_two_pass_flag, fast_model, refine_thresh, chunk_size_sec=30, enable_chunking=True):
692
- """
693
- Generator yields tuples for Gradio outputs: (progress_num, transcript_text, srt_path_or_none, logs)
694
- """
695
  yield 0, "", None, "Starting..."
696
  try:
697
  if not audio_file:
698
  yield 100, "", None, "No audio provided."
699
  return
700
 
701
- # resolve input path
702
  path = audio_file if isinstance(audio_file, str) else (audio_file.name if hasattr(audio_file, "name") else str(audio_file))
703
 
704
- # Convert file to wav (yield while converting)
705
  yield 2, "", None, "Converting input to WAV..."
706
  wav = convert_to_wav_if_needed(path)
707
  yield 8, "", None, f"Converted to WAV: {os.path.basename(wav)}"
708
 
709
- # Determine duration and chunking
710
- if enable_chunking:
 
 
 
 
711
  duration = None
 
 
712
  try:
713
- p = subprocess.run(["ffprobe","-v","error","-show_entries","format=duration","-of","default=noprint_wrappers=1:nokey=1", wav], capture_output=True, text=True, timeout=8)
714
- duration = float(p.stdout.strip()) if p.stdout and p.stdout.strip() else None
715
  except Exception:
716
  duration = None
717
 
718
- if duration is None:
719
- try:
720
- aud = AudioSegment.from_file(wav)
721
- duration = len(aud) / 1000.0
722
- except Exception:
723
- duration = None
724
-
725
- if duration and duration > chunk_size_sec * 1.5:
726
- num_chunks = max(1, int((duration + chunk_size_sec - 1) // chunk_size_sec))
727
- chunk_ranges = []
728
- start = 0.0
729
- for i in range(num_chunks):
730
- end = min(duration, start + chunk_size_sec)
731
- chunk_ranges.append((start, end))
732
- start = end
733
- else:
734
- enable_chunking = False
735
- chunk_ranges = [(0.0, None)]
736
  else:
737
  chunk_ranges = [(0.0, None)]
 
 
 
738
 
739
- # load model (single load)
740
- yield 10, "", None, f"Loading model: {model_name}..."
741
  model = get_whisper_model(model_name, device=None if device == "auto" else device)
742
  yield 15, "", None, f"Model loaded: {model_name}"
743
 
744
- # Prepare transcription loop
745
- overall_text_parts = []
746
  total_chunks = len(chunk_ranges)
747
- for idx, (st, ed) in enumerate(chunk_ranges, start=1):
748
- try:
749
- if ed is None:
750
- chunk_wav = wav
751
- note = "full file"
752
- else:
753
- chunk_wav = trim_audio_segment(wav, st, ed)
754
- note = f"{st:.1f}s - {ed:.1f}s"
755
-
756
- yield int(15 + (idx - 1) * 70 / max(1, total_chunks)), "", None, f"Transcribing chunk {idx}/{total_chunks} ({note})..."
757
-
758
- whisper_opts = {}
759
- # keep whisper_opts minimal to speed transcribe call; model implementation may ignore unknown opts
760
- result = model.transcribe(chunk_wav, **whisper_opts)
761
- chunk_text = result.get("text", "").strip()
762
-
763
- if mem_on:
764
- chunk_text = memory_correct_text(chunk_text)
765
-
766
- chunk_text = postprocess_transcript(chunk_text)
767
- overall_text_parts.append(chunk_text)
768
 
769
- if ed is not None and chunk_wav and os.path.exists(chunk_wav) and chunk_wav != wav:
770
- try:
771
- os.unlink(chunk_wav)
772
- except Exception:
773
- pass
774
-
775
- partial = "\n\n".join(overall_text_parts)
776
- prog = int(15 + idx * 70 / max(1, total_chunks))
777
- yield prog, partial, None, f"Completed chunk {idx}/{total_chunks}."
778
- except Exception as e:
779
- yield int(15 + idx * 70 / max(1, total_chunks)), "\n\n".join(overall_text_parts), None, f"Chunk {idx} failed: {e}\n{traceback.format_exc()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
780
 
781
  # final assembly
782
- final_text = "\n\n".join([p for p in overall_text_parts if p])
783
  if mem_on:
784
  try:
785
  update_memory_with_transcript(final_text)
786
  except Exception:
787
  pass
788
 
789
- # generate SRT if requested (best-effort using full model segments)
790
  srt_path = None
791
  if srt_on:
792
  try:
@@ -816,11 +883,11 @@ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
816
 
817
  transcribe_btn.click(
818
  fn=_single_generator,
819
- inputs=[single_audio, model_select, device_choice, mem_toggle, srt_toggle, use_two_pass_single, fast_model_choice, refine_threshold_single],
820
  outputs=[progress_num, transcript_out, srt_download, single_logs],
821
  )
822
 
823
- # Batch tab
824
  with gr.TabItem("Batch Transcribe"):
825
  with gr.Row():
826
  with gr.Column(scale=1):
@@ -877,7 +944,7 @@ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
877
  outputs=[batch_trans_out, batch_logs, batch_doc_download, batch_srt_download],
878
  )
879
 
880
- # Memory tab
881
  with gr.TabItem("Memory"):
882
  with gr.Row():
883
  with gr.Column(scale=1):
@@ -968,14 +1035,14 @@ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
968
  mem_clear_btn.click(fn=_clear_mem, inputs=[], outputs=[mem_status])
969
  mem_view_btn.click(fn=_view_mem, inputs=[], outputs=[mem_status])
970
 
971
- # Settings tab (theme toggle via injected HTML)
972
  with gr.TabItem("Settings"):
973
  with gr.Row():
974
  with gr.Column():
975
  gr.Markdown("### Runtime & tips")
976
  gr.Markdown("- Use `large-v3` only if your whisper package supports it.")
977
  gr.Markdown("- Extraction writes to a per-run temp directory under system temp.")
978
- gr.Markdown("- Two-pass helps when heavy model is slow.")
979
  with gr.Column():
980
  gr.Markdown("### Theme")
981
  gr.HTML("""
 
1
  # app.py
2
+ # Whisper Transcriber — Gradio 3.x compatible full file
3
+ # Features added: chunk size control, experimental parallel chunk transcription (CPU-only),
4
+ # streaming progress bar (no audio preview), memory corrections, ZIP extraction, theme toggle.
5
+ #
6
+ # Requirements: gradio (3.x), whisper, pydub, pyzipper, python-docx, ffmpeg installed.
7
+ # Experimental parallel mode uses multiprocessing and loads the 'fast' model in each worker.
8
 
9
  import os
10
  import sys
 
18
  from difflib import get_close_matches
19
  from uuid import uuid4
20
  from pathlib import Path
21
+ from multiprocessing import get_context
22
+ from typing import Tuple, List
23
 
24
  # Force unbuffered prints for logs
25
  os.environ["PYTHONUNBUFFERED"] = "1"
26
 
27
  print("DEBUG: app.py bootstrap starting", flush=True)
28
 
29
+ # Third-party imports
30
  try:
31
  import gradio as gr
32
  import whisper
 
52
  MODEL_CACHE = {}
53
  EXTRACT_MAP = {} # friendly_name -> absolute path
54
 
55
+ # ---------- Worker-global for multiprocessing ----------
56
+ # These are defined for worker processes (initialized via initializer)
57
+ WORKER_MODEL = None # type: ignore
58
+
59
+ def worker_init(model_name: str, device: str):
60
+ """
61
+ Multiprocessing worker initializer: load a whisper model per worker.
62
+ Use device='cpu' for workers (recommended).
63
+ """
64
+ global WORKER_MODEL
65
+ try:
66
+ if device and device != "auto":
67
+ WORKER_MODEL = whisper.load_model(model_name, device=device)
68
+ else:
69
+ WORKER_MODEL = whisper.load_model(model_name)
70
+ except Exception:
71
+ # fallback: try load without device arg
72
+ WORKER_MODEL = whisper.load_model(model_name)
73
+
74
+ def worker_transcribe_chunk(chunk_path: str) -> Tuple[str, str]:
75
+ """
76
+ Worker function to transcribe a chunk using WORKER_MODEL.
77
+ Returns (text, error_message). error_message empty if OK.
78
+ """
79
+ global WORKER_MODEL
80
+ try:
81
+ if WORKER_MODEL is None:
82
+ return "", "Worker model not loaded"
83
+ res = WORKER_MODEL.transcribe(chunk_path)
84
+ text = res.get("text", "").strip()
85
+ return text, ""
86
+ except Exception as e:
87
+ return "", f"Worker transcription error: {e}\n{traceback.format_exc()}"
88
+
89
  # ---------- Memory & postprocessing ----------
90
  def load_memory():
91
  try:
 
107
  pass
108
  return mem
109
 
 
110
  def save_memory(mem):
111
  with MEMORY_LOCK:
112
  try:
 
115
  except Exception:
116
  traceback.print_exc()
117
 
 
118
  memory = load_memory()
119
 
120
  MEDICAL_ABBREVIATIONS = {
 
136
  "amoxicillin": "Amoxicillin",
137
  }
138
 
 
139
  def expand_abbreviations(text):
140
  tokens = re.split(r"(\s+)", text)
141
  out = []
 
151
  out.append(t)
152
  return "".join(out)
153
 
 
154
  def normalize_drugs(text):
155
  for k, v in DRUG_NORMALIZATION.items():
156
  text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
157
  return text
158
 
 
159
  def punctuation_and_capitalization(text):
160
  text = text.strip()
161
  if not text:
 
171
  out.append(p)
172
  return "".join(out)
173
 
 
174
  def postprocess_transcript(text):
175
  if not text:
176
  return text
 
180
  t = punctuation_and_capitalization(t)
181
  return t
182
 
 
183
  def extract_words_and_phrases(text):
184
  words = re.findall(r"[A-Za-z0-9\-']+", text)
185
  sentences = [s.strip() for s in re.split(r"(?<=[.?!])\s+", text) if s.strip()]
186
  return [w for w in words if w.strip()], sentences
187
 
 
188
  def update_memory_with_transcript(transcript):
189
  global memory
190
  words, sentences = extract_words_and_phrases(transcript)
 
200
  if changed:
201
  save_memory(memory)
202
 
 
203
  def memory_correct_text(text, min_ratio=0.85):
204
  if not text or (not memory.get("words") and not memory.get("phrases")):
205
  return text
 
233
  corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
234
  return corrected
235
 
 
236
  # ---------- Utilities ----------
237
  def save_as_word(text, filename=None):
238
  if filename is None:
 
242
  doc.save(filename)
243
  return filename
244
 
 
245
  def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
246
  try:
247
  cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"]
 
268
  pass
269
  return False, str(e)
270
 
 
271
  def convert_to_wav_if_needed(input_path):
272
  input_path = str(input_path)
273
  lower = input_path.lower()
 
350
 
351
  raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}")
352
 
 
353
  # ---------- Whisper helper ----------
354
  def whisper_available_models():
355
  try:
 
360
  pass
361
  return set(["tiny", "base", "small", "medium", "large", "large-v3"])
362
 
 
363
  AVAILABLE_MODEL_SET = whisper_available_models()
364
 
 
365
  def safe_model_choices(prefer_default="small"):
366
  base_choices = ["small", "medium", "large", "large-v3", "base", "tiny"]
367
  choices = [m for m in base_choices if m in AVAILABLE_MODEL_SET]
 
370
  default = prefer_default if prefer_default in choices else choices[0]
371
  return choices, default
372
 
 
373
  def get_whisper_model(name, device=None):
374
  if name not in MODEL_CACHE:
375
  print(f"DEBUG: loading whisper model '{name}'", flush=True)
 
382
  MODEL_CACHE[name] = whisper.load_model(name)
383
  return MODEL_CACHE[name]
384
 
 
385
  # ---------- SRT helper ----------
386
  def segments_to_srt(segments):
387
  def fmt_time(t):
 
402
  lines.append("")
403
  return "\n".join(lines)
404
 
 
405
  # ---------- ZIP extraction (per-run dir) ----------
406
  def extract_zip_and_map(zip_path, zip_password=None):
407
  global EXTRACT_MAP
 
461
  pass
462
  return [], f"Extraction failed: {e}"
463
 
 
464
  # ---------- Trim helper used in two-pass ----------
465
  def trim_audio_segment(src_path, start_sec, end_sec):
466
  src = str(src_path)
 
503
  pass
504
  raise
505
 
506
+ # ---------- Core transcription (single file) ----------
 
507
  def transcribe_single_file(
508
  path,
509
  model_name="small",
 
515
  refine_model=None,
516
  refine_threshold=-1.0,
517
  ):
518
+ # non-streaming convenience helper used for batch mode
519
  logs = []
520
  try:
521
  if not path:
 
554
  pass
555
  return text, srt_path, "\n".join(logs)
556
 
557
+ # Two-pass path not used for streaming generator here
558
+ return "", None, "Two-pass not used in this helper."
 
 
559
  except Exception as e:
560
  tb = traceback.format_exc()
561
  return "", None, f"Transcription error: {e}\n{tb}"
562
 
563
+ # ---------- Batch transcribe (unchanged) ----------
 
564
  def batch_transcribe(friendly_selected, uploaded_files, model_name, device_name, merge_flag, enable_mem, generate_srt, use_two_pass=False, fast_model="small", refine_threshold=-1.0):
565
  logs = []
566
  transcripts = []
 
611
  srt_return = srt_files[0] if srt_files else None
612
  return combined, "\n".join(logs), out_doc, srt_return
613
 
 
614
  # ---------- Build Gradio UI (3.x compatible) ----------
615
  print("DEBUG: building Gradio UI", flush=True)
616
  available_choices, default_choice = safe_model_choices(prefer_default="small")
 
648
  <script>
649
  (function() {
650
  try {
 
651
  var saved = null;
652
  try { saved = localStorage.getItem('wt_theme'); } catch(e){ saved = null; }
653
  var chosen = null;
 
659
  chosen = 'light';
660
  }
661
  document.documentElement.setAttribute('data-theme', chosen);
 
662
  try {
663
  var style = document.createElement('style');
664
  style.innerHTML = `
 
677
  gr.HTML("<div style='width:50px;height:50px;border-radius:10px;background:linear-gradient(135deg,#4f46e5,#06b6d4);display:flex;align-items:center;justify-content:center;color:white;font-weight:700;font-size:20px;'>WT</div>")
678
  with gr.Column():
679
  gr.Markdown("<h3 style='margin:0'>Whisper Transcriber (Gradio 3.x)</h3>")
680
+ gr.Markdown("<div class='small-note'>Chunked streaming, experimental CPU parallel, per-run ZIP extraction, memory corrections, SRT export, dark/light toggle</div>")
681
 
682
  with gr.Tabs():
683
  # Single audio
 
690
  device_choice = gr.Dropdown(choices=["auto", "cpu", "cuda"], value="auto", label="Device")
691
  mem_toggle = gr.Checkbox(label="Enable memory corrections", value=False)
692
  srt_toggle = gr.Checkbox(label="Generate SRT", value=False)
693
+ # chunk controls
694
+ chunk_controls_row = gr.Row(visible=True)
695
+ chunk_size_input = gr.Number(value=30, label="Chunk size (seconds)", precision=0)
696
+ enable_chunking = gr.Checkbox(label="Enable chunking (recommended for long files)", value=True)
697
+ # parallel experimental
698
+ parallel_checkbox = gr.Checkbox(label="Enable experimental parallel chunk transcription (CPU only)", value=False)
699
+ parallel_workers = gr.Slider(minimum=1, maximum=max(1, os.cpu_count() or 4), value=2, step=1, label="Parallel workers (processes)")
700
  use_two_pass_single = gr.Checkbox(label="Use two-pass speedup (fast then refine)", value=False)
701
+ fast_model_choice = gr.Dropdown(choices=[c for c in ["tiny", "base", "small"] if c in AVAILABLE_MODEL_SET], value="small", label="Fast model (for two-pass / workers)")
702
  refine_threshold_single = gr.Number(value=-1.0, label="Refine threshold (avg_logprob)", precision=2)
703
  transcribe_btn = gr.Button("Transcribe", variant="primary")
704
  with gr.Column(scale=1):
705
  gr.Markdown("### Output")
 
706
  progress_num = gr.Slider(minimum=0, maximum=100, value=0, label="Progress (%)", interactive=False)
707
  transcript_out = gr.Textbox(label="Transcript", lines=14, interactive=False)
708
  srt_download = gr.File(label="SRT (if generated)")
709
  single_logs = gr.Textbox(label="Logs", lines=8, interactive=False)
710
 
711
+ # streaming generator with optional multiprocessing
712
+ def _single_generator(audio_file, model_name, device, mem_on, srt_on, chunk_size_sec, chunking_enabled, parallel_enabled, workers, use_two_pass_flag, fast_model, refine_thresh):
 
 
 
713
  yield 0, "", None, "Starting..."
714
  try:
715
  if not audio_file:
716
  yield 100, "", None, "No audio provided."
717
  return
718
 
 
719
  path = audio_file if isinstance(audio_file, str) else (audio_file.name if hasattr(audio_file, "name") else str(audio_file))
720
 
 
721
  yield 2, "", None, "Converting input to WAV..."
722
  wav = convert_to_wav_if_needed(path)
723
  yield 8, "", None, f"Converted to WAV: {os.path.basename(wav)}"
724
 
725
+ # determine duration
726
+ duration = None
727
+ try:
728
+ p = subprocess.run(["ffprobe","-v","error","-show_entries","format=duration","-of","default=noprint_wrappers=1:nokey=1", wav], capture_output=True, text=True, timeout=8)
729
+ duration = float(p.stdout.strip()) if p.stdout and p.stdout.strip() else None
730
+ except Exception:
731
  duration = None
732
+
733
+ if duration is None:
734
  try:
735
+ aud = AudioSegment.from_file(wav)
736
+ duration = len(aud) / 1000.0
737
  except Exception:
738
  duration = None
739
 
740
+ # build chunk ranges
741
+ if chunking_enabled and (duration and duration > chunk_size_sec * 1.5):
742
+ num_chunks = max(1, int((duration + chunk_size_sec - 1) // chunk_size_sec))
743
+ chunk_ranges = []
744
+ start = 0.0
745
+ for i in range(num_chunks):
746
+ end = min(duration, start + chunk_size_sec)
747
+ chunk_ranges.append((start, end))
748
+ start = end
 
 
 
 
 
 
 
 
 
749
  else:
750
  chunk_ranges = [(0.0, None)]
751
+ chunking_enabled = False
752
+
753
+ yield 10, "", None, f"Preparing transcription ({len(chunk_ranges)} chunk(s))..."
754
 
755
+ # Load model in main process (for serial or orchestration)
 
756
  model = get_whisper_model(model_name, device=None if device == "auto" else device)
757
  yield 15, "", None, f"Model loaded: {model_name}"
758
 
759
+ overall_parts = []
 
760
  total_chunks = len(chunk_ranges)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
 
762
+ # Decide whether we can/should run parallel workers
763
+ parallel_used = False
764
+ if parallel_enabled and chunking_enabled and total_chunks > 1:
765
+ if device != "cpu" and device != "auto":
766
+ # Most likely GPU requested; parallel across multiple processes with GPU not recommended
767
+ yield 15, "", None, "Parallel mode requested but device is not 'cpu'. Falling back to serial chunking."
768
+ parallel_used = False
769
+ else:
770
+ # attempt to spawn a multiprocessing pool that initializes each worker with fast_model on CPU
771
+ try:
772
+ ctx = get_context("spawn")
773
+ worker_count = max(1, int(workers))
774
+ yield 18, "", None, f"Starting parallel pool with {worker_count} workers (fast_model={fast_model})..."
775
+ pool = ctx.Pool(processes=worker_count, initializer=worker_init, initargs=(fast_model, "cpu"))
776
+ # prepare chunk WAVs
777
+ chunk_paths = []
778
+ temp_chunk_files = []
779
+ for (st, ed) in chunk_ranges:
780
+ if ed is None:
781
+ chunk_paths.append(wav)
782
+ else:
783
+ cw = trim_audio_segment(wav, st, ed)
784
+ chunk_paths.append(cw)
785
+ temp_chunk_files.append(cw)
786
+ # map transcribe jobs
787
+ results = pool.map(worker_transcribe_chunk, chunk_paths)
788
+ pool.close()
789
+ pool.join()
790
+ # process results in order
791
+ for idx, (txt, err) in enumerate(results, start=1):
792
+ if err:
793
+ yield int(20 + idx * 70 / max(1, total_chunks)), "\n\n".join(overall_parts), None, f"Chunk {idx} worker error: {err}"
794
+ else:
795
+ if mem_on:
796
+ txt = memory_correct_text(txt)
797
+ txt = postprocess_transcript(txt)
798
+ overall_parts.append(txt)
799
+ prog = int(20 + idx * 70 / max(1, total_chunks))
800
+ yield prog, "\n\n".join(overall_parts), None, f"Completed chunk {idx}/{total_chunks} (parallel)."
801
+ # cleanup temp chunks (but not original wav)
802
+ for tfile in temp_chunk_files:
803
+ try:
804
+ if os.path.exists(tfile):
805
+ os.unlink(tfile)
806
+ except Exception:
807
+ pass
808
+ parallel_used = True
809
+ except Exception as e:
810
+ yield 20, "", None, f"Parallel execution failed, falling back to serial: {e}\n{traceback.format_exc()}"
811
+ parallel_used = False
812
+
813
+ if not parallel_used:
814
+ # serial chunk processing
815
+ for idx, (st, ed) in enumerate(chunk_ranges, start=1):
816
+ try:
817
+ if ed is None:
818
+ chunk_wav = wav
819
+ note = "full file"
820
+ else:
821
+ chunk_wav = trim_audio_segment(wav, st, ed)
822
+ note = f"{st:.1f}s - {ed:.1f}s"
823
+
824
+ yield int(15 + (idx - 1) * 70 / max(1, total_chunks)), "", None, f"Transcribing chunk {idx}/{total_chunks} ({note})..."
825
+
826
+ # call model.transcribe on chunk
827
+ whisper_opts = {}
828
+ result = model.transcribe(chunk_wav, **whisper_opts)
829
+ chunk_text = result.get("text", "").strip()
830
+
831
+ if mem_on:
832
+ chunk_text = memory_correct_text(chunk_text)
833
+ chunk_text = postprocess_transcript(chunk_text)
834
+ overall_parts.append(chunk_text)
835
+
836
+ if ed is not None and chunk_wav and os.path.exists(chunk_wav) and chunk_wav != wav:
837
+ try:
838
+ os.unlink(chunk_wav)
839
+ except Exception:
840
+ pass
841
+
842
+ partial = "\n\n".join(overall_parts)
843
+ prog = int(15 + idx * 70 / max(1, total_chunks))
844
+ yield prog, partial, None, f"Completed chunk {idx}/{total_chunks}."
845
+ except Exception as e:
846
+ yield int(15 + idx * 70 / max(1, total_chunks)), "\n\n".join(overall_parts), None, f"Chunk {idx} failed: {e}\n{traceback.format_exc()}"
847
 
848
  # final assembly
849
+ final_text = "\n\n".join([p for p in overall_parts if p])
850
  if mem_on:
851
  try:
852
  update_memory_with_transcript(final_text)
853
  except Exception:
854
  pass
855
 
856
+ # SRT generation best-effort (runs a full transcribe to get segments)
857
  srt_path = None
858
  if srt_on:
859
  try:
 
883
 
884
  transcribe_btn.click(
885
  fn=_single_generator,
886
+ inputs=[single_audio, model_select, device_choice, mem_toggle, srt_toggle, chunk_size_input, enable_chunking, parallel_checkbox, parallel_workers, use_two_pass_single, fast_model_choice, refine_threshold_single],
887
  outputs=[progress_num, transcript_out, srt_download, single_logs],
888
  )
889
 
890
+ # Batch tab (unchanged UI and behavior)
891
  with gr.TabItem("Batch Transcribe"):
892
  with gr.Row():
893
  with gr.Column(scale=1):
 
944
  outputs=[batch_trans_out, batch_logs, batch_doc_download, batch_srt_download],
945
  )
946
 
947
+ # Memory tab (unchanged)
948
  with gr.TabItem("Memory"):
949
  with gr.Row():
950
  with gr.Column(scale=1):
 
1035
  mem_clear_btn.click(fn=_clear_mem, inputs=[], outputs=[mem_status])
1036
  mem_view_btn.click(fn=_view_mem, inputs=[], outputs=[mem_status])
1037
 
1038
+ # Settings tab (theme)
1039
  with gr.TabItem("Settings"):
1040
  with gr.Row():
1041
  with gr.Column():
1042
  gr.Markdown("### Runtime & tips")
1043
  gr.Markdown("- Use `large-v3` only if your whisper package supports it.")
1044
  gr.Markdown("- Extraction writes to a per-run temp directory under system temp.")
1045
+ gr.Markdown("- Two-pass helps when heavy model is slow; experimental parallel helps primarily for CPU workloads with many cores.")
1046
  with gr.Column():
1047
  gr.Markdown("### Theme")
1048
  gr.HTML("""