staraks commited on
Commit
ec6edb7
·
verified ·
1 Parent(s): e330361

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +471 -320
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # app.py
2
- # Whisper Transcriber — Beautiful UI + Tabs (Audio Transcribe focused)
3
- # Drop-in replacement. Requires: gradio, whisper, pydub, pyzipper, python-docx, ffmpeg installed.
4
 
5
  import os
6
  import sys
@@ -42,12 +42,11 @@ FFMPEG_CANDIDATES = [
42
  ("pcm_s16le", 44100, 2),
43
  ("mulaw", 8000, 1),
44
  ]
45
-
46
  MODEL_CACHE = {}
47
  FINETUNE_WORKDIR = os.path.join(tempfile.gettempdir(), "finetune_workdir")
48
  os.makedirs(FINETUNE_WORKDIR, exist_ok=True)
49
 
50
- # ---------- Helpers (conversion, whisper, memory, small postprocessing) ----------
51
  def load_memory():
52
  try:
53
  if os.path.exists(MEMORY_FILE):
@@ -78,26 +77,43 @@ def save_memory(mem):
78
 
79
  memory = load_memory()
80
 
81
- MEDICAL_ABBREVIATIONS = {"pt":"patient","dx":"diagnosis","hx":"history","sx":"symptoms","c/o":"complains of","bp":"blood pressure","hr":"heart rate","o2":"oxygen","r/o":"rule out","adm":"admit","disch":"discharge"}
82
- DRUG_NORMALIZATION = {"metformin":"Metformin","aspirin":"Aspirin","amoxicillin":"Amoxicillin"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  def expand_abbreviations(text):
85
  tokens = re.split(r"(\s+)", text)
86
- out=[]
87
  for t in tokens:
88
  key = t.lower().strip(".,;:")
89
  if key in MEDICAL_ABBREVIATIONS:
90
- trailing=""
91
- m=re.match(r"([A-Za-z0-9/]+)([.,;:]*)",t)
92
  if m:
93
- trailing=m.group(2) or ""
94
- out.append(MEDICAL_ABBREVIATIONS[key]+trailing)
95
  else:
96
  out.append(t)
97
  return "".join(out)
98
 
99
  def normalize_drugs(text):
100
- for k,v in DRUG_NORMALIZATION.items():
101
  text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
102
  return text
103
 
@@ -108,7 +124,7 @@ def punctuation_and_capitalization(text):
108
  if not re.search(r"[.?!]\s*$", text):
109
  text = text.rstrip() + "."
110
  parts = re.split(r"([.?!]\s+)", text)
111
- out=[]
112
  for p in parts:
113
  if p and not re.match(r"[.?!]\s+", p):
114
  out.append(p.capitalize())
@@ -119,7 +135,7 @@ def punctuation_and_capitalization(text):
119
  def postprocess_transcript(text):
120
  if not text:
121
  return text
122
- t = re.sub(r"\s+"," ",text).strip()
123
  t = expand_abbreviations(t)
124
  t = normalize_drugs(t)
125
  t = punctuation_and_capitalization(t)
@@ -133,23 +149,24 @@ def extract_words_and_phrases(text):
133
  def update_memory_with_transcript(transcript):
134
  global memory
135
  words, sentences = extract_words_and_phrases(transcript)
136
- changed=False
137
  with MEMORY_LOCK:
138
  for w in words:
139
- lw=w.lower()
140
- memory["words"][lw]=memory["words"].get(lw,0)+1
141
- changed=True
142
  for s in sentences:
143
- memory["phrases"][s]=memory["phrases"].get(s,0)+1
144
- changed=True
145
  if changed:
146
  save_memory(memory)
147
 
148
- def memory_correct_text(text,min_ratio=0.85):
149
  if not text or (not memory.get("words") and not memory.get("phrases")):
150
  return text
 
151
  def fix_word(w):
152
- lw=w.lower()
153
  if lw in memory["words"]:
154
  return w
155
  candidates = get_close_matches(lw, memory["words"].keys(), n=1, cutoff=min_ratio)
@@ -159,178 +176,178 @@ def memory_correct_text(text,min_ratio=0.85):
159
  return cand.capitalize()
160
  return cand
161
  return w
 
162
  tokens = re.split(r"(\W+)", text)
163
- corrected=[]
164
  for tok in tokens:
165
  if re.match(r"^[A-Za-z0-9\-']+$", tok):
166
- corrected.append(fix_word(tok))
167
  else:
168
- corrected.append(tok)
169
- corrected_text = "".join(corrected)
170
- for phrase in sorted(memory.get("phrases",{}).keys(), key=lambda s:-len(s)):
 
171
  low_phrase = phrase.lower()
172
  if len(low_phrase) < 8:
173
  continue
174
- if low_phrase in corrected_text.lower():
175
- corrected_text = re.sub(re.escape(phrase), phrase, corrected_text, flags=re.IGNORECASE)
176
- return corrected_text
 
 
 
 
 
 
 
 
 
177
 
178
  # ---------- Conversion helpers ----------
179
- MIN_WAV_SIZE = MIN_WAV_SIZE
180
  def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
181
- cmd = ["ffmpeg","-hide_banner","-loglevel","error","-y"]
182
- if fmt in ("s16le","pcm_s16le","mulaw"):
183
- cmd += ["-f",fmt,"-ar",str(sr),"-ac",str(ch),"-i",input_path,out_path]
184
- else:
185
- cmd += ["-i",input_path,"-ar",str(sr),"-ac",str(ch),out_path]
186
  try:
 
 
 
 
 
187
  proc = subprocess.run(cmd, capture_output=True, timeout=60, text=True)
188
- stdout_stderr=(proc.stdout or "")+(proc.stderr or "")
189
- if proc.returncode==0 and os.path.exists(out_path) and os.path.getsize(out_path)>MIN_WAV_SIZE:
190
  return True, stdout_stderr
191
  else:
192
  try:
193
- if os.path.exists(out_path): os.unlink(out_path)
194
- except Exception: pass
 
 
195
  return False, stdout_stderr
196
  except Exception as e:
197
  try:
198
- if os.path.exists(out_path): os.unlink(out_path)
199
- except Exception: pass
 
 
200
  return False, str(e)
201
 
202
  def convert_to_wav_if_needed(input_path):
203
- input_path=str(input_path)
204
- if input_path.lower().endswith(".wav"):
 
205
  return input_path
206
- tmp=None
207
- auto_err=""
 
208
  try:
209
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
210
  tmp.close()
211
  AudioSegment.from_file(input_path).export(tmp.name, format="wav")
212
- if os.path.exists(tmp.name) and os.path.getsize(tmp.name)>MIN_WAV_SIZE:
213
  return tmp.name
214
  else:
215
- try: os.unlink(tmp.name)
216
- except Exception: pass
 
 
217
  except Exception:
218
- auto_err=traceback.format_exc()
219
  try:
220
- if tmp and os.path.exists(tmp.name): os.unlink(tmp.name)
221
- except Exception: pass
 
 
 
222
  diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
223
  diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
224
- diagnostics=[]
225
- for fmt,sr,ch in FFMPEG_CANDIDATES:
226
  out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
227
  out_wav.close()
228
  success, debug = _ffmpeg_convert(input_path, out_wav.name, fmt, sr, ch)
229
  diagnostics.append(f"TRY fmt={fmt} sr={sr} ch={ch} success={success}\n{debug}\n")
230
  if success:
231
  try:
232
- with open(diag_log,"w",encoding="utf-8") as fh:
233
  fh.write("pydub auto error:\n")
234
- fh.write(auto_err+"\n\n")
235
  fh.write("Successful ffmpeg candidate:\n")
236
  fh.write(f"fmt={fmt} sr={sr} ch={ch}\n\n")
237
  fh.write("Diagnostics:\n")
238
  fh.write("\n".join(diagnostics))
239
- except Exception: pass
 
240
  return out_wav.name
241
  else:
242
  try:
243
- if os.path.exists(out_wav.name): os.unlink(out_wav.name)
244
- except Exception: pass
 
 
 
245
  try:
246
- fp = subprocess.run(["ffprobe","-v","error","-show_format","-show_streams",input_path], capture_output=True, text=True, timeout=10)
247
- diagnostics.append("FFPROBE:\n"+(fp.stdout.strip() or fp.stderr.strip()))
 
 
 
 
 
248
  except Exception as e:
249
- diagnostics.append("ffprobe failed: "+str(e))
250
  try:
251
- with open(input_path,"rb") as fh:
252
  head = fh.read(512)
253
- diagnostics.append("HEX PREVIEW:\n"+head.hex())
254
  except Exception as e:
255
- diagnostics.append("could not read head: "+str(e))
 
256
  try:
257
- with open(diag_log,"w",encoding="utf-8") as fh:
258
  fh.write("pydub auto error:\n")
259
- fh.write(auto_err+"\n\n")
260
  fh.write("Full diagnostics:\n\n")
261
  fh.write("\n\n".join(diagnostics))
262
  except Exception as e:
263
  raise Exception(f"Conversion failed; diagnostics write error: {e}")
 
264
  raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}")
265
 
266
  # ---------- Whisper model loader ----------
267
  def get_whisper_model(name, device=None):
268
- # caching
269
  if name not in MODEL_CACHE:
270
  print(f"DEBUG: loading whisper model '{name}'", flush=True)
271
- # whisper.load_model accepts model names like "small","medium", "large-v3" depending on installation
272
- if device:
273
- try:
274
  MODEL_CACHE[name] = whisper.load_model(name, device=device)
275
- except TypeError:
276
- # fallback signature
277
  MODEL_CACHE[name] = whisper.load_model(name)
278
- else:
 
279
  MODEL_CACHE[name] = whisper.load_model(name)
280
  return MODEL_CACHE[name]
281
 
282
- # ---------- Transcription helpers ----------
283
- def transcribe_single(audio_path, model_name="small", enable_memory=False, device_choice="auto"):
284
- logs=[]
285
- transcript_text=""
286
- download_path=None
287
- try:
288
- if not audio_path:
289
- return None, "No audio provided.", ""
290
- path = str(audio_path)
291
- device = None if device_choice=="auto" else device_choice
292
- model = get_whisper_model(model_name, device=device)
293
- logs.append(f"Loaded model: {model_name}")
294
- wav = convert_to_wav_if_needed(path)
295
- logs.append(f"Converted to WAV: {os.path.basename(wav)}")
296
- result = model.transcribe(wav)
297
- text = result.get("text","").strip()
298
- if enable_memory:
299
- text = memory_correct_text(text)
300
- text = postprocess_transcript(text)
301
- transcript_text = text
302
- if enable_memory:
303
- try:
304
- update_memory_with_transcript(text)
305
- logs.append("Memory updated.")
306
- except Exception:
307
- pass
308
- # don't delete original user-uploaded wav; delete tmp if created
309
- if wav and wav != path and os.path.exists(wav):
310
- try: os.unlink(wav)
311
- except Exception: pass
312
- return path, transcript_text, "\n".join(logs)
313
- except Exception as e:
314
- tb = traceback.format_exc()
315
- return None, "", f"Error: {e}\n{tb}"
316
-
317
- # ---------- ZIP helpers (kept small, re-use earlier pattern) ----------
318
  def extract_zip_list(zip_file, zip_password):
319
  temp_extract_dir = os.path.join(tempfile.gettempdir(), "extracted_audio")
320
  try:
321
  if os.path.exists(temp_extract_dir):
322
- try: shutil.rmtree(temp_extract_dir)
323
- except Exception: pass
 
 
324
  os.makedirs(temp_extract_dir, exist_ok=True)
325
- extracted=[]
326
- logs=[]
327
  with pyzipper.ZipFile(zip_file, "r") as zf:
328
  if zip_password:
329
- try: zf.setpassword(zip_password.encode())
330
- except Exception: logs.append("Warning: failed to set zip password (unexpected).")
331
- exts = [".mp3",".wav",".aac",".flac",".ogg",".m4a",".dat",".dct"]
 
 
332
  for info in zf.infolist():
333
- if info.is_dir(): continue
 
334
  _, ext = os.path.splitext(info.filename)
335
  if ext.lower() in exts:
336
  try:
@@ -338,6 +355,9 @@ def extract_zip_list(zip_file, zip_password):
338
  except RuntimeError as e:
339
  logs.append(f"Password required/incorrect for {info.filename}: {e}")
340
  continue
 
 
 
341
  except Exception as e:
342
  logs.append(f"Error extracting {info.filename}: {e}")
343
  continue
@@ -353,86 +373,269 @@ def extract_zip_list(zip_file, zip_password):
353
  traceback.print_exc()
354
  return [], f"Extraction failed: {e}"
355
 
356
- # ---------- UI: Beautiful CSS ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  CSS = """
358
  :root{
359
  --accent:#4f46e5;
360
  --muted:#6b7280;
361
  --card:#ffffff;
362
  --bg:#f7f8fb;
363
- --glass: rgba(255,255,255,0.65);
364
  }
365
  body { background: var(--bg); font-family: Inter, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
366
- .header {
367
- padding: 18px 24px;
368
- border-radius: 12px;
369
- background: linear-gradient(90deg, rgba(79,70,229,0.12), rgba(99,102,241,0.04));
370
- margin-bottom: 18px;
371
- display:flex;align-items:center;gap:16px;
372
- }
373
- .app-icon {
374
- width:62px;height:62px;border-radius:12px;background:linear-gradient(135deg,var(--accent),#06b6d4);display:flex;align-items:center;justify-content:center;color:white;font-weight:700;font-size:24px;
375
- }
376
  .header-title h1 { margin:0;font-size:20px;}
377
  .header-sub { color:var(--muted); margin-top:4px;font-size:13px;}
378
  .card { background:var(--card); border-radius:12px; padding:14px; box-shadow: 0 6px 20px rgba(16,24,40,0.06); }
379
- .controls { display:flex;flex-direction:column; gap:10px; }
380
- .btn-primary { background: linear-gradient(90deg,var(--accent),#06b6d4); color:white; border-radius:10px; padding:10px 14px; border:none; cursor:pointer; font-weight:600;}
381
- .small-muted { color:var(--muted); font-size:13px; }
382
  .transcript-area { white-space:pre-wrap; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, "Roboto Mono", monospace; background:#0f172a; color:#e6eef8; padding:12px; border-radius:10px; min-height:220px; }
383
- .audio-player-card { border-radius:10px; padding:8px; background: linear-gradient(180deg, rgba(255,255,255,0.9), rgba(255,255,255,0.7)); box-shadow: 0 6px 18px rgba(15,23,42,0.04); }
384
  .small-note { color:var(--muted); font-size:12px;}
385
- .grid { display:grid; grid-template-columns: 1fr 1fr; gap:14px; }
386
- @media (max-width:900px){ .grid{ grid-template-columns: 1fr; } .header{flex-direction:column;align-items:flex-start;} }
387
  """
388
 
389
- # ---------- Build Gradio UI (beautiful + audio transcribe tab) ----------
390
- with gr.Blocks(title="Whisper Transcriber Beautiful UI", css=CSS) as demo:
 
391
  # Header
392
  with gr.Row(elem_classes="header"):
393
  with gr.Column(scale=0):
394
  gr.HTML("<div class='app-icon'>WT</div>")
395
- with gr.Column(elem_id="header-title"):
396
  gr.HTML("<h1 style='margin:0'>Whisper Transcriber</h1>")
397
- gr.Markdown("<div class='header-sub'>Fast single-file transcribe, batch workflows, memory & fine-tune — now with a beautiful Audio tab ✨</div>")
398
 
399
  with gr.Tabs():
400
- # ---- Audio Transcribe Tab ----
401
- with gr.TabItem("Audio Transcribe 🎙️"):
402
  with gr.Row():
403
- # Left: controls
404
  with gr.Column(scale=1):
405
- gr.Markdown("### Quick Single Audio Transcribe")
406
  with gr.Group(elem_classes="card"):
 
407
  single_audio = gr.Audio(label="Upload or record audio", type="filepath")
408
  with gr.Row():
409
  model_select = gr.Dropdown(choices=["small","medium","large","large-v3","base"], value="large-v3", label="Model")
410
  device_select = gr.Dropdown(choices=["auto","cpu","cuda"], value="auto", label="Device")
411
  with gr.Row():
412
  mem_toggle = gr.Checkbox(label="Enable correction memory", value=False)
413
- format_button = gr.Dropdown(choices=["Plain","SOAP (medical)"], value="Plain", label="Format")
414
- transcribe_btn = gr.Button("Transcribe", elem_classes="btn-primary")
415
- gr.Markdown("<div class='small-note'>Tip: Use <strong>large-v3</strong> for best accuracy if your environment supports it.</div>")
416
-
417
- # Right: player + transcript
418
  with gr.Column(scale=1):
419
- with gr.Group(elem_classes="card audio-player-card"):
420
- gr.Markdown("### Preview & Player")
421
- audio_preview = gr.Audio(label="Player", interactive=False)
422
- gr.HTML("<div style='height:8px'></div>")
423
- gr.Markdown("<div class='small-muted'>Use the player to preview. Click Transcribe to generate the cleaned transcript on the right.</div>")
424
-
425
- gr.Markdown("<div style='height:12px'></div>")
426
  with gr.Group(elem_classes="card"):
427
- gr.Markdown("### Transcript")
428
- transcript_out = gr.Textbox(label="", lines=12, interactive=False, elem_classes="transcript-area")
 
429
  transcript_logs = gr.Textbox(label="Logs", lines=6, interactive=False)
430
 
431
- # Transcribe action
432
  def _do_single_transcribe(audio_file, model_name, device_choice, enable_memory, fmt_choice):
433
  player_path, transcript, logs = transcribe_single(audio_file, model_name=model_name, enable_memory=enable_memory, device_choice=device_choice)
434
  if fmt_choice == "SOAP":
435
- # minimal SOAP formatting if user selected it (basic)
436
  sentences = re.split(r"(?<=[.?!])\s+", transcript)
437
  subj = sentences[0] if sentences else ""
438
  obj = sentences[1] if len(sentences) > 1 else ""
@@ -440,29 +643,30 @@ with gr.Blocks(title="Whisper Transcriber — Beautiful UI", css=CSS) as demo:
440
  transcript = soap
441
  return player_path, transcript, logs
442
 
443
- transcribe_btn.click(fn=_do_single_transcribe, inputs=[single_audio, model_select, device_select, mem_toggle, format_button], outputs=[audio_preview, transcript_out, transcript_logs])
444
 
445
- # ---- Batch Transcribe Tab ----
446
- with gr.TabItem("Batch Transcribe 📦"):
447
  with gr.Row():
448
  with gr.Column(scale=1):
449
  with gr.Group(elem_classes="card"):
450
- gr.Markdown("### Batch upload or ZIP")
451
- batch_files = gr.File(label="Upload audio files (multiple)", file_count="multiple", type="filepath")
452
- batch_zip = gr.File(label="Or upload ZIP (optional)", file_count="single", type="filepath")
453
- zip_password = gr.Textbox(label="ZIP password (optional)", placeholder="leave empty if none")
454
- batch_model = gr.Dropdown(choices=["small","medium","large","large-v3","base"], value="small", label="Model")
455
- batch_device = gr.Dropdown(choices=["auto","cpu","cuda"], value="auto", label="Device")
456
- batch_merge = gr.Checkbox(label="Merge all transcripts to .docx", value=True)
 
457
  batch_mem = gr.Checkbox(label="Enable memory corrections", value=False)
458
- batch_extract_btn = gr.Button("Extract ZIP & Show Files")
459
  batch_extract_logs = gr.Textbox(label="Extraction logs", lines=6, interactive=False)
460
- batch_select = gr.CheckboxGroup(choices=[], label="Select extracted files (optional)")
461
- batch_trans_btn = gr.Button("Start Batch Transcription", elem_classes="btn-primary")
462
  with gr.Column(scale=1):
463
  with gr.Group(elem_classes="card"):
464
- gr.Markdown("### Batch Output")
465
- batch_trans_out = gr.Textbox(label="Transcript (combined)", lines=18, interactive=False)
466
  batch_logs = gr.Textbox(label="Logs", lines=10, interactive=False)
467
  batch_download = gr.File(label="Merged .docx (when available)")
468
 
@@ -471,27 +675,26 @@ with gr.Blocks(title="Whisper Transcriber — Beautiful UI", css=CSS) as demo:
471
  return [], "No zip provided."
472
  zip_path = zip_file.name if hasattr(zip_file, "name") else str(zip_file)
473
  extracted, logs = extract_zip_list(zip_path, password)
474
- # return full paths as values, but show basenames in logs
475
- return extracted, logs + "\n\nFiles:\n" + "\n".join([os.path.basename(p) for p in extracted])
476
 
477
  batch_extract_btn.click(fn=_extract_zip_for_ui, inputs=[batch_zip, zip_password], outputs=[batch_select, batch_extract_logs])
478
 
479
- # reuse transcribe_multiple from earlier designs if available; simplified here to call transcribe_single sequentially
480
- def _batch_transcribe(selected_check, uploaded_files, zip_selected, model_name, device_name, merge_flag, enable_mem):
481
- # build list
482
- paths=[]
483
  if selected_check:
484
  paths.extend(selected_check)
485
  if uploaded_files:
486
- if isinstance(uploaded_files, list):
487
- paths.extend([str(x) for x in uploaded_files])
 
488
  else:
489
  paths.append(str(uploaded_files))
490
- if not paths and zip_selected:
491
- paths.extend(zip_selected if isinstance(zip_selected, list) else [zip_selected])
492
- logs=[]
493
- transcripts=[]
494
- out_doc=None
495
  for p in paths:
496
  try:
497
  _, txt, lg = transcribe_single(p, model_name=model_name, enable_memory=enable_mem, device_choice=device_name)
@@ -508,10 +711,10 @@ with gr.Blocks(title="Whisper Transcriber — Beautiful UI", css=CSS) as demo:
508
  logs.append(f"Merge failed: {e}")
509
  return combined, "\n".join(logs), out_doc
510
 
511
- batch_trans_btn.click(fn=_batch_transcribe, inputs=[batch_select, batch_files, batch_select, batch_model, batch_device, batch_merge, batch_mem], outputs=[batch_trans_out, batch_logs, batch_download])
512
 
513
- # ---- Memory Tab ----
514
- with gr.TabItem("Memory 🧠"):
515
  with gr.Row():
516
  with gr.Column(scale=1):
517
  with gr.Group(elem_classes="card"):
@@ -527,36 +730,37 @@ with gr.Blocks(title="Whisper Transcriber — Beautiful UI", css=CSS) as demo:
527
  def _import_mem(uploaded):
528
  if not uploaded:
529
  return "No file provided."
530
- path = uploaded.name if hasattr(uploaded,"name") else str(uploaded)
531
  try:
532
- with open(path,"r",encoding="utf-8") as fh:
533
  raw = fh.read()
534
  parsed = None
535
  try:
536
  parsed = json.loads(raw)
537
  except Exception:
538
- parsed=None
539
  if isinstance(parsed, dict):
540
  with MEMORY_LOCK:
541
- for k,v in parsed.get("words",{}).items():
542
- memory["words"][k.lower()] = memory["words"].get(k.lower(),0)+int(v)
543
- for k,v in parsed.get("phrases",{}).items():
544
- memory["phrases"][k] = memory["phrases"].get(k,0)+int(v)
545
  save_memory(memory)
546
- return f"Imported JSON memory (words={len(parsed.get('words',{}))}, phrases={len(parsed.get('phrases',{}))})."
547
- # fallback parse lines
548
- lines=[l.strip() for l in raw.splitlines() if l.strip()]
549
- added=0
550
  with MEMORY_LOCK:
551
  for line in lines:
552
  if "," in line:
553
- k,c = line.split(",",1)
554
- try: cnt=int(c)
555
- except: cnt=1
556
- memory["words"][k.lower()]=memory["words"].get(k.lower(),0)+cnt
 
 
557
  else:
558
- memory["words"][line.lower()]=memory["words"].get(line.lower(),0)+1
559
- added+=1
560
  save_memory(memory)
561
  return f"Imported {added} entries."
562
  except Exception as e:
@@ -565,34 +769,34 @@ with gr.Blocks(title="Whisper Transcriber — Beautiful UI", css=CSS) as demo:
565
  def _add_mem(entry):
566
  if not entry or not entry.strip():
567
  return "No entry provided."
568
- e=entry.strip()
569
  with MEMORY_LOCK:
570
- if len(e.split())<=3:
571
- memory["words"][e.lower()]=memory["words"].get(e.lower(),0)+1
572
  save_memory(memory)
573
  return f"Added word: {e.lower()}"
574
  else:
575
- memory["phrases"][e]=memory["phrases"].get(e,0)+1
576
  save_memory(memory)
577
  return f"Added phrase: {e}"
578
 
579
  def _clear_mem():
580
  global memory
581
  with MEMORY_LOCK:
582
- memory={"words":{}, "phrases":{}}
583
  save_memory(memory)
584
  return "Memory cleared."
585
 
586
  def _view_mem():
587
- w=memory.get("words",{})
588
- p=memory.get("phrases",{})
589
- out=[]
590
  out.append("WORDS (top 30):")
591
- for k,v in sorted(w.items(), key=lambda kv:-kv[1])[:30]:
592
  out.append(f"{k}: {v}")
593
  out.append("")
594
  out.append("PHRASES (top 20):")
595
- for k,v in sorted(p.items(), key=lambda kv:-kv[1])[:20]:
596
  out.append(f"{k}: {v}")
597
  return "\n".join(out)
598
 
@@ -601,8 +805,8 @@ with gr.Blocks(title="Whisper Transcriber — Beautiful UI", css=CSS) as demo:
601
  mem_clear_btn.click(fn=_clear_mem, inputs=[], outputs=[mem_status])
602
  mem_view_btn.click(fn=_view_mem, inputs=[], outputs=[mem_status])
603
 
604
- # ---- Fine-tune Tab (compact) ----
605
- with gr.TabItem("Fine-tune ⚙️"):
606
  with gr.Row():
607
  with gr.Column(scale=1):
608
  with gr.Group(elem_classes="card"):
@@ -612,114 +816,61 @@ with gr.Blocks(title="Whisper Transcriber — Beautiful UI", css=CSS) as demo:
612
  ft_old = gr.File(label="Old files folder (optional)", file_count="single", type="filepath")
613
  ft_prepare_btn = gr.Button("Prepare dataset")
614
  ft_manifest_box = gr.Textbox(label="Prepare status / manifest", lines=4, interactive=False)
615
- ft_model = gr.Dropdown(choices=["small","base","medium","large","large-v3"], value="small", label="Base model")
616
  ft_epochs = gr.Slider(minimum=1, maximum=100, value=3, step=1, label="Epochs")
617
- ft_start = gr.Button("Start (uses fine_tune.py placeholder)")
618
- ft_stop = gr.Button("Stop")
619
- ft_status = gr.Textbox(label="Start/Stop status", lines=4, interactive=False)
620
- ft_logs = gr.Textbox(label="Training logs (tail)", lines=8, interactive=False)
621
-
622
- # Minimal prepare function reusing earlier code (kept compact)
623
- def _prepare_finetune(uploaded_zip, include_old, old_dir):
624
- dst=os.path.join(FINETUNE_WORKDIR,"data")
625
- try:
626
- if os.path.exists(dst): shutil.rmtree(dst)
627
- os.makedirs(dst, exist_ok=True)
628
- except Exception as e:
629
- return f"Workdir creation failed: {e}"
630
- # extract uploaded zip if provided
631
- if uploaded_zip:
632
- zpath = uploaded_zip.name if hasattr(uploaded_zip,"name") else str(uploaded_zip)
633
- if os.path.isfile(zpath) and zpath.lower().endswith(".zip"):
634
- try:
635
- with pyzipper.ZipFile(zpath,"r") as zf:
636
- zf.extractall(dst)
637
- except Exception as e:
638
- return f"ZIP extract failed: {e}"
639
- # include old files if requested
640
- old_msgs=""
641
- if include_old and old_dir:
642
- old_path = old_dir.name if hasattr(old_dir,"name") else str(old_dir)
643
- if os.path.isdir(old_path):
644
- copied=0; msgs=[]
645
- for root,_,files in os.walk(old_path):
646
- for f in files:
647
- if f.lower().endswith((".wav",".mp3",".flac",".m4a",".ogg")):
648
- base=os.path.splitext(f)[0]
649
- src=os.path.join(root,f)
650
- rel = os.path.relpath(root, old_path)
651
- tgt_dir = os.path.join(dst, rel)
652
- os.makedirs(tgt_dir, exist_ok=True)
653
- shutil.copy2(src, os.path.join(tgt_dir,f))
654
- ttxt = os.path.join(root, base+".txt")
655
- if os.path.exists(ttxt):
656
- shutil.copy2(ttxt, os.path.join(tgt_dir, base+".txt"))
657
- copied+=1
658
- old_msgs = f"\nCopied ~{copied} old audio files."
659
- else:
660
- old_msgs = "\nOld-files path not a directory."
661
- # simple manifest builder: audio \t transcript(empty or from .txt)
662
- manifest = os.path.join(FINETUNE_WORKDIR,"manifest.tsv")
663
- auds=[]
664
- for root,_,files in os.walk(dst):
665
- for f in files:
666
- if f.lower().endswith((".wav",".mp3",".flac",".m4a",".ogg")):
667
- auds.append(os.path.join(root,f))
668
- if not auds:
669
- return "No audio files found in prepared dataset."+old_msgs
670
- lines=[]
671
- missing=0
672
- for a in auds:
673
- base=os.path.splitext(a)[0]; tfile=base+".txt"
674
- txt=""
675
- if os.path.exists(tfile):
676
- try:
677
- with open(tfile,"r",encoding="utf-8") as fh: txt=fh.read().strip().replace("\n"," ")
678
- except: txt=""
679
- else:
680
- missing+=1
681
- lines.append(f"{a}\t{txt}")
682
- try:
683
- with open(manifest,"w",encoding="utf-8") as fh: fh.write("\n".join(lines))
684
- except Exception as e:
685
- return f"Manifest write failed: {e}"
686
- out = f"Prepared manifest: {manifest}{old_msgs}"
687
- if missing>0: out += f"\nWarning: {missing} audio files missing transcripts."
688
- return out
689
-
690
- def _start_ft(dummy, model, epochs):
691
- # placeholder: you must supply fine_tune.py in root or change this behavior
692
- manifest_guess = os.path.join(FINETUNE_WORKDIR,"manifest.tsv")
693
  if not os.path.exists(manifest_guess):
694
- return "Manifest not found. Prepare dataset first."
695
- try:
696
- cmd = [sys.executable, "fine_tune.py", "--manifest", manifest_guess, "--base_model", model, "--epochs", str(int(epochs))]
697
- # run in background (simple)
698
- p = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
699
- return f"Started fine-tune (PID {p.pid}). Make sure your training script exists."
700
- except Exception as e:
701
- return f"Failed to start fine-tune: {e}"
702
 
703
- ft_prepare.click(fn=_prepare_finetune, inputs=[ft_upload, ft_include_old, ft_old], outputs=[ft_manifest_box])
704
- ft_start.click(fn=_start_ft, inputs=[ft_manifest_box, ft_model, ft_epochs], outputs=[ft_status])
705
- ft_stop.click(fn=lambda: "Stop not implemented in placeholder", inputs=[], outputs=[ft_status])
 
706
 
707
- # ---- Settings Tab ----
708
- with gr.TabItem("Settings ⚙️"):
709
  with gr.Row():
710
  with gr.Column():
711
- gr.Markdown("### Runtime & tips")
712
- gr.Markdown("- Use large-v3 only if your `whisper` package supports it.")
713
- gr.Markdown("- Extraction writes to system temp `extracted_audio`. Re-extracting overwrites it.")
714
- gr.Markdown("- Keep default ZIP password empty for safety.")
 
715
  with gr.Column():
716
- gr.Markdown("### Diagnostics")
717
- diag_btn = gr.Button("Show memory summary")
718
- diag_out = gr.Textbox(label="Diagnostics", lines=12, interactive=False)
719
- diag_btn.click(fn=lambda: (lambda: view_memory())(), inputs=[], outputs=[diag_out])
 
720
 
721
- # Launch
722
  if __name__ == "__main__":
723
  port = int(os.environ.get("PORT", 7860))
724
  print("DEBUG: launching Gradio on port", port, flush=True)
725
- demo.queue().launch(server_name="0.0.0.0", server_port=port)
 
 
 
 
 
 
1
  # app.py
2
+ # Whisper Transcriber — Full corrected app.py (multi-tab, Audio Transcribe focused)
3
+ # Requirements: gradio, whisper, pydub, pyzipper, python-docx, ffmpeg installed.
4
 
5
  import os
6
  import sys
 
42
  ("pcm_s16le", 44100, 2),
43
  ("mulaw", 8000, 1),
44
  ]
 
45
  MODEL_CACHE = {}
46
  FINETUNE_WORKDIR = os.path.join(tempfile.gettempdir(), "finetune_workdir")
47
  os.makedirs(FINETUNE_WORKDIR, exist_ok=True)
48
 
49
+ # ---------- Helpers: Memory & Postprocessing ----------
50
  def load_memory():
51
  try:
52
  if os.path.exists(MEMORY_FILE):
 
77
 
78
  memory = load_memory()
79
 
80
+ MEDICAL_ABBREVIATIONS = {
81
+ "pt": "patient",
82
+ "dx": "diagnosis",
83
+ "hx": "history",
84
+ "sx": "symptoms",
85
+ "c/o": "complains of",
86
+ "bp": "blood pressure",
87
+ "hr": "heart rate",
88
+ "o2": "oxygen",
89
+ "r/o": "rule out",
90
+ "adm": "admit",
91
+ "disch": "discharge",
92
+ }
93
+
94
+ DRUG_NORMALIZATION = {
95
+ "metformin": "Metformin",
96
+ "aspirin": "Aspirin",
97
+ "amoxicillin": "Amoxicillin",
98
+ }
99
 
100
  def expand_abbreviations(text):
101
  tokens = re.split(r"(\s+)", text)
102
+ out = []
103
  for t in tokens:
104
  key = t.lower().strip(".,;:")
105
  if key in MEDICAL_ABBREVIATIONS:
106
+ trailing = ""
107
+ m = re.match(r"([A-Za-z0-9/]+)([.,;:]*)", t)
108
  if m:
109
+ trailing = m.group(2) or ""
110
+ out.append(MEDICAL_ABBREVIATIONS[key] + trailing)
111
  else:
112
  out.append(t)
113
  return "".join(out)
114
 
115
  def normalize_drugs(text):
116
+ for k, v in DRUG_NORMALIZATION.items():
117
  text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
118
  return text
119
 
 
124
  if not re.search(r"[.?!]\s*$", text):
125
  text = text.rstrip() + "."
126
  parts = re.split(r"([.?!]\s+)", text)
127
+ out = []
128
  for p in parts:
129
  if p and not re.match(r"[.?!]\s+", p):
130
  out.append(p.capitalize())
 
135
  def postprocess_transcript(text):
136
  if not text:
137
  return text
138
+ t = re.sub(r"\s+", " ", text).strip()
139
  t = expand_abbreviations(t)
140
  t = normalize_drugs(t)
141
  t = punctuation_and_capitalization(t)
 
149
  def update_memory_with_transcript(transcript):
150
  global memory
151
  words, sentences = extract_words_and_phrases(transcript)
152
+ changed = False
153
  with MEMORY_LOCK:
154
  for w in words:
155
+ lw = w.lower()
156
+ memory["words"][lw] = memory["words"].get(lw, 0) + 1
157
+ changed = True
158
  for s in sentences:
159
+ memory["phrases"][s] = memory["phrases"].get(s, 0) + 1
160
+ changed = True
161
  if changed:
162
  save_memory(memory)
163
 
164
+ def memory_correct_text(text, min_ratio=0.85):
165
  if not text or (not memory.get("words") and not memory.get("phrases")):
166
  return text
167
+
168
  def fix_word(w):
169
+ lw = w.lower()
170
  if lw in memory["words"]:
171
  return w
172
  candidates = get_close_matches(lw, memory["words"].keys(), n=1, cutoff=min_ratio)
 
176
  return cand.capitalize()
177
  return cand
178
  return w
179
+
180
  tokens = re.split(r"(\W+)", text)
181
+ corrected_tokens = []
182
  for tok in tokens:
183
  if re.match(r"^[A-Za-z0-9\-']+$", tok):
184
+ corrected_tokens.append(fix_word(tok))
185
  else:
186
+ corrected_tokens.append(tok)
187
+ corrected = "".join(corrected_tokens)
188
+
189
+ for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
190
  low_phrase = phrase.lower()
191
  if len(low_phrase) < 8:
192
  continue
193
+ if low_phrase in corrected.lower():
194
+ corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
195
+ return corrected
196
+
197
+ # ---------- File utilities ----------
198
+ def save_as_word(text, filename=None):
199
+ if filename is None:
200
+ filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
201
+ doc = Document()
202
+ doc.add_paragraph(text)
203
+ doc.save(filename)
204
+ return filename
205
 
206
  # ---------- Conversion helpers ----------
 
207
  def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
 
 
 
 
 
208
  try:
209
+ cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"]
210
+ if fmt in ("s16le", "pcm_s16le", "mulaw"):
211
+ cmd += ["-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_path]
212
+ else:
213
+ cmd += ["-i", input_path, "-ar", str(sr), "-ac", str(ch), out_path]
214
  proc = subprocess.run(cmd, capture_output=True, timeout=60, text=True)
215
+ stdout_stderr = (proc.stdout or "") + (proc.stderr or "")
216
+ if proc.returncode == 0 and os.path.exists(out_path) and os.path.getsize(out_path) > MIN_WAV_SIZE:
217
  return True, stdout_stderr
218
  else:
219
  try:
220
+ if os.path.exists(out_path):
221
+ os.unlink(out_path)
222
+ except Exception:
223
+ pass
224
  return False, stdout_stderr
225
  except Exception as e:
226
  try:
227
+ if os.path.exists(out_path):
228
+ os.unlink(out_path)
229
+ except Exception:
230
+ pass
231
  return False, str(e)
232
 
233
  def convert_to_wav_if_needed(input_path):
234
+ input_path = str(input_path)
235
+ lower = input_path.lower()
236
+ if lower.endswith(".wav"):
237
  return input_path
238
+
239
+ auto_err = ""
240
+ tmp = None
241
  try:
242
  tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
243
  tmp.close()
244
  AudioSegment.from_file(input_path).export(tmp.name, format="wav")
245
+ if os.path.exists(tmp.name) and os.path.getsize(tmp.name) > MIN_WAV_SIZE:
246
  return tmp.name
247
  else:
248
+ try:
249
+ os.unlink(tmp.name)
250
+ except Exception:
251
+ pass
252
  except Exception:
253
+ auto_err = traceback.format_exc()
254
  try:
255
+ if tmp and os.path.exists(tmp.name):
256
+ os.unlink(tmp.name)
257
+ except Exception:
258
+ pass
259
+
260
  diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
261
  diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
262
+ diagnostics = []
263
+ for fmt, sr, ch in FFMPEG_CANDIDATES:
264
  out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
265
  out_wav.close()
266
  success, debug = _ffmpeg_convert(input_path, out_wav.name, fmt, sr, ch)
267
  diagnostics.append(f"TRY fmt={fmt} sr={sr} ch={ch} success={success}\n{debug}\n")
268
  if success:
269
  try:
270
+ with open(diag_log, "w", encoding="utf-8") as fh:
271
  fh.write("pydub auto error:\n")
272
+ fh.write(auto_err + "\n\n")
273
  fh.write("Successful ffmpeg candidate:\n")
274
  fh.write(f"fmt={fmt} sr={sr} ch={ch}\n\n")
275
  fh.write("Diagnostics:\n")
276
  fh.write("\n".join(diagnostics))
277
+ except Exception:
278
+ pass
279
  return out_wav.name
280
  else:
281
  try:
282
+ if os.path.exists(out_wav.name):
283
+ os.unlink(out_wav.name)
284
+ except Exception:
285
+ pass
286
+
287
  try:
288
+ fp = subprocess.run(
289
+ ["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
290
+ capture_output=True,
291
+ text=True,
292
+ timeout=10,
293
+ )
294
+ diagnostics.append("FFPROBE:\n" + (fp.stdout.strip() or fp.stderr.strip()))
295
  except Exception as e:
296
+ diagnostics.append("ffprobe failed: " + str(e))
297
  try:
298
+ with open(input_path, "rb") as fh:
299
  head = fh.read(512)
300
+ diagnostics.append("HEX PREVIEW:\n" + head.hex())
301
  except Exception as e:
302
+ diagnostics.append("could not read head: " + str(e))
303
+
304
  try:
305
+ with open(diag_log, "w", encoding="utf-8") as fh:
306
  fh.write("pydub auto error:\n")
307
+ fh.write(auto_err + "\n\n")
308
  fh.write("Full diagnostics:\n\n")
309
  fh.write("\n\n".join(diagnostics))
310
  except Exception as e:
311
  raise Exception(f"Conversion failed; diagnostics write error: {e}")
312
+
313
  raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}")
314
 
315
  # ---------- Whisper model loader ----------
316
  def get_whisper_model(name, device=None):
 
317
  if name not in MODEL_CACHE:
318
  print(f"DEBUG: loading whisper model '{name}'", flush=True)
319
+ try:
320
+ if device:
 
321
  MODEL_CACHE[name] = whisper.load_model(name, device=device)
322
+ else:
 
323
  MODEL_CACHE[name] = whisper.load_model(name)
324
+ except TypeError:
325
+ # some whisper versions don't accept device arg
326
  MODEL_CACHE[name] = whisper.load_model(name)
327
  return MODEL_CACHE[name]
328
 
329
+ # ---------- ZIP extraction helper ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  def extract_zip_list(zip_file, zip_password):
331
  temp_extract_dir = os.path.join(tempfile.gettempdir(), "extracted_audio")
332
  try:
333
  if os.path.exists(temp_extract_dir):
334
+ try:
335
+ shutil.rmtree(temp_extract_dir)
336
+ except Exception:
337
+ pass
338
  os.makedirs(temp_extract_dir, exist_ok=True)
339
+ extracted = []
340
+ logs = []
341
  with pyzipper.ZipFile(zip_file, "r") as zf:
342
  if zip_password:
343
+ try:
344
+ zf.setpassword(zip_password.encode())
345
+ except Exception:
346
+ logs.append("Warning: failed to set zip password (unexpected).")
347
+ exts = [".mp3", ".wav", ".aac", ".flac", ".ogg", ".m4a", ".dat", ".dct"]
348
  for info in zf.infolist():
349
+ if info.is_dir():
350
+ continue
351
  _, ext = os.path.splitext(info.filename)
352
  if ext.lower() in exts:
353
  try:
 
355
  except RuntimeError as e:
356
  logs.append(f"Password required/incorrect for {info.filename}: {e}")
357
  continue
358
+ except pyzipper.BadZipFile:
359
+ logs.append(f"Bad zip entry: {info.filename}")
360
+ continue
361
  except Exception as e:
362
  logs.append(f"Error extracting {info.filename}: {e}")
363
  continue
 
373
  traceback.print_exc()
374
  return [], f"Extraction failed: {e}"
375
 
376
+ # ---------- Simple single-file transcriber ----------
377
+ def transcribe_single(audio_path, model_name="small", enable_memory=False, device_choice="auto"):
378
+ logs = []
379
+ transcript_text = ""
380
+ try:
381
+ if not audio_path:
382
+ return None, "No audio provided.", "No file provided."
383
+ path = str(audio_path)
384
+ device = None if device_choice == "auto" else device_choice
385
+ model = get_whisper_model(model_name, device=device)
386
+ logs.append(f"Loaded model: {model_name}")
387
+ wav = convert_to_wav_if_needed(path)
388
+ logs.append(f"Converted to WAV: {os.path.basename(wav)}")
389
+ result = model.transcribe(wav)
390
+ text = result.get("text", "").strip()
391
+ if enable_memory:
392
+ text = memory_correct_text(text)
393
+ text = postprocess_transcript(text)
394
+ transcript_text = text
395
+ if enable_memory:
396
+ try:
397
+ update_memory_with_transcript(text)
398
+ logs.append("Memory updated.")
399
+ except Exception:
400
+ pass
401
+ # cleanup temporary wav if created
402
+ if wav and os.path.exists(wav) and wav != path:
403
+ try:
404
+ os.unlink(wav)
405
+ except Exception:
406
+ pass
407
+ return path, transcript_text, "\n".join(logs)
408
+ except Exception as e:
409
+ tb = traceback.format_exc()
410
+ return None, "", f"Error: {e}\n{tb}"
411
+
412
+ # ---------- Fine-tune helpers (include old-files support) ----------
413
+ def _collect_old_files_into(dst_dir, old_dir_path):
414
+ msgs = []
415
+ copied = 0
416
+ try:
417
+ if not os.path.isdir(old_dir_path):
418
+ return 0, f"Old-files path is not a directory: {old_dir_path}"
419
+ for root, _, files in os.walk(old_dir_path):
420
+ for f in files:
421
+ if f.lower().endswith((".wav", ".mp3", ".flac", ".m4a", ".ogg")):
422
+ src_audio = os.path.join(root, f)
423
+ base = os.path.splitext(f)[0]
424
+ possible_txt = os.path.join(root, base + ".txt")
425
+ rel_subdir = os.path.relpath(root, old_dir_path)
426
+ target_subdir = os.path.join(dst_dir, rel_subdir)
427
+ os.makedirs(target_subdir, exist_ok=True)
428
+ target_audio = os.path.join(target_subdir, f)
429
+ shutil.copy2(src_audio, target_audio)
430
+ if os.path.exists(possible_txt):
431
+ shutil.copy2(possible_txt, os.path.join(target_subdir, base + ".txt"))
432
+ msgs.append(f"Copied pair: {os.path.join(rel_subdir, f)} + .txt")
433
+ else:
434
+ msgs.append(f"Copied audio (no transcript found): {os.path.join(rel_subdir, f)}")
435
+ copied += 1
436
+ return copied, "\n".join(msgs)
437
+ except Exception as e:
438
+ traceback.print_exc()
439
+ return copied, f"Error copying old files: {e}"
440
+
441
+ def prepare_finetune_dataset(uploaded_zip_or_dir, include_old_files=False, old_files_dir=""):
442
+ dst = os.path.join(FINETUNE_WORKDIR, "data")
443
+ try:
444
+ if os.path.exists(dst):
445
+ shutil.rmtree(dst)
446
+ os.makedirs(dst, exist_ok=True)
447
+ except Exception as e:
448
+ return f"Failed to prepare workdir: {e}", ""
449
+ path = None
450
+ try:
451
+ if uploaded_zip_or_dir:
452
+ if isinstance(uploaded_zip_or_dir, (str, os.PathLike)):
453
+ path = str(uploaded_zip_or_dir)
454
+ elif hasattr(uploaded_zip_or_dir, "name"):
455
+ path = uploaded_zip_or_dir.name
456
+ elif isinstance(uploaded_zip_or_dir, dict) and uploaded_zip_or_dir.get("name"):
457
+ path = uploaded_zip_or_dir["name"]
458
+ except Exception as e:
459
+ return f"Unable to determine uploaded path: {e}", ""
460
+ # extract or copy uploaded dataset if provided
461
+ if path and os.path.isfile(path) and path.lower().endswith(".zip"):
462
+ try:
463
+ with pyzipper.ZipFile(path, "r") as zf:
464
+ zf.extractall(dst)
465
+ except Exception as e:
466
+ return f"Failed to extract ZIP: {e}", ""
467
+ elif path and os.path.isdir(path):
468
+ try:
469
+ for item in os.listdir(path):
470
+ s = os.path.join(path, item)
471
+ d = os.path.join(dst, item)
472
+ if os.path.isdir(s):
473
+ shutil.copytree(s, d)
474
+ else:
475
+ shutil.copy2(s, d)
476
+ except Exception as e:
477
+ return f"Failed to copy dataset dir: {e}", ""
478
+ # include old files if requested
479
+ old_msgs = ""
480
+ if include_old_files and old_files_dir:
481
+ old_path = None
482
+ if isinstance(old_files_dir, (str, os.PathLike)):
483
+ old_path = str(old_files_dir)
484
+ elif hasattr(old_files_dir, "name"):
485
+ old_path = old_files_dir.name
486
+ elif isinstance(old_files_dir, dict) and old_files_dir.get("name"):
487
+ old_path = old_files_dir["name"]
488
+ if old_path:
489
+ copied, msg = _collect_old_files_into(dst, old_path)
490
+ old_msgs = f"\nOld-files: copied {copied} audio files.\nDetails:\n{msg}"
491
+ # find or build manifest
492
+ transcripts_candidates = [
493
+ os.path.join(dst, "transcripts.tsv"),
494
+ os.path.join(dst, "metadata.tsv"),
495
+ os.path.join(dst, "manifest.tsv"),
496
+ os.path.join(dst, "transcripts.txt"),
497
+ os.path.join(dst, "manifest.jsonl"),
498
+ ]
499
+ manifest_path = os.path.join(FINETUNE_WORKDIR, "manifest.tsv")
500
+ found = False
501
+ for tpath in transcripts_candidates:
502
+ if os.path.exists(tpath):
503
+ try:
504
+ shutil.copy2(tpath, manifest_path)
505
+ found = True
506
+ break
507
+ except Exception:
508
+ pass
509
+ missing_transcripts = 0
510
+ if not found:
511
+ audio_files = []
512
+ for root, _, files in os.walk(dst):
513
+ for f in files:
514
+ if f.lower().endswith((".wav", ".mp3", ".flac", ".m4a", ".ogg")):
515
+ audio_files.append(os.path.join(root, f))
516
+ if not audio_files:
517
+ return f"No audio files found in dataset.{old_msgs}", ""
518
+ entries = []
519
+ for a in audio_files:
520
+ base = os.path.splitext(a)[0]
521
+ t_candidate = base + ".txt"
522
+ transcript = ""
523
+ if os.path.exists(t_candidate):
524
+ try:
525
+ with open(t_candidate, "r", encoding="utf-8") as fh:
526
+ transcript = fh.read().strip().replace("\n", " ")
527
+ except Exception:
528
+ transcript = ""
529
+ else:
530
+ missing_transcripts += 1
531
+ entries.append(f"{a}\t{transcript}")
532
+ try:
533
+ with open(manifest_path, "w", encoding="utf-8") as fh:
534
+ fh.write("\n".join(entries))
535
+ found = True
536
+ except Exception as e:
537
+ return f"Failed to write manifest: {e}{old_msgs}", ""
538
+ if not found:
539
+ return f"Failed to locate or build manifest.{old_msgs}", ""
540
+ status_msg = f"Dataset prepared. Manifest: {manifest_path}{old_msgs}"
541
+ if missing_transcripts > 0:
542
+ status_msg += f"\nWarning: {missing_transcripts} audio files have no matching .txt transcript (empty transcripts saved)."
543
+ return status_msg, manifest_path
544
+
545
+ def start_finetune(manifest_path, base_model, epochs, batch_size, lr, output_dir):
546
+ outdir = output_dir or os.path.join(FINETUNE_WORKDIR, "output")
547
+ os.makedirs(outdir, exist_ok=True)
548
+ START_CMD = [
549
+ sys.executable,
550
+ "fine_tune.py",
551
+ "--manifest",
552
+ manifest_path,
553
+ "--base_model",
554
+ base_model,
555
+ "--epochs",
556
+ str(epochs),
557
+ "--batch_size",
558
+ str(batch_size),
559
+ "--lr",
560
+ str(lr),
561
+ "--output_dir",
562
+ outdir,
563
+ ]
564
+ try:
565
+ logfile = open(os.path.join(outdir, "finetune_stdout.log"), "a", encoding="utf-8")
566
+ proc = subprocess.Popen(START_CMD, stdout=logfile, stderr=logfile, cwd=os.getcwd())
567
+ return f"Fine-tune started (PID={proc.pid}). Logs: {logfile.name}"
568
+ except FileNotFoundError as e:
569
+ return f"Training script not found: {e}. Put 'fine_tune.py' in project root or change START_CMD."
570
+ except Exception as e:
571
+ return f"Failed to start fine-tune: {e}"
572
+
573
+ def tail_finetune_logs(logpath, lines=200):
574
+ try:
575
+ if not os.path.exists(logpath):
576
+ return "No logs yet."
577
+ with open(logpath, "r", encoding="utf-8", errors="ignore") as fh:
578
+ all_lines = fh.read().splitlines()
579
+ last = all_lines[-lines:]
580
+ return "\n".join(last)
581
+ except Exception as e:
582
+ return f"Failed to read logs: {e}"
583
+
584
+ # ---------- UI CSS ----------
585
  CSS = """
586
  :root{
587
  --accent:#4f46e5;
588
  --muted:#6b7280;
589
  --card:#ffffff;
590
  --bg:#f7f8fb;
 
591
  }
592
  body { background: var(--bg); font-family: Inter, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
593
+ .header { padding: 18px 24px; border-radius: 12px; background: linear-gradient(90deg, rgba(79,70,229,0.12), rgba(99,102,241,0.04)); margin-bottom: 18px; display:flex;align-items:center;gap:16px; }
594
+ .app-icon { width:62px;height:62px;border-radius:12px;background:linear-gradient(135deg,var(--accent),#06b6d4);display:flex;align-items:center;justify-content:center;color:white;font-weight:700;font-size:24px; }
 
 
 
 
 
 
 
 
595
  .header-title h1 { margin:0;font-size:20px;}
596
  .header-sub { color:var(--muted); margin-top:4px;font-size:13px;}
597
  .card { background:var(--card); border-radius:12px; padding:14px; box-shadow: 0 6px 20px rgba(16,24,40,0.06); }
 
 
 
598
  .transcript-area { white-space:pre-wrap; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, "Roboto Mono", monospace; background:#0f172a; color:#e6eef8; padding:12px; border-radius:10px; min-height:220px; }
 
599
  .small-note { color:var(--muted); font-size:12px;}
 
 
600
  """
601
 
602
+ # ---------- Build UI ----------
603
+ print("DEBUG: building Gradio Blocks", flush=True)
604
+ with gr.Blocks(title="Whisper Transcriber", css=CSS) as demo:
605
  # Header
606
  with gr.Row(elem_classes="header"):
607
  with gr.Column(scale=0):
608
  gr.HTML("<div class='app-icon'>WT</div>")
609
+ with gr.Column():
610
  gr.HTML("<h1 style='margin:0'>Whisper Transcriber</h1>")
611
+ gr.Markdown("<div class='header-sub'>Transcribe, batch, memory & fine-tune — multi-tab UI</div>")
612
 
613
  with gr.Tabs():
614
+ # Audio Transcribe Tab
615
+ with gr.TabItem("Audio Transcribe"):
616
  with gr.Row():
 
617
  with gr.Column(scale=1):
 
618
  with gr.Group(elem_classes="card"):
619
+ gr.Markdown("### Quick Single Audio Transcribe")
620
  single_audio = gr.Audio(label="Upload or record audio", type="filepath")
621
  with gr.Row():
622
  model_select = gr.Dropdown(choices=["small","medium","large","large-v3","base"], value="large-v3", label="Model")
623
  device_select = gr.Dropdown(choices=["auto","cpu","cuda"], value="auto", label="Device")
624
  with gr.Row():
625
  mem_toggle = gr.Checkbox(label="Enable correction memory", value=False)
626
+ format_choice = gr.Dropdown(choices=["Plain","SOAP (medical)"], value="Plain", label="Format")
627
+ transcribe_btn = gr.Button("Transcribe", variant="primary")
628
+ gr.Markdown("<div class='small-note'>Tip: choose large-v3 if your environment supports it.</div>")
 
 
629
  with gr.Column(scale=1):
 
 
 
 
 
 
 
630
  with gr.Group(elem_classes="card"):
631
+ gr.Markdown("### Player & Transcript")
632
+ audio_preview = gr.Audio(label="Player", interactive=False)
633
+ transcript_out = gr.Textbox(label="Transcript", lines=14, interactive=False, elem_classes="transcript-area")
634
  transcript_logs = gr.Textbox(label="Logs", lines=6, interactive=False)
635
 
 
636
  def _do_single_transcribe(audio_file, model_name, device_choice, enable_memory, fmt_choice):
637
  player_path, transcript, logs = transcribe_single(audio_file, model_name=model_name, enable_memory=enable_memory, device_choice=device_choice)
638
  if fmt_choice == "SOAP":
 
639
  sentences = re.split(r"(?<=[.?!])\s+", transcript)
640
  subj = sentences[0] if sentences else ""
641
  obj = sentences[1] if len(sentences) > 1 else ""
 
643
  transcript = soap
644
  return player_path, transcript, logs
645
 
646
+ transcribe_btn.click(fn=_do_single_transcribe, inputs=[single_audio, model_select, device_select, mem_toggle, format_choice], outputs=[audio_preview, transcript_out, transcript_logs])
647
 
648
+ # Batch Transcribe Tab
649
+ with gr.TabItem("Batch Transcribe"):
650
  with gr.Row():
651
  with gr.Column(scale=1):
652
  with gr.Group(elem_classes="card"):
653
+ gr.Markdown("### Batch / ZIP workflow")
654
+ batch_files = gr.File(label="Upload multiple audio files (optional)", file_count="multiple", type="filepath")
655
+ batch_zip = gr.File(label="Or upload ZIP with audio (optional)", file_count="single", type="filepath")
656
+ zip_password = gr.Textbox(label="ZIP password (optional)")
657
+ with gr.Row():
658
+ batch_model = gr.Dropdown(choices=["small","medium","large","large-v3","base"], value="small", label="Model")
659
+ batch_device = gr.Dropdown(choices=["auto","cpu","cuda"], value="auto", label="Device")
660
+ batch_merge = gr.Checkbox(label="Merge all transcripts into one .docx", value=True)
661
  batch_mem = gr.Checkbox(label="Enable memory corrections", value=False)
662
+ batch_extract_btn = gr.Button("Extract ZIP & List Files")
663
  batch_extract_logs = gr.Textbox(label="Extraction logs", lines=6, interactive=False)
664
+ batch_select = gr.CheckboxGroup(choices=[], label="Select extracted files to transcribe", interactive=True)
665
+ batch_trans_btn = gr.Button("Start Batch Transcription", variant="primary")
666
  with gr.Column(scale=1):
667
  with gr.Group(elem_classes="card"):
668
+ gr.Markdown("### Output")
669
+ batch_trans_out = gr.Textbox(label="Transcript (combined)", lines=16, interactive=False)
670
  batch_logs = gr.Textbox(label="Logs", lines=10, interactive=False)
671
  batch_download = gr.File(label="Merged .docx (when available)")
672
 
 
675
  return [], "No zip provided."
676
  zip_path = zip_file.name if hasattr(zip_file, "name") else str(zip_file)
677
  extracted, logs = extract_zip_list(zip_path, password)
678
+ short_logs = logs + "\n\nFiles:\n" + "\n".join([os.path.basename(p) for p in extracted])
679
+ return extracted, short_logs
680
 
681
  batch_extract_btn.click(fn=_extract_zip_for_ui, inputs=[batch_zip, zip_password], outputs=[batch_select, batch_extract_logs])
682
 
683
+ def _batch_transcribe(selected_check, uploaded_files, model_name, device_name, merge_flag, enable_mem):
684
+ paths = []
 
 
685
  if selected_check:
686
  paths.extend(selected_check)
687
  if uploaded_files:
688
+ if isinstance(uploaded_files, (list, tuple)):
689
+ for x in uploaded_files:
690
+ paths.append(str(x))
691
  else:
692
  paths.append(str(uploaded_files))
693
+ if not paths:
694
+ return "", "No files selected or uploaded.", None
695
+ logs = []
696
+ transcripts = []
697
+ out_doc = None
698
  for p in paths:
699
  try:
700
  _, txt, lg = transcribe_single(p, model_name=model_name, enable_memory=enable_mem, device_choice=device_name)
 
711
  logs.append(f"Merge failed: {e}")
712
  return combined, "\n".join(logs), out_doc
713
 
714
+ batch_trans_btn.click(fn=_batch_transcribe, inputs=[batch_select, batch_files, batch_model, batch_device, batch_merge, batch_mem], outputs=[batch_trans_out, batch_logs, batch_download])
715
 
716
+ # Memory Tab
717
+ with gr.TabItem("Memory"):
718
  with gr.Row():
719
  with gr.Column(scale=1):
720
  with gr.Group(elem_classes="card"):
 
730
  def _import_mem(uploaded):
731
  if not uploaded:
732
  return "No file provided."
733
+ path = uploaded.name if hasattr(uploaded, "name") else str(uploaded)
734
  try:
735
+ with open(path, "r", encoding="utf-8") as fh:
736
  raw = fh.read()
737
  parsed = None
738
  try:
739
  parsed = json.loads(raw)
740
  except Exception:
741
+ parsed = None
742
  if isinstance(parsed, dict):
743
  with MEMORY_LOCK:
744
+ for k, v in parsed.get("words", {}).items():
745
+ memory["words"][k.lower()] = memory["words"].get(k.lower(), 0) + int(v)
746
+ for k, v in parsed.get("phrases", {}).items():
747
+ memory["phrases"][k] = memory["phrases"].get(k, 0) + int(v)
748
  save_memory(memory)
749
+ return f"Imported JSON memory (words={len(parsed.get('words', {}))}, phrases={len(parsed.get('phrases', {}))})."
750
+ lines = [l.strip() for l in raw.splitlines() if l.strip()]
751
+ added = 0
 
752
  with MEMORY_LOCK:
753
  for line in lines:
754
  if "," in line:
755
+ k, c = line.split(",", 1)
756
+ try:
757
+ cnt = int(c)
758
+ except:
759
+ cnt = 1
760
+ memory["words"][k.lower()] = memory["words"].get(k.lower(), 0) + cnt
761
  else:
762
+ memory["words"][line.lower()] = memory["words"].get(line.lower(), 0) + 1
763
+ added += 1
764
  save_memory(memory)
765
  return f"Imported {added} entries."
766
  except Exception as e:
 
769
  def _add_mem(entry):
770
  if not entry or not entry.strip():
771
  return "No entry provided."
772
+ e = entry.strip()
773
  with MEMORY_LOCK:
774
+ if len(e.split()) <= 3:
775
+ memory["words"][e.lower()] = memory["words"].get(e.lower(), 0) + 1
776
  save_memory(memory)
777
  return f"Added word: {e.lower()}"
778
  else:
779
+ memory["phrases"][e] = memory["phrases"].get(e, 0) + 1
780
  save_memory(memory)
781
  return f"Added phrase: {e}"
782
 
783
  def _clear_mem():
784
  global memory
785
  with MEMORY_LOCK:
786
+ memory = {"words": {}, "phrases": {}}
787
  save_memory(memory)
788
  return "Memory cleared."
789
 
790
  def _view_mem():
791
+ w = memory.get("words", {})
792
+ p = memory.get("phrases", {})
793
+ out = []
794
  out.append("WORDS (top 30):")
795
+ for k, v in sorted(w.items(), key=lambda kv: -kv[1])[:30]:
796
  out.append(f"{k}: {v}")
797
  out.append("")
798
  out.append("PHRASES (top 20):")
799
+ for k, v in sorted(p.items(), key=lambda kv: -kv[1])[:20]:
800
  out.append(f"{k}: {v}")
801
  return "\n".join(out)
802
 
 
805
  mem_clear_btn.click(fn=_clear_mem, inputs=[], outputs=[mem_status])
806
  mem_view_btn.click(fn=_view_mem, inputs=[], outputs=[mem_status])
807
 
808
+ # Fine-tune Tab
809
+ with gr.TabItem("Fine-tune"):
810
  with gr.Row():
811
  with gr.Column(scale=1):
812
  with gr.Group(elem_classes="card"):
 
816
  ft_old = gr.File(label="Old files folder (optional)", file_count="single", type="filepath")
817
  ft_prepare_btn = gr.Button("Prepare dataset")
818
  ft_manifest_box = gr.Textbox(label="Prepare status / manifest", lines=4, interactive=False)
819
+ ft_base_model = gr.Dropdown(choices=["small","base","medium","large","large-v3"], value="small", label="Base model")
820
  ft_epochs = gr.Slider(minimum=1, maximum=100, value=3, step=1, label="Epochs")
821
+ ft_batch = gr.Number(label="Batch size", value=8)
822
+ ft_lr = gr.Number(label="Learning rate", value=1e-5, precision=8)
823
+ ft_output_dir = gr.Textbox(label="Output dir (optional)", value="", placeholder="Leave blank to use temp output")
824
+ ft_start_btn = gr.Button("Start Fine-tune")
825
+ ft_stop_btn = gr.Button("Stop Fine-tune")
826
+ ft_start_status = gr.Textbox(label="Start/Stop status", interactive=False, lines=4)
827
+ ft_tail_btn = gr.Button("Tail training logs")
828
+ ft_logs = gr.Textbox(label="Training logs (tail)", interactive=False, lines=12)
829
+ with gr.Column(scale=1):
830
+ with gr.Group(elem_classes="card"):
831
+ gr.Markdown("### Notes")
832
+ gr.Markdown("- Old-files folder should contain audio files and matching .txt transcripts with the same basename.")
833
+ gr.Markdown("- The app prepares a manifest and calls your `fine_tune.py` training script (you must provide it).")
834
+
835
+ def _prepare_action(ft_upload_file, include_old, old_dir):
836
+ status, manifest = prepare_finetune_dataset(ft_upload_file, include_old_files=include_old, old_files_dir=old_dir)
837
+ return status
838
+
839
+ def _start_action(manifest_text, base_model, epochs, batch_size, lr, output_dir):
840
+ manifest_guess = os.path.join(FINETUNE_WORKDIR, "manifest.tsv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  if not os.path.exists(manifest_guess):
842
+ return "Manifest not found. Prepare dataset first or manually provide manifest."
843
+ status = start_finetune(manifest_guess, base_model, int(epochs), int(batch_size), float(lr), output_dir)
844
+ return status
 
 
 
 
 
845
 
846
+ ft_prepare_btn.click(fn=_prepare_action, inputs=[ft_upload, ft_include_old, ft_old], outputs=[ft_manifest_box])
847
+ ft_start_btn.click(fn=_start_action, inputs=[ft_manifest_box, ft_base_model, ft_epochs, ft_batch, ft_lr, ft_output_dir], outputs=[ft_start_status])
848
+ ft_stop_btn.click(fn=lambda: "Stop not implemented in placeholder", inputs=[], outputs=[ft_start_status])
849
+ ft_tail_btn.click(fn=lambda: "Tail logs not implemented in placeholder", inputs=[], outputs=[ft_logs])
850
 
851
+ # Settings Tab
852
+ with gr.TabItem("Settings"):
853
  with gr.Row():
854
  with gr.Column():
855
+ with gr.Group(elem_classes="card"):
856
+ gr.Markdown("### Runtime & tips")
857
+ gr.Markdown("- Use large-v3 only if your whisper package supports it.")
858
+ gr.Markdown("- Extraction writes to system temp `extracted_audio`. Re-extracting overwrites it.")
859
+ gr.Markdown("- Provide your `fine_tune.py` for real fine-tuning.")
860
  with gr.Column():
861
+ with gr.Group(elem_classes="card"):
862
+ gr.Markdown("### Diagnostics")
863
+ diag_btn = gr.Button("Show memory summary")
864
+ diag_out = gr.Textbox(label="Diagnostics", lines=12, interactive=False)
865
+ diag_btn.click(fn=lambda: _view_mem(), inputs=[], outputs=[diag_out])
866
 
867
+ # ---------- Launch ----------
868
  if __name__ == "__main__":
869
  port = int(os.environ.get("PORT", 7860))
870
  print("DEBUG: launching Gradio on port", port, flush=True)
871
+ try:
872
+ demo.queue().launch(server_name="0.0.0.0", server_port=port)
873
+ except Exception as e:
874
+ print("FATAL: demo.launch failed:", e, flush=True)
875
+ traceback.print_exc()
876
+ raise