staraks commited on
Commit
b7cd478
·
verified ·
1 Parent(s): 68f2294

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -1243
app.py CHANGED
@@ -1,1116 +1,3 @@
1
- Skip to content
2
- Chat history
3
-
4
- You said:
5
- # app.py
6
- # Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
7
- # Clean, single-version file for Hugging Face Spaces.
8
-
9
- import os
10
- import sys
11
- import json
12
- import shutil
13
- import tempfile
14
- import subprocess
15
- import traceback
16
- import threading
17
- import re
18
- from difflib import get_close_matches
19
-
20
- # Force unbuffered output so container logs show prints immediately
21
- os.environ["PYTHONUNBUFFERED"] = "1"
22
-
23
- print("DEBUG: app.py bootstrap starting", flush=True)
24
-
25
- # Third-party imports (must be installed in the environment)
26
- try:
27
- from docx import Document
28
- import whisper
29
- import gradio as gr
30
- import pyzipper
31
- from pydub import AudioSegment
32
- except Exception as e:
33
- print("FATAL: import error for third-party libs:", e, flush=True)
34
- traceback.print_exc()
35
- raise
36
-
37
- print("DEBUG: imports OK", flush=True)
38
-
39
- # ---------- Config ----------
40
- MEMORY_FILE = "memory.json"
41
- MEMORY_LOCK = threading.Lock()
42
- MIN_WAV_SIZE = 200 # bytes
43
-
44
- # Small ffmpeg fallback grid (hybrid conversion)
45
- FFMPEG_CANDIDATES = [
46
- ("s16le", 16000, 1),
47
- ("s16le", 44100, 2),
48
- ("pcm_s16le", 16000, 1),
49
- ("pcm_s16le", 44100, 2),
50
- ("mulaw", 8000, 1),
51
- ]
52
- # ----------------------------
53
-
54
- # ---------- Memory helpers ----------
55
- def load_memory():
56
- try:
57
- if os.path.exists(MEMORY_FILE):
58
- with open(MEMORY_FILE, "r", encoding="utf-8") as fh:
59
- return json.load(fh)
60
- except Exception:
61
- pass
62
- mem = {"words": {}, "phrases": {}}
63
- try:
64
- with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
65
- json.dump(mem, fh, ensure_ascii=False, indent=2)
66
- except Exception:
67
- pass
68
- return mem
69
-
70
-
71
- def save_memory(mem):
72
- with MEMORY_LOCK:
73
- with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
74
- json.dump(mem, fh, ensure_ascii=False, indent=2)
75
-
76
-
77
- memory = load_memory()
78
- print(
79
- "DEBUG: memory loaded (words=%d phrases=%d)"
80
- % (len(memory.get("words", {})), len(memory.get("phrases", {}))),
81
- flush=True,
82
- )
83
-
84
- # ---------- Postprocessing ----------
85
- MEDICAL_ABBREVIATIONS = {
86
- "pt": "patient",
87
- "dx": "diagnosis",
88
- "hx": "history",
89
- "sx": "symptoms",
90
- "c/o": "complains of",
91
- "bp": "blood pressure",
92
- "hr": "heart rate",
93
- "o2": "oxygen",
94
- "r/o": "rule out",
95
- "adm": "admit",
96
- "disch": "discharge",
97
- }
98
-
99
- DRUG_NORMALIZATION = {
100
- "metformin": "Metformin",
101
- "aspirin": "Aspirin",
102
- "amoxicillin": "Amoxicillin",
103
- }
104
-
105
-
106
- def expand_abbreviations(text):
107
- tokens = re.split(r"(\s+)", text)
108
- out = []
109
- for t in tokens:
110
- key = t.lower().strip(".,;:")
111
- if key in MEDICAL_ABBREVIATIONS:
112
- trailing = ""
113
- m = re.match(r"([A-Za-z0-9/]+)([.,;:]*)", t)
114
- if m:
115
- trailing = m.group(2) or ""
116
- out.append(MEDICAL_ABBREVIATIONS[key] + trailing)
117
- else:
118
- out.append(t)
119
- return "".join(out)
120
-
121
-
122
- def normalize_drugs(text):
123
- for k, v in DRUG_NORMALIZATION.items():
124
- text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
125
- return text
126
-
127
-
128
- def punctuation_and_capitalization(text):
129
- text = text.strip()
130
- if not text:
131
- return text
132
- if not re.search(r"[.?!]\s*$", text):
133
- text = text.rstrip() + "."
134
- parts = re.split(r"([.?!]\s+)", text)
135
- out = []
136
- for p in parts:
137
- if p and not re.match(r"[.?!]\s+", p):
138
- out.append(p.capitalize())
139
- else:
140
- out.append(p)
141
- return "".join(out)
142
-
143
-
144
- def postprocess_transcript(text, format_soap=False):
145
- if not text:
146
- return text
147
- t = re.sub(r"\s+", " ", text).strip()
148
- t = expand_abbreviations(t)
149
- t = normalize_drugs(t)
150
- t = punctuation_and_capitalization(t)
151
- if format_soap:
152
- sentences = re.split(r"(?<=[.?!])\s+", t)
153
- subj = sentences[0] if len(sentences) >= 1 else ""
154
- obj = sentences[1] if len(sentences) >= 2 else ""
155
- assessment = ""
156
- for kw in ["diagnosis", "dx", "rule out", "r/o", "probable"]:
157
- if kw in t.lower():
158
- assessment = "Assessment: " + subj
159
- break
160
- soap = (
161
- f"S: {subj}\nO: {obj}\nA: {assessment}\nP: Plan: follow up as indicated."
162
- )
163
- return soap
164
- return t
165
-
166
-
167
- # ---------- Memory utilities ----------
168
- def extract_words_and_phrases(text):
169
- # basic tokenization for words; phrases = sentences
170
- words = re.findall(r"[A-Za-z0-9\-']+", text)
171
- sentences = [
172
- s.strip() for s in re.split(r"(?<=[.?!])\s+", text) if s.strip()
173
- ]
174
- return [w for w in words if w.strip()], sentences
175
-
176
-
177
- def update_memory_with_transcript(transcript):
178
- global memory
179
- words, sentences = extract_words_and_phrases(transcript)
180
- changed = False
181
- with MEMORY_LOCK:
182
- for w in words:
183
- lw = w.lower()
184
- if lw in memory["words"]:
185
- memory["words"][lw] += 1
186
- else:
187
- memory["words"][lw] = 1
188
- changed = True
189
- for s in sentences:
190
- key = s.strip()
191
- if key in memory["phrases"]:
192
- memory["phrases"][key] += 1
193
- else:
194
- memory["phrases"][key] = 1
195
- changed = True
196
- if changed:
197
- try:
198
- with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
199
- json.dump(memory, fh, ensure_ascii=False, indent=2)
200
- except Exception:
201
- pass
202
-
203
-
204
- def memory_correct_text(text, min_ratio=0.85):
205
- if not text or (not memory.get("words") and not memory.get("phrases")):
206
- return text
207
-
208
- def fix_word(w):
209
- lw = w.lower()
210
- if lw in memory["words"]:
211
- return w
212
- candidates = get_close_matches(
213
- lw, memory["words"].keys(), n=1, cutoff=min_ratio
214
- )
215
- if candidates:
216
- cand = candidates[0]
217
- if w and w[0].isupper():
218
- return cand.capitalize()
219
- return cand
220
- return w
221
-
222
- tokens = re.split(r"(\W+)", text)
223
- corrected_tokens = []
224
- for tok in tokens:
225
- if re.match(r"^[A-Za-z0-9\-']+$", tok):
226
- corrected_tokens.append(fix_word(tok))
227
- else:
228
- corrected_tokens.append(tok)
229
- corrected = "".join(corrected_tokens)
230
-
231
- for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
232
- low_phrase = phrase.lower()
233
- if len(low_phrase) < 8:
234
- continue
235
- if low_phrase in corrected.lower():
236
- corrected = re.sub(
237
- re.escape(phrase), phrase, corrected, flags=re.IGNORECASE
238
- )
239
- return corrected
240
-
241
-
242
- # ---------- File utilities ----------
243
- def save_as_word(text, filename=None):
244
- if filename is None:
245
- filename = os.path.join(
246
- tempfile.gettempdir(), "merged_transcripts.docx"
247
- )
248
- doc = Document()
249
- doc.add_paragraph(text)
250
- doc.save(filename)
251
- return filename
252
-
253
-
254
- # ---------- Hybrid conversion: pydub + small ffmpeg fallback ----------
255
- def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
256
- cmd = [
257
- "ffmpeg",
258
- "-hide_banner",
259
- "-loglevel",
260
- "error",
261
- "-y",
262
- "-f",
263
- fmt,
264
- "-ar",
265
- str(sr),
266
- "-ac",
267
- str(ch),
268
- "-i",
269
- input_path,
270
- out_path,
271
- ]
272
- try:
273
- proc = subprocess.run(cmd, capture_output=True, timeout=30, text=True)
274
- if (
275
- proc.returncode == 0
276
- and os.path.exists(out_path)
277
- and os.path.getsize(out_path) > MIN_WAV_SIZE
278
- ):
279
- return True, proc.stderr + proc.stdout
280
- else:
281
- try:
282
- if os.path.exists(out_path):
283
- os.unlink(out_path)
284
- except Exception:
285
- pass
286
- return False, proc.stderr + proc.stdout
287
- except Exception as e:
288
- try:
289
- if os.path.exists(out_path):
290
- os.unlink(out_path)
291
- except Exception:
292
- pass
293
- return False, str(e)
294
-
295
-
296
- def convert_to_wav_if_needed(input_path):
297
- input_path = str(input_path)
298
- lower = input_path.lower()
299
- if lower.endswith(".wav"):
300
- return input_path
301
-
302
- auto_err = ""
303
- tmp = None
304
- try:
305
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
306
- tmp.close()
307
- AudioSegment.from_file(input_path).export(tmp.name, format="wav")
308
- if os.path.exists(tmp.name) and os.path.getsize(tmp.name) > MIN_WAV_SIZE:
309
- return tmp.name
310
- else:
311
- try:
312
- os.unlink(tmp.name)
313
- except Exception:
314
- pass
315
- except Exception:
316
- auto_err = traceback.format_exc()
317
- try:
318
- if tmp and os.path.exists(tmp.name):
319
- os.unlink(tmp.name)
320
- except Exception:
321
- pass
322
-
323
- # ffmpeg fallback
324
- diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
325
- diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
326
- diagnostics = []
327
- for fmt, sr, ch in FFMPEG_CANDIDATES:
328
- out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
329
- out_wav.close()
330
- success, debug = _ffmpeg_convert(input_path, out_wav.name, fmt, sr, ch)
331
- diagnostics.append(
332
- f"TRY fmt={fmt} sr={sr} ch={ch} success={success}\n{debug}\n"
333
- )
334
- if success:
335
- try:
336
- with open(diag_log, "w", encoding="utf-8") as fh:
337
- fh.write("pydub auto error:\n")
338
- fh.write(auto_err + "\n\n")
339
- fh.write("Successful ffmpeg candidate:\n")
340
- fh.write(f"fmt={fmt} sr={sr} ch={ch}\n\n")
341
- fh.write("Diagnostics:\n")
342
- fh.write("\n".join(diagnostics))
343
- except Exception:
344
- pass
345
- return out_wav.name
346
- else:
347
- try:
348
- if os.path.exists(out_wav.name):
349
- os.unlink(out_wav.name)
350
- except Exception:
351
- pass
352
-
353
- # final diagnostics
354
- try:
355
- fp = subprocess.run(
356
- ["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
357
- capture_output=True,
358
- text=True,
359
- timeout=10,
360
- )
361
- diagnostics.append("FFPROBE:\n" + (fp.stdout.strip() or fp.stderr.strip()))
362
- except Exception as e:
363
- diagnostics.append("ffprobe failed: " + str(e))
364
- try:
365
- with open(input_path, "rb") as fh:
366
- head = fh.read(512)
367
- diagnostics.append("HEX PREVIEW:\n" + head.hex())
368
- except Exception as e:
369
- diagnostics.append("could not read head: " + str(e))
370
-
371
- try:
372
- with open(diag_log, "w", encoding="utf-8") as fh:
373
- fh.write("pydub auto error:\n")
374
- fh.write(auto_err + "\n\n")
375
- fh.write("Full diagnostics:\n\n")
376
- fh.write("\n\n".join(diagnostics))
377
- except Exception as e:
378
- raise Exception(f"Conversion failed; diagnostics write error: {e}")
379
-
380
- raise Exception(
381
- f"Could not convert file to WAV. Diagnostics saved to: {diag_log}"
382
- )
383
-
384
-
385
- # ---------- Whisper model cache ----------
386
- MODEL_CACHE = {}
387
-
388
-
389
- def get_whisper_model(name):
390
- if name not in MODEL_CACHE:
391
- print(f"DEBUG: loading whisper model '{name}'", flush=True)
392
- MODEL_CACHE[name] = whisper.load_model(name)
393
- return MODEL_CACHE[name]
394
-
395
-
396
- # ---------- Main transcription generator ----------
397
- def transcribe_multiple(
398
- audio_files,
399
- model_name,
400
- advanced_options,
401
- merge_checkbox,
402
- zip_file=None,
403
- zip_password=None,
404
- enable_memory=False,
405
- ):
406
- """
407
- Generator yields (log_text, transcripts_text, merged_file_path_or_None, percent_int)
408
- """
409
- log = []
410
- transcripts = []
411
- word_file_path = None
412
- temp_extract_dir = os.path.join(tempfile.gettempdir(), "extracted_audio")
413
- extracted_audio_paths = []
414
-
415
- # initial yield
416
- yield "", "", None, 0
417
-
418
- # cleanup previous temp dir
419
- if os.path.exists(temp_extract_dir):
420
- try:
421
- shutil.rmtree(temp_extract_dir)
422
- log.append(f"Cleaned previous temp dir: {temp_extract_dir}")
423
- except Exception:
424
- pass
425
-
426
- # handle zip
427
- if zip_file:
428
- log.append(f"Processing zip: {zip_file}")
429
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 2
430
- try:
431
- os.makedirs(temp_extract_dir, exist_ok=True)
432
- with pyzipper.ZipFile(zip_file, "r") as zf:
433
- if zip_password:
434
- try:
435
- zf.setpassword(zip_password.encode())
436
- except Exception:
437
- log.append("Incorrect zip password")
438
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
439
- return
440
- exts = [
441
- ".mp3",
442
- ".wav",
443
- ".aac",
444
- ".flac",
445
- ".ogg",
446
- ".m4a",
447
- ".dat",
448
- ".dct",
449
- ]
450
- count = 0
451
- for info in zf.infolist():
452
- if info.is_dir():
453
- continue
454
- _, ext = os.path.splitext(info.filename)
455
- if ext.lower() in exts:
456
- try:
457
- zf.extract(info, path=temp_extract_dir)
458
- except Exception as e:
459
- log.append(f"Error extracting {info.filename}: {e}")
460
- continue
461
- p = os.path.normpath(
462
- os.path.join(temp_extract_dir, info.filename)
463
- )
464
- if os.path.exists(p):
465
- extracted_audio_paths.append(p)
466
- count += 1
467
- log.append(f"Extracted: {info.filename}")
468
- if count == 0:
469
- log.append("No supported audio in zip.")
470
- try:
471
- shutil.rmtree(temp_extract_dir)
472
- except Exception:
473
- pass
474
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
475
- return
476
- except pyzipper.BadZipFile:
477
- log.append("Invalid zip file.")
478
- try:
479
- shutil.rmtree(temp_extract_dir)
480
- except Exception:
481
- pass
482
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
483
- return
484
- except Exception as e:
485
- log.append(f"Zip processing error: {e}")
486
- try:
487
- shutil.rmtree(temp_extract_dir)
488
- except Exception:
489
- pass
490
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
491
- return
492
-
493
- # collect audio file paths
494
- paths = []
495
- if extracted_audio_paths:
496
- paths.extend(extracted_audio_paths)
497
- if audio_files:
498
- if isinstance(audio_files, (list, tuple)):
499
- for a in audio_files:
500
- if a:
501
- paths.append(a)
502
- elif isinstance(audio_files, str):
503
- paths.append(audio_files)
504
-
505
- if not paths:
506
- log.append("No audio files provided.")
507
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
508
- return
509
-
510
- # load model
511
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 5
512
- try:
513
- model = get_whisper_model(model_name)
514
- log.append(f"Loaded Whisper model: {model_name}")
515
- except Exception as e:
516
- log.append(f"Failed to load model {model_name}: {e}")
517
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
518
- return
519
-
520
- total = len(paths)
521
- idx = 0
522
- for p in paths:
523
- idx += 1
524
- log.append(f"Processing file ({idx}/{total}): {p}")
525
- yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
526
- 5 + (idx - 1) * 80 / max(1, total)
527
- )
528
-
529
- wav = None
530
- try:
531
- wav = convert_to_wav_if_needed(p)
532
- log.append(f"Converted to WAV: {wav}")
533
- except Exception as e:
534
- log.append(f"Conversion failed for {p}: {e}")
535
- transcripts.append(
536
- f"FILE: {os.path.basename(p)}\nERROR: Conversion failed: {e}"
537
- )
538
- yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
539
- 5 + idx * 80 / max(1, total)
540
- )
541
- continue
542
-
543
- try:
544
- whisper_opts = {}
545
- if isinstance(advanced_options, dict):
546
- whisper_opts.update(advanced_options)
547
-
548
- result = model.transcribe(wav, **whisper_opts)
549
- text = result.get("text", "").strip()
550
- log.append(f"Transcribed: {len(text)} chars")
551
-
552
- if enable_memory:
553
- text = memory_correct_text(text)
554
- text = postprocess_transcript(text)
555
- transcripts.append(
556
- f"FILE: {os.path.basename(p)}\n{text}\n"
557
- )
558
-
559
- if enable_memory:
560
- try:
561
- update_memory_with_transcript(text)
562
- log.append("Memory updated.")
563
- except Exception:
564
- pass
565
-
566
- yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
567
- 10 + idx * 85 / max(1, total)
568
- )
569
- except Exception as e:
570
- log.append(f"Transcription failed for {p}: {e}")
571
- transcripts.append(
572
- f"FILE: {os.path.basename(p)}\nERROR: Transcription failed: {e}"
573
- )
574
- yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
575
- 10 + idx * 85 / max(1, total)
576
- )
577
- continue
578
- finally:
579
- try:
580
- if wav and os.path.exists(wav):
581
- tmpdir = tempfile.gettempdir()
582
- if (
583
- os.path.commonpath([tmpdir, os.path.abspath(wav)])
584
- == tmpdir
585
- and not p.lower().endswith(".wav")
586
- ):
587
- os.unlink(wav)
588
- except Exception:
589
- pass
590
-
591
- # final merge option
592
- if merge_checkbox:
593
- try:
594
- merged_text = "\n\n".join(transcripts)
595
- word_file_path = save_as_word(merged_text)
596
- log.append(f"Merged transcript saved: {word_file_path}")
597
- except Exception as e:
598
- log.append(f"Failed to save merged file: {e}")
599
- word_file_path = None
600
-
601
- # final yield
602
- yield "\n\n".join(log), "\n\n".join(transcripts), word_file_path, 100
603
-
604
- # cleanup extracted dir
605
- try:
606
- if os.path.exists(temp_extract_dir):
607
- shutil.rmtree(temp_extract_dir)
608
- log.append("Cleaned temporary extraction dir.")
609
- except Exception:
610
- pass
611
-
612
-
613
- # ----------------------- Gradio wrapper (streaming) -----------------------
614
- def run_transcription_wrapper(
615
- files,
616
- model_name,
617
- merge,
618
- zip_file,
619
- zip_password,
620
- use_default_zip_pass,
621
- default_zip_password,
622
- enable_memory,
623
- advanced_options_state,
624
- ):
625
- """
626
- Gradio callback that streams results from transcribe_multiple.
627
- This MUST be a generator (use yield) so Gradio receives 4 outputs,
628
- not a single generator object.
629
- """
630
- try:
631
- audio_input = files
632
-
633
- # Normalize zip path
634
- zip_path = None
635
- if zip_file:
636
- if isinstance(zip_file, (str, os.PathLike)):
637
- zip_path = str(zip_file)
638
- elif hasattr(zip_file, "name"):
639
- zip_path = zip_file.name
640
- elif isinstance(zip_file, dict) and zip_file.get("name"):
641
- zip_path = zip_file["name"]
642
-
643
- # Decide final zip password
644
- if use_default_zip_pass and (not zip_password or zip_password.strip() == ""):
645
- final_zip_password = default_zip_password
646
- else:
647
- final_zip_password = zip_password
648
-
649
- adv = {} # placeholder for future advanced options
650
-
651
- # Stream results from the core generator
652
- for logs, transcripts, word_path, percent in transcribe_multiple(
653
- audio_input,
654
- model_name,
655
- adv,
656
- merge_checkbox=merge,
657
- zip_file=zip_path,
658
- zip_password=final_zip_password,
659
- enable_memory=enable_memory,
660
- ):
661
- yield logs, transcripts, word_path, percent
662
-
663
- except Exception:
664
- tb = traceback.format_exc()
665
- logs = f"EXCEPTION in run_transcription_wrapper:\n{tb}"
666
- transcripts = "ERROR: transcription did not start or failed unexpectedly."
667
- yield logs, transcripts, None, 100
668
-
669
-
670
- print("DEBUG: building Gradio Blocks", flush=True)
671
-
672
- # ---------- Minimal, easy-to-use UI ----------
673
- with gr.Blocks(title="Whisper Transcriber") as demo:
674
- gr.Markdown(
675
- "## Whisper Transcriber\n"
676
- "Upload audio files or a ZIP on the left and click **Transcribe**.\n"
677
- "Transcript, progress, download, and logs appear on the right."
678
- )
679
-
680
- with gr.Row():
681
- # LEFT: Inputs (small boxes, simple controls)
682
- with gr.Column(scale=1):
683
- gr.Markdown("### Input")
684
-
685
- file_input = gr.File(
686
- label="Audio files",
687
- file_count="multiple",
688
- type="filepath",
689
- height=60, # small box
690
- )
691
-
692
- zip_input = gr.File(
693
- label="ZIP with audio (optional)",
694
- file_count="single",
695
- type="filepath",
696
- height=60, # small box
697
- )
698
-
699
- use_default_zip_pass = gr.Checkbox(
700
- label="Use default ZIP password",
701
- value=False,
702
- )
703
-
704
- default_zip_password = gr.Textbox(
705
- label="Default ZIP password",
706
- value="dietcoke1", # you can change this
707
- interactive=True,
708
- )
709
-
710
- zip_password = gr.Textbox(
711
- label="ZIP password (override)",
712
- placeholder="If empty, default password will be used",
713
- )
714
-
715
- model_select = gr.Dropdown(
716
- choices=["small", "medium", "large", "base"],
717
- value="small",
718
- label="Whisper model",
719
- )
720
-
721
- merge_checkbox = gr.Checkbox(
722
- label="Merge all transcripts into one .docx",
723
- value=True,
724
- )
725
-
726
- memory_checkbox = gr.Checkbox(
727
- label="Enable correction memory",
728
- value=False,
729
- )
730
-
731
- submit = gr.Button("Transcribe", variant="primary")
732
-
733
- # RIGHT: Outputs (Transcript → Progress → Download → Logs)
734
- with gr.Column(scale=1):
735
- gr.Markdown("### Output")
736
-
737
- transcripts_out = gr.Textbox(
738
- label="Transcript",
739
- lines=18,
740
- interactive=False,
741
- )
742
-
743
- progress_num = gr.Slider(
744
- minimum=0,
745
- maximum=100,
746
- value=0,
747
- step=1,
748
- label="Progress (%)",
749
- interactive=False,
750
- )
751
-
752
- download_file = gr.File(
753
- label="Merged .docx (when available)"
754
- )
755
-
756
- logs = gr.Textbox(
757
- label="Logs",
758
- lines=10,
759
- interactive=False,
760
- )
761
-
762
- submit.click(
763
- fn=run_transcription_wrapper,
764
- inputs=[
765
- file_input,
766
- model_select,
767
- merge_checkbox,
768
- zip_input,
769
- zip_password,
770
- use_default_zip_pass,
771
- default_zip_password,
772
- memory_checkbox,
773
- gr.State({}),
774
- ],
775
- # The function yields: logs, transcripts, download_file, progress
776
- outputs=[logs, transcripts_out, download_file, progress_num],
777
- )
778
-
779
- # ---------- Launch ----------
780
- if __name__ == "__main__":
781
- port = int(os.environ.get("PORT", 7860))
782
- print("DEBUG: launching Gradio on port", port, flush=True)
783
- try:
784
- demo.queue().launch(server_name="0.0.0.0", server_port=port)
785
- except Exception as e:
786
- print("FATAL: demo.launch failed:", e, flush=True)
787
- traceback.print_exc()
788
- raise
789
- ChatGPT said:
790
-
791
- Thought for 7s
792
- Nice, thanks — this is a solid, well-structured transcription app. I ran a careful code review and made a few targeted fixes and improvements you can drop into your file to make it more robust and reliable (especially around ffmpeg fallback conversion, ZIP password handling, temp-file cleanup, and a couple of small safety/usability items).
793
-
794
- Below I list the important issues I found, why they matter, and then provide the concrete patched code snippets you can replace in your file. I kept changes minimal and backward-compatible.
795
-
796
- Major issues & fixes (summary)
797
- ffmpeg conversion command was fragile
798
- You used -f <fmt> before -i which tells ffmpeg the input format is raw (s16le/pcm), and that often fails for containerized inputs. Better: call ffmpeg -i input -ar <sr> -ac <ch> out.wav (let ffmpeg auto-detect input). If you still want raw-format attempts, do them explicitly only when necessary. The patch changes the command so ffmpeg autodetects input and sets output sampling/channel only.
799
-
800
- Diagnostics/logging & cleanup
801
- If conversion fails we should remove partial files and write diagnostics safely. I preserved your diagnostics but improved removal and error messages.
802
-
803
- ZIP password handling with pyzipper
804
- zf.setpassword(...) simply sets a default password — it does not throw on set. Extraction will raise if password is wrong. I changed code to try extract and catch RuntimeError/RuntimeError/pyzipper.BadZipFile where appropriate and produce a clearer "Wrong password" message.
805
-
806
- Temporary WAV deletion logic
807
- The logic that deletes intermediate WAV relied on checking os.path.commonpath([tmpdir, os.path.abspath(wav)]) == tmpdir — that may raise if paths are on different drives. I hardened that check and wrapped in try/except.
808
-
809
- Model loading
810
- I kept your model cache but suggested an optional device override if you want to put it on CPU/GPU. (Small note: confirm whisper.load_model(name) is the right API for the whisper package/version you are using.)
811
-
812
- Minor UI suggestion
813
- Consider setting file_input's type="file" or keeping "filepath" with a fallback handler — your run_transcription_wrapper already copes with zip_file being file-like or path-like, so it's fine. (Left unchanged.)
814
-
815
- Replace these functions with the patched versions
816
- Paste these function definitions into your file to replace the originals for _ffmpeg_convert, convert_to_wav_if_needed, and the ZIP extraction block in transcribe_multiple. I preserved your variable names and diagnostics as much as possible.
817
-
818
- # ---------- improved ffmpeg convert ----------
819
- def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
820
- """
821
- Use ffmpeg to convert input_path -> out_path.
822
- We let ffmpeg autodetect input format (so don't force -f except if fmt == 'raw').
823
- If fmt is one of the raw formats (s16le/pcm_s16le/mulaw) we treat it as raw input;
824
- otherwise we let ffmpeg autodetect and only set sample rate/channels on output.
825
- """
826
- try:
827
- cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"]
828
-
829
- # if fmt indicates raw input format, we need to tell ffmpeg about it
830
- if fmt in ("s16le", "pcm_s16le", "mulaw"):
831
- # raw PCM input requires -f and input format description before -i
832
- cmd += ["-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path]
833
- # output will be WAV (ffmpeg determines container from out_path)
834
- cmd += [out_path]
835
- else:
836
- # For most candidate attempts, let ffmpeg autodetect input format.
837
- # Request output sample rate/channels explicitly.
838
- cmd += ["-i", input_path, "-ar", str(sr), "-ac", str(ch), out_path]
839
-
840
- proc = subprocess.run(cmd, capture_output=True, timeout=60, text=True)
841
- stdout_stderr = proc.stdout + proc.stderr
842
- if proc.returncode == 0 and os.path.exists(out_path) and os.path.getsize(out_path) > MIN_WAV_SIZE:
843
- return True, stdout_stderr
844
- else:
845
- # remove partial output if exists
846
- try:
847
- if os.path.exists(out_path):
848
- os.unlink(out_path)
849
- except Exception:
850
- pass
851
- return False, stdout_stderr
852
- except Exception as e:
853
- try:
854
- if os.path.exists(out_path):
855
- os.unlink(out_path)
856
- except Exception:
857
- pass
858
- return False, str(e)
859
-
860
-
861
- def convert_to_wav_if_needed(input_path):
862
- input_path = str(input_path)
863
- lower = input_path.lower()
864
- if lower.endswith(".wav"):
865
- return input_path
866
-
867
- auto_err = ""
868
- tmp = None
869
- try:
870
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
871
- tmp.close()
872
- # let pydub autodetect format for most files
873
- AudioSegment.from_file(input_path).export(tmp.name, format="wav")
874
- if os.path.exists(tmp.name) and os.path.getsize(tmp.name) > MIN_WAV_SIZE:
875
- return tmp.name
876
- else:
877
- # bad export, remove and fall through to ffmpeg tries
878
- try:
879
- os.unlink(tmp.name)
880
- except Exception:
881
- pass
882
- except Exception:
883
- auto_err = traceback.format_exc()
884
- try:
885
- if tmp and os.path.exists(tmp.name):
886
- os.unlink(tmp.name)
887
- except Exception:
888
- pass
889
-
890
- # ffmpeg fallback
891
- diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
892
- diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
893
- diagnostics = []
894
- for fmt, sr, ch in FFMPEG_CANDIDATES:
895
- out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
896
- out_wav.close()
897
- success, debug = _ffmpeg_convert(input_path, out_wav.name, fmt, sr, ch)
898
- diagnostics.append(f"TRY fmt={fmt} sr={sr} ch={ch} success={success}\n{debug}\n")
899
- if success:
900
- try:
901
- with open(diag_log, "w", encoding="utf-8") as fh:
902
- fh.write("pydub auto error:\n")
903
- fh.write(auto_err + "\n\n")
904
- fh.write("Successful ffmpeg candidate:\n")
905
- fh.write(f"fmt={fmt} sr={sr} ch={ch}\n\n")
906
- fh.write("Diagnostics:\n")
907
- fh.write("\n".join(diagnostics))
908
- except Exception:
909
- pass
910
- return out_wav.name
911
- else:
912
- try:
913
- if os.path.exists(out_wav.name):
914
- os.unlink(out_wav.name)
915
- except Exception:
916
- pass
917
-
918
- # final diagnostics: ffprobe + file head
919
- try:
920
- fp = subprocess.run(
921
- ["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
922
- capture_output=True,
923
- text=True,
924
- timeout=10,
925
- )
926
- diagnostics.append("FFPROBE:\n" + (fp.stdout.strip() or fp.stderr.strip()))
927
- except Exception as e:
928
- diagnostics.append("ffprobe failed: " + str(e))
929
- try:
930
- with open(input_path, "rb") as fh:
931
- head = fh.read(512)
932
- diagnostics.append("HEX PREVIEW:\n" + head.hex())
933
- except Exception as e:
934
- diagnostics.append("could not read head: " + str(e))
935
-
936
- try:
937
- with open(diag_log, "w", encoding="utf-8") as fh:
938
- fh.write("pydub auto error:\n")
939
- fh.write(auto_err + "\n\n")
940
- fh.write("Full diagnostics:\n\n")
941
- fh.write("\n\n".join(diagnostics))
942
- except Exception as e:
943
- # keep original behavior: surface a readable exception
944
- raise Exception(f"Conversion failed; diagnostics write error: {e}")
945
-
946
- raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}")
947
- Replace the ZIP extraction block inside transcribe_multiple with this improved version
948
- Find this earlier in transcribe_multiple:
949
-
950
- if zip_file:
951
- log.append(f"Processing zip: {zip_file}")
952
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 2
953
- try:
954
- os.makedirs(temp_extract_dir, exist_ok=True)
955
- with pyzipper.ZipFile(zip_file, "r") as zf:
956
- if zip_password:
957
- try:
958
- zf.setpassword(zip_password.encode())
959
- except Exception:
960
- log.append("Incorrect zip password")
961
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
962
- return
963
- exts = [...]
964
- ...
965
- Replace the whole if zip_file: block with this:
966
-
967
- if zip_file:
968
- log.append(f"Processing zip: {zip_file}")
969
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 2
970
- try:
971
- os.makedirs(temp_extract_dir, exist_ok=True)
972
- with pyzipper.ZipFile(zip_file, "r") as zf:
973
- # set password if provided (pyzipper accepts bytes)
974
- if zip_password:
975
- try:
976
- zf.setpassword(zip_password.encode())
977
- except Exception:
978
- # setpassword typically doesn't fail, but just in case
979
- log.append("Failed to set zip password (unexpected).")
980
- exts = [
981
- ".mp3",
982
- ".wav",
983
- ".aac",
984
- ".flac",
985
- ".ogg",
986
- ".m4a",
987
- ".dat",
988
- ".dct",
989
- ]
990
- count = 0
991
- for info in zf.infolist():
992
- if info.is_dir():
993
- continue
994
- _, ext = os.path.splitext(info.filename)
995
- if ext.lower() in exts:
996
- try:
997
- # attempt to extract each file; if wrong password, extraction will raise
998
- zf.extract(info, path=temp_extract_dir)
999
- except RuntimeError as e:
1000
- # pyzipper may raise RuntimeError on bad password
1001
- log.append(f"Password required or incorrect for {info.filename}: {e}")
1002
- continue
1003
- except pyzipper.BadZipFile:
1004
- log.append(f"Bad zip entry: {info.filename}")
1005
- continue
1006
- except Exception as e:
1007
- log.append(f"Error extracting {info.filename}: {e}")
1008
- continue
1009
- p = os.path.normpath(os.path.join(temp_extract_dir, info.filename))
1010
- if os.path.exists(p):
1011
- extracted_audio_paths.append(p)
1012
- count += 1
1013
- log.append(f"Extracted: {info.filename}")
1014
- if count == 0:
1015
- log.append("No supported audio in zip.")
1016
- try:
1017
- shutil.rmtree(temp_extract_dir)
1018
- except Exception:
1019
- pass
1020
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
1021
- return
1022
- except pyzipper.BadZipFile:
1023
- log.append("Invalid zip file.")
1024
- try:
1025
- shutil.rmtree(temp_extract_dir)
1026
- except Exception:
1027
- pass
1028
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
1029
- return
1030
- except Exception as e:
1031
- log.append(f"Zip processing error: {e}")
1032
- try:
1033
- shutil.rmtree(temp_extract_dir)
1034
- except Exception:
1035
- pass
1036
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
1037
- return
1038
- Small hardening for temporary WAV deletion (replace your existing finally block)
1039
- Replace the finally that deletes intermediate WAVs with this safer variant:
1040
-
1041
- finally:
1042
- try:
1043
- if wav and os.path.exists(wav):
1044
- tmpdir = tempfile.gettempdir()
1045
- try:
1046
- # Do not remove original .wav files that came from user (only remove conversion outputs)
1047
- common = os.path.commonpath([os.path.abspath(tmpdir), os.path.abspath(wav)])
1048
- if common == os.path.abspath(tmpdir) and not p.lower().endswith(".wav"):
1049
- os.unlink(wav)
1050
- except Exception:
1051
- # fallback: only remove if it's in tempdir substring
1052
- try:
1053
- if tmpdir in os.path.abspath(wav) and not p.lower().endswith(".wav"):
1054
- os.unlink(wav)
1055
- except Exception:
1056
- pass
1057
- except Exception:
1058
- pass
1059
- Optional suggestions (non-blocking)
1060
- Consider making MIN_WAV_SIZE a bit larger (e.g., 1024 bytes) — 200 bytes is tiny and can be a partial/invalid file.
1061
-
1062
- Consider exposing a device option for the Whisper model (e.g. cpu vs cuda) and pass that into whisper.load_model.
1063
-
1064
- default_zip_password is in your UI — don't keep real secrets there; make default generic or blank.
1065
-
1066
- If you want progress more precise, you could measure durations of actual model transcribe calls and update percent accordingly.
1067
-
1068
- If you run on Hugging Face Spaces, ensure the whisper package you installed matches your API expectations (some forks differ).
1069
-
1070
- Why these changes help (TL;DR)
1071
- Letting ffmpeg autodetect input prevents many conversion failures on real-world audio files (mp3, m4a, flac, etc.).
1072
-
1073
- Proper diagnostic capture + safer temp-file cleanup prevents stuck files and makes debugging easier.
1074
-
1075
- More robust zip/password extraction reduces silent failures and makes user feedback clearer.
1076
-
1077
- If you want, I can:
1078
-
1079
- produce a full patched app.py with the above replacements merged (ready to paste), or
1080
-
1081
- convert model loading to allow mobile/CPU-only fallback and add a small "device" dropdown in the UI, or
1082
-
1083
- update the UI to add a "select which extracted files to transcribe" step (you had requested that in earlier project notes).
1084
-
1085
- Tell me which of those you want and I’ll give the full code in one paste.
1086
-
1087
- You said:
1088
- full code with option to memories old words
1089
- ChatGPT said:
1090
-
1091
- Thought for 5s
1092
- Nice — below is a full, ready-to-paste app.py that:
1093
-
1094
- Includes the improved ffmpeg fallback conversion and safer temp cleanup from my earlier suggestions.
1095
-
1096
- Improves ZIP extraction & password handling.
1097
-
1098
- Adds Memory management UI and functions so you can:
1099
-
1100
- Enable/disable the correction memory (existing checkbox).
1101
-
1102
- Import an existing memory file (JSON or newline word list).
1103
-
1104
- Add a single word or phrase to memory manually.
1105
-
1106
- Clear the memory.
1107
-
1108
- View current memory (words & phrases counts).
1109
-
1110
- Keeps the original transcription flow and UI layout; I only extended the left-side controls to include the memory operations.
1111
-
1112
- Paste this into your repo (replace your current app.py). I preserved your original structure and most variable names.
1113
-
1114
  # app.py
1115
  # Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
1116
  # Clean, single-version file for Hugging Face Spaces.
@@ -1148,7 +35,7 @@ print("DEBUG: imports OK", flush=True)
1148
  # ---------- Config ----------
1149
  MEMORY_FILE = "memory.json"
1150
  MEMORY_LOCK = threading.Lock()
1151
- MIN_WAV_SIZE = 1024 # raised slightly from 200 for safety
1152
  FFMPEG_CANDIDATES = [
1153
  ("s16le", 16000, 1),
1154
  ("s16le", 44100, 2),
@@ -1164,7 +51,6 @@ def load_memory():
1164
  if os.path.exists(MEMORY_FILE):
1165
  with open(MEMORY_FILE, "r", encoding="utf-8") as fh:
1166
  data = json.load(fh)
1167
- # validate minimal structure
1168
  if not isinstance(data, dict):
1169
  raise ValueError("memory.json root not dict")
1170
  data.setdefault("words", {})
@@ -1197,7 +83,6 @@ print(
1197
  flush=True,
1198
  )
1199
 
1200
-
1201
  # ---------- Postprocessing ----------
1202
  MEDICAL_ABBREVIATIONS = {
1203
  "pt": "patient",
@@ -1283,11 +168,8 @@ def postprocess_transcript(text, format_soap=False):
1283
 
1284
  # ---------- Memory utilities ----------
1285
  def extract_words_and_phrases(text):
1286
- # basic tokenization for words; phrases = sentences
1287
  words = re.findall(r"[A-Za-z0-9\-']+", text)
1288
- sentences = [
1289
- s.strip() for s in re.split(r"(?<=[.?!])\s+", text) if s.strip()
1290
- ]
1291
  return [w for w in words if w.strip()], sentences
1292
 
1293
 
@@ -1326,9 +208,7 @@ def memory_correct_text(text, min_ratio=0.85):
1326
  lw = w.lower()
1327
  if lw in memory["words"]:
1328
  return w
1329
- candidates = get_close_matches(
1330
- lw, memory["words"].keys(), n=1, cutoff=min_ratio
1331
- )
1332
  if candidates:
1333
  cand = candidates[0]
1334
  if w and w[0].isupper():
@@ -1350,24 +230,15 @@ def memory_correct_text(text, min_ratio=0.85):
1350
  if len(low_phrase) < 8:
1351
  continue
1352
  if low_phrase in corrected.lower():
1353
- corrected = re.sub(
1354
- re.escape(phrase), phrase, corrected, flags=re.IGNORECASE
1355
- )
1356
  return corrected
1357
 
1358
 
1359
  # ---------- Memory management UI helpers ----------
1360
  def import_memory_file(uploaded):
1361
- """
1362
- Accepts an uploaded file object or filepath. Accepts:
1363
- - JSON of shape {"words": {...}, "phrases": {...}}
1364
- - Plain newline-separated words or CSV-like lines "word,count"
1365
- Returns status message.
1366
- """
1367
  global memory
1368
  if not uploaded:
1369
  return "No file provided."
1370
-
1371
  path = None
1372
  try:
1373
  if isinstance(uploaded, (str, os.PathLike)):
@@ -1378,17 +249,12 @@ def import_memory_file(uploaded):
1378
  path = uploaded["name"]
1379
  else:
1380
  return "Unable to determine uploaded file path."
1381
-
1382
- # read file
1383
  with open(path, "r", encoding="utf-8") as fh:
1384
  raw = fh.read()
1385
-
1386
- # try JSON first
1387
  try:
1388
  parsed = json.loads(raw)
1389
  if isinstance(parsed, dict):
1390
  with MEMORY_LOCK:
1391
- # merge words/phrases
1392
  parsed_words = parsed.get("words", {})
1393
  parsed_phrases = parsed.get("phrases", {})
1394
  for k, v in parsed_words.items():
@@ -1398,10 +264,7 @@ def import_memory_file(uploaded):
1398
  save_memory(memory)
1399
  return f"Imported JSON memory (words={len(parsed_words)}, phrases={len(parsed_phrases)})."
1400
  except Exception:
1401
- # not JSON, fallback to newline parse
1402
  pass
1403
-
1404
- # fallback: split lines, attempt "word,count" or just "word"
1405
  lines = [l.strip() for l in raw.splitlines() if l.strip()]
1406
  added_words = 0
1407
  added_phrases = 0
@@ -1417,7 +280,6 @@ def import_memory_file(uploaded):
1417
  memory["words"][key] = memory["words"].get(key, 0) + cnt
1418
  added_words += 1
1419
  else:
1420
- # treat as word (if short) else as phrase
1421
  if len(line.split()) <= 3:
1422
  key = line.lower()
1423
  memory["words"][key] = memory["words"].get(key, 0) + 1
@@ -1433,9 +295,6 @@ def import_memory_file(uploaded):
1433
 
1434
 
1435
  def add_memory_entry(entry):
1436
- """
1437
- Add a single 'word' or phrase. If entry is short (<=3 words) treat as word, else phrase.
1438
- """
1439
  global memory
1440
  if not entry or not entry.strip():
1441
  return "No entry provided."
@@ -1451,6 +310,7 @@ def add_memory_entry(entry):
1451
  save_memory(memory)
1452
  return f"Added/updated phrase: '{e}'."
1453
 
 
1454
  def clear_memory():
1455
  global memory
1456
  with MEMORY_LOCK:
@@ -1458,11 +318,8 @@ def clear_memory():
1458
  save_memory(memory)
1459
  return "Memory cleared."
1460
 
 
1461
  def view_memory(limit=2000):
1462
- """
1463
- Returns a text summary of memory (words sorted by count then phrases).
1464
- limit parameter caps returned characters for UI.
1465
- """
1466
  w = memory.get("words", {})
1467
  p = memory.get("phrases", {})
1468
  out_lines = []
@@ -1482,9 +339,7 @@ def view_memory(limit=2000):
1482
  # ---------- File utilities ----------
1483
  def save_as_word(text, filename=None):
1484
  if filename is None:
1485
- filename = os.path.join(
1486
- tempfile.gettempdir(), "merged_transcripts.docx"
1487
- )
1488
  doc = Document()
1489
  doc.add_paragraph(text)
1490
  doc.save(filename)
@@ -1493,20 +348,12 @@ def save_as_word(text, filename=None):
1493
 
1494
  # ---------- improved ffmpeg convert ----------
1495
  def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
1496
- """
1497
- Use ffmpeg to convert input_path -> out_path.
1498
- Let ffmpeg autodetect input unless fmt signals raw PCM.
1499
- """
1500
  try:
1501
  cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"]
1502
-
1503
  if fmt in ("s16le", "pcm_s16le", "mulaw"):
1504
- # raw input: specify input format and sample params before -i
1505
  cmd += ["-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_path]
1506
  else:
1507
- # autodetect input, request output sample rate/channels
1508
  cmd += ["-i", input_path, "-ar", str(sr), "-ac", str(ch), out_path]
1509
-
1510
  proc = subprocess.run(cmd, capture_output=True, timeout=60, text=True)
1511
  stdout_stderr = (proc.stdout or "") + (proc.stderr or "")
1512
  if proc.returncode == 0 and os.path.exists(out_path) and os.path.getsize(out_path) > MIN_WAV_SIZE:
@@ -1554,7 +401,6 @@ def convert_to_wav_if_needed(input_path):
1554
  except Exception:
1555
  pass
1556
 
1557
- # ffmpeg fallback
1558
  diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
1559
  diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
1560
  diagnostics = []
@@ -1582,7 +428,6 @@ def convert_to_wav_if_needed(input_path):
1582
  except Exception:
1583
  pass
1584
 
1585
- # final diagnostics
1586
  try:
1587
  fp = subprocess.run(
1588
  ["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
@@ -1619,7 +464,6 @@ MODEL_CACHE = {}
1619
  def get_whisper_model(name):
1620
  if name not in MODEL_CACHE:
1621
  print(f"DEBUG: loading whisper model '{name}'", flush=True)
1622
- # You can set device by changing whisper.load_model(name, device="cpu") if needed.
1623
  MODEL_CACHE[name] = whisper.load_model(name)
1624
  return MODEL_CACHE[name]
1625
 
@@ -1634,19 +478,14 @@ def transcribe_multiple(
1634
  zip_password=None,
1635
  enable_memory=False,
1636
  ):
1637
- """
1638
- Generator yields (log_text, transcripts_text, merged_file_path_or_None, percent_int)
1639
- """
1640
  log = []
1641
  transcripts = []
1642
  word_file_path = None
1643
  temp_extract_dir = os.path.join(tempfile.gettempdir(), "extracted_audio")
1644
  extracted_audio_paths = []
1645
 
1646
- # initial yield
1647
  yield "", "", None, 0
1648
 
1649
- # cleanup previous temp dir
1650
  if os.path.exists(temp_extract_dir):
1651
  try:
1652
  shutil.rmtree(temp_extract_dir)
@@ -1654,7 +493,6 @@ def transcribe_multiple(
1654
  except Exception:
1655
  pass
1656
 
1657
- # handle zip
1658
  if zip_file:
1659
  log.append(f"Processing zip: {zip_file}")
1660
  yield "\n\n".join(log), "\n\n".join(transcripts), None, 2
@@ -1724,7 +562,6 @@ def transcribe_multiple(
1724
  yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
1725
  return
1726
 
1727
- # collect audio file paths
1728
  paths = []
1729
  if extracted_audio_paths:
1730
  paths.extend(extracted_audio_paths)
@@ -1741,7 +578,6 @@ def transcribe_multiple(
1741
  yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
1742
  return
1743
 
1744
- # load model
1745
  yield "\n\n".join(log), "\n\n".join(transcripts), None, 5
1746
  try:
1747
  model = get_whisper_model(model_name)
@@ -1756,9 +592,7 @@ def transcribe_multiple(
1756
  for p in paths:
1757
  idx += 1
1758
  log.append(f"Processing file ({idx}/{total}): {p}")
1759
- yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
1760
- 5 + (idx - 1) * 80 / max(1, total)
1761
- )
1762
 
1763
  wav = None
1764
  try:
@@ -1766,12 +600,8 @@ def transcribe_multiple(
1766
  log.append(f"Converted to WAV: {wav}")
1767
  except Exception as e:
1768
  log.append(f"Conversion failed for {p}: {e}")
1769
- transcripts.append(
1770
- f"FILE: {os.path.basename(p)}\nERROR: Conversion failed: {e}"
1771
- )
1772
- yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
1773
- 5 + idx * 80 / max(1, total)
1774
- )
1775
  continue
1776
 
1777
  try:
@@ -1786,9 +616,7 @@ def transcribe_multiple(
1786
  if enable_memory:
1787
  text = memory_correct_text(text)
1788
  text = postprocess_transcript(text)
1789
- transcripts.append(
1790
- f"FILE: {os.path.basename(p)}\n{text}\n"
1791
- )
1792
 
1793
  if enable_memory:
1794
  try:
@@ -1797,17 +625,11 @@ def transcribe_multiple(
1797
  except Exception:
1798
  pass
1799
 
1800
- yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
1801
- 10 + idx * 85 / max(1, total)
1802
- )
1803
  except Exception as e:
1804
  log.append(f"Transcription failed for {p}: {e}")
1805
- transcripts.append(
1806
- f"FILE: {os.path.basename(p)}\nERROR: Transcription failed: {e}"
1807
- )
1808
- yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
1809
- 10 + idx * 85 / max(1, total)
1810
- )
1811
  continue
1812
  finally:
1813
  try:
@@ -1826,7 +648,6 @@ def transcribe_multiple(
1826
  except Exception:
1827
  pass
1828
 
1829
- # final merge option
1830
  if merge_checkbox:
1831
  try:
1832
  merged_text = "\n\n".join(transcripts)
@@ -1836,10 +657,8 @@ def transcribe_multiple(
1836
  log.append(f"Failed to save merged file: {e}")
1837
  word_file_path = None
1838
 
1839
- # final yield
1840
  yield "\n\n".join(log), "\n\n".join(transcripts), word_file_path, 100
1841
 
1842
- # cleanup extracted dir
1843
  try:
1844
  if os.path.exists(temp_extract_dir):
1845
  shutil.rmtree(temp_extract_dir)
@@ -1860,15 +679,9 @@ def run_transcription_wrapper(
1860
  enable_memory,
1861
  advanced_options_state,
1862
  ):
1863
- """
1864
- Gradio callback that streams results from transcribe_multiple.
1865
- This MUST be a generator (use `yield`) so Gradio receives 4 outputs,
1866
- not a single generator object.
1867
- """
1868
  try:
1869
  audio_input = files
1870
 
1871
- # Normalize zip path
1872
  zip_path = None
1873
  if zip_file:
1874
  if isinstance(zip_file, (str, os.PathLike)):
@@ -1878,15 +691,13 @@ def run_transcription_wrapper(
1878
  elif isinstance(zip_file, dict) and zip_file.get("name"):
1879
  zip_path = zip_file["name"]
1880
 
1881
- # Decide final zip password
1882
  if use_default_zip_pass and (not zip_password or zip_password.strip() == ""):
1883
  final_zip_password = default_zip_password
1884
  else:
1885
  final_zip_password = zip_password
1886
 
1887
- adv = {} # placeholder for future advanced options
1888
 
1889
- # Stream results from the core generator
1890
  for logs, transcripts, word_path, percent in transcribe_multiple(
1891
  audio_input,
1892
  model_name,
@@ -1907,7 +718,6 @@ def run_transcription_wrapper(
1907
 
1908
  print("DEBUG: building Gradio Blocks", flush=True)
1909
 
1910
- # ---------- Minimal, easy-to-use UI ----------
1911
  with gr.Blocks(title="Whisper Transcriber") as demo:
1912
  gr.Markdown(
1913
  "## Whisper Transcriber\n"
@@ -1916,7 +726,6 @@ with gr.Blocks(title="Whisper Transcriber") as demo:
1916
  )
1917
 
1918
  with gr.Row():
1919
- # LEFT: Inputs (small boxes, simple controls)
1920
  with gr.Column(scale=1):
1921
  gr.Markdown("### Input")
1922
 
@@ -1924,14 +733,14 @@ with gr.Blocks(title="Whisper Transcriber") as demo:
1924
  label="Audio files",
1925
  file_count="multiple",
1926
  type="filepath",
1927
- height=60, # small box
1928
  )
1929
 
1930
  zip_input = gr.File(
1931
  label="ZIP with audio (optional)",
1932
  file_count="single",
1933
  type="filepath",
1934
- height=60, # small box
1935
  )
1936
 
1937
  use_default_zip_pass = gr.Checkbox(
@@ -1968,7 +777,6 @@ with gr.Blocks(title="Whisper Transcriber") as demo:
1968
 
1969
  submit = gr.Button("Transcribe", variant="primary")
1970
 
1971
- # Memory management UI
1972
  gr.Markdown("### Memory management")
1973
  mem_upload = gr.File(label="Import memory file (JSON or text)", file_count="single", type="file")
1974
  mem_import_btn = gr.Button("Import Memory File")
@@ -1976,9 +784,8 @@ with gr.Blocks(title="Whisper Transcriber") as demo:
1976
  mem_add_btn = gr.Button("Add to Memory")
1977
  mem_clear_btn = gr.Button("Clear Memory")
1978
  mem_view_btn = gr.Button("View Memory")
1979
- mem_status = gr.Textbox(label="Memory status", interactive=False, lines=4)
1980
 
1981
- # RIGHT: Outputs (Transcript → Progress → Download → Logs)
1982
  with gr.Column(scale=1):
1983
  gr.Markdown("### Output")
1984
 
@@ -2007,7 +814,6 @@ with gr.Blocks(title="Whisper Transcriber") as demo:
2007
  interactive=False,
2008
  )
2009
 
2010
- # Transcription click binding
2011
  submit.click(
2012
  fn=run_transcription_wrapper,
2013
  inputs=[
@@ -2021,11 +827,9 @@ with gr.Blocks(title="Whisper Transcriber") as demo:
2021
  memory_checkbox,
2022
  gr.State({}),
2023
  ],
2024
- # The function yields: logs, transcripts, download_file, progress
2025
  outputs=[logs, transcripts_out, download_file, progress_num],
2026
  )
2027
 
2028
- # Memory button bindings
2029
  def _import_memory(uploaded):
2030
  return import_memory_file(uploaded)
2031
 
@@ -2037,7 +841,6 @@ with gr.Blocks(title="Whisper Transcriber") as demo:
2037
 
2038
  mem_view_btn.click(fn=lambda: view_memory(), inputs=[], outputs=[mem_status])
2039
 
2040
- # ---------- Launch ----------
2041
  if __name__ == "__main__":
2042
  port = int(os.environ.get("PORT", 7860))
2043
  print("DEBUG: launching Gradio on port", port, flush=True)
@@ -2047,31 +850,3 @@ if __name__ == "__main__":
2047
  print("FATAL: demo.launch failed:", e, flush=True)
2048
  traceback.print_exc()
2049
  raise
2050
- Notes & usage tips
2051
- Importable memory formats:
2052
-
2053
- JSON with { "words": {"word": count, ...}, "phrases": {"some phrase": count, ...} }
2054
-
2055
- Plain text with each line a word (or word,count), or longer lines will be treated as phrases.
2056
-
2057
- Enable correction memory should be checked to apply memory-based corrections during transcription.
2058
-
2059
- View Memory shows the top 50 words/phrases (truncated for UI).
2060
-
2061
- Clear Memory removes all stored words & phrases (and overwrites memory.json).
2062
-
2063
- If you'd like I can:
2064
-
2065
- produce a version that also shows a small popup listing extracted files from a ZIP and lets the user choose which to transcribe (you mentioned that previously), or
2066
-
2067
- add a "device" dropdown (cpu/cuda) and pass to whisper.load_model, or
2068
-
2069
- create a minimal tests script that runs conversion on a few sample audio files to validate ffmpeg fallbacks.
2070
-
2071
- Which follow-up would you like?
2072
-
2073
-
2074
-
2075
-
2076
- No file chosenNo file chosen
2077
- ChatGPT can make mistakes. Check important info. See Cookie Preferences.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # app.py
2
  # Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
3
  # Clean, single-version file for Hugging Face Spaces.
 
35
  # ---------- Config ----------
36
  MEMORY_FILE = "memory.json"
37
  MEMORY_LOCK = threading.Lock()
38
+ MIN_WAV_SIZE = 1024 # bytes
39
  FFMPEG_CANDIDATES = [
40
  ("s16le", 16000, 1),
41
  ("s16le", 44100, 2),
 
51
  if os.path.exists(MEMORY_FILE):
52
  with open(MEMORY_FILE, "r", encoding="utf-8") as fh:
53
  data = json.load(fh)
 
54
  if not isinstance(data, dict):
55
  raise ValueError("memory.json root not dict")
56
  data.setdefault("words", {})
 
83
  flush=True,
84
  )
85
 
 
86
  # ---------- Postprocessing ----------
87
  MEDICAL_ABBREVIATIONS = {
88
  "pt": "patient",
 
168
 
169
  # ---------- Memory utilities ----------
170
  def extract_words_and_phrases(text):
 
171
  words = re.findall(r"[A-Za-z0-9\-']+", text)
172
+ sentences = [s.strip() for s in re.split(r"(?<=[.?!])\s+", text) if s.strip()]
 
 
173
  return [w for w in words if w.strip()], sentences
174
 
175
 
 
208
  lw = w.lower()
209
  if lw in memory["words"]:
210
  return w
211
+ candidates = get_close_matches(lw, memory["words"].keys(), n=1, cutoff=min_ratio)
 
 
212
  if candidates:
213
  cand = candidates[0]
214
  if w and w[0].isupper():
 
230
  if len(low_phrase) < 8:
231
  continue
232
  if low_phrase in corrected.lower():
233
+ corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
 
 
234
  return corrected
235
 
236
 
237
  # ---------- Memory management UI helpers ----------
238
  def import_memory_file(uploaded):
 
 
 
 
 
 
239
  global memory
240
  if not uploaded:
241
  return "No file provided."
 
242
  path = None
243
  try:
244
  if isinstance(uploaded, (str, os.PathLike)):
 
249
  path = uploaded["name"]
250
  else:
251
  return "Unable to determine uploaded file path."
 
 
252
  with open(path, "r", encoding="utf-8") as fh:
253
  raw = fh.read()
 
 
254
  try:
255
  parsed = json.loads(raw)
256
  if isinstance(parsed, dict):
257
  with MEMORY_LOCK:
 
258
  parsed_words = parsed.get("words", {})
259
  parsed_phrases = parsed.get("phrases", {})
260
  for k, v in parsed_words.items():
 
264
  save_memory(memory)
265
  return f"Imported JSON memory (words={len(parsed_words)}, phrases={len(parsed_phrases)})."
266
  except Exception:
 
267
  pass
 
 
268
  lines = [l.strip() for l in raw.splitlines() if l.strip()]
269
  added_words = 0
270
  added_phrases = 0
 
280
  memory["words"][key] = memory["words"].get(key, 0) + cnt
281
  added_words += 1
282
  else:
 
283
  if len(line.split()) <= 3:
284
  key = line.lower()
285
  memory["words"][key] = memory["words"].get(key, 0) + 1
 
295
 
296
 
297
  def add_memory_entry(entry):
 
 
 
298
  global memory
299
  if not entry or not entry.strip():
300
  return "No entry provided."
 
310
  save_memory(memory)
311
  return f"Added/updated phrase: '{e}'."
312
 
313
+
314
  def clear_memory():
315
  global memory
316
  with MEMORY_LOCK:
 
318
  save_memory(memory)
319
  return "Memory cleared."
320
 
321
+
322
  def view_memory(limit=2000):
 
 
 
 
323
  w = memory.get("words", {})
324
  p = memory.get("phrases", {})
325
  out_lines = []
 
339
  # ---------- File utilities ----------
340
  def save_as_word(text, filename=None):
341
  if filename is None:
342
+ filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
 
 
343
  doc = Document()
344
  doc.add_paragraph(text)
345
  doc.save(filename)
 
348
 
349
  # ---------- improved ffmpeg convert ----------
350
  def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
 
 
 
 
351
  try:
352
  cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"]
 
353
  if fmt in ("s16le", "pcm_s16le", "mulaw"):
 
354
  cmd += ["-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_path]
355
  else:
 
356
  cmd += ["-i", input_path, "-ar", str(sr), "-ac", str(ch), out_path]
 
357
  proc = subprocess.run(cmd, capture_output=True, timeout=60, text=True)
358
  stdout_stderr = (proc.stdout or "") + (proc.stderr or "")
359
  if proc.returncode == 0 and os.path.exists(out_path) and os.path.getsize(out_path) > MIN_WAV_SIZE:
 
401
  except Exception:
402
  pass
403
 
 
404
  diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
405
  diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
406
  diagnostics = []
 
428
  except Exception:
429
  pass
430
 
 
431
  try:
432
  fp = subprocess.run(
433
  ["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
 
464
  def get_whisper_model(name):
465
  if name not in MODEL_CACHE:
466
  print(f"DEBUG: loading whisper model '{name}'", flush=True)
 
467
  MODEL_CACHE[name] = whisper.load_model(name)
468
  return MODEL_CACHE[name]
469
 
 
478
  zip_password=None,
479
  enable_memory=False,
480
  ):
 
 
 
481
  log = []
482
  transcripts = []
483
  word_file_path = None
484
  temp_extract_dir = os.path.join(tempfile.gettempdir(), "extracted_audio")
485
  extracted_audio_paths = []
486
 
 
487
  yield "", "", None, 0
488
 
 
489
  if os.path.exists(temp_extract_dir):
490
  try:
491
  shutil.rmtree(temp_extract_dir)
 
493
  except Exception:
494
  pass
495
 
 
496
  if zip_file:
497
  log.append(f"Processing zip: {zip_file}")
498
  yield "\n\n".join(log), "\n\n".join(transcripts), None, 2
 
562
  yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
563
  return
564
 
 
565
  paths = []
566
  if extracted_audio_paths:
567
  paths.extend(extracted_audio_paths)
 
578
  yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
579
  return
580
 
 
581
  yield "\n\n".join(log), "\n\n".join(transcripts), None, 5
582
  try:
583
  model = get_whisper_model(model_name)
 
592
  for p in paths:
593
  idx += 1
594
  log.append(f"Processing file ({idx}/{total}): {p}")
595
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, int(5 + (idx - 1) * 80 / max(1, total))
 
 
596
 
597
  wav = None
598
  try:
 
600
  log.append(f"Converted to WAV: {wav}")
601
  except Exception as e:
602
  log.append(f"Conversion failed for {p}: {e}")
603
+ transcripts.append(f"FILE: {os.path.basename(p)}\nERROR: Conversion failed: {e}")
604
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, int(5 + idx * 80 / max(1, total))
 
 
 
 
605
  continue
606
 
607
  try:
 
616
  if enable_memory:
617
  text = memory_correct_text(text)
618
  text = postprocess_transcript(text)
619
+ transcripts.append(f"FILE: {os.path.basename(p)}\n{text}\n")
 
 
620
 
621
  if enable_memory:
622
  try:
 
625
  except Exception:
626
  pass
627
 
628
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, int(10 + idx * 85 / max(1, total))
 
 
629
  except Exception as e:
630
  log.append(f"Transcription failed for {p}: {e}")
631
+ transcripts.append(f"FILE: {os.path.basename(p)}\nERROR: Transcription failed: {e}")
632
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, int(10 + idx * 85 / max(1, total))
 
 
 
 
633
  continue
634
  finally:
635
  try:
 
648
  except Exception:
649
  pass
650
 
 
651
  if merge_checkbox:
652
  try:
653
  merged_text = "\n\n".join(transcripts)
 
657
  log.append(f"Failed to save merged file: {e}")
658
  word_file_path = None
659
 
 
660
  yield "\n\n".join(log), "\n\n".join(transcripts), word_file_path, 100
661
 
 
662
  try:
663
  if os.path.exists(temp_extract_dir):
664
  shutil.rmtree(temp_extract_dir)
 
679
  enable_memory,
680
  advanced_options_state,
681
  ):
 
 
 
 
 
682
  try:
683
  audio_input = files
684
 
 
685
  zip_path = None
686
  if zip_file:
687
  if isinstance(zip_file, (str, os.PathLike)):
 
691
  elif isinstance(zip_file, dict) and zip_file.get("name"):
692
  zip_path = zip_file["name"]
693
 
 
694
  if use_default_zip_pass and (not zip_password or zip_password.strip() == ""):
695
  final_zip_password = default_zip_password
696
  else:
697
  final_zip_password = zip_password
698
 
699
+ adv = {}
700
 
 
701
  for logs, transcripts, word_path, percent in transcribe_multiple(
702
  audio_input,
703
  model_name,
 
718
 
719
  print("DEBUG: building Gradio Blocks", flush=True)
720
 
 
721
  with gr.Blocks(title="Whisper Transcriber") as demo:
722
  gr.Markdown(
723
  "## Whisper Transcriber\n"
 
726
  )
727
 
728
  with gr.Row():
 
729
  with gr.Column(scale=1):
730
  gr.Markdown("### Input")
731
 
 
733
  label="Audio files",
734
  file_count="multiple",
735
  type="filepath",
736
+ height=60,
737
  )
738
 
739
  zip_input = gr.File(
740
  label="ZIP with audio (optional)",
741
  file_count="single",
742
  type="filepath",
743
+ height=60,
744
  )
745
 
746
  use_default_zip_pass = gr.Checkbox(
 
777
 
778
  submit = gr.Button("Transcribe", variant="primary")
779
 
 
780
  gr.Markdown("### Memory management")
781
  mem_upload = gr.File(label="Import memory file (JSON or text)", file_count="single", type="file")
782
  mem_import_btn = gr.Button("Import Memory File")
 
784
  mem_add_btn = gr.Button("Add to Memory")
785
  mem_clear_btn = gr.Button("Clear Memory")
786
  mem_view_btn = gr.Button("View Memory")
787
+ mem_status = gr.Textbox(label="Memory status", interactive=False, lines=6)
788
 
 
789
  with gr.Column(scale=1):
790
  gr.Markdown("### Output")
791
 
 
814
  interactive=False,
815
  )
816
 
 
817
  submit.click(
818
  fn=run_transcription_wrapper,
819
  inputs=[
 
827
  memory_checkbox,
828
  gr.State({}),
829
  ],
 
830
  outputs=[logs, transcripts_out, download_file, progress_num],
831
  )
832
 
 
833
  def _import_memory(uploaded):
834
  return import_memory_file(uploaded)
835
 
 
841
 
842
  mem_view_btn.click(fn=lambda: view_memory(), inputs=[], outputs=[mem_status])
843
 
 
844
  if __name__ == "__main__":
845
  port = int(os.environ.get("PORT", 7860))
846
  print("DEBUG: launching Gradio on port", port, flush=True)
 
850
  print("FATAL: demo.launch failed:", e, flush=True)
851
  traceback.print_exc()
852
  raise