staraks commited on
Commit
41eb42e
·
verified ·
1 Parent(s): ef63fe4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +420 -5
app.py CHANGED
@@ -1,7 +1,6 @@
1
  # app.py
2
  # Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
3
- # Cleaned, debugged, and Spaces-ready.
4
- # Replace /app/app.py with this file and restart container.
5
 
6
  import os
7
  import sys
@@ -34,10 +33,9 @@ except Exception as e:
34
  print("DEBUG: imports OK", flush=True)
35
 
36
  # ---------- Config ----------
37
- MEMORY_FILE = "memory.json" # persistent memory in repo (will be written)
38
  MEMORY_LOCK = threading.Lock()
39
  MIN_WAV_SIZE = 200 # bytes
40
- # Fallback ffmpeg conversion candidates (short hybrid list)
41
  FFMPEG_CANDIDATES = [
42
  ("s16le", 16000, 1),
43
  ("s16le", 44100, 2),
@@ -71,6 +69,7 @@ def save_memory(mem):
71
  memory = load_memory()
72
  print("DEBUG: memory loaded (words=%d phrases=%d)" % (len(memory.get("words", {})), len(memory.get("phrases", {}))), flush=True)
73
 
 
74
  # ---------- Postprocessing ----------
75
  MEDICAL_ABBREVIATIONS = {
76
  "pt": "patient",
@@ -149,4 +148,420 @@ def postprocess_transcript(text, format_soap=False):
149
 
150
  # ---------- Memory utilities ----------
151
  def extract_words_and_phrases(text):
152
- words = re.findall(r"[A-Za-z0-]()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # app.py
2
  # Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
3
+ # Paste chunks 1/4 -> 2/4 -> 3/4 -> 4/4 in order into /app/app.py
 
4
 
5
  import os
6
  import sys
 
33
  print("DEBUG: imports OK", flush=True)
34
 
35
  # ---------- Config ----------
36
+ MEMORY_FILE = "memory.json"
37
  MEMORY_LOCK = threading.Lock()
38
  MIN_WAV_SIZE = 200 # bytes
 
39
  FFMPEG_CANDIDATES = [
40
  ("s16le", 16000, 1),
41
  ("s16le", 44100, 2),
 
69
  memory = load_memory()
70
  print("DEBUG: memory loaded (words=%d phrases=%d)" % (len(memory.get("words", {})), len(memory.get("phrases", {}))), flush=True)
71
 
72
+
73
  # ---------- Postprocessing ----------
74
  MEDICAL_ABBREVIATIONS = {
75
  "pt": "patient",
 
148
 
149
  # ---------- Memory utilities ----------
150
  def extract_words_and_phrases(text):
151
+ # basic tokenization for words; phrases = sentences
152
+ words = re.findall(r"[A-Za-z0-9\-']+", text)
153
+ sentences = [s.strip() for s in re.split(r'(?<=[.?!])\s+', text) if s.strip()]
154
+ return [w for w in words if w.strip()], sentences
155
+
156
+ def update_memory_with_transcript(transcript):
157
+ global memory
158
+ words, sentences = extract_words_and_phrases(transcript)
159
+ changed = False
160
+ with MEMORY_LOCK:
161
+ for w in words:
162
+ lw = w.lower()
163
+ if lw in memory["words"]:
164
+ memory["words"][lw] += 1
165
+ else:
166
+ memory["words"][lw] = 1
167
+ changed = True
168
+ for s in sentences:
169
+ key = s.strip()
170
+ if key in memory["phrases"]:
171
+ memory["phrases"][key] += 1
172
+ else:
173
+ memory["phrases"][key] = 1
174
+ changed = True
175
+ if changed:
176
+ try:
177
+ with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
178
+ json.dump(memory, fh, ensure_ascii=False, indent=2)
179
+ except Exception:
180
+ pass
181
+
182
+ def memory_correct_text(text, min_ratio=0.85):
183
+ if not text or (not memory.get("words") and not memory.get("phrases")):
184
+ return text
185
+
186
+ def fix_word(w):
187
+ lw = w.lower()
188
+ if lw in memory["words"]:
189
+ return w
190
+ candidates = get_close_matches(lw, memory["words"].keys(), n=1, cutoff=min_ratio)
191
+ if candidates:
192
+ cand = candidates[0]
193
+ if w and w[0].isupper():
194
+ return cand.capitalize()
195
+ return cand
196
+ return w
197
+
198
+ tokens = re.split(r'(\W+)', text)
199
+ corrected_tokens = []
200
+ for tok in tokens:
201
+ if re.match(r"^[A-Za-z0-9\-']+$", tok):
202
+ corrected_tokens.append(fix_word(tok))
203
+ else:
204
+ corrected_tokens.append(tok)
205
+ corrected = ''.join(corrected_tokens)
206
+
207
+ for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
208
+ low_phrase = phrase.lower()
209
+ if len(low_phrase) < 8:
210
+ continue
211
+ if low_phrase in corrected.lower():
212
+ corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
213
+ return corrected
214
+
215
+ # ---------- File utilities ----------
216
+ def save_as_word(text, filename=None):
217
+ if filename is None:
218
+ filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
219
+ doc = Document()
220
+ doc.add_paragraph(text)
221
+ doc.save(filename)
222
+ return filename
223
+
224
+
225
+
226
+
227
+
228
+ # ---------- Hybrid conversion: pydub + small ffmpeg fallback ----------
229
+ def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
230
+ cmd = [
231
+ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
232
+ "-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_path
233
+ ]
234
+ try:
235
+ proc = subprocess.run(cmd, capture_output=True, timeout=30, text=True)
236
+ if proc.returncode == 0 and os.path.exists(out_path) and os.path.getsize(out_path) > MIN_WAV_SIZE:
237
+ return True, proc.stderr + proc.stdout
238
+ else:
239
+ try:
240
+ if os.path.exists(out_path):
241
+ os.unlink(out_path)
242
+ except Exception:
243
+ pass
244
+ return False, proc.stderr + proc.stdout
245
+ except Exception as e:
246
+ try:
247
+ if os.path.exists(out_path):
248
+ os.unlink(out_path)
249
+ except Exception:
250
+ pass
251
+ return False, str(e)
252
+
253
+ def convert_to_wav_if_needed(input_path):
254
+ input_path = str(input_path)
255
+ lower = input_path.lower()
256
+ if lower.endswith(".wav"):
257
+ return input_path
258
+
259
+ auto_err = ""
260
+ tmp = None
261
+ try:
262
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
263
+ tmp.close()
264
+ AudioSegment.from_file(input_path).export(tmp.name, format="wav")
265
+ if os.path.exists(tmp.name) and os.path.getsize(tmp.name) > MIN_WAV_SIZE:
266
+ return tmp.name
267
+ else:
268
+ try:
269
+ os.unlink(tmp.name)
270
+ except Exception:
271
+ pass
272
+ except Exception:
273
+ auto_err = traceback.format_exc()
274
+ try:
275
+ if tmp and os.path.exists(tmp.name):
276
+ os.unlink(tmp.name)
277
+ except Exception:
278
+ pass
279
+
280
+ diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
281
+ diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
282
+ diagnostics = []
283
+ for fmt, sr, ch in FFMPEG_CANDIDATES:
284
+ out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
285
+ out_wav.close()
286
+ success, debug = _ffmpeg_convert(input_path, out_wav.name, fmt, sr, ch)
287
+ diagnostics.append(f"TRY fmt={fmt} sr={sr} ch={ch} success={success}\n{debug}\n")
288
+ if success:
289
+ try:
290
+ with open(diag_log, "w", encoding="utf-8") as fh:
291
+ fh.write("pydub auto error:\n")
292
+ fh.write(auto_err + "\n\n")
293
+ fh.write("Successful ffmpeg candidate:\n")
294
+ fh.write(f"fmt={fmt} sr={sr} ch={ch}\n\n")
295
+ fh.write("Diagnostics:\n")
296
+ fh.write("\n".join(diagnostics))
297
+ except Exception:
298
+ pass
299
+ return out_wav.name
300
+ else:
301
+ try:
302
+ if os.path.exists(out_wav.name):
303
+ os.unlink(out_wav.name)
304
+ except Exception:
305
+ pass
306
+
307
+ try:
308
+ fp = subprocess.run(["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
309
+ capture_output=True, text=True, timeout=10)
310
+ diagnostics.append("FFPROBE:\n" + (fp.stdout.strip() or fp.stderr.strip()))
311
+ except Exception as e:
312
+ diagnostics.append("ffprobe failed: " + str(e))
313
+ try:
314
+ with open(input_path, "rb") as fh:
315
+ head = fh.read(512)
316
+ diagnostics.append("HEX PREVIEW:\n" + head.hex())
317
+ except Exception as e:
318
+ diagnostics.append("could not read head: " + str(e))
319
+
320
+ try:
321
+ with open(diag_log, "w", encoding="utf-8") as fh:
322
+ fh.write("pydub auto error:\n")
323
+ fh.write(auto_err + "\n\n")
324
+ fh.write("Full diagnostics:\n\n")
325
+ fh.write("\n\n".join(diagnostics))
326
+ except Exception as e:
327
+ raise Exception(f"Conversion failed; diagnostics write error: {e}")
328
+
329
+ raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}")
330
+
331
+ # ---------- Whisper model cache ----------
332
+ MODEL_CACHE = {}
333
+
334
+ def get_whisper_model(name):
335
+ if name not in MODEL_CACHE:
336
+ print(f"DEBUG: loading whisper model '{name}'", flush=True)
337
+ MODEL_CACHE[name] = whisper.load_model(name)
338
+ return MODEL_CACHE[name]
339
+
340
+ # ---------- Main transcription generator ----------
341
+ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbox, zip_file=None, zip_password=None, enable_memory=False):
342
+ log = []
343
+ transcripts = []
344
+ word_file_path = None
345
+ temp_extract_dir = os.path.join(tempfile.gettempdir(), "extracted_audio")
346
+ extracted_audio_paths = []
347
+
348
+ # initial yield
349
+ yield "", "", None, 0
350
+
351
+ # cleanup previous
352
+ if os.path.exists(temp_extract_dir):
353
+ try:
354
+ shutil.rmtree(temp_extract_dir)
355
+ log.append(f"Cleaned previous temp dir: {temp_extract_dir}")
356
+ except Exception:
357
+ pass
358
+
359
+ # handle zip
360
+ if zip_file:
361
+ log.append(f"Processing zip: {zip_file}")
362
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 2
363
+ try:
364
+ os.makedirs(temp_extract_dir, exist_ok=True)
365
+ with pyzipper.ZipFile(zip_file, "r") as zf:
366
+ if zip_password:
367
+ try:
368
+ zf.setpassword(zip_password.encode())
369
+ except Exception:
370
+ log.append("Incorrect zip password")
371
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
372
+ return
373
+ exts = ['.mp3', '.wav', '.aac', '.flac', '.ogg', '.m4a', '.dat', '.dct']
374
+ count = 0
375
+ for info in zf.infolist():
376
+ if info.is_dir():
377
+ continue
378
+ _, ext = os.path.splitext(info.filename)
379
+ if ext.lower() in exts:
380
+ try:
381
+ zf.extract(info, path=temp_extract_dir)
382
+ except Exception as e:
383
+ log.append(f"Error extracting {info.filename}: {e}")
384
+ continue
385
+ p = os.path.normpath(os.path.join(temp_extract_dir, info.filename))
386
+ if os.path.exists(p):
387
+ extracted_audio_paths.append(p)
388
+ count += 1
389
+ log.append(f"Extracted: {info.filename}")
390
+ if count == 0:
391
+ log.append("No supported audio in zip.")
392
+ try:
393
+ shutil.rmtree(temp_extract_dir)
394
+ except Exception:
395
+ pass
396
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
397
+ return
398
+ except pyzipper.BadZipFile:
399
+ log.append("Invalid zip file.")
400
+ try:
401
+ shutil.rmtree(temp_extract_dir)
402
+ except Exception:
403
+ pass
404
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
405
+ return
406
+ except Exception as e:
407
+ log.append(f"Zip processing error: {e}")
408
+ try:
409
+ shutil.rmtree(temp_extract_dir)
410
+ except Exception:
411
+ pass
412
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
413
+ return
414
+
415
+
416
+
417
+ # collect audio file paths
418
+ paths = []
419
+ if extracted_audio_paths:
420
+ paths.extend(extracted_audio_paths)
421
+ if audio_files:
422
+ if isinstance(audio_files, (list, tuple)):
423
+ for a in audio_files:
424
+ if a:
425
+ paths.append(a)
426
+ elif isinstance(audio_files, str):
427
+ paths.append(audio_files)
428
+
429
+ if not paths:
430
+ log.append("No audio files provided.")
431
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
432
+ return
433
+
434
+ # load model (on demand)
435
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 5
436
+ try:
437
+ model = get_whisper_model(model_name)
438
+ log.append(f"Loaded Whisper model: {model_name}")
439
+ except Exception as e:
440
+ log.append(f"Failed to load model {model_name}: {e}")
441
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
442
+ return
443
+
444
+ total = len(paths)
445
+ idx = 0
446
+ for p in paths:
447
+ idx += 1
448
+ log.append(f"Processing file ({idx}/{total}): {p}")
449
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, int(5 + (idx-1) * 80 / max(1, total))
450
+
451
+ wav = None
452
+ try:
453
+ wav = convert_to_wav_if_needed(p)
454
+ log.append(f"Converted to WAV: {wav}")
455
+ except Exception as e:
456
+ log.append(f"Conversion failed for {p}: {e}")
457
+ transcripts.append(f"FILE: {os.path.basename(p)}\nERROR: Conversion failed: {e}")
458
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, int(5 + idx * 80 / max(1, total))
459
+ continue
460
+
461
+ try:
462
+ whisper_opts = {}
463
+ if isinstance(advanced_options, dict):
464
+ whisper_opts.update(advanced_options)
465
+
466
+ result = model.transcribe(wav, **whisper_opts)
467
+ text = result.get("text", "").strip()
468
+ log.append(f"Transcribed: {len(text)} chars")
469
+
470
+ if enable_memory:
471
+ text = memory_correct_text(text)
472
+ text = postprocess_transcript(text)
473
+ transcripts.append(f"FILE: {os.path.basename(p)}\n{text}\n")
474
+
475
+ if enable_memory:
476
+ try:
477
+ update_memory_with_transcript(text)
478
+ log.append("Memory updated.")
479
+ except Exception:
480
+ pass
481
+
482
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, int(10 + idx * 85 / max(1, total))
483
+ except Exception as e:
484
+ log.append(f"Transcription failed for {p}: {e}")
485
+ transcripts.append(f"FILE: {os.path.basename(p)}\nERROR: Transcription failed: {e}")
486
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, int(10 + idx * 85 / max(1, total))
487
+ continue
488
+ finally:
489
+ try:
490
+ if wav and os.path.exists(wav):
491
+ tmpdir = tempfile.gettempdir()
492
+ try:
493
+ if os.path.commonpath([tmpdir, os.path.abspath(wav)]) == tmpdir and (not p.lower().endswith(".wav")):
494
+ os.unlink(wav)
495
+ except Exception:
496
+ pass
497
+ except Exception:
498
+ pass
499
+
500
+ if merge_checkbox:
501
+ try:
502
+ merged_text = "\n\n".join(transcripts)
503
+ word_file_path = save_as_word(merged_text)
504
+ log.append(f"Merged transcript saved: {word_file_path}")
505
+ except Exception as e:
506
+ log.append(f"Failed to save merged file: {e}")
507
+ word_file_path = None
508
+
509
+ yield "\n\n".join(log), "\n\n".join(transcripts), word_file_path, 100
510
+
511
+ try:
512
+ if os.path.exists(temp_extract_dir):
513
+ shutil.rmtree(temp_extract_dir)
514
+ log.append("Cleaned temporary extraction dir.")
515
+ except Exception:
516
+ pass
517
+
518
+ # ----------------------- Gradio UI -----------------------
519
+ def run_transcription_wrapper(files, model_name, merge, zip_file, zip_password, enable_memory, advanced_options_state):
520
+ audio_input = files
521
+ zip_path = None
522
+ if zip_file:
523
+ if isinstance(zip_file, (str, os.PathLike)):
524
+ zip_path = str(zip_file)
525
+ elif hasattr(zip_file, "name"):
526
+ zip_path = zip_file.name
527
+ elif isinstance(zip_file, dict) and zip_file.get("name"):
528
+ zip_path = zip_file["name"]
529
+ adv = {}
530
+ return transcribe_multiple(audio_input, model_name, adv, merge_checkbox=merge, zip_file=zip_path, zip_password=zip_password, enable_memory=enable_memory)
531
+
532
+ print("DEBUG: building Gradio Blocks", flush=True)
533
+ demo = gr.Blocks()
534
+
535
+ with demo:
536
+ gr.Markdown("## Whisper Transcription (Spaces-ready)")
537
+ with gr.Row():
538
+ with gr.Column(scale=2):
539
+ file_input = gr.File(label="Upload audio files (or zip)", file_count="multiple", type="filepath")
540
+ zip_input = gr.File(label="Optional: Upload zip file containing audio", file_count="single", type="filepath")
541
+ zip_password = gr.Textbox(label="Zip password (if any)", placeholder="password (optional)")
542
+ model_select = gr.Dropdown(choices=["small","medium","large","base"], value="small", label="Whisper model")
543
+ merge_checkbox = gr.Checkbox(label="Merge transcripts to a single .docx (downloadable)", value=True)
544
+ memory_checkbox = gr.Checkbox(label="Enable persistent memory (word/phrase correction)", value=False)
545
+ submit = gr.Button("Transcribe")
546
+ with gr.Column(scale=3):
547
+ logs = gr.Textbox(label="Logs (streaming)", lines=12)
548
+ transcripts_out = gr.Textbox(label="Transcripts (streaming)", lines=12)
549
+ download_file = gr.File(label="Merged .docx (when enabled)")
550
+ progress_num = gr.Number(value=0, label="Progress (%)")
551
+
552
+ submit.click(
553
+ fn=run_transcription_wrapper,
554
+ inputs=[file_input, model_select, merge_checkbox, zip_input, zip_password, memory_checkbox, gr.State({})],
555
+ outputs=[logs, transcripts_out, download_file, progress_num],
556
+ )
557
+
558
+ # Launch
559
+ if __name__ == "__main__":
560
+ port = int(os.environ.get("PORT", 7860))
561
+ print("DEBUG: launching Gradio on port", port, flush=True)
562
+ try:
563
+ demo.queue().launch(server_name="0.0.0.0", server_port=port)
564
+ except Exception as e:
565
+ print("FATAL: demo.launch failed:", e, flush=True)
566
+ traceback.print_exc()
567
+ raise