staraks commited on
Commit
d366308
·
verified ·
1 Parent(s): a086336

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +429 -13
app.py CHANGED
@@ -1,8 +1,428 @@
1
- # ----------------------- CONTINUATION / APP LAUNCH -----------------------
2
- # Append this to the end of your app.py (after the previous code)
 
 
 
 
 
 
 
 
3
 
4
- yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
5
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # collect audio file paths from either audio_files or extracted paths
8
  paths = []
@@ -111,28 +531,25 @@
111
  pass
112
 
113
  # ----------------------- Gradio UI -----------------------
114
- def run_transcription_wrapper(files, model_name, merge, zip_file, zip_password, enable_memory, advanced_options):
115
  """
116
- Gradio wrapper: accepts file upload(s) and zip file (single), returns final outputs.
117
- Because Gradio supports generator functions directly, we can return the generator.
118
  """
119
- # normalize inputs
120
  audio_input = files
121
  zip_path = None
122
  if zip_file:
123
- # gr.File will provide a dict-like or path depending on version; try to handle both
124
  if isinstance(zip_file, (str, os.PathLike)):
125
  zip_path = str(zip_file)
126
  elif hasattr(zip_file, "name"):
127
  zip_path = zip_file.name
128
  elif isinstance(zip_file, dict) and zip_file.get("name"):
129
  zip_path = zip_file["name"]
130
- # advanced options not used heavily here; keep empty dict if None
131
- adv = advanced_options or {}
132
  return transcribe_multiple(audio_input, model_name, adv, merge_checkbox=merge, zip_file=zip_path, zip_password=zip_password, enable_memory=enable_memory)
133
 
134
  # Build Blocks UI
135
- demo = gr.Blocks()
136
 
137
  with demo:
138
  gr.Markdown("## Whisper Transcription (Spaces-ready)")
@@ -151,7 +568,6 @@ with demo:
151
  download_file = gr.File(label="Merged .docx (when enabled)")
152
 
153
  # connect
154
- # Gradio supports generator functions directly; the outputs are (logs, transcripts, file, progress)
155
  submit.click(fn=run_transcription_wrapper,
156
  inputs=[file_input, model_select, merge_checkbox, zip_input, zip_password, memory_checkbox, gr.State({})],
157
  outputs=[logs, transcripts_out, download_file])
 
1
+ # app.py
2
+ # Full Whisper transcription app for Hugging Face Spaces
3
+ # - Advanced .dct conversion (ffmpeg heuristics + pydub)
4
+ # - Zip extraction (pyzipper)
5
+ # - Whisper transcription (cached)
6
+ # - Live progress & logs to Gradio (generator)
7
+ # - Persistent memory (word + phrase) with fuzzy correction
8
+ # - Simple medical post-processing (abbrev expansion)
9
+ # - Merge transcripts to .docx
10
+ # - Binds to 0.0.0.0:$PORT and uses demo.queue().launch()
11
 
12
+ import os
13
+ import json
14
+ import shutil
15
+ import tempfile
16
+ import subprocess
17
+ import traceback
18
+ import threading
19
+ import re
20
+ from difflib import get_close_matches
21
+ from pathlib import Path
22
+
23
+ from docx import Document
24
+ import whisper
25
+ import gradio as gr
26
+ import pyzipper
27
+ from pydub import AudioSegment
28
+
29
+ # ---------- Config ----------
30
+ MEMORY_FILE = "memory.json" # persistent memory in repo (will be written)
31
+ MEMORY_LOCK = threading.Lock()
32
+ DIAGNOSTICS_DIR_BASE = tempfile.gettempdir()
33
+ MIN_WAV_SIZE = 200
34
+ # ----------------------------
35
+
36
+ # ensure memory file exists
37
+ def load_memory():
38
+ try:
39
+ if os.path.exists(MEMORY_FILE):
40
+ with open(MEMORY_FILE, "r", encoding="utf-8") as fh:
41
+ return json.load(fh)
42
+ except Exception:
43
+ pass
44
+ # default structure
45
+ mem = {"words": {}, "phrases": {}}
46
+ try:
47
+ with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
48
+ json.dump(mem, fh, ensure_ascii=False, indent=2)
49
+ except Exception:
50
+ pass
51
+ return mem
52
+
53
+ def save_memory(mem):
54
+ with MEMORY_LOCK:
55
+ with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
56
+ json.dump(mem, fh, ensure_ascii=False, indent=2)
57
+
58
+ memory = load_memory()
59
+
60
+ # ---------- Simple medical post-processing ----------
61
+ MEDICAL_ABBREVIATIONS = {
62
+ "pt": "patient",
63
+ "dx": "diagnosis",
64
+ "hx": "history",
65
+ "sx": "symptoms",
66
+ "c/o": "complains of",
67
+ "bp": "blood pressure",
68
+ "hr": "heart rate",
69
+ "o2": "oxygen",
70
+ "r/o": "rule out",
71
+ "adm": "admit",
72
+ "disch": "discharge",
73
+ # extend as needed
74
+ }
75
+
76
+ DRUG_NORMALIZATION = {
77
+ "metformin": "Metformin",
78
+ "aspirin": "Aspirin",
79
+ "amoxicillin": "Amoxicillin",
80
+ }
81
+
82
+
83
+ def expand_abbreviations(text):
84
+ tokens = re.split(r'(\s+)', text)
85
+ out = []
86
+ for t in tokens:
87
+ key = t.lower().strip(".,;:")
88
+ if key in MEDICAL_ABBREVIATIONS:
89
+ trailing = ''
90
+ m = re.match(r'([A-Za-z0-9/]+)([.,;:]*)', t)
91
+ if m:
92
+ trailing = m.group(2) or ''
93
+ out.append(MEDICAL_ABBREVIATIONS[key] + trailing)
94
+ else:
95
+ out.append(t)
96
+ return ''.join(out)
97
+
98
+
99
+ def normalize_drugs(text):
100
+ for k, v in DRUG_NORMALIZATION.items():
101
+ text = re.sub(rf'\b{k}\b', v, text, flags=re.IGNORECASE)
102
+ return text
103
+
104
+
105
+ def punctuation_and_capitalization(text):
106
+ text = text.strip()
107
+ if not text:
108
+ return text
109
+ if not re.search(r'[.?!]\s*$', text):
110
+ text = text.rstrip() + '.'
111
+ parts = re.split(r'([.?!]\s+)', text)
112
+ out = []
113
+ for p in parts:
114
+ if p and not re.match(r'[.?!]\s+', p):
115
+ out.append(p.capitalize())
116
+ else:
117
+ out.append(p)
118
+ return ''.join(out)
119
+
120
+
121
+ def postprocess_transcript(text, format_soap=False):
122
+ if not text:
123
+ return text
124
+ t = re.sub(r'\s+', ' ', text).strip()
125
+ t = expand_abbreviations(t)
126
+ t = normalize_drugs(t)
127
+ t = punctuation_and_capitalization(t)
128
+ if format_soap:
129
+ sentences = re.split(r'(?<=[.?!])\s+', t)
130
+ subj = sentences[0] if len(sentences) >= 1 else ""
131
+ obj = sentences[1] if len(sentences) >= 2 else ""
132
+ assessment = ""
133
+ for kw in ["diagnosis", "dx", "rule out", "r/o", "probable"]:
134
+ if kw in t.lower():
135
+ assessment = "Assessment: " + subj
136
+ break
137
+ soap = f"S: {subj}\nO: {obj}\nA: {assessment}\nP: Plan: follow up as indicated."
138
+ return soap
139
+ return t
140
+
141
+ # ---------- Memory utilities (word + phrase) ----------
142
+ def extract_words_and_phrases(text):
143
+ # basic tokenization for words; phrases = sentences
144
+ words = re.findall(r"[A-Za-z0-9\-']+", text)
145
+ sentences = [s.strip() for s in re.split(r'(?<=[.?!])\s+', text) if s.strip()]
146
+ return [w for w in words if w.strip()], sentences
147
+
148
+
149
+ def update_memory_with_transcript(transcript):
150
+ global memory
151
+ words, sentences = extract_words_and_phrases(transcript)
152
+ changed = False
153
+ with MEMORY_LOCK:
154
+ for w in words:
155
+ lw = w.lower()
156
+ if lw in memory["words"]:
157
+ memory["words"][lw] += 1
158
+ else:
159
+ memory["words"][lw] = 1
160
+ changed = True
161
+ for s in sentences:
162
+ key = s.strip()
163
+ if key in memory["phrases"]:
164
+ memory["phrases"][key] += 1
165
+ else:
166
+ memory["phrases"][key] = 1
167
+ changed = True
168
+ if changed:
169
+ try:
170
+ with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
171
+ json.dump(memory, fh, ensure_ascii=False, indent=2)
172
+ except Exception:
173
+ pass
174
+
175
+
176
+ def memory_correct_text(text, min_ratio=0.85):
177
+ """
178
+ Correct words/phrases in text using memory.
179
+ - Word-level: uses difflib.get_close_matches against known memory words.
180
+ - Phrase-level: tries to match stored phrases (exact or close substring).
181
+ """
182
+ if not text or (not memory.get("words") and not memory.get("phrases")):
183
+ return text
184
+
185
+ # word-level corrections
186
+ def fix_word(w):
187
+ lw = w.lower()
188
+ if lw in memory["words"]:
189
+ return w # known exact
190
+ # find close matches from memory words (keys)
191
+ candidates = get_close_matches(lw, memory["words"].keys(), n=1, cutoff=min_ratio)
192
+ if candidates:
193
+ # preserve casing: if candidate is lower, capitalize if original was capitalized
194
+ cand = candidates[0]
195
+ if w[0].isupper():
196
+ return cand.capitalize()
197
+ return cand
198
+ return w
199
+
200
+ tokens = re.split(r'(\W+)', text) # keep punctuation
201
+ corrected_tokens = []
202
+ for tok in tokens:
203
+ if re.match(r"^[A-Za-z0-9\-']+$", tok):
204
+ corrected_tokens.append(fix_word(tok))
205
+ else:
206
+ corrected_tokens.append(tok)
207
+ corrected = ''.join(corrected_tokens)
208
+
209
+ # phrase-level: try to replace short substrings that closely match known phrases
210
+ # naive approach: for each stored phrase, if it is short and a fuzzy substring of corrected, replace
211
+ for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
212
+ low_phrase = phrase.lower()
213
+ # only replace if phrase length >= 8 chars to avoid noisy matches
214
+ if len(low_phrase) < 8:
215
+ continue
216
+ if low_phrase in corrected.lower():
217
+ # find exact location, replace preserving case roughly
218
+ corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
219
+ return corrected
220
+
221
+ # ---------- File utilities ----------
222
+ def save_as_word(text, filename=None):
223
+ if filename is None:
224
+ filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
225
+ doc = Document()
226
+ doc.add_paragraph(text)
227
+ doc.save(filename)
228
+ return filename
229
+
230
+ # ---------- Advanced conversion: pydub auto + ffmpeg heuristics ----------
231
+ def convert_to_wav_if_needed(input_path):
232
+ """
233
+ Advanced conversion:
234
+ - pydub (AudioSegment.from_file) first
235
+ - if that fails, exhaustive ffmpeg format/rate/channel grid
236
+ - writes diagnostics to a temp folder if conversion fails entirely
237
+ """
238
+ input_path = str(input_path)
239
+ lower = input_path.lower()
240
+ if lower.endswith(".wav"):
241
+ return input_path
242
+
243
+ # try pydub first
244
+ auto_err = ""
245
+ tmp = None
246
+ try:
247
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
248
+ tmp.close()
249
+ AudioSegment.from_file(input_path).export(tmp.name, format="wav")
250
+ return tmp.name
251
+ except Exception as e:
252
+ auto_err = traceback.format_exc()
253
+ try:
254
+ if tmp:
255
+ os.unlink(tmp.name)
256
+ except Exception:
257
+ pass
258
+
259
+ # fallback grid
260
+ pcm_formats = ['s16le', 's32le', 's24le', 's8', 'u8', 's16be', 'pcm_s16le', 'pcm_u8', 'pcm_u16le']
261
+ mulaw_alaw = ['mulaw', 'alaw']
262
+ adpcm = ['adpcm_ima_wav', 'adpcm_ms']
263
+ extra = ['gsm', 'g726', 'vorbis']
264
+ formats = pcm_formats + mulaw_alaw + adpcm + extra
265
+ sample_rates = [8000, 11025, 12000, 16000, 22050, 32000, 44100, 48000]
266
+ channels = [1, 2]
267
+
268
+ diagnostics = []
269
+ diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
270
+ diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
271
+
272
+ for fmt in formats:
273
+ for sr in sample_rates:
274
+ for ch in channels:
275
+ out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
276
+ out_wav.close()
277
+ cmd = [
278
+ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
279
+ "-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_wav.name
280
+ ]
281
+ try:
282
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=45)
283
+ except Exception as e_run:
284
+ diagnostics.append(f"RUN-EXC fmt={fmt} sr={sr} ch={ch} err={e_run}")
285
+ try: os.unlink(out_wav.name)
286
+ except Exception: pass
287
+ continue
288
+
289
+ rc = proc.returncode
290
+ stderr = proc.stderr.strip() if proc.stderr else ""
291
+ stdout = proc.stdout.strip() if proc.stdout else ""
292
+ diagnostics.append(f"ATTEMPT fmt={fmt} sr={sr} ch={ch} rc={rc}")
293
+ if stdout:
294
+ diagnostics.append("STDOUT:")
295
+ diagnostics.append(stdout)
296
+ if stderr:
297
+ diagnostics.append("STDERR:")
298
+ diagnostics.append(stderr)
299
+ diagnostics.append("-" * 60)
300
+
301
+ try:
302
+ if rc == 0 and os.path.exists(out_wav.name) and os.path.getsize(out_wav.name) > MIN_WAV_SIZE:
303
+ # success
304
+ try:
305
+ with open(diag_log, "w", encoding="utf-8") as fh:
306
+ fh.write("pydub auto error:\n")
307
+ fh.write(auto_err + "\n\n")
308
+ fh.write("Successful guess:\n")
309
+ fh.write(f"fmt={fmt} sr={sr} ch={ch}\n\n")
310
+ fh.write("Diagnostics (last attempts):\n")
311
+ fh.write("\n".join(diagnostics[-1000:]))
312
+ except Exception:
313
+ pass
314
+ return out_wav.name
315
+ except Exception:
316
+ pass
317
+
318
+ try: os.unlink(out_wav.name)
319
+ except Exception: pass
320
+
321
+ # ffprobe and hexdump preview
322
+ try:
323
+ fp = subprocess.run(["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
324
+ capture_output=True, text=True, timeout=15)
325
+ diagnostics.append("FFPROBE:")
326
+ diagnostics.append(fp.stdout.strip() or fp.stderr.strip())
327
+ except Exception as e:
328
+ diagnostics.append(f"ffprobe failed: {e}")
329
+
330
+ try:
331
+ with open(input_path, "rb") as fh:
332
+ head = fh.read(256)
333
+ diagnostics.append("HEX PREVIEW:")
334
+ diagnostics.append(head.hex())
335
+ except Exception as e:
336
+ diagnostics.append(f"could not read head: {e}")
337
+
338
+ try:
339
+ with open(diag_log, "w", encoding="utf-8") as fh:
340
+ fh.write("pydub auto error:\n")
341
+ fh.write(auto_err + "\n\n")
342
+ fh.write("Full diagnostics:\n\n")
343
+ fh.write("\n".join(diagnostics))
344
+ except Exception as e:
345
+ raise Exception(f"Conversion failed; diagnostics could not be written: {e}")
346
+
347
+ raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}\nSummary: {diagnostics[:6]}")
348
+
349
+ # ---------- Whisper model cache ----------
350
+ MODEL_CACHE = {}
351
+
352
+ def get_whisper_model(name):
353
+ if name not in MODEL_CACHE:
354
+ MODEL_CACHE[name] = whisper.load_model(name)
355
+ return MODEL_CACHE[name]
356
+
357
+ # ---------- Main transcription generator ----------
358
+ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbox, zip_file=None, zip_password=None, enable_memory=False):
359
+ """
360
+ Generator yields (log_text, transcripts_text, word_file_path_or_None, percent_int)
361
+ audio_files: path or list of paths (gr.File with type='filepath' gives file path string)
362
+ """
363
+ log = []
364
+ transcripts = []
365
+ word_file_path = None
366
+ temp_extract_dir = os.path.join(tempfile.gettempdir(), "extracted_audio")
367
+ extracted_audio_paths = []
368
+
369
+ # initial yield
370
+ yield "", "", None, 0
371
+
372
+ # cleanup
373
+ if os.path.exists(temp_extract_dir):
374
+ try:
375
+ shutil.rmtree(temp_extract_dir)
376
+ log.append(f"Cleaned previous temp dir: {temp_extract_dir}")
377
+ except Exception:
378
+ pass
379
+
380
+ # handle zip
381
+ if zip_file:
382
+ log.append(f"Processing zip: {zip_file}")
383
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 2
384
+ try:
385
+ os.makedirs(temp_extract_dir, exist_ok=True)
386
+ with pyzipper.ZipFile(zip_file, "r") as zf:
387
+ if zip_password:
388
+ try: zf.setpassword(zip_password.encode())
389
+ except Exception:
390
+ log.append("Incorrect zip password")
391
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
392
+ return
393
+ exts = ['.mp3', '.wav', '.aac', '.flac', '.ogg', '.dat', '.dct']
394
+ count = 0
395
+ for info in zf.infolist():
396
+ if info.is_dir(): continue
397
+ _, ext = os.path.splitext(info.filename)
398
+ if ext.lower() in exts:
399
+ try:
400
+ zf.extract(info, path=temp_extract_dir)
401
+ p = os.path.normpath(os.path.join(temp_extract_dir, info.filename))
402
+ if os.path.exists(p):
403
+ extracted_audio_paths.append(p)
404
+ count += 1
405
+ log.append(f"Extracted: {info.filename}")
406
+ except Exception as e:
407
+ log.append(f"Error extracting {info.filename}: {e}")
408
+ if count == 0:
409
+ log.append("No supported audio in zip.")
410
+ try: shutil.rmtree(temp_extract_dir)
411
+ except Exception: pass
412
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
413
+ return
414
+ except pyzipper.BadZipFile:
415
+ log.append("Invalid zip file.")
416
+ try: shutil.rmtree(temp_extract_dir)
417
+ except Exception: pass
418
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
419
+ return
420
+ except Exception as e:
421
+ log.append(f"Zip processing error: {e}")
422
+ try: shutil.rmtree(temp_extract_dir)
423
+ except Exception: pass
424
+ yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
425
+ return
426
 
427
  # collect audio file paths from either audio_files or extracted paths
428
  paths = []
 
531
  pass
532
 
533
  # ----------------------- Gradio UI -----------------------
534
+ def run_transcription_wrapper(files, model_name, merge, zip_file, zip_password, enable_memory, advanced_options_state):
535
  """
536
+ Gradio wrapper: accepts file upload(s) and zip file (single), returns generator.
 
537
  """
 
538
  audio_input = files
539
  zip_path = None
540
  if zip_file:
 
541
  if isinstance(zip_file, (str, os.PathLike)):
542
  zip_path = str(zip_file)
543
  elif hasattr(zip_file, "name"):
544
  zip_path = zip_file.name
545
  elif isinstance(zip_file, dict) and zip_file.get("name"):
546
  zip_path = zip_file["name"]
547
+ adv = {}
548
+ # advanced options state could be used to pass params later
549
  return transcribe_multiple(audio_input, model_name, adv, merge_checkbox=merge, zip_file=zip_path, zip_password=zip_password, enable_memory=enable_memory)
550
 
551
  # Build Blocks UI
552
+ demo = gr.Blocks()
553
 
554
  with demo:
555
  gr.Markdown("## Whisper Transcription (Spaces-ready)")
 
568
  download_file = gr.File(label="Merged .docx (when enabled)")
569
 
570
  # connect
 
571
  submit.click(fn=run_transcription_wrapper,
572
  inputs=[file_input, model_select, merge_checkbox, zip_input, zip_password, memory_checkbox, gr.State({})],
573
  outputs=[logs, transcripts_out, download_file])