staraks commited on
Commit
ef63fe4
·
verified ·
1 Parent(s): 15a4432

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -186
app.py CHANGED
@@ -1,8 +1,10 @@
1
  # app.py
2
- # Full Whisper transcription app for Hugging Face Spaces
3
- # Cleaned & hardened for debugging in container logs.
 
4
 
5
  import os
 
6
  import json
7
  import shutil
8
  import tempfile
@@ -11,24 +13,41 @@ import traceback
11
  import threading
12
  import re
13
  from difflib import get_close_matches
14
- from pathlib import Path
15
 
16
- from docx import Document
17
- import whisper
18
- import gradio as gr
19
- import pyzipper
20
- from pydub import AudioSegment
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # ---------- Config ----------
23
  MEMORY_FILE = "memory.json" # persistent memory in repo (will be written)
24
  MEMORY_LOCK = threading.Lock()
25
- DIAGNOSTICS_DIR_BASE = tempfile.gettempdir()
26
- MIN_WAV_SIZE = 200
 
 
 
 
 
 
 
27
  # ----------------------------
28
 
29
- print("app.py: starting up") # helpful in container logs
30
-
31
- # ensure memory file exists
32
  def load_memory():
33
  try:
34
  if os.path.exists(MEMORY_FILE):
@@ -36,7 +55,6 @@ def load_memory():
36
  return json.load(fh)
37
  except Exception:
38
  pass
39
- # default structure
40
  mem = {"words": {}, "phrases": {}}
41
  try:
42
  with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
@@ -51,8 +69,9 @@ def save_memory(mem):
51
  json.dump(mem, fh, ensure_ascii=False, indent=2)
52
 
53
  memory = load_memory()
 
54
 
55
- # ---------- Simple medical post-processing ----------
56
  MEDICAL_ABBREVIATIONS = {
57
  "pt": "patient",
58
  "dx": "diagnosis",
@@ -65,7 +84,6 @@ MEDICAL_ABBREVIATIONS = {
65
  "r/o": "rule out",
66
  "adm": "admit",
67
  "disch": "discharge",
68
- # extend as needed
69
  }
70
 
71
  DRUG_NORMALIZATION = {
@@ -74,7 +92,6 @@ DRUG_NORMALIZATION = {
74
  "amoxicillin": "Amoxicillin",
75
  }
76
 
77
-
78
  def expand_abbreviations(text):
79
  tokens = re.split(r'(\s+)', text)
80
  out = []
@@ -90,13 +107,11 @@ def expand_abbreviations(text):
90
  out.append(t)
91
  return ''.join(out)
92
 
93
-
94
  def normalize_drugs(text):
95
  for k, v in DRUG_NORMALIZATION.items():
96
  text = re.sub(rf'\b{k}\b', v, text, flags=re.IGNORECASE)
97
  return text
98
 
99
-
100
  def punctuation_and_capitalization(text):
101
  text = text.strip()
102
  if not text:
@@ -112,7 +127,6 @@ def punctuation_and_capitalization(text):
112
  out.append(p)
113
  return ''.join(out)
114
 
115
-
116
  def postprocess_transcript(text, format_soap=False):
117
  if not text:
118
  return text
@@ -133,170 +147,6 @@ def postprocess_transcript(text, format_soap=False):
133
  return soap
134
  return t
135
 
136
- # ---------- Memory utilities (word + phrase) ----------
137
  def extract_words_and_phrases(text):
138
- # basic tokenization for words; phrases = sentences
139
- words = re.findall(r"[A-Za-z0-9\-']+", text)
140
- sentences = [s.strip() for s in re.split(r'(?<=[.?!])\s+', text) if s.strip()]
141
- return [w for w in words if w.strip()], sentences
142
-
143
-
144
- def update_memory_with_transcript(transcript):
145
- global memory
146
- words, sentences = extract_words_and_phrases(transcript)
147
- changed = False
148
- with MEMORY_LOCK:
149
- for w in words:
150
- lw = w.lower()
151
- if lw in memory["words"]:
152
- memory["words"][lw] += 1
153
- else:
154
- memory["words"][lw] = 1
155
- changed = True
156
- for s in sentences:
157
- key = s.strip()
158
- if key in memory["phrases"]:
159
- memory["phrases"][key] += 1
160
- else:
161
- memory["phrases"][key] = 1
162
- changed = True
163
- if changed:
164
- try:
165
- with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
166
- json.dump(memory, fh, ensure_ascii=False, indent=2)
167
- except Exception:
168
- pass
169
-
170
-
171
- def memory_correct_text(text, min_ratio=0.85):
172
- """
173
- Correct words/phrases in text using memory.
174
- - Word-level: uses difflib.get_close_matches against known memory words.
175
- - Phrase-level: tries to match stored phrases (exact or close substring).
176
- """
177
- if not text or (not memory.get("words") and not memory.get("phrases")):
178
- return text
179
-
180
- # word-level corrections
181
- def fix_word(w):
182
- lw = w.lower()
183
- if lw in memory["words"]:
184
- return w # known exact
185
- # find close matches from memory words (keys)
186
- candidates = get_close_matches(lw, memory["words"].keys(), n=1, cutoff=min_ratio)
187
- if candidates:
188
- # preserve casing: if candidate is lower, capitalize if original was capitalized
189
- cand = candidates[0]
190
- if w and w[0].isupper():
191
- return cand.capitalize()
192
- return cand
193
- return w
194
-
195
- tokens = re.split(r'(\W+)', text) # keep punctuation
196
- corrected_tokens = []
197
- for tok in tokens:
198
- if re.match(r"^[A-Za-z0-9\-']+$", tok):
199
- corrected_tokens.append(fix_word(tok))
200
- else:
201
- corrected_tokens.append(tok)
202
- corrected = ''.join(corrected_tokens)
203
-
204
- # phrase-level: try to replace short substrings that closely match known phrases
205
- # naive approach: for each stored phrase, if it is short and a fuzzy substring of corrected, replace
206
- for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
207
- low_phrase = phrase.lower()
208
- # only replace if phrase length >= 8 chars to avoid noisy matches
209
- if len(low_phrase) < 8:
210
- continue
211
- if low_phrase in corrected.lower():
212
- # find exact location, replace preserving case roughly
213
- corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
214
- return corrected
215
-
216
- # ---------- File utilities ----------
217
- def save_as_word(text, filename=None):
218
- if filename is None:
219
- filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
220
- doc = Document()
221
- doc.add_paragraph(text)
222
- doc.save(filename)
223
- return filename
224
-
225
- # ---------- Advanced conversion: pydub auto + ffmpeg heuristics ----------
226
- def convert_to_wav_if_needed(input_path):
227
- """
228
- Advanced conversion:
229
- - pydub (AudioSegment.from_file) first
230
- - if that fails, exhaustive ffmpeg format/rate/channel grid
231
- - writes diagnostics to a temp folder if conversion fails entirely
232
- """
233
- input_path = str(input_path)
234
- lower = input_path.lower()
235
- if lower.endswith(".wav"):
236
- return input_path
237
-
238
- # try pydub first
239
- auto_err = ""
240
- tmp = None
241
- try:
242
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
243
- tmp.close()
244
- AudioSegment.from_file(input_path).export(tmp.name, format="wav")
245
- return tmp.name
246
- except Exception as e:
247
- auto_err = traceback.format_exc()
248
- try:
249
- if tmp:
250
- os.unlink(tmp.name)
251
- except Exception:
252
- pass
253
-
254
- # fallback grid
255
- pcm_formats = ['s16le', 's32le', 's24le', 's8', 'u8', 's16be', 'pcm_s16le', 'pcm_u8', 'pcm_u16le']
256
- mulaw_alaw = ['mulaw', 'alaw']
257
- adpcm = ['adpcm_ima_wav', 'adpcm_ms']
258
- extra = ['gsm', 'g726', 'vorbis']
259
- formats = pcm_formats + mulaw_alaw + adpcm + extra
260
- sample_rates = [8000, 11025, 12000, 16000, 22050, 32000, 44100, 48000]
261
- channels = [1, 2]
262
-
263
- diagnostics = []
264
- diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
265
- diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
266
-
267
- for fmt in formats:
268
- for sr in sample_rates:
269
- for ch in channels:
270
- out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
271
- out_wav.close()
272
- cmd = [
273
- "ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
274
- "-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_wav.name
275
- ]
276
- try:
277
- proc = subprocess.run(cmd, capture_output=True, text=True, timeout=45)
278
- except Exception as e_run:
279
- diagnostics.append(f"RUN-EXC fmt={fmt} sr={sr} ch={ch} err={e_run}")
280
- try: os.unlink(out_wav.name)
281
- except Exception: pass
282
- continue
283
-
284
- rc = proc.returncode
285
- stderr = proc.stderr.strip() if proc.stderr else ""
286
- stdout = proc.stdout.strip() if proc.stdout else ""
287
- diagnostics.append(f"ATTEMPT fmt={fmt} sr={sr} ch={ch} rc={rc}")
288
- if stdout:
289
- diagnostics.append("STDOUT:")
290
- diagnostics.append(stdout)
291
- if stderr:
292
- diagnostics.append("STDERR:")
293
- diagnostics.append(stderr)
294
- diagnostics.append("-" * 60)
295
-
296
- try:
297
- if rc == 0 and os.path.exists(out_wav.name) and os.path.getsize(out_wav.name) > MIN_WAV_SIZE:
298
- # success
299
- try:
300
- with open(diag_log, "w", encoding="utf-8") as fh:
301
- fh.write("pydub auto error:\n")
302
- fh.write(auto_err + "\n\n")
 
1
  # app.py
2
+ # Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
3
+ # Cleaned, debugged, and Spaces-ready.
4
+ # Replace /app/app.py with this file and restart container.
5
 
6
  import os
7
+ import sys
8
  import json
9
  import shutil
10
  import tempfile
 
13
  import threading
14
  import re
15
  from difflib import get_close_matches
 
16
 
17
+ # Force unbuffered output so container logs show prints immediately
18
+ os.environ["PYTHONUNBUFFERED"] = "1"
19
+
20
+ print("DEBUG: app.py bootstrap starting", flush=True)
21
+
22
+ # Third-party imports (must be installed in the environment)
23
+ try:
24
+ from docx import Document
25
+ import whisper
26
+ import gradio as gr
27
+ import pyzipper
28
+ from pydub import AudioSegment
29
+ except Exception as e:
30
+ print("FATAL: import error for third-party libs:", e, flush=True)
31
+ traceback.print_exc()
32
+ raise
33
+
34
+ print("DEBUG: imports OK", flush=True)
35
 
36
  # ---------- Config ----------
37
  MEMORY_FILE = "memory.json" # persistent memory in repo (will be written)
38
  MEMORY_LOCK = threading.Lock()
39
+ MIN_WAV_SIZE = 200 # bytes
40
+ # Fallback ffmpeg conversion candidates (short hybrid list)
41
+ FFMPEG_CANDIDATES = [
42
+ ("s16le", 16000, 1),
43
+ ("s16le", 44100, 2),
44
+ ("pcm_s16le", 16000, 1),
45
+ ("pcm_s16le", 44100, 2),
46
+ ("mulaw", 8000, 1),
47
+ ]
48
  # ----------------------------
49
 
50
+ # ---------- Memory helpers ----------
 
 
51
  def load_memory():
52
  try:
53
  if os.path.exists(MEMORY_FILE):
 
55
  return json.load(fh)
56
  except Exception:
57
  pass
 
58
  mem = {"words": {}, "phrases": {}}
59
  try:
60
  with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
 
69
  json.dump(mem, fh, ensure_ascii=False, indent=2)
70
 
71
  memory = load_memory()
72
+ print("DEBUG: memory loaded (words=%d phrases=%d)" % (len(memory.get("words", {})), len(memory.get("phrases", {}))), flush=True)
73
 
74
+ # ---------- Postprocessing ----------
75
  MEDICAL_ABBREVIATIONS = {
76
  "pt": "patient",
77
  "dx": "diagnosis",
 
84
  "r/o": "rule out",
85
  "adm": "admit",
86
  "disch": "discharge",
 
87
  }
88
 
89
  DRUG_NORMALIZATION = {
 
92
  "amoxicillin": "Amoxicillin",
93
  }
94
 
 
95
  def expand_abbreviations(text):
96
  tokens = re.split(r'(\s+)', text)
97
  out = []
 
107
  out.append(t)
108
  return ''.join(out)
109
 
 
110
  def normalize_drugs(text):
111
  for k, v in DRUG_NORMALIZATION.items():
112
  text = re.sub(rf'\b{k}\b', v, text, flags=re.IGNORECASE)
113
  return text
114
 
 
115
  def punctuation_and_capitalization(text):
116
  text = text.strip()
117
  if not text:
 
127
  out.append(p)
128
  return ''.join(out)
129
 
 
130
  def postprocess_transcript(text, format_soap=False):
131
  if not text:
132
  return text
 
147
  return soap
148
  return t
149
 
150
+ # ---------- Memory utilities ----------
151
  def extract_words_and_phrases(text):
152
+ words = re.findall(r"[A-Za-z0-]()