Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
# app.py
|
| 2 |
-
#
|
| 3 |
-
# Cleaned
|
|
|
|
| 4 |
|
| 5 |
import os
|
|
|
|
| 6 |
import json
|
| 7 |
import shutil
|
| 8 |
import tempfile
|
|
@@ -11,24 +13,41 @@ import traceback
|
|
| 11 |
import threading
|
| 12 |
import re
|
| 13 |
from difflib import get_close_matches
|
| 14 |
-
from pathlib import Path
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# ---------- Config ----------
|
| 23 |
MEMORY_FILE = "memory.json" # persistent memory in repo (will be written)
|
| 24 |
MEMORY_LOCK = threading.Lock()
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# ----------------------------
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
# ensure memory file exists
|
| 32 |
def load_memory():
|
| 33 |
try:
|
| 34 |
if os.path.exists(MEMORY_FILE):
|
|
@@ -36,7 +55,6 @@ def load_memory():
|
|
| 36 |
return json.load(fh)
|
| 37 |
except Exception:
|
| 38 |
pass
|
| 39 |
-
# default structure
|
| 40 |
mem = {"words": {}, "phrases": {}}
|
| 41 |
try:
|
| 42 |
with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
|
|
@@ -51,8 +69,9 @@ def save_memory(mem):
|
|
| 51 |
json.dump(mem, fh, ensure_ascii=False, indent=2)
|
| 52 |
|
| 53 |
memory = load_memory()
|
|
|
|
| 54 |
|
| 55 |
-
# ----------
|
| 56 |
MEDICAL_ABBREVIATIONS = {
|
| 57 |
"pt": "patient",
|
| 58 |
"dx": "diagnosis",
|
|
@@ -65,7 +84,6 @@ MEDICAL_ABBREVIATIONS = {
|
|
| 65 |
"r/o": "rule out",
|
| 66 |
"adm": "admit",
|
| 67 |
"disch": "discharge",
|
| 68 |
-
# extend as needed
|
| 69 |
}
|
| 70 |
|
| 71 |
DRUG_NORMALIZATION = {
|
|
@@ -74,7 +92,6 @@ DRUG_NORMALIZATION = {
|
|
| 74 |
"amoxicillin": "Amoxicillin",
|
| 75 |
}
|
| 76 |
|
| 77 |
-
|
| 78 |
def expand_abbreviations(text):
|
| 79 |
tokens = re.split(r'(\s+)', text)
|
| 80 |
out = []
|
|
@@ -90,13 +107,11 @@ def expand_abbreviations(text):
|
|
| 90 |
out.append(t)
|
| 91 |
return ''.join(out)
|
| 92 |
|
| 93 |
-
|
| 94 |
def normalize_drugs(text):
|
| 95 |
for k, v in DRUG_NORMALIZATION.items():
|
| 96 |
text = re.sub(rf'\b{k}\b', v, text, flags=re.IGNORECASE)
|
| 97 |
return text
|
| 98 |
|
| 99 |
-
|
| 100 |
def punctuation_and_capitalization(text):
|
| 101 |
text = text.strip()
|
| 102 |
if not text:
|
|
@@ -112,7 +127,6 @@ def punctuation_and_capitalization(text):
|
|
| 112 |
out.append(p)
|
| 113 |
return ''.join(out)
|
| 114 |
|
| 115 |
-
|
| 116 |
def postprocess_transcript(text, format_soap=False):
|
| 117 |
if not text:
|
| 118 |
return text
|
|
@@ -133,170 +147,6 @@ def postprocess_transcript(text, format_soap=False):
|
|
| 133 |
return soap
|
| 134 |
return t
|
| 135 |
|
| 136 |
-
# ---------- Memory utilities
|
| 137 |
def extract_words_and_phrases(text):
|
| 138 |
-
|
| 139 |
-
words = re.findall(r"[A-Za-z0-9\-']+", text)
|
| 140 |
-
sentences = [s.strip() for s in re.split(r'(?<=[.?!])\s+', text) if s.strip()]
|
| 141 |
-
return [w for w in words if w.strip()], sentences
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
def update_memory_with_transcript(transcript):
|
| 145 |
-
global memory
|
| 146 |
-
words, sentences = extract_words_and_phrases(transcript)
|
| 147 |
-
changed = False
|
| 148 |
-
with MEMORY_LOCK:
|
| 149 |
-
for w in words:
|
| 150 |
-
lw = w.lower()
|
| 151 |
-
if lw in memory["words"]:
|
| 152 |
-
memory["words"][lw] += 1
|
| 153 |
-
else:
|
| 154 |
-
memory["words"][lw] = 1
|
| 155 |
-
changed = True
|
| 156 |
-
for s in sentences:
|
| 157 |
-
key = s.strip()
|
| 158 |
-
if key in memory["phrases"]:
|
| 159 |
-
memory["phrases"][key] += 1
|
| 160 |
-
else:
|
| 161 |
-
memory["phrases"][key] = 1
|
| 162 |
-
changed = True
|
| 163 |
-
if changed:
|
| 164 |
-
try:
|
| 165 |
-
with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
|
| 166 |
-
json.dump(memory, fh, ensure_ascii=False, indent=2)
|
| 167 |
-
except Exception:
|
| 168 |
-
pass
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
def memory_correct_text(text, min_ratio=0.85):
|
| 172 |
-
"""
|
| 173 |
-
Correct words/phrases in text using memory.
|
| 174 |
-
- Word-level: uses difflib.get_close_matches against known memory words.
|
| 175 |
-
- Phrase-level: tries to match stored phrases (exact or close substring).
|
| 176 |
-
"""
|
| 177 |
-
if not text or (not memory.get("words") and not memory.get("phrases")):
|
| 178 |
-
return text
|
| 179 |
-
|
| 180 |
-
# word-level corrections
|
| 181 |
-
def fix_word(w):
|
| 182 |
-
lw = w.lower()
|
| 183 |
-
if lw in memory["words"]:
|
| 184 |
-
return w # known exact
|
| 185 |
-
# find close matches from memory words (keys)
|
| 186 |
-
candidates = get_close_matches(lw, memory["words"].keys(), n=1, cutoff=min_ratio)
|
| 187 |
-
if candidates:
|
| 188 |
-
# preserve casing: if candidate is lower, capitalize if original was capitalized
|
| 189 |
-
cand = candidates[0]
|
| 190 |
-
if w and w[0].isupper():
|
| 191 |
-
return cand.capitalize()
|
| 192 |
-
return cand
|
| 193 |
-
return w
|
| 194 |
-
|
| 195 |
-
tokens = re.split(r'(\W+)', text) # keep punctuation
|
| 196 |
-
corrected_tokens = []
|
| 197 |
-
for tok in tokens:
|
| 198 |
-
if re.match(r"^[A-Za-z0-9\-']+$", tok):
|
| 199 |
-
corrected_tokens.append(fix_word(tok))
|
| 200 |
-
else:
|
| 201 |
-
corrected_tokens.append(tok)
|
| 202 |
-
corrected = ''.join(corrected_tokens)
|
| 203 |
-
|
| 204 |
-
# phrase-level: try to replace short substrings that closely match known phrases
|
| 205 |
-
# naive approach: for each stored phrase, if it is short and a fuzzy substring of corrected, replace
|
| 206 |
-
for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
|
| 207 |
-
low_phrase = phrase.lower()
|
| 208 |
-
# only replace if phrase length >= 8 chars to avoid noisy matches
|
| 209 |
-
if len(low_phrase) < 8:
|
| 210 |
-
continue
|
| 211 |
-
if low_phrase in corrected.lower():
|
| 212 |
-
# find exact location, replace preserving case roughly
|
| 213 |
-
corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
|
| 214 |
-
return corrected
|
| 215 |
-
|
| 216 |
-
# ---------- File utilities ----------
|
| 217 |
-
def save_as_word(text, filename=None):
|
| 218 |
-
if filename is None:
|
| 219 |
-
filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
|
| 220 |
-
doc = Document()
|
| 221 |
-
doc.add_paragraph(text)
|
| 222 |
-
doc.save(filename)
|
| 223 |
-
return filename
|
| 224 |
-
|
| 225 |
-
# ---------- Advanced conversion: pydub auto + ffmpeg heuristics ----------
|
| 226 |
-
def convert_to_wav_if_needed(input_path):
|
| 227 |
-
"""
|
| 228 |
-
Advanced conversion:
|
| 229 |
-
- pydub (AudioSegment.from_file) first
|
| 230 |
-
- if that fails, exhaustive ffmpeg format/rate/channel grid
|
| 231 |
-
- writes diagnostics to a temp folder if conversion fails entirely
|
| 232 |
-
"""
|
| 233 |
-
input_path = str(input_path)
|
| 234 |
-
lower = input_path.lower()
|
| 235 |
-
if lower.endswith(".wav"):
|
| 236 |
-
return input_path
|
| 237 |
-
|
| 238 |
-
# try pydub first
|
| 239 |
-
auto_err = ""
|
| 240 |
-
tmp = None
|
| 241 |
-
try:
|
| 242 |
-
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 243 |
-
tmp.close()
|
| 244 |
-
AudioSegment.from_file(input_path).export(tmp.name, format="wav")
|
| 245 |
-
return tmp.name
|
| 246 |
-
except Exception as e:
|
| 247 |
-
auto_err = traceback.format_exc()
|
| 248 |
-
try:
|
| 249 |
-
if tmp:
|
| 250 |
-
os.unlink(tmp.name)
|
| 251 |
-
except Exception:
|
| 252 |
-
pass
|
| 253 |
-
|
| 254 |
-
# fallback grid
|
| 255 |
-
pcm_formats = ['s16le', 's32le', 's24le', 's8', 'u8', 's16be', 'pcm_s16le', 'pcm_u8', 'pcm_u16le']
|
| 256 |
-
mulaw_alaw = ['mulaw', 'alaw']
|
| 257 |
-
adpcm = ['adpcm_ima_wav', 'adpcm_ms']
|
| 258 |
-
extra = ['gsm', 'g726', 'vorbis']
|
| 259 |
-
formats = pcm_formats + mulaw_alaw + adpcm + extra
|
| 260 |
-
sample_rates = [8000, 11025, 12000, 16000, 22050, 32000, 44100, 48000]
|
| 261 |
-
channels = [1, 2]
|
| 262 |
-
|
| 263 |
-
diagnostics = []
|
| 264 |
-
diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
|
| 265 |
-
diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
|
| 266 |
-
|
| 267 |
-
for fmt in formats:
|
| 268 |
-
for sr in sample_rates:
|
| 269 |
-
for ch in channels:
|
| 270 |
-
out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 271 |
-
out_wav.close()
|
| 272 |
-
cmd = [
|
| 273 |
-
"ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
|
| 274 |
-
"-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_wav.name
|
| 275 |
-
]
|
| 276 |
-
try:
|
| 277 |
-
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=45)
|
| 278 |
-
except Exception as e_run:
|
| 279 |
-
diagnostics.append(f"RUN-EXC fmt={fmt} sr={sr} ch={ch} err={e_run}")
|
| 280 |
-
try: os.unlink(out_wav.name)
|
| 281 |
-
except Exception: pass
|
| 282 |
-
continue
|
| 283 |
-
|
| 284 |
-
rc = proc.returncode
|
| 285 |
-
stderr = proc.stderr.strip() if proc.stderr else ""
|
| 286 |
-
stdout = proc.stdout.strip() if proc.stdout else ""
|
| 287 |
-
diagnostics.append(f"ATTEMPT fmt={fmt} sr={sr} ch={ch} rc={rc}")
|
| 288 |
-
if stdout:
|
| 289 |
-
diagnostics.append("STDOUT:")
|
| 290 |
-
diagnostics.append(stdout)
|
| 291 |
-
if stderr:
|
| 292 |
-
diagnostics.append("STDERR:")
|
| 293 |
-
diagnostics.append(stderr)
|
| 294 |
-
diagnostics.append("-" * 60)
|
| 295 |
-
|
| 296 |
-
try:
|
| 297 |
-
if rc == 0 and os.path.exists(out_wav.name) and os.path.getsize(out_wav.name) > MIN_WAV_SIZE:
|
| 298 |
-
# success
|
| 299 |
-
try:
|
| 300 |
-
with open(diag_log, "w", encoding="utf-8") as fh:
|
| 301 |
-
fh.write("pydub auto error:\n")
|
| 302 |
-
fh.write(auto_err + "\n\n")
|
|
|
|
| 1 |
# app.py
|
| 2 |
+
# Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
|
| 3 |
+
# Cleaned, debugged, and Spaces-ready.
|
| 4 |
+
# Replace /app/app.py with this file and restart container.
|
| 5 |
|
| 6 |
import os
|
| 7 |
+
import sys
|
| 8 |
import json
|
| 9 |
import shutil
|
| 10 |
import tempfile
|
|
|
|
| 13 |
import threading
|
| 14 |
import re
|
| 15 |
from difflib import get_close_matches
|
|
|
|
| 16 |
|
| 17 |
+
# Force unbuffered output so container logs show prints immediately
|
| 18 |
+
os.environ["PYTHONUNBUFFERED"] = "1"
|
| 19 |
+
|
| 20 |
+
print("DEBUG: app.py bootstrap starting", flush=True)
|
| 21 |
+
|
| 22 |
+
# Third-party imports (must be installed in the environment)
|
| 23 |
+
try:
|
| 24 |
+
from docx import Document
|
| 25 |
+
import whisper
|
| 26 |
+
import gradio as gr
|
| 27 |
+
import pyzipper
|
| 28 |
+
from pydub import AudioSegment
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print("FATAL: import error for third-party libs:", e, flush=True)
|
| 31 |
+
traceback.print_exc()
|
| 32 |
+
raise
|
| 33 |
+
|
| 34 |
+
print("DEBUG: imports OK", flush=True)
|
| 35 |
|
| 36 |
# ---------- Config ----------
|
| 37 |
MEMORY_FILE = "memory.json" # persistent memory in repo (will be written)
|
| 38 |
MEMORY_LOCK = threading.Lock()
|
| 39 |
+
MIN_WAV_SIZE = 200 # bytes
|
| 40 |
+
# Fallback ffmpeg conversion candidates (short hybrid list)
|
| 41 |
+
FFMPEG_CANDIDATES = [
|
| 42 |
+
("s16le", 16000, 1),
|
| 43 |
+
("s16le", 44100, 2),
|
| 44 |
+
("pcm_s16le", 16000, 1),
|
| 45 |
+
("pcm_s16le", 44100, 2),
|
| 46 |
+
("mulaw", 8000, 1),
|
| 47 |
+
]
|
| 48 |
# ----------------------------
|
| 49 |
|
| 50 |
+
# ---------- Memory helpers ----------
|
|
|
|
|
|
|
| 51 |
def load_memory():
|
| 52 |
try:
|
| 53 |
if os.path.exists(MEMORY_FILE):
|
|
|
|
| 55 |
return json.load(fh)
|
| 56 |
except Exception:
|
| 57 |
pass
|
|
|
|
| 58 |
mem = {"words": {}, "phrases": {}}
|
| 59 |
try:
|
| 60 |
with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
|
|
|
|
| 69 |
json.dump(mem, fh, ensure_ascii=False, indent=2)
|
| 70 |
|
| 71 |
memory = load_memory()
|
| 72 |
+
print("DEBUG: memory loaded (words=%d phrases=%d)" % (len(memory.get("words", {})), len(memory.get("phrases", {}))), flush=True)
|
| 73 |
|
| 74 |
+
# ---------- Postprocessing ----------
|
| 75 |
MEDICAL_ABBREVIATIONS = {
|
| 76 |
"pt": "patient",
|
| 77 |
"dx": "diagnosis",
|
|
|
|
| 84 |
"r/o": "rule out",
|
| 85 |
"adm": "admit",
|
| 86 |
"disch": "discharge",
|
|
|
|
| 87 |
}
|
| 88 |
|
| 89 |
DRUG_NORMALIZATION = {
|
|
|
|
| 92 |
"amoxicillin": "Amoxicillin",
|
| 93 |
}
|
| 94 |
|
|
|
|
| 95 |
def expand_abbreviations(text):
|
| 96 |
tokens = re.split(r'(\s+)', text)
|
| 97 |
out = []
|
|
|
|
| 107 |
out.append(t)
|
| 108 |
return ''.join(out)
|
| 109 |
|
|
|
|
| 110 |
def normalize_drugs(text):
|
| 111 |
for k, v in DRUG_NORMALIZATION.items():
|
| 112 |
text = re.sub(rf'\b{k}\b', v, text, flags=re.IGNORECASE)
|
| 113 |
return text
|
| 114 |
|
|
|
|
| 115 |
def punctuation_and_capitalization(text):
|
| 116 |
text = text.strip()
|
| 117 |
if not text:
|
|
|
|
| 127 |
out.append(p)
|
| 128 |
return ''.join(out)
|
| 129 |
|
|
|
|
| 130 |
def postprocess_transcript(text, format_soap=False):
|
| 131 |
if not text:
|
| 132 |
return text
|
|
|
|
| 147 |
return soap
|
| 148 |
return t
|
| 149 |
|
| 150 |
+
# ---------- Memory utilities ----------
|
| 151 |
def extract_words_and_phrases(text):
|
| 152 |
+
words = re.findall(r"[A-Za-z0-]()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|