Spaces:

staraks
/

arvind

Sleeping

App Files Files Community

staraks commited on Nov 14, 2025

Commit

ef63fe4

verified ·

1 Parent(s): 15a4432

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -186

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 # app.py
-# Full Whisper transcription app for Hugging Face Spaces
-# Cleaned & hardened for debugging in container logs.
 import os
 import json
 import shutil
 import tempfile
@@ -11,24 +13,41 @@ import traceback
 import threading
 import re
 from difflib import get_close_matches
-from pathlib import Path
-from docx import Document
-import whisper
-import gradio as gr
-import pyzipper
-from pydub import AudioSegment
 # ---------- Config ----------
 MEMORY_FILE = "memory.json"   # persistent memory in repo (will be written)
 MEMORY_LOCK = threading.Lock()
-DIAGNOSTICS_DIR_BASE = tempfile.gettempdir()
-MIN_WAV_SIZE = 200
 # ----------------------------
-print("app.py: starting up")  # helpful in container logs
-# ensure memory file exists
 def load_memory():
     try:
         if os.path.exists(MEMORY_FILE):
@@ -36,7 +55,6 @@ def load_memory():
                 return json.load(fh)
     except Exception:
         pass
-    # default structure
     mem = {"words": {}, "phrases": {}}
     try:
         with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
@@ -51,8 +69,9 @@ def save_memory(mem):
             json.dump(mem, fh, ensure_ascii=False, indent=2)
 memory = load_memory()
-# ---------- Simple medical post-processing ----------
 MEDICAL_ABBREVIATIONS = {
     "pt": "patient",
     "dx": "diagnosis",
@@ -65,7 +84,6 @@ MEDICAL_ABBREVIATIONS = {
     "r/o": "rule out",
     "adm": "admit",
     "disch": "discharge",
-    # extend as needed
 }
 DRUG_NORMALIZATION = {
@@ -74,7 +92,6 @@ DRUG_NORMALIZATION = {
     "amoxicillin": "Amoxicillin",
 }
 def expand_abbreviations(text):
     tokens = re.split(r'(\s+)', text)
     out = []
@@ -90,13 +107,11 @@ def expand_abbreviations(text):
             out.append(t)
     return ''.join(out)
 def normalize_drugs(text):
     for k, v in DRUG_NORMALIZATION.items():
         text = re.sub(rf'\b{k}\b', v, text, flags=re.IGNORECASE)
     return text
 def punctuation_and_capitalization(text):
     text = text.strip()
     if not text:
@@ -112,7 +127,6 @@ def punctuation_and_capitalization(text):
             out.append(p)
     return ''.join(out)
 def postprocess_transcript(text, format_soap=False):
     if not text:
         return text
@@ -133,170 +147,6 @@ def postprocess_transcript(text, format_soap=False):
         return soap
     return t
-# ---------- Memory utilities (word + phrase) ----------
 def extract_words_and_phrases(text):
-    # basic tokenization for words; phrases = sentences
-    words = re.findall(r"[A-Za-z0-9\-']+", text)
-    sentences = [s.strip() for s in re.split(r'(?<=[.?!])\s+', text) if s.strip()]
-    return [w for w in words if w.strip()], sentences
-def update_memory_with_transcript(transcript):
-    global memory
-    words, sentences = extract_words_and_phrases(transcript)
-    changed = False
-    with MEMORY_LOCK:
-        for w in words:
-            lw = w.lower()
-            if lw in memory["words"]:
-                memory["words"][lw] += 1
-            else:
-                memory["words"][lw] = 1
-                changed = True
-        for s in sentences:
-            key = s.strip()
-            if key in memory["phrases"]:
-                memory["phrases"][key] += 1
-            else:
-                memory["phrases"][key] = 1
-                changed = True
-        if changed:
-            try:
-                with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
-                    json.dump(memory, fh, ensure_ascii=False, indent=2)
-            except Exception:
-                pass
-def memory_correct_text(text, min_ratio=0.85):
-    """
-    Correct words/phrases in text using memory.
-    - Word-level: uses difflib.get_close_matches against known memory words.
-    - Phrase-level: tries to match stored phrases (exact or close substring).
-    """
-    if not text or (not memory.get("words") and not memory.get("phrases")):
-        return text
-    # word-level corrections
-    def fix_word(w):
-        lw = w.lower()
-        if lw in memory["words"]:
-            return w  # known exact
-        # find close matches from memory words (keys)
-        candidates = get_close_matches(lw, memory["words"].keys(), n=1, cutoff=min_ratio)
-        if candidates:
-            # preserve casing: if candidate is lower, capitalize if original was capitalized
-            cand = candidates[0]
-            if w and w[0].isupper():
-                return cand.capitalize()
-            return cand
-        return w
-    tokens = re.split(r'(\W+)', text)  # keep punctuation
-    corrected_tokens = []
-    for tok in tokens:
-        if re.match(r"^[A-Za-z0-9\-']+$", tok):
-            corrected_tokens.append(fix_word(tok))
-        else:
-            corrected_tokens.append(tok)
-    corrected = ''.join(corrected_tokens)
-    # phrase-level: try to replace short substrings that closely match known phrases
-    # naive approach: for each stored phrase, if it is short and a fuzzy substring of corrected, replace
-    for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
-        low_phrase = phrase.lower()
-        # only replace if phrase length >= 8 chars to avoid noisy matches
-        if len(low_phrase) < 8:
-            continue
-        if low_phrase in corrected.lower():
-            # find exact location, replace preserving case roughly
-            corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
-    return corrected
-# ---------- File utilities ----------
-def save_as_word(text, filename=None):
-    if filename is None:
-        filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
-    doc = Document()
-    doc.add_paragraph(text)
-    doc.save(filename)
-    return filename
-# ---------- Advanced conversion: pydub auto + ffmpeg heuristics ----------
-def convert_to_wav_if_needed(input_path):
-    """
-    Advanced conversion:
-    - pydub (AudioSegment.from_file) first
-    - if that fails, exhaustive ffmpeg format/rate/channel grid
-    - writes diagnostics to a temp folder if conversion fails entirely
-    """
-    input_path = str(input_path)
-    lower = input_path.lower()
-    if lower.endswith(".wav"):
-        return input_path
-    # try pydub first
-    auto_err = ""
-    tmp = None
-    try:
-        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-        tmp.close()
-        AudioSegment.from_file(input_path).export(tmp.name, format="wav")
-        return tmp.name
-    except Exception as e:
-        auto_err = traceback.format_exc()
-        try:
-            if tmp:
-                os.unlink(tmp.name)
-        except Exception:
-            pass
-    # fallback grid
-    pcm_formats = ['s16le', 's32le', 's24le', 's8', 'u8', 's16be', 'pcm_s16le', 'pcm_u8', 'pcm_u16le']
-    mulaw_alaw = ['mulaw', 'alaw']
-    adpcm = ['adpcm_ima_wav', 'adpcm_ms']
-    extra = ['gsm', 'g726', 'vorbis']
-    formats = pcm_formats + mulaw_alaw + adpcm + extra
-    sample_rates = [8000, 11025, 12000, 16000, 22050, 32000, 44100, 48000]
-    channels = [1, 2]
-    diagnostics = []
-    diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
-    diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
-    for fmt in formats:
-        for sr in sample_rates:
-            for ch in channels:
-                out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-                out_wav.close()
-                cmd = [
-                    "ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
-                    "-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_wav.name
-                ]
-                try:
-                    proc = subprocess.run(cmd, capture_output=True, text=True, timeout=45)
-                except Exception as e_run:
-                    diagnostics.append(f"RUN-EXC fmt={fmt} sr={sr} ch={ch} err={e_run}")
-                    try: os.unlink(out_wav.name)
-                    except Exception: pass
-                    continue
-                rc = proc.returncode
-                stderr = proc.stderr.strip() if proc.stderr else ""
-                stdout = proc.stdout.strip() if proc.stdout else ""
-                diagnostics.append(f"ATTEMPT fmt={fmt} sr={sr} ch={ch} rc={rc}")
-                if stdout:
-                    diagnostics.append("STDOUT:")
-                    diagnostics.append(stdout)
-                if stderr:
-                    diagnostics.append("STDERR:")
-                    diagnostics.append(stderr)
-                diagnostics.append("-" * 60)
-                try:
-                    if rc == 0 and os.path.exists(out_wav.name) and os.path.getsize(out_wav.name) > MIN_WAV_SIZE:
-                        # success
-                        try:
-                            with open(diag_log, "w", encoding="utf-8") as fh:
-                                fh.write("pydub auto error:\n")
-                                fh.write(auto_err + "\n\n")

 # app.py
+# Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
+# Cleaned, debugged, and Spaces-ready.
+# Replace /app/app.py with this file and restart container.
 import os
+import sys
 import json
 import shutil
 import tempfile
 import threading
 import re
 from difflib import get_close_matches
+# Force unbuffered output so container logs show prints immediately
+os.environ["PYTHONUNBUFFERED"] = "1"
+print("DEBUG: app.py bootstrap starting", flush=True)
+# Third-party imports (must be installed in the environment)
+try:
+    from docx import Document
+    import whisper
+    import gradio as gr
+    import pyzipper
+    from pydub import AudioSegment
+except Exception as e:
+    print("FATAL: import error for third-party libs:", e, flush=True)
+    traceback.print_exc()
+    raise
+print("DEBUG: imports OK", flush=True)
 # ---------- Config ----------
 MEMORY_FILE = "memory.json"   # persistent memory in repo (will be written)
 MEMORY_LOCK = threading.Lock()
+MIN_WAV_SIZE = 200            # bytes
+# Fallback ffmpeg conversion candidates (short hybrid list)
+FFMPEG_CANDIDATES = [
+    ("s16le", 16000, 1),
+    ("s16le", 44100, 2),
+    ("pcm_s16le", 16000, 1),
+    ("pcm_s16le", 44100, 2),
+    ("mulaw", 8000, 1),
+]
 # ----------------------------
+# ---------- Memory helpers ----------
 def load_memory():
     try:
         if os.path.exists(MEMORY_FILE):
                 return json.load(fh)
     except Exception:
         pass
     mem = {"words": {}, "phrases": {}}
     try:
         with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
             json.dump(mem, fh, ensure_ascii=False, indent=2)
 memory = load_memory()
+print("DEBUG: memory loaded (words=%d phrases=%d)" % (len(memory.get("words", {})), len(memory.get("phrases", {}))), flush=True)
+# ---------- Postprocessing ----------
 MEDICAL_ABBREVIATIONS = {
     "pt": "patient",
     "dx": "diagnosis",
     "r/o": "rule out",
     "adm": "admit",
     "disch": "discharge",
 }
 DRUG_NORMALIZATION = {
     "amoxicillin": "Amoxicillin",
 }
 def expand_abbreviations(text):
     tokens = re.split(r'(\s+)', text)
     out = []
             out.append(t)
     return ''.join(out)
 def normalize_drugs(text):
     for k, v in DRUG_NORMALIZATION.items():
         text = re.sub(rf'\b{k}\b', v, text, flags=re.IGNORECASE)
     return text
 def punctuation_and_capitalization(text):
     text = text.strip()
     if not text:
             out.append(p)
     return ''.join(out)
 def postprocess_transcript(text, format_soap=False):
     if not text:
         return text
         return soap
     return t
+# ---------- Memory utilities ----------
 def extract_words_and_phrases(text):
+    words = re.findall(r"[A-Za-z0-]()