Spaces:

staraks
/

arvind

Sleeping

App Files Files Community

staraks commited on Nov 14, 2025

Commit

a086336

verified ·

1 Parent(s): f461a9b

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -397

app.py CHANGED Viewed

@@ -1,415 +1,163 @@
-# app.py
-# Full Whisper transcription app for Hugging Face Spaces
-# - Advanced .dct conversion (ffmpeg heuristics + pydub)
-# - Zip extraction (pyzipper)
-# - Whisper transcription (cached)
-# - Live progress & logs to Gradio (generator)
-# - Persistent memory (word + phrase) with fuzzy correction
-# - Simple medical post-processing (abbrev expansion)
-# - Merge transcripts to .docx
-# - Binds to 0.0.0.0:$PORT and uses demo.queue().launch()
-import os
-import json
-import shutil
-import tempfile
-import subprocess
-import traceback
-import threading
-import re
-from difflib import get_close_matches
-from pathlib import Path
-from docx import Document
-import whisper
-import gradio as gr
-import pyzipper
-from pydub import AudioSegment
-# ---------- Config ----------
-MEMORY_FILE = "memory.json"   # persistent memory in repo (will be written)
-MEMORY_LOCK = threading.Lock()
-DIAGNOSTICS_DIR_BASE = tempfile.gettempdir()
-MIN_WAV_SIZE = 200
-# ----------------------------
-# ensure memory file exists
-def load_memory():
     try:
-        if os.path.exists(MEMORY_FILE):
-            with open(MEMORY_FILE, "r", encoding="utf-8") as fh:
-                return json.load(fh)
-    except Exception:
-        pass
-    # default structure
-    mem = {"words": {}, "phrases": {}}
-    try:
-        with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
-            json.dump(mem, fh, ensure_ascii=False, indent=2)
-    except Exception:
-        pass
-    return mem
-def save_memory(mem):
-    with MEMORY_LOCK:
-        with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
-            json.dump(mem, fh, ensure_ascii=False, indent=2)
-memory = load_memory()
-# ---------- Simple medical post-processing ----------
-MEDICAL_ABBREVIATIONS = {
-    "pt": "patient",
-    "dx": "diagnosis",
-    "hx": "history",
-    "sx": "symptoms",
-    "c/o": "complains of",
-    "bp": "blood pressure",
-    "hr": "heart rate",
-    "o2": "oxygen",
-    "r/o": "rule out",
-    "adm": "admit",
-    "disch": "discharge",
-    # extend as needed
-}
-DRUG_NORMALIZATION = {
-    "metformin": "Metformin",
-    "aspirin": "Aspirin",
-    "amoxicillin": "Amoxicillin",
-}
-def expand_abbreviations(text):
-    tokens = re.split(r'(\s+)', text)
-    out = []
-    for t in tokens:
-        key = t.lower().strip(".,;:")
-        if key in MEDICAL_ABBREVIATIONS:
-            trailing = ''
-            m = re.match(r'([A-Za-z0-9/]+)([.,;:]*)', t)
-            if m:
-                trailing = m.group(2) or ''
-            out.append(MEDICAL_ABBREVIATIONS[key] + trailing)
-        else:
-            out.append(t)
-    return ''.join(out)
-def normalize_drugs(text):
-    for k, v in DRUG_NORMALIZATION.items():
-        text = re.sub(rf'\b{k}\b', v, text, flags=re.IGNORECASE)
-    return text
-def punctuation_and_capitalization(text):
-    text = text.strip()
-    if not text:
-        return text
-    if not re.search(r'[.?!]\s*$', text):
-        text = text.rstrip() + '.'
-    parts = re.split(r'([.?!]\s+)', text)
-    out = []
-    for p in parts:
-        if p and not re.match(r'[.?!]\s+', p):
-            out.append(p.capitalize())
-        else:
-            out.append(p)
-    return ''.join(out)
-def postprocess_transcript(text, format_soap=False):
-    if not text:
-        return text
-    t = re.sub(r'\s+', ' ', text).strip()
-    t = expand_abbreviations(t)
-    t = normalize_drugs(t)
-    t = punctuation_and_capitalization(t)
-    if format_soap:
-        sentences = re.split(r'(?<=[.?!])\s+', t)
-        subj = sentences[0] if len(sentences) >= 1 else ""
-        obj = sentences[1] if len(sentences) >= 2 else ""
-        assessment = ""
-        for kw in ["diagnosis", "dx", "rule out", "r/o", "probable"]:
-            if kw in t.lower():
-                assessment = "Assessment: " + subj
-                break
-        soap = f"S: {subj}\nO: {obj}\nA: {assessment}\nP: Plan: follow up as indicated."
-        return soap
-    return t
-# ---------- Memory utilities (word + phrase) ----------
-def extract_words_and_phrases(text):
-    # basic tokenization for words; phrases = sentences
-    words = re.findall(r"[A-Za-z0-9\-']+", text)
-    sentences = [s.strip() for s in re.split(r'(?<=[.?!])\s+', text) if s.strip()]
-    return [w for w in words if w.strip()], sentences
-def update_memory_with_transcript(transcript):
-    global memory
-    words, sentences = extract_words_and_phrases(transcript)
-    changed = False
-    with MEMORY_LOCK:
-        for w in words:
-            lw = w.lower()
-            if lw in memory["words"]:
-                memory["words"][lw] += 1
-            else:
-                memory["words"][lw] = 1
-                changed = True
-        for s in sentences:
-            key = s.strip()
-            if key in memory["phrases"]:
-                memory["phrases"][key] += 1
-            else:
-                memory["phrases"][key] = 1
-                changed = True
-        if changed:
-            try:
-                with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
-                    json.dump(memory, fh, ensure_ascii=False, indent=2)
-            except Exception:
-                pass
-def memory_correct_text(text, min_ratio=0.85):
-    """
-    Correct words/phrases in text using memory.
-    - Word-level: uses difflib.get_close_matches against known memory words.
-    - Phrase-level: tries to match stored phrases (exact or close substring).
-    """
-    if not text or (not memory.get("words") and not memory.get("phrases")):
-        return text
-    # word-level corrections
-    def fix_word(w):
-        lw = w.lower()
-        if lw in memory["words"]:
-            return w  # known exact
-        # find close matches from memory words (keys)
-        candidates = get_close_matches(lw, memory["words"].keys(), n=1, cutoff=min_ratio)
-        if candidates:
-            # preserve casing: if candidate is lower, capitalize if original was capitalized
-            cand = candidates[0]
-            if w[0].isupper():
-                return cand.capitalize()
-            return cand
-        return w
-    tokens = re.split(r'(\W+)', text)  # keep punctuation
-    corrected_tokens = []
-    for tok in tokens:
-        if re.match(r"^[A-Za-z0-9\-']+$", tok):
-            corrected_tokens.append(fix_word(tok))
-        else:
-            corrected_tokens.append(tok)
-    corrected = ''.join(corrected_tokens)
-    # phrase-level: try to replace short substrings that closely match known phrases
-    # naive approach: for each stored phrase, if it is short and a fuzzy substring of corrected, replace
-    for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
-        low_phrase = phrase.lower()
-        # only replace if phrase length >= 8 chars to avoid noisy matches
-        if len(low_phrase) < 8:
             continue
-        if low_phrase in corrected.lower():
-            # find exact location, replace preserving case roughly
-            corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
-    return corrected
-# ---------- File utilities ----------
-def save_as_word(text, filename=None):
-    if filename is None:
-        filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
-    doc = Document()
-    doc.add_paragraph(text)
-    doc.save(filename)
-    return filename
-# ---------- Advanced conversion: pydub auto + ffmpeg heuristics ----------
-def convert_to_wav_if_needed(input_path):
-    """
-    Advanced conversion:
-    - pydub (AudioSegment.from_file) first
-    - if that fails, exhaustive ffmpeg format/rate/channel grid
-    - writes diagnostics to a temp folder if conversion fails entirely
-    """
-    input_path = str(input_path)
-    lower = input_path.lower()
-    if lower.endswith(".wav"):
-        return input_path
-    # try pydub first
-    auto_err = ""
-    try:
-        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-        tmp.close()
-        AudioSegment.from_file(input_path).export(tmp.name, format="wav")
-        return tmp.name
-    except Exception as e:
-        auto_err = traceback.format_exc()
         try:
-            os.unlink(tmp.name)
-        except Exception:
-            pass
-    # fallback grid
-    pcm_formats = ['s16le', 's32le', 's24le', 's8', 'u8', 's16be', 'pcm_s16le', 'pcm_u8', 'pcm_u16le']
-    mulaw_alaw = ['mulaw', 'alaw']
-    adpcm = ['adpcm_ima_wav', 'adpcm_ms']
-    extra = ['gsm', 'g726', 'vorbis']
-    formats = pcm_formats + mulaw_alaw + adpcm + extra
-    sample_rates = [8000, 11025, 12000, 16000, 22050, 32000, 44100, 48000]
-    channels = [1, 2]
-    diagnostics = []
-    diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
-    diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
-    for fmt in formats:
-        for sr in sample_rates:
-            for ch in channels:
-                out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-                out_wav.close()
-                cmd = [
-                    "ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
-                    "-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_wav.name
-                ]
                 try:
-                    proc = subprocess.run(cmd, capture_output=True, text=True, timeout=45)
-                except Exception as e_run:
-                    diagnostics.append(f"RUN-EXC fmt={fmt} sr={sr} ch={ch} err={e_run}")
-                    try: os.unlink(out_wav.name)
-                    except Exception: pass
-                    continue
-                rc = proc.returncode
-                stderr = proc.stderr.strip() if proc.stderr else ""
-                stdout = proc.stdout.strip() if proc.stdout else ""
-                diagnostics.append(f"ATTEMPT fmt={fmt} sr={sr} ch={ch} rc={rc}")
-                if stdout:
-                    diagnostics.append("STDOUT:")
-                    diagnostics.append(stdout)
-                if stderr:
-                    diagnostics.append("STDERR:")
-                    diagnostics.append(stderr)
-                diagnostics.append("-" * 60)
-                try:
-                    if rc == 0 and os.path.exists(out_wav.name) and os.path.getsize(out_wav.name) > MIN_WAV_SIZE:
-                        # success
-                        try:
-                            with open(diag_log, "w", encoding="utf-8") as fh:
-                                fh.write("pydub auto error:\n")
-                                fh.write(auto_err + "\n\n")
-                                fh.write("Successful guess:\n")
-                                fh.write(f"fmt={fmt} sr={sr} ch={ch}\n\n")
-                                fh.write("Diagnostics (last attempts):\n")
-                                fh.write("\n".join(diagnostics[-1000:]))
-                        except Exception:
-                            pass
-                        return out_wav.name
                 except Exception:
                     pass
-                try: os.unlink(out_wav.name)
-                except Exception: pass
-    # ffprobe and hexdump preview
-    try:
-        fp = subprocess.run(["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
-                            capture_output=True, text=True, timeout=15)
-        diagnostics.append("FFPROBE:")
-        diagnostics.append(fp.stdout.strip() or fp.stderr.strip())
-    except Exception as e:
-        diagnostics.append(f"ffprobe failed: {e}")
-    try:
-        with open(input_path, "rb") as fh:
-            head = fh.read(256)
-            diagnostics.append("HEX PREVIEW:")
-            diagnostics.append(head.hex())
-    except Exception as e:
-        diagnostics.append(f"could not read head: {e}")
     try:
-        with open(diag_log, "w", encoding="utf-8") as fh:
-            fh.write("pydub auto error:\n")
-            fh.write(auto_err + "\n\n")
-            fh.write("Full diagnostics:\n\n")
-            fh.write("\n".join(diagnostics))
-    except Exception as e:
-        raise Exception(f"Conversion failed; diagnostics could not be written: {e}")
-    raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}\nSummary: {diagnostics[:6]}")
-# ---------- Whisper model cache ----------
-MODEL_CACHE = {}
-def get_whisper_model(name):
-    if name not in MODEL_CACHE:
-        MODEL_CACHE[name] = whisper.load_model(name)
-    return MODEL_CACHE[name]
-# ---------- Main transcription generator ----------
-def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbox, zip_file=None, zip_password=None, enable_memory=False):
     """
-    Generator yields (log_text, transcripts_text, word_file_path_or_None, percent_int)
-    audio_files: path or list of paths (gr.File with type='filepath' gives file path string)
     """
-    log = []
-    transcripts = []
-    word_file_path = None
-    temp_extract_dir = os.path.join(tempfile.gettempdir(), "extracted_audio")
-    extracted_audio_paths = []
-    yield "", "", None, 0
-    # cleanup
-    if os.path.exists(temp_extract_dir):
-        try:
-            shutil.rmtree(temp_extract_dir)
-            log.append(f"Cleaned previous temp dir: {temp_extract_dir}")
-        except Exception:
-            pass
-    # handle zip
     if zip_file:
-        log.append(f"Processing zip: {zip_file}")
-        yield "\n\n".join(log), "\n\n".join(transcripts), None, 2
-        try:
-            os.makedirs(temp_extract_dir, exist_ok=True)
-            with pyzipper.ZipFile(zip_file, "r") as zf:
-                if zip_password:
-                    try: zf.setpassword(zip_password.encode())
-                    except Exception:
-                        log.append("Incorrect zip password")
-                        yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
-                        return
-                exts = ['.mp3', '.wav', '.aac', '.flac', '.ogg', '.dat', '.dct']
-                count = 0
-                for info in zf.infolist():
-                    if info.is_dir(): continue
-                    _, ext = os.path.splitext(info.filename)
-                    if ext.lower() in exts:
-                        try:
-                            zf.extract(info, path=temp_extract_dir)
-                            p = os.path.normpath(os.path.join(temp_extract_dir, info.filename))
-                            if os.path.exists(p):
-                                extracted_audio_paths.append(p)
-                                count += 1
-                                log.append(f"Extracted: {info.filename}")
-                        except Exception as e:
-                            log.append(f"Error extracting {info.filename}: {e}")
-                if count == 0:
-                    log.append("No supported audio in zip.")
-                    try: shutil.rmtree(temp_extract_dir)
-                    except Exception: pass
-                    yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
-                    return
-        except pyzipper.BadZipFile:
-            log.append("Invalid zip file.")
-            try: shutil.rmtree(temp_extract_dir)
-            except Exception: pass
-            yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
-            return
-        except Exception as e:
-            log.append(f"Zip processing error: {e}")
-            try: shutil.rmtree(temp_extract_dir)
-            except Exception: pass
-            yield "\n\n".join(log), "\n\n".join(transcripts), None

+# ----------------------- CONTINUATION / APP LAUNCH -----------------------
+# Append this to the end of your app.py (after the previous code)
+                yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
+                return
+    # collect audio file paths from either audio_files or extracted paths
+    paths = []
+    if extracted_audio_paths:
+        paths.extend(extracted_audio_paths)
+    if audio_files:
+        # audio_files may be a single path string or list of paths (gr.File gives str)
+        if isinstance(audio_files, (list, tuple)):
+            for a in audio_files:
+                if a:
+                    paths.append(a)
+        elif isinstance(audio_files, str):
+            paths.append(audio_files)
+    if not paths:
+        log.append("No audio files provided.")
+        yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
+        return
+    # load model
+    yield "\n\n".join(log), "\n\n".join(transcripts), None, 5
     try:
+        model = get_whisper_model(model_name)
+        log.append(f"Loaded Whisper model: {model_name}")
+    except Exception as e:
+        log.append(f"Failed to load model {model_name}: {e}")
+        yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
+        return
+    total = len(paths)
+    idx = 0
+    for p in paths:
+        idx += 1
+        log.append(f"Processing file ({idx}/{total}): {p}")
+        yield "\n\n".join(log), "\n\n".join(transcripts), None, int(5 + (idx-1) * 80 / max(1, total))
+        try:
+            wav = convert_to_wav_if_needed(p)
+            log.append(f"Converted to WAV: {wav}")
+        except Exception as e:
+            log.append(f"Conversion failed for {p}: {e}")
+            transcripts.append(f"FILE: {os.path.basename(p)}\nERROR: Conversion failed: {e}")
+            yield "\n\n".join(log), "\n\n".join(transcripts), None, int(5 + idx * 80 / max(1, total))
             continue
+        # whisper transcription
         try:
+            # advanced_options can be used to pass whisper params later
+            whisper_opts = {}
+            # e.g., advanced_options might contain 'language' or 'task'; keep simple for now
+            if isinstance(advanced_options, dict):
+                whisper_opts.update(advanced_options)
+            result = model.transcribe(wav, **whisper_opts)
+            text = result.get("text", "").strip()
+            log.append(f"Transcribed: {len(text)} chars")
+            # memory correction & postprocess
+            if enable_memory:
+                text = memory_correct_text(text)
+            text = postprocess_transcript(text)
+            transcripts.append(f"FILE: {os.path.basename(p)}\n{text}\n")
+            # update persistent memory
+            if enable_memory:
                 try:
+                    update_memory_with_transcript(text)
+                    log.append("Memory updated.")
                 except Exception:
                     pass
+            yield "\n\n".join(log), "\n\n".join(transcripts), None, int(10 + idx * 85 / max(1, total))
+        except Exception as e:
+            log.append(f"Transcription failed for {p}: {e}")
+            transcripts.append(f"FILE: {os.path.basename(p)}\nERROR: Transcription failed: {e}")
+            yield "\n\n".join(log), "\n\n".join(transcripts), None, int(10 + idx * 85 / max(1, total))
+            continue
+        finally:
+            # cleanup intermediate wav if it was created in tempdir and not original .wav
+            try:
+                if wav and os.path.exists(wav) and (not p.lower().endswith(".wav")):
+                    # only remove if it's a tmp file in tmpdir
+                    tmpdir = tempfile.gettempdir()
+                    if os.path.commonpath([tmpdir, os.path.abspath(wav)]) == tmpdir:
+                        os.unlink(wav)
+            except Exception:
+                pass
+    # final merge option
+    if merge_checkbox:
+        try:
+            merged_text = "\n\n".join(transcripts)
+            word_file_path = save_as_word(merged_text)
+            log.append(f"Merged transcript saved: {word_file_path}")
+        except Exception as e:
+            log.append(f"Failed to save merged file: {e}")
+            word_file_path = None
+    # final yield
+    yield "\n\n".join(log), "\n\n".join(transcripts), word_file_path, 100
+    # cleanup extracted dir
     try:
+        if os.path.exists(temp_extract_dir):
+            shutil.rmtree(temp_extract_dir)
+            log.append("Cleaned temporary extraction dir.")
+    except Exception:
+        pass
+# ----------------------- Gradio UI -----------------------
+def run_transcription_wrapper(files, model_name, merge, zip_file, zip_password, enable_memory, advanced_options):
     """
+    Gradio wrapper: accepts file upload(s) and zip file (single), returns final outputs.
+    Because Gradio supports generator functions directly, we can return the generator.
     """
+    # normalize inputs
+    audio_input = files
+    zip_path = None
     if zip_file:
+        # gr.File will provide a dict-like or path depending on version; try to handle both
+        if isinstance(zip_file, (str, os.PathLike)):
+            zip_path = str(zip_file)
+        elif hasattr(zip_file, "name"):
+            zip_path = zip_file.name
+        elif isinstance(zip_file, dict) and zip_file.get("name"):
+            zip_path = zip_file["name"]
+    # advanced options not used heavily here; keep empty dict if None
+    adv = advanced_options or {}
+    return transcribe_multiple(audio_input, model_name, adv, merge_checkbox=merge, zip_file=zip_path, zip_password=zip_password, enable_memory=enable_memory)
+# Build Blocks UI
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("## Whisper Transcription (Spaces-ready)")
+    with gr.Row():
+        with gr.Column(scale=2):
+            file_input = gr.File(label="Upload audio files (or zip)", file_count="multiple", type="filepath")
+            zip_input = gr.File(label="Optional: Upload zip file containing audio", file_count="single", type="filepath")
+            zip_password = gr.Textbox(label="Zip password (if any)", placeholder="password (optional)")
+            model_select = gr.Dropdown(choices=["small","medium","large","base"], value="small", label="Whisper model")
+            merge_checkbox = gr.Checkbox(label="Merge transcripts to a single .docx (downloadable)", value=True)
+            memory_checkbox = gr.Checkbox(label="Enable persistent memory (word/phrase correction)", value=False)
+            submit = gr.Button("Transcribe")
+        with gr.Column(scale=3):
+            logs = gr.Textbox(label="Logs (streaming)", lines=12)
+            transcripts_out = gr.Textbox(label="Transcripts (streaming)", lines=12)
+            download_file = gr.File(label="Merged .docx (when enabled)")
+    # connect
+    # Gradio supports generator functions directly; the outputs are (logs, transcripts, file, progress)
+    submit.click(fn=run_transcription_wrapper,
+                 inputs=[file_input, model_select, merge_checkbox, zip_input, zip_password, memory_checkbox, gr.State({})],
+                 outputs=[logs, transcripts_out, download_file])
+# Ensure we queue and bind to PORT (for Spaces/containers)
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 7860))
+    # Use queue to support longer-running jobs
+    demo.queue().launch(server_name="0.0.0.0", server_port=port)