Spaces:

staraks
/

arvind

Running

App Files Files Community

staraks commited on Nov 15, 2025

Commit

2171026

verified ·

1 Parent(s): 1910810

Update app.py

Browse files

Files changed (1) hide show

app.py +265 -123

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # app.py
 # Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
-# Cleaned, debugged, and Spaces-ready.
 import os
 import sys
@@ -32,12 +32,12 @@ except Exception as e:
 print("DEBUG: imports OK", flush=True)
 # ---------- Config ----------
 MEMORY_FILE = "memory.json"
 MEMORY_LOCK = threading.Lock()
-MIN_WAV_SIZE = 200            # bytes
 FFMPEG_CANDIDATES = [
     ("s16le", 16000, 1),
     ("s16le", 44100, 2),
@@ -63,15 +63,19 @@ def load_memory():
         pass
     return mem
 def save_memory(mem):
     with MEMORY_LOCK:
         with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
             json.dump(mem, fh, ensure_ascii=False, indent=2)
-memory = load_memory()
-print("DEBUG: memory loaded (words=%d phrases=%d)" % (len(memory.get("words", {})), len(memory.get("phrases", {}))), flush=True)
 # ---------- Postprocessing ----------
 MEDICAL_ABBREVIATIONS = {
@@ -94,55 +98,54 @@ DRUG_NORMALIZATION = {
     "amoxicillin": "Amoxicillin",
 }
 def expand_abbreviations(text):
-    tokens = re.split(r'(\s+)', text)
     out = []
     for t in tokens:
         key = t.lower().strip(".,;:")
         if key in MEDICAL_ABBREVIATIONS:
-            trailing = ''
-            m = re.match(r'([A-Za-z0-9/]+)([.,;:]*)', t)
             if m:
-                trailing = m.group(2) or ''
             out.append(MEDICAL_ABBREVIATIONS[key] + trailing)
         else:
             out.append(t)
-    return ''.join(out)
 def normalize_drugs(text):
     for k, v in DRUG_NORMALIZATION.items():
-        text = re.sub(rf'\b{k}\b', v, text, flags=re.IGNORECASE)
     return text
 def punctuation_and_capitalization(text):
     text = text.strip()
     if not text:
         return text
-    if not re.search(r'[.?!]\s*$', text):
-        text = text.rstrip() + '.'
-    parts = re.split(r'([.?!]\s+)', text)
     out = []
     for p in parts:
-        if p and not re.match(r'[.?!]\s+', p):
             out.append(p.capitalize())
         else:
             out.append(p)
-    return ''.join(out)
 def postprocess_transcript(text, format_soap=False):
     if not text:
         return text
-    t = re.sub(r'\s+', ' ', text).strip()
     t = expand_abbreviations(t)
     t = normalize_drugs(t)
     t = punctuation_and_capitalization(t)
     if format_soap:
-        sentences = re.split(r'(?<=[.?!])\s+', t)
         subj = sentences[0] if len(sentences) >= 1 else ""
         obj = sentences[1] if len(sentences) >= 2 else ""
         assessment = ""
@@ -150,22 +153,23 @@ def postprocess_transcript(text, format_soap=False):
             if kw in t.lower():
                 assessment = "Assessment: " + subj
                 break
-        soap = f"S: {subj}\nO: {obj}\nA: {assessment}\nP: Plan: follow up as indicated."
         return soap
     return t
 # ---------- Memory utilities ----------
 def extract_words_and_phrases(text):
     # basic tokenization for words; phrases = sentences
     words = re.findall(r"[A-Za-z0-9\-']+", text)
-    sentences = [s.strip() for s in re.split(r'(?<=[.?!])\s+', text) if s.strip()]
     return [w for w in words if w.strip()], sentences
 def update_memory_with_transcript(transcript):
     global memory
     words, sentences = extract_words_and_phrases(transcript)
@@ -193,9 +197,6 @@ def update_memory_with_transcript(transcript):
                 pass
 def memory_correct_text(text, min_ratio=0.85):
     if not text or (not memory.get("words") and not memory.get("phrases")):
         return text
@@ -204,7 +205,9 @@ def memory_correct_text(text, min_ratio=0.85):
         lw = w.lower()
         if lw in memory["words"]:
             return w
-        candidates = get_close_matches(lw, memory["words"].keys(), n=1, cutoff=min_ratio)
         if candidates:
             cand = candidates[0]
             if w and w[0].isupper():
@@ -212,46 +215,63 @@ def memory_correct_text(text, min_ratio=0.85):
             return cand
         return w
-    tokens = re.split(r'(\W+)', text)
     corrected_tokens = []
     for tok in tokens:
         if re.match(r"^[A-Za-z0-9\-']+$", tok):
             corrected_tokens.append(fix_word(tok))
         else:
             corrected_tokens.append(tok)
-    corrected = ''.join(corrected_tokens)
     for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
         low_phrase = phrase.lower()
         if len(low_phrase) < 8:
             continue
         if low_phrase in corrected.lower():
-            corrected = re.sub(re.escape(phrase), phrase, corrected, flags=re.IGNORECASE)
     return corrected
 # ---------- File utilities ----------
 def save_as_word(text, filename=None):
     if filename is None:
-        filename = os.path.join(tempfile.gettempdir(), "merged_transcripts.docx")
     doc = Document()
     doc.add_paragraph(text)
     doc.save(filename)
     return filename
 # ---------- Hybrid conversion: pydub + small ffmpeg fallback ----------
 def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
     cmd = [
-        "ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
-        "-f", fmt, "-ar", str(sr), "-ac", str(ch), "-i", input_path, out_path
     ]
     try:
         proc = subprocess.run(cmd, capture_output=True, timeout=30, text=True)
-        if proc.returncode == 0 and os.path.exists(out_path) and os.path.getsize(out_path) > MIN_WAV_SIZE:
             return True, proc.stderr + proc.stdout
         else:
             try:
@@ -268,6 +288,7 @@ def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
             pass
         return False, str(e)
 def convert_to_wav_if_needed(input_path):
     input_path = str(input_path)
     lower = input_path.lower()
@@ -295,10 +316,7 @@ def convert_to_wav_if_needed(input_path):
         except Exception:
             pass
     diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
     diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
     diagnostics = []
@@ -306,7 +324,9 @@ def convert_to_wav_if_needed(input_path):
         out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
         out_wav.close()
         success, debug = _ffmpeg_convert(input_path, out_wav.name, fmt, sr, ch)
-        diagnostics.append(f"TRY fmt={fmt} sr={sr} ch={ch} success={success}\n{debug}\n")
         if success:
             try:
                 with open(diag_log, "w", encoding="utf-8") as fh:
@@ -326,9 +346,14 @@ def convert_to_wav_if_needed(input_path):
             except Exception:
                 pass
     try:
-        fp = subprocess.run(["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
-                            capture_output=True, text=True, timeout=10)
         diagnostics.append("FFPROBE:\n" + (fp.stdout.strip() or fp.stderr.strip()))
     except Exception as e:
         diagnostics.append("ffprobe failed: " + str(e))
@@ -348,19 +373,35 @@ def convert_to_wav_if_needed(input_path):
     except Exception as e:
         raise Exception(f"Conversion failed; diagnostics write error: {e}")
-    raise Exception(f"Could not convert file to WAV. Diagnostics saved to: {diag_log}")
 # ---------- Whisper model cache ----------
 MODEL_CACHE = {}
 def get_whisper_model(name):
     if name not in MODEL_CACHE:
         print(f"DEBUG: loading whisper model '{name}'", flush=True)
         MODEL_CACHE[name] = whisper.load_model(name)
     return MODEL_CACHE[name]
 # ---------- Main transcription generator ----------
-def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbox, zip_file=None, zip_password=None, enable_memory=False):
     log = []
     transcripts = []
     word_file_path = None
@@ -370,7 +411,7 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
     # initial yield
     yield "", "", None, 0
-    # cleanup previous
     if os.path.exists(temp_extract_dir):
         try:
             shutil.rmtree(temp_extract_dir)
@@ -392,7 +433,16 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
                         log.append("Incorrect zip password")
                         yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
                         return
-                exts = ['.mp3', '.wav', '.aac', '.flac', '.ogg', '.m4a', '.dat', '.dct']
                 count = 0
                 for info in zf.infolist():
                     if info.is_dir():
@@ -404,7 +454,9 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
                         except Exception as e:
                             log.append(f"Error extracting {info.filename}: {e}")
                             continue
-                        p = os.path.normpath(os.path.join(temp_extract_dir, info.filename))
                         if os.path.exists(p):
                             extracted_audio_paths.append(p)
                             count += 1
@@ -451,7 +503,7 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
         yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
         return
-    # load model (on demand)
     yield "\n\n".join(log), "\n\n".join(transcripts), None, 5
     try:
         model = get_whisper_model(model_name)
@@ -466,7 +518,9 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
     for p in paths:
         idx += 1
         log.append(f"Processing file ({idx}/{total}): {p}")
-        yield "\n\n".join(log), "\n\n".join(transcripts), None, int(5 + (idx-1) * 80 / max(1, total))
         wav = None
         try:
@@ -474,46 +528,165 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
             log.append(f"Converted to WAV: {wav}")
         except Exception as e:
             log.append(f"Conversion failed for {p}: {e}")
-            transcripts.append(f"FILE: {os.path.basename(p)}\nERROR: Conversion failed: {e}")
-            yield "\n\n".join(log), "\n\n".join(transcripts), None, int(5 + idx * 80 / max(1, total))
             continue
-    # continue... (UI + launch in next message)
-# Defensive wrapper to surface exceptions into the Logs textbox
-def run_transcription_wrapper(files, model_name, merge, zip_file, zip_password, enable_memory, advanced_options_state):
-    import traceback, io
-    try:
-        audio_input = files
-        zip_path = None
-        if zip_file:
-            if isinstance(zip_file, (str, os.PathLike)):
-                zip_path = str(zip_file)
-            elif hasattr(zip_file, "name"):
-                zip_path = zip_file.name
-            elif isinstance(zip_file, dict) and zip_file.get("name"):
-                zip_path = zip_file["name"]
-        adv = {}
-        # return the generator directly (transcribe_multiple yields tuples)
-        return transcribe_multiple(audio_input, model_name, adv, merge_checkbox=merge, zip_file=zip_path, zip_password=zip_password, enable_memory=enable_memory)
-    except Exception as e:
-        # If anything raises before generator returned, produce a generator that yields the traceback
-        buf = io.StringIO()
-        traceback.print_exc(file=buf)
-        tb = buf.getvalue()
-        logs = f"EXCEPTION in run_transcription_wrapper:\n{tb}"
-        transcripts = "ERROR: transcription did not start due to exception."
-        # Yield once with logs and final 100% to stop spinner
-        def error_gen():
-            yield logs, transcripts, None, 100
-        return error_gen()
-# Launch
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
     print("DEBUG: launching Gradio on port", port, flush=True)
@@ -523,34 +696,3 @@ if __name__ == "__main__":
         print("FATAL: demo.launch failed:", e, flush=True)
         traceback.print_exc()
         raise
-# Safe launch: only launch if demo exists
-if __name__ == "__main__":
-    port = int(os.environ.get("PORT", 7860))
-    print("DEBUG: preparing to launch Gradio on port", port, flush=True)
-    try:
-        if 'demo' in globals() and demo is not None:
-            print("DEBUG: demo object found. launching...", flush=True)
-            demo.queue().launch(server_name="0.0.0.0", server_port=port)
-        else:
-            print("FATAL: 'demo' not found. The Gradio UI block may be missing or failed to create.", flush=True)
-            # show the tail of the file so you can inspect quickly in logs
-            try:
-                import inspect
-                import pathlib
-                print("DEBUG: last 60 lines of /app/app.py for inspection:", flush=True)
-                with open("/app/app.py", "r", encoding="utf-8") as fh:
-                    all_lines = fh.read().splitlines()
-                    for ln in all_lines[-60:]:
-                        print(ln)
-            except Exception:
-                pass
-            # Exit non-zero so platform reports failure clearly
-            sys.exit(1)
-    except Exception as e:
-        print("FATAL: demo.launch failed:", e, flush=True)
-        traceback.print_exc()
-        raise

 # app.py
 # Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
+# Clean, single-version file for Hugging Face Spaces.
 import os
 import sys
 print("DEBUG: imports OK", flush=True)
 # ---------- Config ----------
 MEMORY_FILE = "memory.json"
 MEMORY_LOCK = threading.Lock()
+MIN_WAV_SIZE = 200  # bytes
+# Small ffmpeg fallback grid (hybrid conversion)
 FFMPEG_CANDIDATES = [
     ("s16le", 16000, 1),
     ("s16le", 44100, 2),
         pass
     return mem
 def save_memory(mem):
     with MEMORY_LOCK:
         with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
             json.dump(mem, fh, ensure_ascii=False, indent=2)
+memory = load_memory()
+print(
+    "DEBUG: memory loaded (words=%d phrases=%d)"
+    % (len(memory.get("words", {})), len(memory.get("phrases", {}))),
+    flush=True,
+)
 # ---------- Postprocessing ----------
 MEDICAL_ABBREVIATIONS = {
     "amoxicillin": "Amoxicillin",
 }
 def expand_abbreviations(text):
+    tokens = re.split(r"(\s+)", text)
     out = []
     for t in tokens:
         key = t.lower().strip(".,;:")
         if key in MEDICAL_ABBREVIATIONS:
+            trailing = ""
+            m = re.match(r"([A-Za-z0-9/]+)([.,;:]*)", t)
             if m:
+                trailing = m.group(2) or ""
             out.append(MEDICAL_ABBREVIATIONS[key] + trailing)
         else:
             out.append(t)
+    return "".join(out)
 def normalize_drugs(text):
     for k, v in DRUG_NORMALIZATION.items():
+        text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
     return text
 def punctuation_and_capitalization(text):
     text = text.strip()
     if not text:
         return text
+    if not re.search(r"[.?!]\s*$", text):
+        text = text.rstrip() + "."
+    parts = re.split(r"([.?!]\s+)", text)
     out = []
     for p in parts:
+        if p and not re.match(r"[.?!]\s+", p):
             out.append(p.capitalize())
         else:
             out.append(p)
+    return "".join(out)
 def postprocess_transcript(text, format_soap=False):
     if not text:
         return text
+    t = re.sub(r"\s+", " ", text).strip()
     t = expand_abbreviations(t)
     t = normalize_drugs(t)
     t = punctuation_and_capitalization(t)
     if format_soap:
+        sentences = re.split(r"(?<=[.?!])\s+", t)
         subj = sentences[0] if len(sentences) >= 1 else ""
         obj = sentences[1] if len(sentences) >= 2 else ""
         assessment = ""
             if kw in t.lower():
                 assessment = "Assessment: " + subj
                 break
+        soap = (
+            f"S: {subj}\nO: {obj}\nA: {assessment}\nP: Plan: follow up as indicated."
+        )
         return soap
     return t
 # ---------- Memory utilities ----------
 def extract_words_and_phrases(text):
     # basic tokenization for words; phrases = sentences
     words = re.findall(r"[A-Za-z0-9\-']+", text)
+    sentences = [
+        s.strip() for s in re.split(r"(?<=[.?!])\s+", text) if s.strip()
+    ]
     return [w for w in words if w.strip()], sentences
 def update_memory_with_transcript(transcript):
     global memory
     words, sentences = extract_words_and_phrases(transcript)
                 pass
 def memory_correct_text(text, min_ratio=0.85):
     if not text or (not memory.get("words") and not memory.get("phrases")):
         return text
         lw = w.lower()
         if lw in memory["words"]:
             return w
+        candidates = get_close_matches(
+            lw, memory["words"].keys(), n=1, cutoff=min_ratio
+        )
         if candidates:
             cand = candidates[0]
             if w and w[0].isupper():
             return cand
         return w
+    tokens = re.split(r"(\W+)", text)
     corrected_tokens = []
     for tok in tokens:
         if re.match(r"^[A-Za-z0-9\-']+$", tok):
             corrected_tokens.append(fix_word(tok))
         else:
             corrected_tokens.append(tok)
+    corrected = "".join(corrected_tokens)
     for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
         low_phrase = phrase.lower()
         if len(low_phrase) < 8:
             continue
         if low_phrase in corrected.lower():
+            corrected = re.sub(
+                re.escape(phrase), phrase, corrected, flags=re.IGNORECASE
+            )
     return corrected
 # ---------- File utilities ----------
 def save_as_word(text, filename=None):
     if filename is None:
+        filename = os.path.join(
+            tempfile.gettempdir(), "merged_transcripts.docx"
+        )
     doc = Document()
     doc.add_paragraph(text)
     doc.save(filename)
     return filename
 # ---------- Hybrid conversion: pydub + small ffmpeg fallback ----------
 def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
     cmd = [
+        "ffmpeg",
+        "-hide_banner",
+        "-loglevel",
+        "error",
+        "-y",
+        "-f",
+        fmt,
+        "-ar",
+        str(sr),
+        "-ac",
+        str(ch),
+        "-i",
+        input_path,
+        out_path,
     ]
     try:
         proc = subprocess.run(cmd, capture_output=True, timeout=30, text=True)
+        if (
+            proc.returncode == 0
+            and os.path.exists(out_path)
+            and os.path.getsize(out_path) > MIN_WAV_SIZE
+        ):
             return True, proc.stderr + proc.stdout
         else:
             try:
             pass
         return False, str(e)
 def convert_to_wav_if_needed(input_path):
     input_path = str(input_path)
     lower = input_path.lower()
         except Exception:
             pass
+    # ffmpeg fallback
     diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
     diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
     diagnostics = []
         out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
         out_wav.close()
         success, debug = _ffmpeg_convert(input_path, out_wav.name, fmt, sr, ch)
+        diagnostics.append(
+            f"TRY fmt={fmt} sr={sr} ch={ch} success={success}\n{debug}\n"
+        )
         if success:
             try:
                 with open(diag_log, "w", encoding="utf-8") as fh:
             except Exception:
                 pass
+    # final diagnostics
     try:
+        fp = subprocess.run(
+            ["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
         diagnostics.append("FFPROBE:\n" + (fp.stdout.strip() or fp.stderr.strip()))
     except Exception as e:
         diagnostics.append("ffprobe failed: " + str(e))
     except Exception as e:
         raise Exception(f"Conversion failed; diagnostics write error: {e}")
+    raise Exception(
+        f"Could not convert file to WAV. Diagnostics saved to: {diag_log}"
+    )
 # ---------- Whisper model cache ----------
 MODEL_CACHE = {}
 def get_whisper_model(name):
     if name not in MODEL_CACHE:
         print(f"DEBUG: loading whisper model '{name}'", flush=True)
         MODEL_CACHE[name] = whisper.load_model(name)
     return MODEL_CACHE[name]
 # ---------- Main transcription generator ----------
+def transcribe_multiple(
+    audio_files,
+    model_name,
+    advanced_options,
+    merge_checkbox,
+    zip_file=None,
+    zip_password=None,
+    enable_memory=False,
+):
+    """
+    Generator yields (log_text, transcripts_text, merged_file_path_or_None, percent_int)
+    """
     log = []
     transcripts = []
     word_file_path = None
     # initial yield
     yield "", "", None, 0
+    # cleanup previous temp dir
     if os.path.exists(temp_extract_dir):
         try:
             shutil.rmtree(temp_extract_dir)
                         log.append("Incorrect zip password")
                         yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
                         return
+                exts = [
+                    ".mp3",
+                    ".wav",
+                    ".aac",
+                    ".flac",
+                    ".ogg",
+                    ".m4a",
+                    ".dat",
+                    ".dct",
+                ]
                 count = 0
                 for info in zf.infolist():
                     if info.is_dir():
                         except Exception as e:
                             log.append(f"Error extracting {info.filename}: {e}")
                             continue
+                        p = os.path.normpath(
+                            os.path.join(temp_extract_dir, info.filename)
+                        )
                         if os.path.exists(p):
                             extracted_audio_paths.append(p)
                             count += 1
         yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
         return
+    # load model
     yield "\n\n".join(log), "\n\n".join(transcripts), None, 5
     try:
         model = get_whisper_model(model_name)
     for p in paths:
         idx += 1
         log.append(f"Processing file ({idx}/{total}): {p}")
+        yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
+            5 + (idx - 1) * 80 / max(1, total)
+        )
         wav = None
         try:
             log.append(f"Converted to WAV: {wav}")
         except Exception as e:
             log.append(f"Conversion failed for {p}: {e}")
+            transcripts.append(
+                f"FILE: {os.path.basename(p)}\nERROR: Conversion failed: {e}"
+            )
+            yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
+                5 + idx * 80 / max(1, total)
+            )
             continue
+        try:
+            whisper_opts = {}
+            if isinstance(advanced_options, dict):
+                whisper_opts.update(advanced_options)
+            result = model.transcribe(wav, **whisper_opts)
+            text = result.get("text", "").strip()
+            log.append(f"Transcribed: {len(text)} chars")
+            if enable_memory:
+                text = memory_correct_text(text)
+            text = postprocess_transcript(text)
+            transcripts.append(
+                f"FILE: {os.path.basename(p)}\n{text}\n"
+            )
+            if enable_memory:
+                try:
+                    update_memory_with_transcript(text)
+                    log.append("Memory updated.")
+                except Exception:
+                    pass
+            yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
+                10 + idx * 85 / max(1, total)
+            )
+        except Exception as e:
+            log.append(f"Transcription failed for {p}: {e}")
+            transcripts.append(
+                f"FILE: {os.path.basename(p)}\nERROR: Transcription failed: {e}"
+            )
+            yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
+                10 + idx * 85 / max(1, total)
+            )
+            continue
+        finally:
+            try:
+                if wav and os.path.exists(wav):
+                    tmpdir = tempfile.gettempdir()
+                    if (
+                        os.path.commonpath([tmpdir, os.path.abspath(wav)])
+                        == tmpdir
+                        and not p.lower().endswith(".wav")
+                    ):
+                        os.unlink(wav)
+            except Exception:
+                pass
+    # final merge option
+    if merge_checkbox:
+        try:
+            merged_text = "\n\n".join(transcripts)
+            word_file_path = save_as_word(merged_text)
+            log.append(f"Merged transcript saved: {word_file_path}")
+        except Exception as e:
+            log.append(f"Failed to save merged file: {e}")
+            word_file_path = None
+    # final yield
+    yield "\n\n".join(log), "\n\n".join(transcripts), word_file_path, 100
+    # cleanup extracted dir
+    try:
+        if os.path.exists(temp_extract_dir):
+            shutil.rmtree(temp_extract_dir)
+            log.append("Cleaned temporary extraction dir.")
+    except Exception:
+        pass
+# ----------------------- Gradio UI -----------------------
+def run_transcription_wrapper(
+    files, model_name, merge, zip_file, zip_password, enable_memory, advanced_options_state
+):
+    audio_input = files
+    zip_path = None
+    if zip_file:
+        if isinstance(zip_file, (str, os.PathLike)):
+            zip_path = str(zip_file)
+        elif hasattr(zip_file, "name"):
+            zip_path = zip_file.name
+        elif isinstance(zip_file, dict) and zip_file.get("name"):
+            zip_path = zip_file["name"]
+    adv = {}
+    return transcribe_multiple(
+        audio_input,
+        model_name,
+        adv,
+        merge_checkbox=merge,
+        zip_file=zip_path,
+        zip_password=zip_password,
+        enable_memory=enable_memory,
+    )
+print("DEBUG: building Gradio Blocks", flush=True)
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("## Whisper Transcription (Spaces-ready)")
+    with gr.Row():
+        with gr.Column(scale=2):
+            file_input = gr.File(
+                label="Upload audio files (or zip)",
+                file_count="multiple",
+                type="filepath",
+            )
+            zip_input = gr.File(
+                label="Optional: Upload zip file containing audio",
+                file_count="single",
+                type="filepath",
+            )
+            zip_password = gr.Textbox(
+                label="Zip password (if any)",
+                placeholder="password (optional)",
+            )
+            model_select = gr.Dropdown(
+                choices=["small", "medium", "large", "base"],
+                value="small",
+                label="Whisper model",
+            )
+            merge_checkbox = gr.Checkbox(
+                label="Merge transcripts to a single .docx (downloadable)",
+                value=True,
+            )
+            memory_checkbox = gr.Checkbox(
+                label="Enable persistent memory (word/phrase correction)",
+                value=False,
+            )
+            submit = gr.Button("Transcribe")
+        with gr.Column(scale=3):
+            logs = gr.Textbox(label="Logs (streaming)", lines=12)
+            transcripts_out = gr.Textbox(label="Transcripts (streaming)", lines=12)
+            download_file = gr.File(label="Merged .docx (when enabled)")
+            progress_num = gr.Number(value=0, label="Progress (%)")
+    submit.click(
+        fn=run_transcription_wrapper,
+        inputs=[
+            file_input,
+            model_select,
+            merge_checkbox,
+            zip_input,
+            zip_password,
+            memory_checkbox,
+            gr.State({}),
+        ],
+        outputs=[logs, transcripts_out, download_file, progress_num],
+    )
+# ---------- Launch ----------
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
     print("DEBUG: launching Gradio on port", port, flush=True)
         print("FATAL: demo.launch failed:", e, flush=True)
         traceback.print_exc()
         raise