Spaces:

frankmcmahen
/

voxo

Sleeping

App Files Files Community

frankmcmahen commited on Sep 17, 2025

Commit

d0f8862

verified ·

1 Parent(s): 73a7a7e

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -203

app.py CHANGED Viewed

@@ -1,17 +1,22 @@
-import os, time, re, shutil, zipfile, subprocess, json
 import gradio as gr
 from faster_whisper import WhisperModel
-# ===== Hardware & model selection =====
 DEVICE = "cuda" if os.path.exists("/dev/nvidia0") else "cpu"
 COMPUTE = "float16" if DEVICE == "cuda" else "int8"
 MODEL_ID = os.getenv(
     "VOXO_MODEL",
     "Systran/faster-whisper-large-v3" if DEVICE == "cuda" else "Systran/faster-whisper-small"
 )
-model = WhisperModel(MODEL_ID, device=DEVICE, compute_type=COMPUTE)
-# ===== Helpers =====
 def _ts(t: float) -> str:
     m, s = divmod(int(t), 60); h, m = divmod(m, 60)
     return f"{h:02d}:{m:02d}:{s:02d}"
@@ -30,8 +35,7 @@ def _fmt_bytes(n: int) -> str:
         n /= 1024
 def _safe(name: str) -> str:
-    base = os.path.basename(name)
-    return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
 def _duration_secs(path: str) -> float:
     try:
@@ -44,17 +48,17 @@ def _duration_secs(path: str) -> float:
     except Exception:
         return 0.0
-# ===== Core transcription =====
 def transcribe(audio_path, language="auto", timestamps=True):
     if not audio_path:
         return ""
     lang = None if language == "auto" else language
-    segments, _info = model.transcribe(
         audio_path,
         language=lang,
         vad_filter=True,
         vad_parameters=dict(min_silence_duration_ms=500),
-        beam_size=1,   # fast; bump to 3–5 if you want extra accuracy
         best_of=1,
         condition_on_previous_text=False,
         no_speech_threshold=0.3,
@@ -63,7 +67,23 @@ def transcribe(audio_path, language="auto", timestamps=True):
              if timestamps else [s.text.strip() for s in segments])
     return "\n".join(lines)
-# ===== Batch with streaming ETA =====
 def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progress=gr.Progress(track_tqdm=True)):
     if not file_paths:
         yield "No files selected.", None
@@ -78,11 +98,11 @@ def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progre
     summary_parts, processed_audio, completed = [], 0.0, 0
-    def status_md(extra_note: str = "") -> str:
         elapsed = time.time() - start
         rtf = elapsed / processed_audio if processed_audio > 0 else 0.0
-        remaining_audio = max(0.0, total_audio - processed_audio)
-        eta = remaining_audio * rtf if processed_audio > 0 else 0.0
         header = [
             "### Batch Progress",
             f"- Files: **{completed}/{n}**",
@@ -91,7 +111,7 @@ def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progre
             f"- Est. RTF: **{rtf:.2f}**" if processed_audio else "- Est. RTF: **…**",
             f"- ETA: **{_fmt_hms(eta)}**" if processed_audio else "- ETA: **…**",
         ]
-        if extra_note: header.append(f"\n{extra_note}")
         tail = "\n".join(summary_parts[-2:]) if summary_parts else ""
         return "\n".join(header) + ("\n\n" + tail if tail else "")
@@ -110,11 +130,9 @@ def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progre
             f.write(text)
         wall = time.time() - t0
-        per_file = (
-            f"#### ✅ {name}\n- Audio: {_fmt_hms(file_dur)}  |  "
-            f"Wall: {_fmt_hms(wall)}  |  RTF: {(wall/max(1e-6,file_dur)):.2f}\n\n{text}\n"
         )
-        summary_parts.append(per_file)
         processed_audio += file_dur
         completed += 1
@@ -122,6 +140,7 @@ def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progre
         yield status_md(), None
     combined_path = os.path.join(workdir, "_ALL_TRANSCRIPTS.txt")
     with open(combined_path, "w", encoding="utf-8") as f:
         f.write("\n\n".join(summary_parts))
@@ -134,208 +153,44 @@ def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progre
     yield status_md("All done. Download the ZIP for every transcript."), zip_path
-# ===== Progress uploader adoption =====
-UPLOAD_ROOT = "/tmp/voxo_progress_uploads"
-def adopt_uploaded(json_paths: str, session_id: str):
-    """Take server paths from the custom uploader and prep status + list for batch."""
-    try:
-        paths = json.loads(json_paths) if json_paths else []
-    except Exception:
-        paths = []
-    safe_paths, total_size, total_audio = [], 0, 0.0
-    base = os.path.join(UPLOAD_ROOT, _safe(session_id))
-    for p in paths:
-        if not p: continue
-        p = os.path.abspath(p)
-        if not p.startswith(base):  # sandbox check
-            continue
-        if os.path.exists(p):
-            safe_paths.append(p)
-            total_size += os.path.getsize(p)
-            total_audio += _duration_secs(p)
-    status = (
-        "### Files staged\n" +
-        "\n".join([f"- ✅ **{_safe(p)}** — {_fmt_hms(_duration_secs(p))} | {_fmt_bytes(os.path.getsize(p))}" for p in safe_paths]) +
-        (f"\n\n**Total:** {len(safe_paths)} files — {_fmt_hms(total_audio)} — {_fmt_bytes(total_size)}" if safe_paths else "\n\nNo valid files.")
-    )
-    return status, safe_paths, gr.update(interactive=bool(safe_paths))
-# ===== UI =====
 with gr.Blocks(title="Voxo – Audio to Text") as demo:
     gr.Markdown("# 🎧 Voxo\nDrop audio, get text. GPU = fast, CPU = free.")
     with gr.Tabs():
-        # --- Single file ---
         with gr.Tab("Single file"):
             with gr.Row():
                 audio = gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio (mp3/wav)")
-                lang = gr.Dropdown(
-                    ["auto","en","es","fr","de","it","pt","ja","ko","zh"],
-                    value="auto", label="Language"
-                )
             ts = gr.Checkbox(value=True, label="Show timestamps")
             btn = gr.Button("Transcribe", variant="primary")
             out = gr.Textbox(lines=20, label="Transcript", show_copy_button=True)
             btn.click(transcribe, inputs=[audio, lang, ts], outputs=out, concurrency_limit=1)
-        # --- Batch (Progress Uploads) — replaces old Batch entirely ---
         with gr.Tab("Batch"):
-            gr.Markdown("**Upload with real progress bars, then run the batch.**")
-            # Hidden wiring to carry results from JS → Python
-            uploaded_json = gr.Textbox(visible=False)
-            session_box = gr.Textbox(visible=False)
-            staged_files = gr.State([])  # python list[str] of server paths
-            # Custom HTML + JS uploader with true progress bars
-            uploader = gr.HTML("""
-<div id="vx_uploader_wrap" style="border:1px dashed #7c3aed;padding:14px;border-radius:12px">
-  <input id="vx_input" type="file" multiple accept="audio/*" style="margin-bottom:8px"/>
-  <div id="vx_hint" style="font-size:12px;opacity:.7;margin-bottom:8px">Select multiple audio files. Upload starts immediately.</div>
-  <div id="vx_progress_list" style="display:flex;flex-direction:column;gap:6px"></div>
-  <div id="vx_totals" style="margin-top:8px;font-size:12px;opacity:.8"></div>
-</div>
-<script>
-(function(){
-  const uploadUrl = "/voxo-upload";
-  const input = document.getElementById("vx_input");
-  const list = document.getElementById("vx_progress_list");
-  const totals = document.getElementById("vx_totals");
-  window.voxoUploadedPaths = [];
-  window.voxoSession = String(Date.now());
-  function fmtBytes(n){const u=["B","KB","MB","GB","TB"];let i=0;while(n>=1024&&i<u.length-1){n/=1024;i++;}return (i?n.toFixed(1):n)+" "+u[i];}
-  function fmtHMS(sec){sec=Math.max(0,sec|0);let m=sec/60|0,s=sec%60,h=m/60|0;m%=60;return h?`${h}h ${String(m).padStart(2,'0')}m ${String(s).padStart(2,'0')}s`:m?`${m}m ${String(s).padStart(2,'0')}s`:`${s}s`;}
-  function makeRow(name,size){
-    const row=document.createElement("div");
-    row.style="display:flex;align-items:center;gap:8px;white-space:nowrap";
-    row.innerHTML = `
-      <span style="flex:1;overflow:hidden;text-overflow:ellipsis">${name}</span>
-      <span style="width:78px;text-align:right;font-size:12px;opacity:.7">${fmtBytes(size)}</span>
-      <progress value="0" max="100" style="flex:0 0 160px;height:10px"></progress>
-      <span class="pct" style="width:40px;text-align:right;font-size:12px">0%</span>
-      <span class="spd" style="width:90px;text-align:right;font-size:12px;opacity:.7"></span>
-    `;
-    return row;
-  }
-  function postOne(file){
-    return new Promise((resolve)=>{
-      const row = makeRow(file.name, file.size);
-      const bar = row.querySelector("progress");
-      const pct = row.querySelector(".pct");
-      const spd = row.querySelector(".spd");
-      list.appendChild(row);
-      const xhr = new XMLHttpRequest();
-      xhr.open("POST", uploadUrl);
-      const t0 = performance.now();
-      xhr.upload.onprogress = (e)=>{
-        if(e.lengthComputable){
-          const p = Math.round(100*e.loaded/e.total);
-          bar.value = p; pct.textContent = p+"%";
-          const sec = (performance.now()-t0)/1000;
-          const rate = e.loaded / Math.max(1e-6, sec); // B/s
-          const remain = (e.total - e.loaded) / Math.max(1e-6, rate); // s
-          spd.textContent = (rate<1024?`${rate|0} B/s` : rate<1024*1024?`${(rate/1024).toFixed(1)} KB/s` : `${(rate/1024/1024).toFixed(1)} MB/s`) + " · " + fmtHMS(remain);
-        }
-      };
-      xhr.onload = ()=>{
-        if(xhr.status===200){
-          try{
-            const resp = JSON.parse(xhr.responseText);
-            window.voxoUploadedPaths.push(resp.path);
-            bar.value = 100; pct.textContent = "✓"; spd.textContent = "";
-          }catch(e){ pct.textContent = "err"; }
-        }else{ pct.textContent = "err"; }
-        const done = list.querySelectorAll("progress[value='100']").length;
-        totals.textContent = done + " / " + list.children.length + " uploaded";
-        resolve();
-      };
-      const form = new FormData();
-      form.append("session", window.voxoSession);
-      form.append("file", file, file.name);
-      xhr.send(form);
-    });
-  }
-  input.addEventListener("change", async ()=>{
-    list.innerHTML = ""; totals.textContent = "";
-    window.voxoUploadedPaths = [];
-    const files = Array.from(input.files||[]);
-    for (const f of files){ await postOne(f); } // sequential for reliability
-  });
-})();
-</script>
-""")
-            # Bridge: JS -> Python
-            adopt_btn = gr.Button("Use uploaded files", variant="primary")
-            uploaded_json = gr.Textbox(visible=False)
-            session_box = gr.Textbox(visible=False)
-            adopt_btn.click(
-                fn=None,
-                inputs=[],
-                outputs=[uploaded_json, session_box],
-                js="() => [JSON.stringify(window.voxoUploadedPaths||[]), window.voxoSession||'default']"
-            )
-            # Stage for batch
-            upload_summary = gr.Markdown("No uploads yet.")
-            staged_files = gr.State([])
-            stage_btn = gr.Button("Confirm & Stage", variant="secondary", interactive=True)
-            stage_btn.click(
-                adopt_uploaded,
-                inputs=[uploaded_json, session_box],
-                outputs=[upload_summary, staged_files, stage_btn],
-                concurrency_limit=1
-            )
             with gr.Row():
-                lang3 = gr.Dropdown(
-                    ["auto","en","es","fr","de","it","pt","ja","ko","zh"],
-                    value="auto", label="Language"
-                )
-                ts3 = gr.Checkbox(value=True, label="Show timestamps")
-            run_batch = gr.Button("Run Batch", variant="primary", interactive=False)
             batch_out = gr.Markdown("Ready.")
-            zip_out = gr.File(label="Download transcripts (ZIP)")
-            run_batch.click(
                 batch_transcribe_stream,
-                inputs=[staged_files, lang3, ts3],
                 outputs=[batch_out, zip_out],
                 concurrency_limit=1
             )
-    gr.Markdown(
-        f"**Engine**: `{MODEL_ID}` on `{DEVICE}` ({COMPUTE}). "
-        "Tip: Use an L4 GPU for large-v3 fast runs; switch back to CPU Basic to save dollars."
-    )
-# Queue for Gradio events (uploads handled by FastAPI below)
-demo.queue(default_concurrency_limit=1)
-# ===== FastAPI: real upload endpoint =====
-from fastapi import FastAPI, UploadFile, File as _FAFile, Form
-from fastapi.responses import JSONResponse
-api = FastAPI()
-@api.post("/voxo-upload")
-async def voxo_upload(file: UploadFile = _FAFile(...), session: str = Form("default")):
-    session_dir = os.path.join(UPLOAD_ROOT, _safe(session))
-    os.makedirs(session_dir, exist_ok=True)
-    dest = os.path.join(session_dir, _safe(file.filename))
-    with open(dest, "wb") as out:
-        while True:
-            chunk = await file.read(1024 * 1024)  # 1MB chunks
-            if not chunk:
-                break
-            out.write(chunk)
-    return JSONResponse({"path": dest, "name": os.path.basename(dest)})
-# 👇 Export a single ASGI app for Spaces to serve
-app = gr.mount_gradio_app(api, demo, path="/")

+import os, time, re, shutil, zipfile, subprocess
 import gradio as gr
 from faster_whisper import WhisperModel
+# ---------- Device & Model (lazy load so startup is instant) ----------
 DEVICE = "cuda" if os.path.exists("/dev/nvidia0") else "cpu"
 COMPUTE = "float16" if DEVICE == "cuda" else "int8"
 MODEL_ID = os.getenv(
     "VOXO_MODEL",
     "Systran/faster-whisper-large-v3" if DEVICE == "cuda" else "Systran/faster-whisper-small"
 )
+_model = None
+def get_model():
+    global _model
+    if _model is None:
+        _model = WhisperModel(MODEL_ID, device=DEVICE, compute_type=COMPUTE)
+    return _model
+# ---------- Helpers ----------
 def _ts(t: float) -> str:
     m, s = divmod(int(t), 60); h, m = divmod(m, 60)
     return f"{h:02d}:{m:02d}:{s:02d}"
         n /= 1024
 def _safe(name: str) -> str:
+    return re.sub(r"[^A-Za-z0-9._-]+", "_", os.path.basename(name))
 def _duration_secs(path: str) -> float:
     try:
     except Exception:
         return 0.0
+# ---------- Core Transcribe ----------
 def transcribe(audio_path, language="auto", timestamps=True):
     if not audio_path:
         return ""
     lang = None if language == "auto" else language
+    segments, _info = get_model().transcribe(
         audio_path,
         language=lang,
         vad_filter=True,
         vad_parameters=dict(min_silence_duration_ms=500),
+        beam_size=1,                   # fast; bump to 3–5 for more accuracy
         best_of=1,
         condition_on_previous_text=False,
         no_speech_threshold=0.3,
              if timestamps else [s.text.strip() for s in segments])
     return "\n".join(lines)
+# ---------- Batch with live ETA (streams updates) ----------
+def files_added_status(file_paths, progress=gr.Progress(track_tqdm=True)):
+    if not file_paths:
+        return "No files yet. Add some audio to get started.", gr.update(interactive=False)
+    total_size, total_audio = 0, 0.0
+    lines = ["### Files added"]
+    for i, p in enumerate(file_paths, 1):
+        name = _safe(p)
+        progress(i/len(file_paths), desc=f"Scanning {name}")
+        size = os.path.getsize(p) if os.path.exists(p) else 0
+        dur  = _duration_secs(p)
+        total_size += size
+        total_audio += dur
+        lines.append(f"- ✅ **{name}** — {(_fmt_hms(dur) if dur else '…')} | {_fmt_bytes(size)}")
+    lines += ["", f"**Total:** {len(file_paths)} files — {_fmt_hms(total_audio)} audio — {_fmt_bytes(total_size)}", "Ready to run the batch."]
+    return "\n".join(lines), gr.update(interactive=True)
 def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progress=gr.Progress(track_tqdm=True)):
     if not file_paths:
         yield "No files selected.", None
     summary_parts, processed_audio, completed = [], 0.0, 0
+    def status_md(note: str = "") -> str:
         elapsed = time.time() - start
         rtf = elapsed / processed_audio if processed_audio > 0 else 0.0
+        remaining = max(0.0, total_audio - processed_audio)
+        eta = remaining * rtf if processed_audio > 0 else 0.0
         header = [
             "### Batch Progress",
             f"- Files: **{completed}/{n}**",
             f"- Est. RTF: **{rtf:.2f}**" if processed_audio else "- Est. RTF: **…**",
             f"- ETA: **{_fmt_hms(eta)}**" if processed_audio else "- ETA: **…**",
         ]
+        if note: header.append(f"\n{note}")
         tail = "\n".join(summary_parts[-2:]) if summary_parts else ""
         return "\n".join(header) + ("\n\n" + tail if tail else "")
             f.write(text)
         wall = time.time() - t0
+        summary_parts.append(
+            f"#### ✅ {name}\n- Audio: {_fmt_hms(file_dur)}  |  Wall: {_fmt_hms(wall)}  |  RTF: {(wall/max(1e-6,file_dur)):.2f}\n\n{text}\n"
         )
         processed_audio += file_dur
         completed += 1
         yield status_md(), None
+    # combined + zip
     combined_path = os.path.join(workdir, "_ALL_TRANSCRIPTS.txt")
     with open(combined_path, "w", encoding="utf-8") as f:
         f.write("\n\n".join(summary_parts))
     yield status_md("All done. Download the ZIP for every transcript."), zip_path
+# ---------- UI ----------
 with gr.Blocks(title="Voxo – Audio to Text") as demo:
     gr.Markdown("# 🎧 Voxo\nDrop audio, get text. GPU = fast, CPU = free.")
     with gr.Tabs():
+        # Single
         with gr.Tab("Single file"):
             with gr.Row():
                 audio = gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio (mp3/wav)")
+                lang  = gr.Dropdown(["auto","en","es","fr","de","it","pt","ja","ko","zh"], value="auto", label="Language")
             ts = gr.Checkbox(value=True, label="Show timestamps")
             btn = gr.Button("Transcribe", variant="primary")
             out = gr.Textbox(lines=20, label="Transcript", show_copy_button=True)
             btn.click(transcribe, inputs=[audio, lang, ts], outputs=out, concurrency_limit=1)
+        # Batch (simple uploader + live ETA)
         with gr.Tab("Batch"):
+            files = gr.File(file_count="multiple", type="filepath", file_types=["audio"], label="Select multiple audio files")
+            upload_status = gr.Markdown("No files yet. Add some audio.")
             with gr.Row():
+                lang2 = gr.Dropdown(["auto","en","es","fr","de","it","pt","ja","ko","zh"], value="auto", label="Language")
+                ts2   = gr.Checkbox(value=True, label="Show timestamps")
+            batch_btn = gr.Button("Run Batch", variant="primary", interactive=False)
             batch_out = gr.Markdown("Ready.")
+            zip_out   = gr.File(label="Download transcripts (ZIP)")
+            # Enable the Run button after files are added + show a summary
+            files.change(files_added_status, inputs=[files], outputs=[upload_status, batch_btn])
+            # Stream progress + final ZIP
+            batch_btn.click(
                 batch_transcribe_stream,
+                inputs=[files, lang2, ts2],
                 outputs=[batch_out, zip_out],
                 concurrency_limit=1
             )
+    gr.Markdown(f"**Engine**: `{MODEL_ID}` on `{DEVICE}` ({COMPUTE}). Tip: Use an L4 GPU for large-v3 fast runs; switch back to CPU Basic to save dollars.")
+# Start Gradio server (Spaces-friendly)
+demo.queue(default_concurrency_limit=1).launch()