frankmcmahen commited on
Commit
d0f8862
·
verified ·
1 Parent(s): 73a7a7e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -203
app.py CHANGED
@@ -1,17 +1,22 @@
1
- import os, time, re, shutil, zipfile, subprocess, json
2
  import gradio as gr
3
  from faster_whisper import WhisperModel
4
 
5
- # ===== Hardware & model selection =====
6
  DEVICE = "cuda" if os.path.exists("/dev/nvidia0") else "cpu"
7
  COMPUTE = "float16" if DEVICE == "cuda" else "int8"
8
  MODEL_ID = os.getenv(
9
  "VOXO_MODEL",
10
  "Systran/faster-whisper-large-v3" if DEVICE == "cuda" else "Systran/faster-whisper-small"
11
  )
12
- model = WhisperModel(MODEL_ID, device=DEVICE, compute_type=COMPUTE)
13
-
14
- # ===== Helpers =====
 
 
 
 
 
15
  def _ts(t: float) -> str:
16
  m, s = divmod(int(t), 60); h, m = divmod(m, 60)
17
  return f"{h:02d}:{m:02d}:{s:02d}"
@@ -30,8 +35,7 @@ def _fmt_bytes(n: int) -> str:
30
  n /= 1024
31
 
32
  def _safe(name: str) -> str:
33
- base = os.path.basename(name)
34
- return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
35
 
36
  def _duration_secs(path: str) -> float:
37
  try:
@@ -44,17 +48,17 @@ def _duration_secs(path: str) -> float:
44
  except Exception:
45
  return 0.0
46
 
47
- # ===== Core transcription =====
48
  def transcribe(audio_path, language="auto", timestamps=True):
49
  if not audio_path:
50
  return ""
51
  lang = None if language == "auto" else language
52
- segments, _info = model.transcribe(
53
  audio_path,
54
  language=lang,
55
  vad_filter=True,
56
  vad_parameters=dict(min_silence_duration_ms=500),
57
- beam_size=1, # fast; bump to 3–5 if you want extra accuracy
58
  best_of=1,
59
  condition_on_previous_text=False,
60
  no_speech_threshold=0.3,
@@ -63,7 +67,23 @@ def transcribe(audio_path, language="auto", timestamps=True):
63
  if timestamps else [s.text.strip() for s in segments])
64
  return "\n".join(lines)
65
 
66
- # ===== Batch with streaming ETA =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progress=gr.Progress(track_tqdm=True)):
68
  if not file_paths:
69
  yield "No files selected.", None
@@ -78,11 +98,11 @@ def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progre
78
 
79
  summary_parts, processed_audio, completed = [], 0.0, 0
80
 
81
- def status_md(extra_note: str = "") -> str:
82
  elapsed = time.time() - start
83
  rtf = elapsed / processed_audio if processed_audio > 0 else 0.0
84
- remaining_audio = max(0.0, total_audio - processed_audio)
85
- eta = remaining_audio * rtf if processed_audio > 0 else 0.0
86
  header = [
87
  "### Batch Progress",
88
  f"- Files: **{completed}/{n}**",
@@ -91,7 +111,7 @@ def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progre
91
  f"- Est. RTF: **{rtf:.2f}**" if processed_audio else "- Est. RTF: **…**",
92
  f"- ETA: **{_fmt_hms(eta)}**" if processed_audio else "- ETA: **…**",
93
  ]
94
- if extra_note: header.append(f"\n{extra_note}")
95
  tail = "\n".join(summary_parts[-2:]) if summary_parts else ""
96
  return "\n".join(header) + ("\n\n" + tail if tail else "")
97
 
@@ -110,11 +130,9 @@ def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progre
110
  f.write(text)
111
 
112
  wall = time.time() - t0
113
- per_file = (
114
- f"#### ✅ {name}\n- Audio: {_fmt_hms(file_dur)} | "
115
- f"Wall: {_fmt_hms(wall)} | RTF: {(wall/max(1e-6,file_dur)):.2f}\n\n{text}\n"
116
  )
117
- summary_parts.append(per_file)
118
 
119
  processed_audio += file_dur
120
  completed += 1
@@ -122,6 +140,7 @@ def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progre
122
 
123
  yield status_md(), None
124
 
 
125
  combined_path = os.path.join(workdir, "_ALL_TRANSCRIPTS.txt")
126
  with open(combined_path, "w", encoding="utf-8") as f:
127
  f.write("\n\n".join(summary_parts))
@@ -134,208 +153,44 @@ def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progre
134
 
135
  yield status_md("All done. Download the ZIP for every transcript."), zip_path
136
 
137
- # ===== Progress uploader adoption =====
138
- UPLOAD_ROOT = "/tmp/voxo_progress_uploads"
139
-
140
- def adopt_uploaded(json_paths: str, session_id: str):
141
- """Take server paths from the custom uploader and prep status + list for batch."""
142
- try:
143
- paths = json.loads(json_paths) if json_paths else []
144
- except Exception:
145
- paths = []
146
- safe_paths, total_size, total_audio = [], 0, 0.0
147
- base = os.path.join(UPLOAD_ROOT, _safe(session_id))
148
- for p in paths:
149
- if not p: continue
150
- p = os.path.abspath(p)
151
- if not p.startswith(base): # sandbox check
152
- continue
153
- if os.path.exists(p):
154
- safe_paths.append(p)
155
- total_size += os.path.getsize(p)
156
- total_audio += _duration_secs(p)
157
- status = (
158
- "### Files staged\n" +
159
- "\n".join([f"- ✅ **{_safe(p)}** — {_fmt_hms(_duration_secs(p))} | {_fmt_bytes(os.path.getsize(p))}" for p in safe_paths]) +
160
- (f"\n\n**Total:** {len(safe_paths)} files — {_fmt_hms(total_audio)} — {_fmt_bytes(total_size)}" if safe_paths else "\n\nNo valid files.")
161
- )
162
- return status, safe_paths, gr.update(interactive=bool(safe_paths))
163
-
164
- # ===== UI =====
165
  with gr.Blocks(title="Voxo – Audio to Text") as demo:
166
  gr.Markdown("# 🎧 Voxo\nDrop audio, get text. GPU = fast, CPU = free.")
167
 
168
  with gr.Tabs():
169
- # --- Single file ---
170
  with gr.Tab("Single file"):
171
  with gr.Row():
172
  audio = gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio (mp3/wav)")
173
- lang = gr.Dropdown(
174
- ["auto","en","es","fr","de","it","pt","ja","ko","zh"],
175
- value="auto", label="Language"
176
- )
177
  ts = gr.Checkbox(value=True, label="Show timestamps")
178
  btn = gr.Button("Transcribe", variant="primary")
179
  out = gr.Textbox(lines=20, label="Transcript", show_copy_button=True)
180
  btn.click(transcribe, inputs=[audio, lang, ts], outputs=out, concurrency_limit=1)
181
 
182
- # --- Batch (Progress Uploads) replaces old Batch entirely ---
183
  with gr.Tab("Batch"):
184
- gr.Markdown("**Upload with real progress bars, then run the batch.**")
185
-
186
- # Hidden wiring to carry results from JS → Python
187
- uploaded_json = gr.Textbox(visible=False)
188
- session_box = gr.Textbox(visible=False)
189
- staged_files = gr.State([]) # python list[str] of server paths
190
-
191
- # Custom HTML + JS uploader with true progress bars
192
- uploader = gr.HTML("""
193
- <div id="vx_uploader_wrap" style="border:1px dashed #7c3aed;padding:14px;border-radius:12px">
194
- <input id="vx_input" type="file" multiple accept="audio/*" style="margin-bottom:8px"/>
195
- <div id="vx_hint" style="font-size:12px;opacity:.7;margin-bottom:8px">Select multiple audio files. Upload starts immediately.</div>
196
- <div id="vx_progress_list" style="display:flex;flex-direction:column;gap:6px"></div>
197
- <div id="vx_totals" style="margin-top:8px;font-size:12px;opacity:.8"></div>
198
- </div>
199
- <script>
200
- (function(){
201
- const uploadUrl = "/voxo-upload";
202
- const input = document.getElementById("vx_input");
203
- const list = document.getElementById("vx_progress_list");
204
- const totals = document.getElementById("vx_totals");
205
- window.voxoUploadedPaths = [];
206
- window.voxoSession = String(Date.now());
207
-
208
- function fmtBytes(n){const u=["B","KB","MB","GB","TB"];let i=0;while(n>=1024&&i<u.length-1){n/=1024;i++;}return (i?n.toFixed(1):n)+" "+u[i];}
209
- function fmtHMS(sec){sec=Math.max(0,sec|0);let m=sec/60|0,s=sec%60,h=m/60|0;m%=60;return h?`${h}h ${String(m).padStart(2,'0')}m ${String(s).padStart(2,'0')}s`:m?`${m}m ${String(s).padStart(2,'0')}s`:`${s}s`;}
210
-
211
- function makeRow(name,size){
212
- const row=document.createElement("div");
213
- row.style="display:flex;align-items:center;gap:8px;white-space:nowrap";
214
- row.innerHTML = `
215
- <span style="flex:1;overflow:hidden;text-overflow:ellipsis">${name}</span>
216
- <span style="width:78px;text-align:right;font-size:12px;opacity:.7">${fmtBytes(size)}</span>
217
- <progress value="0" max="100" style="flex:0 0 160px;height:10px"></progress>
218
- <span class="pct" style="width:40px;text-align:right;font-size:12px">0%</span>
219
- <span class="spd" style="width:90px;text-align:right;font-size:12px;opacity:.7"></span>
220
- `;
221
- return row;
222
- }
223
-
224
- function postOne(file){
225
- return new Promise((resolve)=>{
226
- const row = makeRow(file.name, file.size);
227
- const bar = row.querySelector("progress");
228
- const pct = row.querySelector(".pct");
229
- const spd = row.querySelector(".spd");
230
- list.appendChild(row);
231
-
232
- const xhr = new XMLHttpRequest();
233
- xhr.open("POST", uploadUrl);
234
- const t0 = performance.now();
235
-
236
- xhr.upload.onprogress = (e)=>{
237
- if(e.lengthComputable){
238
- const p = Math.round(100*e.loaded/e.total);
239
- bar.value = p; pct.textContent = p+"%";
240
- const sec = (performance.now()-t0)/1000;
241
- const rate = e.loaded / Math.max(1e-6, sec); // B/s
242
- const remain = (e.total - e.loaded) / Math.max(1e-6, rate); // s
243
- spd.textContent = (rate<1024?`${rate|0} B/s` : rate<1024*1024?`${(rate/1024).toFixed(1)} KB/s` : `${(rate/1024/1024).toFixed(1)} MB/s`) + " · " + fmtHMS(remain);
244
- }
245
- };
246
- xhr.onload = ()=>{
247
- if(xhr.status===200){
248
- try{
249
- const resp = JSON.parse(xhr.responseText);
250
- window.voxoUploadedPaths.push(resp.path);
251
- bar.value = 100; pct.textContent = "✓"; spd.textContent = "";
252
- }catch(e){ pct.textContent = "err"; }
253
- }else{ pct.textContent = "err"; }
254
- const done = list.querySelectorAll("progress[value='100']").length;
255
- totals.textContent = done + " / " + list.children.length + " uploaded";
256
- resolve();
257
- };
258
- const form = new FormData();
259
- form.append("session", window.voxoSession);
260
- form.append("file", file, file.name);
261
- xhr.send(form);
262
- });
263
- }
264
-
265
- input.addEventListener("change", async ()=>{
266
- list.innerHTML = ""; totals.textContent = "";
267
- window.voxoUploadedPaths = [];
268
- const files = Array.from(input.files||[]);
269
- for (const f of files){ await postOne(f); } // sequential for reliability
270
- });
271
- })();
272
- </script>
273
- """)
274
-
275
- # Bridge: JS -> Python
276
- adopt_btn = gr.Button("Use uploaded files", variant="primary")
277
- uploaded_json = gr.Textbox(visible=False)
278
- session_box = gr.Textbox(visible=False)
279
- adopt_btn.click(
280
- fn=None,
281
- inputs=[],
282
- outputs=[uploaded_json, session_box],
283
- js="() => [JSON.stringify(window.voxoUploadedPaths||[]), window.voxoSession||'default']"
284
- )
285
-
286
- # Stage for batch
287
- upload_summary = gr.Markdown("No uploads yet.")
288
- staged_files = gr.State([])
289
- stage_btn = gr.Button("Confirm & Stage", variant="secondary", interactive=True)
290
- stage_btn.click(
291
- adopt_uploaded,
292
- inputs=[uploaded_json, session_box],
293
- outputs=[upload_summary, staged_files, stage_btn],
294
- concurrency_limit=1
295
- )
296
-
297
  with gr.Row():
298
- lang3 = gr.Dropdown(
299
- ["auto","en","es","fr","de","it","pt","ja","ko","zh"],
300
- value="auto", label="Language"
301
- )
302
- ts3 = gr.Checkbox(value=True, label="Show timestamps")
303
- run_batch = gr.Button("Run Batch", variant="primary", interactive=False)
304
  batch_out = gr.Markdown("Ready.")
305
- zip_out = gr.File(label="Download transcripts (ZIP)")
306
- run_batch.click(
 
 
 
 
 
307
  batch_transcribe_stream,
308
- inputs=[staged_files, lang3, ts3],
309
  outputs=[batch_out, zip_out],
310
  concurrency_limit=1
311
  )
312
 
313
- gr.Markdown(
314
- f"**Engine**: `{MODEL_ID}` on `{DEVICE}` ({COMPUTE}). "
315
- "Tip: Use an L4 GPU for large-v3 fast runs; switch back to CPU Basic to save dollars."
316
- )
317
-
318
- # Queue for Gradio events (uploads handled by FastAPI below)
319
- demo.queue(default_concurrency_limit=1)
320
-
321
- # ===== FastAPI: real upload endpoint =====
322
- from fastapi import FastAPI, UploadFile, File as _FAFile, Form
323
- from fastapi.responses import JSONResponse
324
-
325
- api = FastAPI()
326
-
327
- @api.post("/voxo-upload")
328
- async def voxo_upload(file: UploadFile = _FAFile(...), session: str = Form("default")):
329
- session_dir = os.path.join(UPLOAD_ROOT, _safe(session))
330
- os.makedirs(session_dir, exist_ok=True)
331
- dest = os.path.join(session_dir, _safe(file.filename))
332
- with open(dest, "wb") as out:
333
- while True:
334
- chunk = await file.read(1024 * 1024) # 1MB chunks
335
- if not chunk:
336
- break
337
- out.write(chunk)
338
- return JSONResponse({"path": dest, "name": os.path.basename(dest)})
339
 
340
- # 👇 Export a single ASGI app for Spaces to serve
341
- app = gr.mount_gradio_app(api, demo, path="/")
 
1
+ import os, time, re, shutil, zipfile, subprocess
2
  import gradio as gr
3
  from faster_whisper import WhisperModel
4
 
5
+ # ---------- Device & Model (lazy load so startup is instant) ----------
6
  DEVICE = "cuda" if os.path.exists("/dev/nvidia0") else "cpu"
7
  COMPUTE = "float16" if DEVICE == "cuda" else "int8"
8
  MODEL_ID = os.getenv(
9
  "VOXO_MODEL",
10
  "Systran/faster-whisper-large-v3" if DEVICE == "cuda" else "Systran/faster-whisper-small"
11
  )
12
+ _model = None
13
+ def get_model():
14
+ global _model
15
+ if _model is None:
16
+ _model = WhisperModel(MODEL_ID, device=DEVICE, compute_type=COMPUTE)
17
+ return _model
18
+
19
+ # ---------- Helpers ----------
20
  def _ts(t: float) -> str:
21
  m, s = divmod(int(t), 60); h, m = divmod(m, 60)
22
  return f"{h:02d}:{m:02d}:{s:02d}"
 
35
  n /= 1024
36
 
37
  def _safe(name: str) -> str:
38
+ return re.sub(r"[^A-Za-z0-9._-]+", "_", os.path.basename(name))
 
39
 
40
  def _duration_secs(path: str) -> float:
41
  try:
 
48
  except Exception:
49
  return 0.0
50
 
51
+ # ---------- Core Transcribe ----------
52
  def transcribe(audio_path, language="auto", timestamps=True):
53
  if not audio_path:
54
  return ""
55
  lang = None if language == "auto" else language
56
+ segments, _info = get_model().transcribe(
57
  audio_path,
58
  language=lang,
59
  vad_filter=True,
60
  vad_parameters=dict(min_silence_duration_ms=500),
61
+ beam_size=1, # fast; bump to 3–5 for more accuracy
62
  best_of=1,
63
  condition_on_previous_text=False,
64
  no_speech_threshold=0.3,
 
67
  if timestamps else [s.text.strip() for s in segments])
68
  return "\n".join(lines)
69
 
70
+ # ---------- Batch with live ETA (streams updates) ----------
71
+ def files_added_status(file_paths, progress=gr.Progress(track_tqdm=True)):
72
+ if not file_paths:
73
+ return "No files yet. Add some audio to get started.", gr.update(interactive=False)
74
+ total_size, total_audio = 0, 0.0
75
+ lines = ["### Files added"]
76
+ for i, p in enumerate(file_paths, 1):
77
+ name = _safe(p)
78
+ progress(i/len(file_paths), desc=f"Scanning {name}")
79
+ size = os.path.getsize(p) if os.path.exists(p) else 0
80
+ dur = _duration_secs(p)
81
+ total_size += size
82
+ total_audio += dur
83
+ lines.append(f"- ✅ **{name}** — {(_fmt_hms(dur) if dur else '…')} | {_fmt_bytes(size)}")
84
+ lines += ["", f"**Total:** {len(file_paths)} files — {_fmt_hms(total_audio)} audio — {_fmt_bytes(total_size)}", "Ready to run the batch."]
85
+ return "\n".join(lines), gr.update(interactive=True)
86
+
87
  def batch_transcribe_stream(file_paths, language="auto", timestamps=True, progress=gr.Progress(track_tqdm=True)):
88
  if not file_paths:
89
  yield "No files selected.", None
 
98
 
99
  summary_parts, processed_audio, completed = [], 0.0, 0
100
 
101
+ def status_md(note: str = "") -> str:
102
  elapsed = time.time() - start
103
  rtf = elapsed / processed_audio if processed_audio > 0 else 0.0
104
+ remaining = max(0.0, total_audio - processed_audio)
105
+ eta = remaining * rtf if processed_audio > 0 else 0.0
106
  header = [
107
  "### Batch Progress",
108
  f"- Files: **{completed}/{n}**",
 
111
  f"- Est. RTF: **{rtf:.2f}**" if processed_audio else "- Est. RTF: **…**",
112
  f"- ETA: **{_fmt_hms(eta)}**" if processed_audio else "- ETA: **…**",
113
  ]
114
+ if note: header.append(f"\n{note}")
115
  tail = "\n".join(summary_parts[-2:]) if summary_parts else ""
116
  return "\n".join(header) + ("\n\n" + tail if tail else "")
117
 
 
130
  f.write(text)
131
 
132
  wall = time.time() - t0
133
+ summary_parts.append(
134
+ f"#### ✅ {name}\n- Audio: {_fmt_hms(file_dur)} | Wall: {_fmt_hms(wall)} | RTF: {(wall/max(1e-6,file_dur)):.2f}\n\n{text}\n"
 
135
  )
 
136
 
137
  processed_audio += file_dur
138
  completed += 1
 
140
 
141
  yield status_md(), None
142
 
143
+ # combined + zip
144
  combined_path = os.path.join(workdir, "_ALL_TRANSCRIPTS.txt")
145
  with open(combined_path, "w", encoding="utf-8") as f:
146
  f.write("\n\n".join(summary_parts))
 
153
 
154
  yield status_md("All done. Download the ZIP for every transcript."), zip_path
155
 
156
+ # ---------- UI ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  with gr.Blocks(title="Voxo – Audio to Text") as demo:
158
  gr.Markdown("# 🎧 Voxo\nDrop audio, get text. GPU = fast, CPU = free.")
159
 
160
  with gr.Tabs():
161
+ # Single
162
  with gr.Tab("Single file"):
163
  with gr.Row():
164
  audio = gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio (mp3/wav)")
165
+ lang = gr.Dropdown(["auto","en","es","fr","de","it","pt","ja","ko","zh"], value="auto", label="Language")
 
 
 
166
  ts = gr.Checkbox(value=True, label="Show timestamps")
167
  btn = gr.Button("Transcribe", variant="primary")
168
  out = gr.Textbox(lines=20, label="Transcript", show_copy_button=True)
169
  btn.click(transcribe, inputs=[audio, lang, ts], outputs=out, concurrency_limit=1)
170
 
171
+ # Batch (simple uploader + live ETA)
172
  with gr.Tab("Batch"):
173
+ files = gr.File(file_count="multiple", type="filepath", file_types=["audio"], label="Select multiple audio files")
174
+ upload_status = gr.Markdown("No files yet. Add some audio.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  with gr.Row():
176
+ lang2 = gr.Dropdown(["auto","en","es","fr","de","it","pt","ja","ko","zh"], value="auto", label="Language")
177
+ ts2 = gr.Checkbox(value=True, label="Show timestamps")
178
+ batch_btn = gr.Button("Run Batch", variant="primary", interactive=False)
 
 
 
179
  batch_out = gr.Markdown("Ready.")
180
+ zip_out = gr.File(label="Download transcripts (ZIP)")
181
+
182
+ # Enable the Run button after files are added + show a summary
183
+ files.change(files_added_status, inputs=[files], outputs=[upload_status, batch_btn])
184
+
185
+ # Stream progress + final ZIP
186
+ batch_btn.click(
187
  batch_transcribe_stream,
188
+ inputs=[files, lang2, ts2],
189
  outputs=[batch_out, zip_out],
190
  concurrency_limit=1
191
  )
192
 
193
+ gr.Markdown(f"**Engine**: `{MODEL_ID}` on `{DEVICE}` ({COMPUTE}). Tip: Use an L4 GPU for large-v3 fast runs; switch back to CPU Basic to save dollars.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ # Start Gradio server (Spaces-friendly)
196
+ demo.queue(default_concurrency_limit=1).launch()