Spaces:

BinKhoaLe1812
/

WhisperAPI

Running on Zero

App Files Files Community

LiamKhoaLe commited on Oct 29, 2025

Commit

4c24458

1 Parent(s): 4727f1c

Upd abort time and smart chunk-batcher #3

Browse files

Files changed (1) hide show

app.py +72 -16

app.py CHANGED Viewed

@@ -36,6 +36,20 @@ def _concat_text(chunks):
     return " ".join([c.strip() for c in chunks if c and c.strip()])
 def _robust_transcribe_array(audio_array: np.ndarray, sr: int, task: str) -> str:
     """Transcribe long/large audio by chunking sequentially to minimize GPU memory.
@@ -49,22 +63,44 @@ def _robust_transcribe_array(audio_array: np.ndarray, sr: int, task: str) -> str
     win = int(chunk_s * sr)
     texts = []
     if len(audio_array) <= win:
-        inputs = {"array": audio_array, "sampling_rate": sr}
-        out = pipe(inputs, batch_size=1, generate_kwargs={"task": task})
-        return out["text"]
     start = 0
     while start < len(audio_array):
         end = min(start + win, len(audio_array))
         chunk = audio_array[start:end]
-        inputs = {"array": chunk, "sampling_rate": sr}
-        out = pipe(inputs, batch_size=1, generate_kwargs={"task": task})
-        texts.append(out["text"])
         if end == len(audio_array):
             break
         start += step
     return _concat_text(texts)
 def _robust_transcribe_path(path: str, task: str) -> str:
     sr = pipe.feature_extractor.sampling_rate
     # ffmpeg_read expects raw bytes, not a file path
@@ -97,22 +133,42 @@ def _robust_transcribe_path(path: str, task: str) -> str:
 def transcribe(inputs, task, summarize=False):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
         if isinstance(inputs, str):
-            text = _robust_transcribe_path(inputs, task)
         elif isinstance(inputs, dict) and "array" in inputs:
-            text = _robust_transcribe_array(inputs["array"], inputs.get("sampling_rate", pipe.feature_extractor.sampling_rate), task)
         else:
-            text = pipe(inputs, batch_size=1, generate_kwargs={"task": task})["text"]
     except Exception as e:
         raise gr.Error(f"Transcription failed: {e}")
-    if summarize:
-        try:
-            summary = summarize_with_gemini(text)
-        except Exception as e:
-            summary = f"Summary error: {e}"
-        return text, summary
-    return text, ""
 def _return_yt_html_embed(yt_url):

     return " ".join([c.strip() for c in chunks if c and c.strip()])
+def _transcribe_chunk(chunk: np.ndarray, sr: int, task: str, max_retries: int = 3) -> str:
+    """Transcribe a single chunk with retries and simple backoff."""
+    delay = 2.0
+    for attempt in range(max_retries):
+        try:
+            out = pipe({"array": chunk, "sampling_rate": sr}, batch_size=1, generate_kwargs={"task": task})
+            return out["text"]
+        except Exception:
+            if attempt == max_retries - 1:
+                raise
+            time.sleep(delay)
+            delay *= 1.8
 def _robust_transcribe_array(audio_array: np.ndarray, sr: int, task: str) -> str:
     """Transcribe long/large audio by chunking sequentially to minimize GPU memory.
     win = int(chunk_s * sr)
     texts = []
     if len(audio_array) <= win:
+        return _transcribe_chunk(audio_array, sr, task)
     start = 0
     while start < len(audio_array):
         end = min(start + win, len(audio_array))
         chunk = audio_array[start:end]
+        txt = _transcribe_chunk(chunk, sr, task)
+        texts.append(txt)
         if end == len(audio_array):
             break
         start += step
     return _concat_text(texts)
+def _robust_transcribe_array_stream(audio_array: np.ndarray, sr: int, task: str):
+    """Generator: yields cumulative transcription after each chunk."""
+    if audio_array.ndim > 1:
+        audio_array = np.mean(audio_array, axis=1)
+    chunk_s = 20
+    overlap_s = 2
+    step = int((chunk_s - overlap_s) * sr)
+    win = int(chunk_s * sr)
+    texts = []
+    if len(audio_array) <= win:
+        texts.append(_transcribe_chunk(audio_array, sr, task))
+        yield _concat_text(texts)
+        return
+    start = 0
+    while start < len(audio_array):
+        end = min(start + win, len(audio_array))
+        chunk = audio_array[start:end]
+        txt = _transcribe_chunk(chunk, sr, task)
+        texts.append(txt)
+        yield _concat_text(texts)
+        if end == len(audio_array):
+            break
+        start += step
 def _robust_transcribe_path(path: str, task: str) -> str:
     sr = pipe.feature_extractor.sampling_rate
     # ffmpeg_read expects raw bytes, not a file path
 def transcribe(inputs, task, summarize=False):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    # Stream outputs incrementally: yield tuples (transcription_so_far, summary_so_far)
+    def _stream(gen):
+        running_text = ""
+        running_summary = ""
+        for partial in gen:
+            running_text = partial
+            if summarize and partial.strip():
+                try:
+                    running_summary += ("\n\n" if running_summary else "") + summarize_with_gemini(partial)
+                except Exception:
+                    pass
+            yield running_text, (running_summary if summarize else "")
     try:
         if isinstance(inputs, str):
+            # File path handed by Gradio
+            sr = pipe.feature_extractor.sampling_rate
+            with open(inputs, "rb") as _f:
+                payload = _f.read()
+            audio = ffmpeg_read(payload, sr)
+            return _stream(_robust_transcribe_array_stream(audio, sr, task))
         elif isinstance(inputs, dict) and "array" in inputs:
+            sr = inputs.get("sampling_rate", pipe.feature_extractor.sampling_rate)
+            return _stream(_robust_transcribe_array_stream(inputs["array"], sr, task))
         else:
+            # Fallback single shot
+            out = pipe(inputs, batch_size=1, generate_kwargs={"task": task})["text"]
+            if summarize:
+                try:
+                    summ = summarize_with_gemini(out)
+                except Exception as e:
+                    summ = f"Summary error: {e}"
+                return out, summ
+            return out, ""
     except Exception as e:
         raise gr.Error(f"Transcription failed: {e}")
 def _return_yt_html_embed(yt_url):