Spaces:

BinKhoaLe1812
/

WhisperAPI

Running on Zero

App Files Files Community

LiamKhoaLe commited on Oct 29, 2025

Commit

e47935f

1 Parent(s): c0167f3

Upd abort time and smart chunk-batcher

Browse files

Files changed (1) hide show

app.py +71 -10

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import os
 import time
 import google.generativeai as genai
 from dotenv import load_dotenv
@@ -26,16 +27,80 @@ pipe = pipeline(
     model=MODEL_NAME,
     device=device,
     ignore_warning=True,
-    model_kwargs={"torch_dtype": torch.float16} if torch.cuda.is_available() else {}
 )
-@spaces.GPU
 def transcribe(inputs, task, summarize=False):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
-        text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
     except Exception as e:
         raise gr.Error(f"Transcription failed: {e}")
     if summarize:
@@ -100,7 +165,7 @@ def yt_transcribe(yt_url, task, summarize=False, max_filesize=75.0):
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
     try:
-        text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
     except Exception as e:
         raise gr.Error(f"Transcription failed: {e}")
     summary = ""
@@ -139,9 +204,7 @@ file_transcribe = gr.Interface(
     outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")],
     title="Whisper Large V3: Audio file",
     description=(
-        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
-        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
-        " of arbitrary length."
     ),
     flagging_mode="never",
 )
@@ -156,9 +219,7 @@ yt_transcribe = gr.Interface(
     outputs=["html", gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")],
     title="Whisper Large V3: Transcribe YouTube",
     description=(
-        "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
-        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
-        " arbitrary length."
     ),
     flagging_mode="never",
 )

 import tempfile
 import os
 import time
+import numpy as np
 import google.generativeai as genai
 from dotenv import load_dotenv
     model=MODEL_NAME,
     device=device,
     ignore_warning=True,
+    model_kwargs={"torch_dtype": torch.float16} if torch.cuda.is_available() else {},
+    chunk_length_s=20,  # small chunks to fit ZeroGPU
 )
+def _concat_text(chunks):
+    return " ".join([c.strip() for c in chunks if c and c.strip()])
+def _robust_transcribe_array(audio_array: np.ndarray, sr: int, task: str) -> str:
+    """Transcribe long/large audio by chunking sequentially to minimize GPU memory.
+    Uses conservative chunking (20s) with 2s overlap, batch_size=1.
+    """
+    if audio_array.ndim > 1:
+        audio_array = np.mean(audio_array, axis=1)
+    chunk_s = 20
+    overlap_s = 2
+    step = int((chunk_s - overlap_s) * sr)
+    win = int(chunk_s * sr)
+    texts = []
+    if len(audio_array) <= win:
+        inputs = {"array": audio_array, "sampling_rate": sr}
+        out = pipe(inputs, batch_size=1, generate_kwargs={"task": task})
+        return out["text"]
+    start = 0
+    while start < len(audio_array):
+        end = min(start + win, len(audio_array))
+        chunk = audio_array[start:end]
+        inputs = {"array": chunk, "sampling_rate": sr}
+        out = pipe(inputs, batch_size=1, generate_kwargs={"task": task})
+        texts.append(out["text"])
+        if end == len(audio_array):
+            break
+        start += step
+    return _concat_text(texts)
+def _robust_transcribe_path(path: str, task: str) -> str:
+    sr = pipe.feature_extractor.sampling_rate
+    audio = ffmpeg_read(path, sr)
+    try:
+        return _robust_transcribe_array(audio, sr, task)
+    except Exception as e:
+        # last-chance: shrink chunk and retry small windows
+        try:
+            small_chunk = 10
+            step = int(8 * sr)
+            win = int(small_chunk * sr)
+            texts = []
+            pos = 0
+            while pos < len(audio):
+                sub = audio[pos:pos+win]
+                out = pipe({"array": sub, "sampling_rate": sr}, batch_size=1, generate_kwargs={"task": task})
+                texts.append(out["text"])
+                if pos + win >= len(audio):
+                    break
+                pos += step
+            return _concat_text(texts)
+        except Exception as e2:
+            raise gr.Error(f"Transcription failed after retries: {e2}")
+@spaces.GPU(duration=2400) # 40 minutes
 def transcribe(inputs, task, summarize=False):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
+        if isinstance(inputs, str):
+            text = _robust_transcribe_path(inputs, task)
+        elif isinstance(inputs, dict) and "array" in inputs:
+            text = _robust_transcribe_array(inputs["array"], inputs.get("sampling_rate", pipe.feature_extractor.sampling_rate), task)
+        else:
+            text = pipe(inputs, batch_size=1, generate_kwargs={"task": task})["text"]
     except Exception as e:
         raise gr.Error(f"Transcription failed: {e}")
     if summarize:
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
     try:
+        text = _robust_transcribe_array(inputs["array"], inputs["sampling_rate"], task)
     except Exception as e:
         raise gr.Error(f"Transcription failed: {e}")
     summary = ""
     outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")],
     title="Whisper Large V3: Audio file",
     description=(
+        "Transcribe long-form microphone or audio inputs."
     ),
     flagging_mode="never",
 )
     outputs=["html", gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")],
     title="Whisper Large V3: Transcribe YouTube",
     description=(
+        "Transcribe long-form YouTube videos."
     ),
     flagging_mode="never",
 )