Spaces:

Ratnesh-dev
/

transcribe-diarize

Build error

App Files Files Community

Ratnesh-dev commited on Feb 23

Commit

1db40b9

1 Parent(s): a8f3b8c

Refactor To Only Use Whisper Turbo And Parakeet

Browse files

Files changed (12) hide show

README.md +22 -15
app.py +15 -95
local_api_benchmark.py +9 -7
requirements.txt +2 -2
src/constants.py +3 -9
src/models/faster_whisper_model.py +0 -98
src/models/parakeet_model.py +49 -2
src/models/whisper_cpp_model.py +0 -77
src/models/whisper_openai_model.py +0 -79
src/models/whisper_turbo_model.py +92 -0
src/transcription_service.py +3 -26
src/utils.py +21 -0

README.md CHANGED Viewed

@@ -9,19 +9,19 @@ python_version: '3.12'
 app_file: app.py
 pinned: false
 license: mit
-short_description: Multi-model ASR APIs with word-level timestamps
 ---
 This Space is optimized for API usage on ZeroGPU.
 Supported models (word-level timestamp capable):
-- Whisper Large V3
 - Whisper Large V3 Turbo
-- Whisper.cpp (large)
-- Whisper faster (large)
 - NVIDIA Parakeet v3
 Omitted:
 - IBM Granite Speech 3.3 8B (no stable, documented word-level timestamp output in standard inference APIs)
 Every transcription response returns:
@@ -29,23 +29,16 @@ Every transcription response returns:
 - `zerogpu_timing.gpu_window_seconds`
 - `zerogpu_timing.inference_seconds`
-Whisper.cpp notes:
-- Requires a whisper.cpp binary and a model file.
-- Configure with env vars:
-  - `WHISPER_CPP_BIN` (default: `whisper-cli`)
-  - `WHISPER_CPP_MODEL_LARGE` (path to ggml model)
 API endpoints:
 - `/transcribe_selected`
-- `/transcribe_whisper_large_v3`
 - `/transcribe_whisper_large_v3_turbo`
-- `/transcribe_whisper_cpp_large`
-- `/transcribe_whisper_faster_large`
 - `/transcribe_parakeet_v3`
 Local benchmark script (run in IPython):
 - `local_api_benchmark.py`
-- Calls each model-specific endpoint sequentially and returns all raw outputs + timings.
 - Example:
 ```python
 from local_api_benchmark import run_all_model_apis
@@ -57,7 +50,21 @@ res = run_all_model_apis(
     language=None,
     initial_prompt=None,
     postprocess_prompt=None,
-    model_options={"beam_size": 5, "temperature": 0.0},
     save_outputs=True,
     output_dir="benchmark_outputs",
 )

 app_file: app.py
 pinned: false
 license: mit
+short_description: Turbo + Parakeet ASR APIs with word-level timestamps
 ---
 This Space is optimized for API usage on ZeroGPU.
 Supported models (word-level timestamp capable):
 - Whisper Large V3 Turbo
 - NVIDIA Parakeet v3
 Omitted:
+- Whisper Large V3 (removed from this benchmark-focused app)
+- Whisper.cpp (large) (removed from this benchmark-focused app)
+- Whisper faster (large) (removed from this benchmark-focused app)
 - IBM Granite Speech 3.3 8B (no stable, documented word-level timestamp output in standard inference APIs)
 Every transcription response returns:
 - `zerogpu_timing.gpu_window_seconds`
 - `zerogpu_timing.inference_seconds`
 API endpoints:
 - `/transcribe_selected`
 - `/transcribe_whisper_large_v3_turbo`
 - `/transcribe_parakeet_v3`
 Local benchmark script (run in IPython):
 - `local_api_benchmark.py`
+- Calls only these two endpoints sequentially and returns all raw outputs + timings:
+  - `/transcribe_whisper_large_v3_turbo`
+  - `/transcribe_parakeet_v3`
 - Example:
 ```python
 from local_api_benchmark import run_all_model_apis
     language=None,
     initial_prompt=None,
     postprocess_prompt=None,
+    model_options_by_model={
+        "Whisper Large V3 Turbo": {
+            "chunk_length_s": 30,
+            "batch_size": 16,
+            "long_audio_threshold_seconds": 120,
+            "num_beams": 1,
+        },
+        "NVIDIA Parakeet v3": {
+            "batch_size": 1,
+            "long_audio_threshold_seconds": 480,
+            "local_attention_left": 256,
+            "local_attention_right": 256,
+            "subsampling_conv_chunking_factor": 1,
+        },
+    },
     save_outputs=True,
     output_dir="benchmark_outputs",
 )

app.py CHANGED Viewed

@@ -1,15 +1,16 @@
 import gradio as gr
 import spaces
-from src.constants import (
-    OMITTED_MODELS,
-    PARAKEET_V3,
-    SUPPORTED_MODELS,
-    WHISPER_CPP_LARGE,
-    WHISPER_FASTER_LARGE,
-    WHISPER_LARGE_V3,
-    WHISPER_LARGE_V3_TURBO,
-)
 from src.transcription_service import dispatch_transcription
@@ -34,26 +35,6 @@ def transcribe_selected_model(
     )
-@spaces.GPU
-def transcribe_whisper_large_v3(
-    audio_file,
-    task,
-    language,
-    initial_prompt,
-    postprocess_prompt,
-    model_options_json,
-):
-    return dispatch_transcription(
-        audio_file,
-        WHISPER_LARGE_V3,
-        task,
-        language,
-        initial_prompt,
-        postprocess_prompt,
-        model_options_json,
-    )
 @spaces.GPU
 def transcribe_whisper_large_v3_turbo(
     audio_file,
@@ -74,46 +55,6 @@ def transcribe_whisper_large_v3_turbo(
     )
-@spaces.GPU
-def transcribe_whisper_cpp_large(
-    audio_file,
-    task,
-    language,
-    initial_prompt,
-    postprocess_prompt,
-    model_options_json,
-):
-    return dispatch_transcription(
-        audio_file,
-        WHISPER_CPP_LARGE,
-        task,
-        language,
-        initial_prompt,
-        postprocess_prompt,
-        model_options_json,
-    )
-@spaces.GPU
-def transcribe_whisper_faster_large(
-    audio_file,
-    task,
-    language,
-    initial_prompt,
-    postprocess_prompt,
-    model_options_json,
-):
-    return dispatch_transcription(
-        audio_file,
-        WHISPER_FASTER_LARGE,
-        task,
-        language,
-        initial_prompt,
-        postprocess_prompt,
-        model_options_json,
-    )
 @spaces.GPU
 def transcribe_parakeet_v3(
     audio_file,
@@ -134,10 +75,10 @@ def transcribe_parakeet_v3(
     )
-with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
     gr.Markdown(
-        "# Multi-model transcription APIs (ZeroGPU)\n"
-        "API-first design with one endpoint per model and full raw outputs (including word-level timestamps)."
     )
     with gr.Row():
@@ -150,7 +91,7 @@ with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
     with gr.Row():
         model_label = gr.Dropdown(
             choices=SUPPORTED_MODELS,
-            value=WHISPER_LARGE_V3,
             label="Model",
         )
         task = gr.Radio(
@@ -170,7 +111,7 @@ with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
     model_options_json = gr.Textbox(
         label="Model options JSON (optional)",
-        placeholder='{"beam_size": 5, "temperature": 0.0, "vad_filter": true}',
         lines=3,
     )
@@ -196,36 +137,15 @@ with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
     # Hidden controls used only to expose dedicated API routes per model.
     with gr.Row(visible=False):
-        api_btn_wlv3 = gr.Button("transcribe_whisper_large_v3")
         api_btn_wlv3t = gr.Button("transcribe_whisper_large_v3_turbo")
-        api_btn_wcpp = gr.Button("transcribe_whisper_cpp_large")
-        api_btn_fw = gr.Button("transcribe_whisper_faster_large")
         api_btn_parakeet = gr.Button("transcribe_parakeet_v3")
-    api_btn_wlv3.click(
-        fn=transcribe_whisper_large_v3,
-        inputs=shared_inputs,
-        outputs=output,
-        api_name="transcribe_whisper_large_v3",
-    )
     api_btn_wlv3t.click(
         fn=transcribe_whisper_large_v3_turbo,
         inputs=shared_inputs,
         outputs=output,
         api_name="transcribe_whisper_large_v3_turbo",
     )
-    api_btn_wcpp.click(
-        fn=transcribe_whisper_cpp_large,
-        inputs=shared_inputs,
-        outputs=output,
-        api_name="transcribe_whisper_cpp_large",
-    )
-    api_btn_fw.click(
-        fn=transcribe_whisper_faster_large,
-        inputs=shared_inputs,
-        outputs=output,
-        api_name="transcribe_whisper_faster_large",
-    )
     api_btn_parakeet.click(
         fn=transcribe_parakeet_v3,
         inputs=shared_inputs,

+import warnings
+# Suppress a known deprecation warning emitted by a transitive dependency in spaces.
+warnings.filterwarnings(
+    "ignore",
+    message=r"`torch\.distributed\.reduce_op` is deprecated, please use `torch\.distributed\.ReduceOp` instead",
+    category=FutureWarning,
+)
 import gradio as gr
 import spaces
+from src.constants import OMITTED_MODELS, PARAKEET_V3, SUPPORTED_MODELS, WHISPER_LARGE_V3_TURBO
 from src.transcription_service import dispatch_transcription
     )
 @spaces.GPU
 def transcribe_whisper_large_v3_turbo(
     audio_file,
     )
 @spaces.GPU
 def transcribe_parakeet_v3(
     audio_file,
     )
+with gr.Blocks(title="Dual-model ASR (ZeroGPU)") as demo:
     gr.Markdown(
+        "# Turbo + Parakeet transcription APIs (ZeroGPU)\n"
+        "Focused benchmark app exposing only Whisper Large V3 Turbo and NVIDIA Parakeet v3."
     )
     with gr.Row():
     with gr.Row():
         model_label = gr.Dropdown(
             choices=SUPPORTED_MODELS,
+            value=WHISPER_LARGE_V3_TURBO,
             label="Model",
         )
         task = gr.Radio(
     model_options_json = gr.Textbox(
         label="Model options JSON (optional)",
+        placeholder='{"chunk_length_s": 30, "batch_size": 16}',
         lines=3,
     )
     # Hidden controls used only to expose dedicated API routes per model.
     with gr.Row(visible=False):
         api_btn_wlv3t = gr.Button("transcribe_whisper_large_v3_turbo")
         api_btn_parakeet = gr.Button("transcribe_parakeet_v3")
     api_btn_wlv3t.click(
         fn=transcribe_whisper_large_v3_turbo,
         inputs=shared_inputs,
         outputs=output,
         api_name="transcribe_whisper_large_v3_turbo",
     )
     api_btn_parakeet.click(
         fn=transcribe_parakeet_v3,
         inputs=shared_inputs,

local_api_benchmark.py CHANGED Viewed

@@ -8,17 +8,11 @@ from gradio_client import Client, handle_file
 from src.constants import (
     PARAKEET_V3,
-    WHISPER_CPP_LARGE,
-    WHISPER_FASTER_LARGE,
-    WHISPER_LARGE_V3,
     WHISPER_LARGE_V3_TURBO,
 )
 MODEL_API_BY_LABEL = {
-    WHISPER_LARGE_V3: "/transcribe_whisper_large_v3",
     WHISPER_LARGE_V3_TURBO: "/transcribe_whisper_large_v3_turbo",
-    WHISPER_CPP_LARGE: "/transcribe_whisper_cpp_large",
-    WHISPER_FASTER_LARGE: "/transcribe_whisper_faster_large",
     PARAKEET_V3: "/transcribe_parakeet_v3",
 }
@@ -110,6 +104,7 @@ def run_all_model_apis(
     initial_prompt: str | None = None,
     postprocess_prompt: str | None = None,
     model_options: str | dict[str, Any] | None = None,
     models: list[str] | None = None,
     hf_token: str | None = None,
     save_outputs: bool = True,
@@ -118,6 +113,7 @@ def run_all_model_apis(
     """Run each model-specific API endpoint one by one and collect full outputs.
     Designed for use from IPython notebooks/scripts.
     """
     if models is None:
         model_sequence = list(MODEL_API_BY_LABEL.keys())
@@ -135,6 +131,9 @@ def run_all_model_apis(
     for model in model_sequence:
         api_name = MODEL_API_BY_LABEL[model]
         call_start = time.perf_counter()
         try:
             response = client.predict(
@@ -143,7 +142,7 @@ def run_all_model_apis(
                 language=language,
                 initial_prompt=initial_prompt,
                 postprocess_prompt=postprocess_prompt,
-                model_options_json=options_json,
                 api_name=api_name,
             )
             call_end = time.perf_counter()
@@ -153,6 +152,7 @@ def run_all_model_apis(
                     "api_name": api_name,
                     "status": "ok",
                     "client_wall_clock_seconds": round(call_end - call_start, 4),
                     "result": response,
                 }
             )
@@ -164,6 +164,7 @@ def run_all_model_apis(
                     "api_name": api_name,
                     "status": "error",
                     "client_wall_clock_seconds": round(call_end - call_start, 4),
                     "error": str(exc),
                 }
             )
@@ -178,6 +179,7 @@ def run_all_model_apis(
         "initial_prompt": initial_prompt,
         "postprocess_prompt": postprocess_prompt,
         "model_options_json": options_json,
         "models": model_sequence,
         "benchmark_timing": {
             "total_client_wall_clock_seconds": round(finished_at - started_at, 4),

 from src.constants import (
     PARAKEET_V3,
     WHISPER_LARGE_V3_TURBO,
 )
 MODEL_API_BY_LABEL = {
     WHISPER_LARGE_V3_TURBO: "/transcribe_whisper_large_v3_turbo",
     PARAKEET_V3: "/transcribe_parakeet_v3",
 }
     initial_prompt: str | None = None,
     postprocess_prompt: str | None = None,
     model_options: str | dict[str, Any] | None = None,
+    model_options_by_model: dict[str, str | dict[str, Any]] | None = None,
     models: list[str] | None = None,
     hf_token: str | None = None,
     save_outputs: bool = True,
     """Run each model-specific API endpoint one by one and collect full outputs.
     Designed for use from IPython notebooks/scripts.
+    Use model_options_by_model for per-model tuning in a single benchmark run.
     """
     if models is None:
         model_sequence = list(MODEL_API_BY_LABEL.keys())
     for model in model_sequence:
         api_name = MODEL_API_BY_LABEL[model]
+        effective_options_json = options_json
+        if model_options_by_model and model in model_options_by_model:
+            effective_options_json = _to_model_options_json(model_options_by_model[model])
         call_start = time.perf_counter()
         try:
             response = client.predict(
                 language=language,
                 initial_prompt=initial_prompt,
                 postprocess_prompt=postprocess_prompt,
+                model_options_json=effective_options_json,
                 api_name=api_name,
             )
             call_end = time.perf_counter()
                     "api_name": api_name,
                     "status": "ok",
                     "client_wall_clock_seconds": round(call_end - call_start, 4),
+                    "effective_model_options_json": effective_options_json,
                     "result": response,
                 }
             )
                     "api_name": api_name,
                     "status": "error",
                     "client_wall_clock_seconds": round(call_end - call_start, 4),
+                    "effective_model_options_json": effective_options_json,
                     "error": str(exc),
                 }
             )
         "initial_prompt": initial_prompt,
         "postprocess_prompt": postprocess_prompt,
         "model_options_json": options_json,
+        "model_options_by_model": model_options_by_model,
         "models": model_sequence,
         "benchmark_timing": {
             "total_client_wall_clock_seconds": round(finished_at - started_at, 4),

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 torch>=2.3.0
-openai-whisper>=20250625
-faster-whisper>=1.1.0
 nemo_toolkit[asr]>=2.0.0

 torch>=2.3.0
+transformers>=4.46.0
+accelerate>=1.1.0
 nemo_toolkit[asr]>=2.0.0

src/constants.py CHANGED Viewed

@@ -1,20 +1,15 @@
-BATCH_SIZE = 8
-WHISPER_LARGE_V3 = "Whisper Large V3"
 WHISPER_LARGE_V3_TURBO = "Whisper Large V3 Turbo"
-WHISPER_CPP_LARGE = "Whisper.cpp (large)"
-WHISPER_FASTER_LARGE = "Whisper faster (large)"
 PARAKEET_V3 = "NVIDIA Parakeet v3"
 SUPPORTED_MODELS = [
-    WHISPER_LARGE_V3,
     WHISPER_LARGE_V3_TURBO,
-    WHISPER_CPP_LARGE,
-    WHISPER_FASTER_LARGE,
     PARAKEET_V3,
 ]
 OMITTED_MODELS = {
     "IBM Granite Speech 3.3 8B": (
         "Omitted because a stable, documented word-level timestamp interface is not available "
         "in standard inference usage."
@@ -22,7 +17,6 @@ OMITTED_MODELS = {
 }
 MODEL_IDS = {
-    WHISPER_LARGE_V3: "openai/whisper-large-v3",
     WHISPER_LARGE_V3_TURBO: "openai/whisper-large-v3-turbo",
     PARAKEET_V3: "nvidia/parakeet-tdt-0.6b-v3",
 }

 WHISPER_LARGE_V3_TURBO = "Whisper Large V3 Turbo"
 PARAKEET_V3 = "NVIDIA Parakeet v3"
 SUPPORTED_MODELS = [
     WHISPER_LARGE_V3_TURBO,
     PARAKEET_V3,
 ]
 OMITTED_MODELS = {
+    "Whisper Large V3": "Removed from this benchmark-focused app per configuration.",
+    "Whisper.cpp (large)": "Removed from this benchmark-focused app per configuration.",
+    "Whisper faster (large)": "Removed from this benchmark-focused app per configuration.",
     "IBM Granite Speech 3.3 8B": (
         "Omitted because a stable, documented word-level timestamp interface is not available "
         "in standard inference usage."
 }
 MODEL_IDS = {
     WHISPER_LARGE_V3_TURBO: "openai/whisper-large-v3-turbo",
     PARAKEET_V3: "nvidia/parakeet-tdt-0.6b-v3",
 }

src/models/faster_whisper_model.py DELETED Viewed

@@ -1,98 +0,0 @@
-import time
-from typing import Any
-import gradio as gr
-import torch
-from src.utils import serialize
-_FASTER_WHISPER_MODELS: dict[str, Any] = {}
-def _get_faster_whisper_model(model_options: dict[str, Any]):
-    model_size = model_options.get("model_size", "large-v3")
-    compute_type = model_options.get(
-        "compute_type",
-        "float16" if torch.cuda.is_available() else "int8",
-    )
-    cache_key = f"{model_size}:{compute_type}"
-    if cache_key in _FASTER_WHISPER_MODELS:
-        return _FASTER_WHISPER_MODELS[cache_key], model_size, compute_type
-    try:
-        from faster_whisper import WhisperModel
-    except Exception as exc:
-        raise gr.Error(
-            "faster-whisper backend requested but package is missing. "
-            "Add faster-whisper to requirements.txt"
-        ) from exc
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = WhisperModel(model_size, device=device, compute_type=compute_type)
-    _FASTER_WHISPER_MODELS[cache_key] = model
-    return model, model_size, compute_type
-def run_faster_whisper(
-    audio_file: str,
-    task: str,
-    language: str,
-    initial_prompt: str,
-    model_options: dict[str, Any],
-) -> dict[str, Any]:
-    model, model_size, compute_type = _get_faster_whisper_model(model_options)
-    beam_size = int(model_options.get("beam_size", 5))
-    temperature = float(model_options.get("temperature", 0.0))
-    vad_filter = bool(model_options.get("vad_filter", True))
-    infer_start = time.perf_counter()
-    segments, info = model.transcribe(
-        audio_file,
-        task=task,
-        language=language or None,
-        initial_prompt=initial_prompt or None,
-        word_timestamps=True,
-        beam_size=beam_size,
-        temperature=temperature,
-        vad_filter=vad_filter,
-    )
-    segments_list = list(segments)
-    infer_end = time.perf_counter()
-    raw_output = {
-        "info": serialize(info),
-        "segments": [
-            {
-                "id": seg.id,
-                "seek": seg.seek,
-                "start": seg.start,
-                "end": seg.end,
-                "text": seg.text,
-                "tokens": list(seg.tokens) if seg.tokens is not None else None,
-                "avg_logprob": seg.avg_logprob,
-                "compression_ratio": seg.compression_ratio,
-                "no_speech_prob": seg.no_speech_prob,
-                "words": [
-                    {
-                        "start": w.start,
-                        "end": w.end,
-                        "word": w.word,
-                        "probability": w.probability,
-                    }
-                    for w in (seg.words or [])
-                ],
-            }
-            for seg in segments_list
-        ],
-        "runtime": {
-            "model_size": model_size,
-            "compute_type": compute_type,
-        },
-    }
-    return {
-        "raw_output": serialize(raw_output),
-        "timing": {
-            "inference_seconds": round(infer_end - infer_start, 4),
-        },
-    }

src/models/parakeet_model.py CHANGED Viewed

@@ -4,7 +4,7 @@ import gradio as gr
 import torch
 from src.constants import MODEL_IDS, PARAKEET_V3
-from src.utils import serialize
 _PARAKEET_MODEL = None
@@ -36,9 +36,44 @@ def run_parakeet(
 ) -> dict:
     model = _get_parakeet_model()
     batch_size = int(model_options.get("batch_size", 1))
     infer_start = time.perf_counter()
-    outputs = model.transcribe([audio_file], batch_size=batch_size, timestamps=True)
     infer_end = time.perf_counter()
     item = outputs[0] if outputs else None
@@ -46,6 +81,18 @@ def run_parakeet(
         "output": serialize(item),
         "timestamp_hint": "word timestamps available in output.timestamp['word'] when provided by NeMo",
         "language_hint": language or "auto",
     }
     return {

 import torch
 from src.constants import MODEL_IDS, PARAKEET_V3
+from src.utils import get_audio_duration_seconds, serialize
 _PARAKEET_MODEL = None
 ) -> dict:
     model = _get_parakeet_model()
     batch_size = int(model_options.get("batch_size", 1))
+    long_audio_threshold_seconds = float(model_options.get("long_audio_threshold_seconds", 480))
+    local_attention_left = int(model_options.get("local_attention_left", 256))
+    local_attention_right = int(model_options.get("local_attention_right", 256))
+    subsampling_conv_chunking_factor = int(model_options.get("subsampling_conv_chunking_factor", 1))
+    enable_long_audio_optimizations = bool(model_options.get("enable_long_audio_optimizations", True))
+    duration_seconds = get_audio_duration_seconds(audio_file)
+    is_long_audio = duration_seconds is not None and duration_seconds > long_audio_threshold_seconds
+    applied_local_attention = False
+    applied_subsampling_chunking = False
+    optimization_errors: list[str] = []
+    if enable_long_audio_optimizations and is_long_audio:
+        try:
+            model.change_attention_model("rel_pos_local_attn", [local_attention_left, local_attention_right])
+            applied_local_attention = True
+        except Exception as exc:
+            optimization_errors.append(f"change_attention_model failed: {exc}")
+        try:
+            model.change_subsampling_conv_chunking_factor(subsampling_conv_chunking_factor)
+            applied_subsampling_chunking = True
+        except Exception as exc:
+            optimization_errors.append(f"change_subsampling_conv_chunking_factor failed: {exc}")
     infer_start = time.perf_counter()
+    try:
+        outputs = model.transcribe([audio_file], batch_size=batch_size, timestamps=True)
+    finally:
+        if applied_local_attention:
+            try:
+                model.change_attention_model("rel_pos")
+            except Exception:
+                pass
+        if applied_subsampling_chunking:
+            try:
+                model.change_subsampling_conv_chunking_factor(-1)
+            except Exception:
+                pass
     infer_end = time.perf_counter()
     item = outputs[0] if outputs else None
         "output": serialize(item),
         "timestamp_hint": "word timestamps available in output.timestamp['word'] when provided by NeMo",
         "language_hint": language or "auto",
+        "long_audio_settings": {
+            "duration_seconds": duration_seconds,
+            "is_long_audio": is_long_audio,
+            "threshold_seconds": long_audio_threshold_seconds,
+            "enable_long_audio_optimizations": enable_long_audio_optimizations,
+            "applied_local_attention": applied_local_attention,
+            "applied_subsampling_chunking": applied_subsampling_chunking,
+            "local_attention_left": local_attention_left,
+            "local_attention_right": local_attention_right,
+            "subsampling_conv_chunking_factor": subsampling_conv_chunking_factor,
+            "optimization_errors": optimization_errors,
+        },
     }
     return {

src/models/whisper_cpp_model.py DELETED Viewed

@@ -1,77 +0,0 @@
-import json
-import os
-import subprocess
-import tempfile
-import time
-from pathlib import Path
-import gradio as gr
-from src.utils import serialize
-def run_whisper_cpp(
-    audio_file: str,
-    task: str,
-    language: str,
-    initial_prompt: str,
-    model_options: dict,
-) -> dict:
-    whisper_cpp_bin = model_options.get("whisper_cpp_bin") or os.getenv("WHISPER_CPP_BIN", "whisper-cli")
-    whisper_cpp_model = model_options.get("whisper_cpp_model") or os.getenv("WHISPER_CPP_MODEL_LARGE")
-    if not whisper_cpp_model:
-        raise gr.Error(
-            "Whisper.cpp requires model path. Set WHISPER_CPP_MODEL_LARGE or pass "
-            "model_options_json={\"whisper_cpp_model\":\"/path/to/ggml-large-v3.bin\"}."
-        )
-    with tempfile.TemporaryDirectory() as tmpdir:
-        output_prefix = str(Path(tmpdir) / "whispercpp")
-        cmd = [
-            whisper_cpp_bin,
-            "-m",
-            whisper_cpp_model,
-            "-f",
-            audio_file,
-            "-oj",
-            "-ml",
-            "1",
-            "-of",
-            output_prefix,
-        ]
-        if language:
-            cmd.extend(["-l", language])
-        if initial_prompt:
-            cmd.extend(["--prompt", initial_prompt])
-        if task == "translate":
-            cmd.append("-tr")
-        infer_start = time.perf_counter()
-        proc = subprocess.run(cmd, capture_output=True, text=True)
-        infer_end = time.perf_counter()
-        if proc.returncode != 0:
-            raise gr.Error(
-                "whisper.cpp transcription failed. "
-                f"exit={proc.returncode} stderr={proc.stderr[-1500:]}"
-            )
-        json_path = Path(f"{output_prefix}.json")
-        if not json_path.exists():
-            raise gr.Error(
-                "whisper.cpp did not produce JSON output. "
-                "Ensure your whisper.cpp binary supports -oj and word timestamps (-ml 1)."
-            )
-        raw_output = json.loads(json_path.read_text())
-    return {
-        "raw_output": {
-            "result": serialize(raw_output),
-            "stderr": proc.stderr,
-        },
-        "timing": {
-            "inference_seconds": round(infer_end - infer_start, 4),
-        },
-    }

src/models/whisper_openai_model.py DELETED Viewed

@@ -1,79 +0,0 @@
-import time
-from typing import Any
-import gradio as gr
-import torch
-import whisper
-from src.constants import WHISPER_LARGE_V3, WHISPER_LARGE_V3_TURBO
-from src.utils import serialize
-_WHISPER_MODELS: dict[str, Any] = {}
-_OPENAI_MODEL_NAMES = {
-    WHISPER_LARGE_V3: "large-v3",
-    WHISPER_LARGE_V3_TURBO: "turbo",
-}
-def _get_whisper_model(model_label: str):
-    if model_label in _WHISPER_MODELS:
-        return _WHISPER_MODELS[model_label]
-    model_name = _OPENAI_MODEL_NAMES.get(model_label)
-    if model_name is None:
-        raise gr.Error(f"Unsupported Whisper model label: {model_label}")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = whisper.load_model(model_name, device=device)
-    _WHISPER_MODELS[model_label] = model
-    return model
-def run_whisper_openai(
-    model_label: str,
-    audio_file: str,
-    task: str,
-    language: str,
-    initial_prompt: str,
-    model_options: dict[str, Any],
-) -> dict[str, Any]:
-    model = _get_whisper_model(model_label)
-    decode_kwargs: dict[str, Any] = {"task": task, "word_timestamps": True}
-    if language:
-        decode_kwargs["language"] = language
-    if initial_prompt:
-        decode_kwargs["initial_prompt"] = initial_prompt
-    if "temperature" in model_options:
-        decode_kwargs["temperature"] = float(model_options["temperature"])
-    if "beam_size" in model_options:
-        decode_kwargs["beam_size"] = int(model_options["beam_size"])
-    if "best_of" in model_options:
-        decode_kwargs["best_of"] = int(model_options["best_of"])
-    if "patience" in model_options:
-        decode_kwargs["patience"] = float(model_options["patience"])
-    if "condition_on_previous_text" in model_options:
-        decode_kwargs["condition_on_previous_text"] = bool(model_options["condition_on_previous_text"])
-    if "suppress_tokens" in model_options:
-        decode_kwargs["suppress_tokens"] = model_options["suppress_tokens"]
-    # Ensure expected precision behavior when GPU is available.
-    decode_kwargs["fp16"] = bool(torch.cuda.is_available())
-    infer_start = time.perf_counter()
-    raw_output = model.transcribe(audio_file, **decode_kwargs)
-    infer_end = time.perf_counter()
-    return {
-        "raw_output": {
-            "backend": "openai-whisper",
-            "model_name": _OPENAI_MODEL_NAMES[model_label],
-            "result": serialize(raw_output),
-        },
-        "timing": {
-            "inference_seconds": round(infer_end - infer_start, 4),
-        },
-    }

src/models/whisper_turbo_model.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import time
+import warnings
+from typing import Any
+import gradio as gr
+import torch
+from src.constants import MODEL_IDS, WHISPER_LARGE_V3_TURBO
+from src.utils import get_audio_duration_seconds, serialize
+_TURBO_PIPELINES: dict[str, Any] = {}
+def _get_turbo_pipeline(chunk_length_s: float):
+    cache_key = f"chunk:{chunk_length_s}"
+    if cache_key in _TURBO_PIPELINES:
+        return _TURBO_PIPELINES[cache_key]
+    try:
+        from transformers import pipeline
+    except Exception as exc:
+        raise gr.Error(
+            "transformers is required for Whisper Turbo long-audio chunked inference. "
+            "Add transformers and accelerate to requirements.txt."
+        ) from exc
+    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    pipe = pipeline(
+        task="automatic-speech-recognition",
+        model=MODEL_IDS[WHISPER_LARGE_V3_TURBO],
+        device=0 if torch.cuda.is_available() else "cpu",
+        torch_dtype=dtype,
+        model_kwargs={"low_cpu_mem_usage": True, "use_safetensors": True},
+    )
+    _TURBO_PIPELINES[cache_key] = pipe
+    return pipe
+def run_whisper_turbo(
+    audio_file: str,
+    task: str,
+    language: str,
+    initial_prompt: str,
+    model_options: dict[str, Any],
+) -> dict[str, Any]:
+    chunk_length_s = float(model_options.get("chunk_length_s", 30))
+    batch_size = int(model_options.get("batch_size", 16))
+    long_audio_threshold_seconds = float(model_options.get("long_audio_threshold_seconds", 120))
+    duration_seconds = get_audio_duration_seconds(audio_file)
+    pipe = _get_turbo_pipeline(chunk_length_s=chunk_length_s)
+    generate_kwargs: dict[str, Any] = {"task": task, "num_beams": int(model_options.get("num_beams", 1))}
+    if language:
+        generate_kwargs["language"] = language
+    if initial_prompt:
+        generate_kwargs["prompt"] = initial_prompt
+    if "temperature" in model_options:
+        generate_kwargs["temperature"] = float(model_options["temperature"])
+    is_long_audio = duration_seconds is not None and duration_seconds > long_audio_threshold_seconds
+    infer_start = time.perf_counter()
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message=r".*chunk_length_s.*experimental.*", category=Warning)
+        warnings.filterwarnings("ignore", message=r".*input name `inputs` is deprecated.*", category=FutureWarning)
+        call_kwargs: dict[str, Any] = {
+            "return_timestamps": "word",
+            "batch_size": batch_size,
+            "generate_kwargs": generate_kwargs,
+        }
+        if is_long_audio:
+            call_kwargs["chunk_length_s"] = chunk_length_s
+        raw_output = pipe(audio_file, **call_kwargs)
+    infer_end = time.perf_counter()
+    return {
+        "raw_output": {
+            "backend": "transformers-whisper-turbo",
+            "model_name": MODEL_IDS[WHISPER_LARGE_V3_TURBO],
+            "long_audio_settings": {
+                "duration_seconds": duration_seconds,
+                "is_long_audio": is_long_audio,
+                "long_audio_threshold_seconds": long_audio_threshold_seconds,
+                "chunk_length_s": chunk_length_s,
+                "batch_size": batch_size,
+            },
+            "result": serialize(raw_output),
+        },
+        "timing": {
+            "inference_seconds": round(infer_end - infer_start, 4),
+        },
+    }

src/transcription_service.py CHANGED Viewed

@@ -5,15 +5,10 @@ import gradio as gr
 from src.constants import (
     PARAKEET_V3,
     SUPPORTED_MODELS,
-    WHISPER_CPP_LARGE,
-    WHISPER_FASTER_LARGE,
-    WHISPER_LARGE_V3,
     WHISPER_LARGE_V3_TURBO,
 )
-from src.models.faster_whisper_model import run_faster_whisper
 from src.models.parakeet_model import run_parakeet
-from src.models.whisper_openai_model import run_whisper_openai
-from src.models.whisper_cpp_model import run_whisper_cpp
 from src.utils import parse_model_options
@@ -56,25 +51,8 @@ def dispatch_transcription_with_options(
 ) -> dict:
     gpu_start = time.perf_counter()
-    if model_label in {WHISPER_LARGE_V3, WHISPER_LARGE_V3_TURBO}:
-        result = run_whisper_openai(
-            model_label=model_label,
-            audio_file=audio_file,
-            task=task,
-            language=language,
-            initial_prompt=initial_prompt,
-            model_options=model_options,
-        )
-    elif model_label == WHISPER_FASTER_LARGE:
-        result = run_faster_whisper(
-            audio_file=audio_file,
-            task=task,
-            language=language,
-            initial_prompt=initial_prompt,
-            model_options=model_options,
-        )
-    elif model_label == WHISPER_CPP_LARGE:
-        result = run_whisper_cpp(
             audio_file=audio_file,
             task=task,
             language=language,
@@ -107,4 +85,3 @@ def dispatch_transcription_with_options(
         "raw_output": result["raw_output"],
         "timestamp_granularity": "word",
     }

 from src.constants import (
     PARAKEET_V3,
     SUPPORTED_MODELS,
     WHISPER_LARGE_V3_TURBO,
 )
 from src.models.parakeet_model import run_parakeet
+from src.models.whisper_turbo_model import run_whisper_turbo
 from src.utils import parse_model_options
 ) -> dict:
     gpu_start = time.perf_counter()
+    if model_label == WHISPER_LARGE_V3_TURBO:
+        result = run_whisper_turbo(
             audio_file=audio_file,
             task=task,
             language=language,
         "raw_output": result["raw_output"],
         "timestamp_granularity": "word",
     }

src/utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 from pathlib import Path
 from typing import Any
@@ -39,3 +40,23 @@ def parse_model_options(raw: str | None) -> dict[str, Any]:
     if not isinstance(parsed, dict):
         raise gr.Error("model_options_json must decode to a JSON object")
     return parsed

 import json
+import subprocess
 from pathlib import Path
 from typing import Any
     if not isinstance(parsed, dict):
         raise gr.Error("model_options_json must decode to a JSON object")
     return parsed
+def get_audio_duration_seconds(audio_file: str) -> float | None:
+    cmd = [
+        "ffprobe",
+        "-v",
+        "error",
+        "-show_entries",
+        "format=duration",
+        "-of",
+        "default=noprint_wrappers=1:nokey=1",
+        audio_file,
+    ]
+    proc = subprocess.run(cmd, capture_output=True, text=True)
+    if proc.returncode != 0:
+        return None
+    try:
+        return float(proc.stdout.strip())
+    except Exception:
+        return None