Spaces:
Build error
Build error
Commit ·
1db40b9
1
Parent(s): a8f3b8c
Refactor To Only Use Whisper Turbo And Parakeet
Browse files- README.md +22 -15
- app.py +15 -95
- local_api_benchmark.py +9 -7
- requirements.txt +2 -2
- src/constants.py +3 -9
- src/models/faster_whisper_model.py +0 -98
- src/models/parakeet_model.py +49 -2
- src/models/whisper_cpp_model.py +0 -77
- src/models/whisper_openai_model.py +0 -79
- src/models/whisper_turbo_model.py +92 -0
- src/transcription_service.py +3 -26
- src/utils.py +21 -0
README.md
CHANGED
|
@@ -9,19 +9,19 @@ python_version: '3.12'
|
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
| 11 |
license: mit
|
| 12 |
-
short_description:
|
| 13 |
---
|
| 14 |
|
| 15 |
This Space is optimized for API usage on ZeroGPU.
|
| 16 |
|
| 17 |
Supported models (word-level timestamp capable):
|
| 18 |
-
- Whisper Large V3
|
| 19 |
- Whisper Large V3 Turbo
|
| 20 |
-
- Whisper.cpp (large)
|
| 21 |
-
- Whisper faster (large)
|
| 22 |
- NVIDIA Parakeet v3
|
| 23 |
|
| 24 |
Omitted:
|
|
|
|
|
|
|
|
|
|
| 25 |
- IBM Granite Speech 3.3 8B (no stable, documented word-level timestamp output in standard inference APIs)
|
| 26 |
|
| 27 |
Every transcription response returns:
|
|
@@ -29,23 +29,16 @@ Every transcription response returns:
|
|
| 29 |
- `zerogpu_timing.gpu_window_seconds`
|
| 30 |
- `zerogpu_timing.inference_seconds`
|
| 31 |
|
| 32 |
-
Whisper.cpp notes:
|
| 33 |
-
- Requires a whisper.cpp binary and a model file.
|
| 34 |
-
- Configure with env vars:
|
| 35 |
-
- `WHISPER_CPP_BIN` (default: `whisper-cli`)
|
| 36 |
-
- `WHISPER_CPP_MODEL_LARGE` (path to ggml model)
|
| 37 |
-
|
| 38 |
API endpoints:
|
| 39 |
- `/transcribe_selected`
|
| 40 |
-
- `/transcribe_whisper_large_v3`
|
| 41 |
- `/transcribe_whisper_large_v3_turbo`
|
| 42 |
-
- `/transcribe_whisper_cpp_large`
|
| 43 |
-
- `/transcribe_whisper_faster_large`
|
| 44 |
- `/transcribe_parakeet_v3`
|
| 45 |
|
| 46 |
Local benchmark script (run in IPython):
|
| 47 |
- `local_api_benchmark.py`
|
| 48 |
-
- Calls
|
|
|
|
|
|
|
| 49 |
- Example:
|
| 50 |
```python
|
| 51 |
from local_api_benchmark import run_all_model_apis
|
|
@@ -57,7 +50,21 @@ res = run_all_model_apis(
|
|
| 57 |
language=None,
|
| 58 |
initial_prompt=None,
|
| 59 |
postprocess_prompt=None,
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
save_outputs=True,
|
| 62 |
output_dir="benchmark_outputs",
|
| 63 |
)
|
|
|
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
| 11 |
license: mit
|
| 12 |
+
short_description: Turbo + Parakeet ASR APIs with word-level timestamps
|
| 13 |
---
|
| 14 |
|
| 15 |
This Space is optimized for API usage on ZeroGPU.
|
| 16 |
|
| 17 |
Supported models (word-level timestamp capable):
|
|
|
|
| 18 |
- Whisper Large V3 Turbo
|
|
|
|
|
|
|
| 19 |
- NVIDIA Parakeet v3
|
| 20 |
|
| 21 |
Omitted:
|
| 22 |
+
- Whisper Large V3 (removed from this benchmark-focused app)
|
| 23 |
+
- Whisper.cpp (large) (removed from this benchmark-focused app)
|
| 24 |
+
- Whisper faster (large) (removed from this benchmark-focused app)
|
| 25 |
- IBM Granite Speech 3.3 8B (no stable, documented word-level timestamp output in standard inference APIs)
|
| 26 |
|
| 27 |
Every transcription response returns:
|
|
|
|
| 29 |
- `zerogpu_timing.gpu_window_seconds`
|
| 30 |
- `zerogpu_timing.inference_seconds`
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
API endpoints:
|
| 33 |
- `/transcribe_selected`
|
|
|
|
| 34 |
- `/transcribe_whisper_large_v3_turbo`
|
|
|
|
|
|
|
| 35 |
- `/transcribe_parakeet_v3`
|
| 36 |
|
| 37 |
Local benchmark script (run in IPython):
|
| 38 |
- `local_api_benchmark.py`
|
| 39 |
+
- Calls only these two endpoints sequentially and returns all raw outputs + timings:
|
| 40 |
+
- `/transcribe_whisper_large_v3_turbo`
|
| 41 |
+
- `/transcribe_parakeet_v3`
|
| 42 |
- Example:
|
| 43 |
```python
|
| 44 |
from local_api_benchmark import run_all_model_apis
|
|
|
|
| 50 |
language=None,
|
| 51 |
initial_prompt=None,
|
| 52 |
postprocess_prompt=None,
|
| 53 |
+
model_options_by_model={
|
| 54 |
+
"Whisper Large V3 Turbo": {
|
| 55 |
+
"chunk_length_s": 30,
|
| 56 |
+
"batch_size": 16,
|
| 57 |
+
"long_audio_threshold_seconds": 120,
|
| 58 |
+
"num_beams": 1,
|
| 59 |
+
},
|
| 60 |
+
"NVIDIA Parakeet v3": {
|
| 61 |
+
"batch_size": 1,
|
| 62 |
+
"long_audio_threshold_seconds": 480,
|
| 63 |
+
"local_attention_left": 256,
|
| 64 |
+
"local_attention_right": 256,
|
| 65 |
+
"subsampling_conv_chunking_factor": 1,
|
| 66 |
+
},
|
| 67 |
+
},
|
| 68 |
save_outputs=True,
|
| 69 |
output_dir="benchmark_outputs",
|
| 70 |
)
|
app.py
CHANGED
|
@@ -1,15 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import spaces
|
| 3 |
|
| 4 |
-
from src.constants import
|
| 5 |
-
OMITTED_MODELS,
|
| 6 |
-
PARAKEET_V3,
|
| 7 |
-
SUPPORTED_MODELS,
|
| 8 |
-
WHISPER_CPP_LARGE,
|
| 9 |
-
WHISPER_FASTER_LARGE,
|
| 10 |
-
WHISPER_LARGE_V3,
|
| 11 |
-
WHISPER_LARGE_V3_TURBO,
|
| 12 |
-
)
|
| 13 |
from src.transcription_service import dispatch_transcription
|
| 14 |
|
| 15 |
|
|
@@ -34,26 +35,6 @@ def transcribe_selected_model(
|
|
| 34 |
)
|
| 35 |
|
| 36 |
|
| 37 |
-
@spaces.GPU
|
| 38 |
-
def transcribe_whisper_large_v3(
|
| 39 |
-
audio_file,
|
| 40 |
-
task,
|
| 41 |
-
language,
|
| 42 |
-
initial_prompt,
|
| 43 |
-
postprocess_prompt,
|
| 44 |
-
model_options_json,
|
| 45 |
-
):
|
| 46 |
-
return dispatch_transcription(
|
| 47 |
-
audio_file,
|
| 48 |
-
WHISPER_LARGE_V3,
|
| 49 |
-
task,
|
| 50 |
-
language,
|
| 51 |
-
initial_prompt,
|
| 52 |
-
postprocess_prompt,
|
| 53 |
-
model_options_json,
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
@spaces.GPU
|
| 58 |
def transcribe_whisper_large_v3_turbo(
|
| 59 |
audio_file,
|
|
@@ -74,46 +55,6 @@ def transcribe_whisper_large_v3_turbo(
|
|
| 74 |
)
|
| 75 |
|
| 76 |
|
| 77 |
-
@spaces.GPU
|
| 78 |
-
def transcribe_whisper_cpp_large(
|
| 79 |
-
audio_file,
|
| 80 |
-
task,
|
| 81 |
-
language,
|
| 82 |
-
initial_prompt,
|
| 83 |
-
postprocess_prompt,
|
| 84 |
-
model_options_json,
|
| 85 |
-
):
|
| 86 |
-
return dispatch_transcription(
|
| 87 |
-
audio_file,
|
| 88 |
-
WHISPER_CPP_LARGE,
|
| 89 |
-
task,
|
| 90 |
-
language,
|
| 91 |
-
initial_prompt,
|
| 92 |
-
postprocess_prompt,
|
| 93 |
-
model_options_json,
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
@spaces.GPU
|
| 98 |
-
def transcribe_whisper_faster_large(
|
| 99 |
-
audio_file,
|
| 100 |
-
task,
|
| 101 |
-
language,
|
| 102 |
-
initial_prompt,
|
| 103 |
-
postprocess_prompt,
|
| 104 |
-
model_options_json,
|
| 105 |
-
):
|
| 106 |
-
return dispatch_transcription(
|
| 107 |
-
audio_file,
|
| 108 |
-
WHISPER_FASTER_LARGE,
|
| 109 |
-
task,
|
| 110 |
-
language,
|
| 111 |
-
initial_prompt,
|
| 112 |
-
postprocess_prompt,
|
| 113 |
-
model_options_json,
|
| 114 |
-
)
|
| 115 |
-
|
| 116 |
-
|
| 117 |
@spaces.GPU
|
| 118 |
def transcribe_parakeet_v3(
|
| 119 |
audio_file,
|
|
@@ -134,10 +75,10 @@ def transcribe_parakeet_v3(
|
|
| 134 |
)
|
| 135 |
|
| 136 |
|
| 137 |
-
with gr.Blocks(title="
|
| 138 |
gr.Markdown(
|
| 139 |
-
"#
|
| 140 |
-
"
|
| 141 |
)
|
| 142 |
|
| 143 |
with gr.Row():
|
|
@@ -150,7 +91,7 @@ with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
|
|
| 150 |
with gr.Row():
|
| 151 |
model_label = gr.Dropdown(
|
| 152 |
choices=SUPPORTED_MODELS,
|
| 153 |
-
value=
|
| 154 |
label="Model",
|
| 155 |
)
|
| 156 |
task = gr.Radio(
|
|
@@ -170,7 +111,7 @@ with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
|
|
| 170 |
|
| 171 |
model_options_json = gr.Textbox(
|
| 172 |
label="Model options JSON (optional)",
|
| 173 |
-
placeholder='{"
|
| 174 |
lines=3,
|
| 175 |
)
|
| 176 |
|
|
@@ -196,36 +137,15 @@ with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
|
|
| 196 |
|
| 197 |
# Hidden controls used only to expose dedicated API routes per model.
|
| 198 |
with gr.Row(visible=False):
|
| 199 |
-
api_btn_wlv3 = gr.Button("transcribe_whisper_large_v3")
|
| 200 |
api_btn_wlv3t = gr.Button("transcribe_whisper_large_v3_turbo")
|
| 201 |
-
api_btn_wcpp = gr.Button("transcribe_whisper_cpp_large")
|
| 202 |
-
api_btn_fw = gr.Button("transcribe_whisper_faster_large")
|
| 203 |
api_btn_parakeet = gr.Button("transcribe_parakeet_v3")
|
| 204 |
|
| 205 |
-
api_btn_wlv3.click(
|
| 206 |
-
fn=transcribe_whisper_large_v3,
|
| 207 |
-
inputs=shared_inputs,
|
| 208 |
-
outputs=output,
|
| 209 |
-
api_name="transcribe_whisper_large_v3",
|
| 210 |
-
)
|
| 211 |
api_btn_wlv3t.click(
|
| 212 |
fn=transcribe_whisper_large_v3_turbo,
|
| 213 |
inputs=shared_inputs,
|
| 214 |
outputs=output,
|
| 215 |
api_name="transcribe_whisper_large_v3_turbo",
|
| 216 |
)
|
| 217 |
-
api_btn_wcpp.click(
|
| 218 |
-
fn=transcribe_whisper_cpp_large,
|
| 219 |
-
inputs=shared_inputs,
|
| 220 |
-
outputs=output,
|
| 221 |
-
api_name="transcribe_whisper_cpp_large",
|
| 222 |
-
)
|
| 223 |
-
api_btn_fw.click(
|
| 224 |
-
fn=transcribe_whisper_faster_large,
|
| 225 |
-
inputs=shared_inputs,
|
| 226 |
-
outputs=output,
|
| 227 |
-
api_name="transcribe_whisper_faster_large",
|
| 228 |
-
)
|
| 229 |
api_btn_parakeet.click(
|
| 230 |
fn=transcribe_parakeet_v3,
|
| 231 |
inputs=shared_inputs,
|
|
|
|
| 1 |
+
import warnings
|
| 2 |
+
|
| 3 |
+
# Suppress a known deprecation warning emitted by a transitive dependency in spaces.
|
| 4 |
+
warnings.filterwarnings(
|
| 5 |
+
"ignore",
|
| 6 |
+
message=r"`torch\.distributed\.reduce_op` is deprecated, please use `torch\.distributed\.ReduceOp` instead",
|
| 7 |
+
category=FutureWarning,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
import gradio as gr
|
| 11 |
import spaces
|
| 12 |
|
| 13 |
+
from src.constants import OMITTED_MODELS, PARAKEET_V3, SUPPORTED_MODELS, WHISPER_LARGE_V3_TURBO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from src.transcription_service import dispatch_transcription
|
| 15 |
|
| 16 |
|
|
|
|
| 35 |
)
|
| 36 |
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
@spaces.GPU
|
| 39 |
def transcribe_whisper_large_v3_turbo(
|
| 40 |
audio_file,
|
|
|
|
| 55 |
)
|
| 56 |
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
@spaces.GPU
|
| 59 |
def transcribe_parakeet_v3(
|
| 60 |
audio_file,
|
|
|
|
| 75 |
)
|
| 76 |
|
| 77 |
|
| 78 |
+
with gr.Blocks(title="Dual-model ASR (ZeroGPU)") as demo:
|
| 79 |
gr.Markdown(
|
| 80 |
+
"# Turbo + Parakeet transcription APIs (ZeroGPU)\n"
|
| 81 |
+
"Focused benchmark app exposing only Whisper Large V3 Turbo and NVIDIA Parakeet v3."
|
| 82 |
)
|
| 83 |
|
| 84 |
with gr.Row():
|
|
|
|
| 91 |
with gr.Row():
|
| 92 |
model_label = gr.Dropdown(
|
| 93 |
choices=SUPPORTED_MODELS,
|
| 94 |
+
value=WHISPER_LARGE_V3_TURBO,
|
| 95 |
label="Model",
|
| 96 |
)
|
| 97 |
task = gr.Radio(
|
|
|
|
| 111 |
|
| 112 |
model_options_json = gr.Textbox(
|
| 113 |
label="Model options JSON (optional)",
|
| 114 |
+
placeholder='{"chunk_length_s": 30, "batch_size": 16}',
|
| 115 |
lines=3,
|
| 116 |
)
|
| 117 |
|
|
|
|
| 137 |
|
| 138 |
# Hidden controls used only to expose dedicated API routes per model.
|
| 139 |
with gr.Row(visible=False):
|
|
|
|
| 140 |
api_btn_wlv3t = gr.Button("transcribe_whisper_large_v3_turbo")
|
|
|
|
|
|
|
| 141 |
api_btn_parakeet = gr.Button("transcribe_parakeet_v3")
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
api_btn_wlv3t.click(
|
| 144 |
fn=transcribe_whisper_large_v3_turbo,
|
| 145 |
inputs=shared_inputs,
|
| 146 |
outputs=output,
|
| 147 |
api_name="transcribe_whisper_large_v3_turbo",
|
| 148 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
api_btn_parakeet.click(
|
| 150 |
fn=transcribe_parakeet_v3,
|
| 151 |
inputs=shared_inputs,
|
local_api_benchmark.py
CHANGED
|
@@ -8,17 +8,11 @@ from gradio_client import Client, handle_file
|
|
| 8 |
|
| 9 |
from src.constants import (
|
| 10 |
PARAKEET_V3,
|
| 11 |
-
WHISPER_CPP_LARGE,
|
| 12 |
-
WHISPER_FASTER_LARGE,
|
| 13 |
-
WHISPER_LARGE_V3,
|
| 14 |
WHISPER_LARGE_V3_TURBO,
|
| 15 |
)
|
| 16 |
|
| 17 |
MODEL_API_BY_LABEL = {
|
| 18 |
-
WHISPER_LARGE_V3: "/transcribe_whisper_large_v3",
|
| 19 |
WHISPER_LARGE_V3_TURBO: "/transcribe_whisper_large_v3_turbo",
|
| 20 |
-
WHISPER_CPP_LARGE: "/transcribe_whisper_cpp_large",
|
| 21 |
-
WHISPER_FASTER_LARGE: "/transcribe_whisper_faster_large",
|
| 22 |
PARAKEET_V3: "/transcribe_parakeet_v3",
|
| 23 |
}
|
| 24 |
|
|
@@ -110,6 +104,7 @@ def run_all_model_apis(
|
|
| 110 |
initial_prompt: str | None = None,
|
| 111 |
postprocess_prompt: str | None = None,
|
| 112 |
model_options: str | dict[str, Any] | None = None,
|
|
|
|
| 113 |
models: list[str] | None = None,
|
| 114 |
hf_token: str | None = None,
|
| 115 |
save_outputs: bool = True,
|
|
@@ -118,6 +113,7 @@ def run_all_model_apis(
|
|
| 118 |
"""Run each model-specific API endpoint one by one and collect full outputs.
|
| 119 |
|
| 120 |
Designed for use from IPython notebooks/scripts.
|
|
|
|
| 121 |
"""
|
| 122 |
if models is None:
|
| 123 |
model_sequence = list(MODEL_API_BY_LABEL.keys())
|
|
@@ -135,6 +131,9 @@ def run_all_model_apis(
|
|
| 135 |
|
| 136 |
for model in model_sequence:
|
| 137 |
api_name = MODEL_API_BY_LABEL[model]
|
|
|
|
|
|
|
|
|
|
| 138 |
call_start = time.perf_counter()
|
| 139 |
try:
|
| 140 |
response = client.predict(
|
|
@@ -143,7 +142,7 @@ def run_all_model_apis(
|
|
| 143 |
language=language,
|
| 144 |
initial_prompt=initial_prompt,
|
| 145 |
postprocess_prompt=postprocess_prompt,
|
| 146 |
-
model_options_json=
|
| 147 |
api_name=api_name,
|
| 148 |
)
|
| 149 |
call_end = time.perf_counter()
|
|
@@ -153,6 +152,7 @@ def run_all_model_apis(
|
|
| 153 |
"api_name": api_name,
|
| 154 |
"status": "ok",
|
| 155 |
"client_wall_clock_seconds": round(call_end - call_start, 4),
|
|
|
|
| 156 |
"result": response,
|
| 157 |
}
|
| 158 |
)
|
|
@@ -164,6 +164,7 @@ def run_all_model_apis(
|
|
| 164 |
"api_name": api_name,
|
| 165 |
"status": "error",
|
| 166 |
"client_wall_clock_seconds": round(call_end - call_start, 4),
|
|
|
|
| 167 |
"error": str(exc),
|
| 168 |
}
|
| 169 |
)
|
|
@@ -178,6 +179,7 @@ def run_all_model_apis(
|
|
| 178 |
"initial_prompt": initial_prompt,
|
| 179 |
"postprocess_prompt": postprocess_prompt,
|
| 180 |
"model_options_json": options_json,
|
|
|
|
| 181 |
"models": model_sequence,
|
| 182 |
"benchmark_timing": {
|
| 183 |
"total_client_wall_clock_seconds": round(finished_at - started_at, 4),
|
|
|
|
| 8 |
|
| 9 |
from src.constants import (
|
| 10 |
PARAKEET_V3,
|
|
|
|
|
|
|
|
|
|
| 11 |
WHISPER_LARGE_V3_TURBO,
|
| 12 |
)
|
| 13 |
|
| 14 |
MODEL_API_BY_LABEL = {
|
|
|
|
| 15 |
WHISPER_LARGE_V3_TURBO: "/transcribe_whisper_large_v3_turbo",
|
|
|
|
|
|
|
| 16 |
PARAKEET_V3: "/transcribe_parakeet_v3",
|
| 17 |
}
|
| 18 |
|
|
|
|
| 104 |
initial_prompt: str | None = None,
|
| 105 |
postprocess_prompt: str | None = None,
|
| 106 |
model_options: str | dict[str, Any] | None = None,
|
| 107 |
+
model_options_by_model: dict[str, str | dict[str, Any]] | None = None,
|
| 108 |
models: list[str] | None = None,
|
| 109 |
hf_token: str | None = None,
|
| 110 |
save_outputs: bool = True,
|
|
|
|
| 113 |
"""Run each model-specific API endpoint one by one and collect full outputs.
|
| 114 |
|
| 115 |
Designed for use from IPython notebooks/scripts.
|
| 116 |
+
Use model_options_by_model for per-model tuning in a single benchmark run.
|
| 117 |
"""
|
| 118 |
if models is None:
|
| 119 |
model_sequence = list(MODEL_API_BY_LABEL.keys())
|
|
|
|
| 131 |
|
| 132 |
for model in model_sequence:
|
| 133 |
api_name = MODEL_API_BY_LABEL[model]
|
| 134 |
+
effective_options_json = options_json
|
| 135 |
+
if model_options_by_model and model in model_options_by_model:
|
| 136 |
+
effective_options_json = _to_model_options_json(model_options_by_model[model])
|
| 137 |
call_start = time.perf_counter()
|
| 138 |
try:
|
| 139 |
response = client.predict(
|
|
|
|
| 142 |
language=language,
|
| 143 |
initial_prompt=initial_prompt,
|
| 144 |
postprocess_prompt=postprocess_prompt,
|
| 145 |
+
model_options_json=effective_options_json,
|
| 146 |
api_name=api_name,
|
| 147 |
)
|
| 148 |
call_end = time.perf_counter()
|
|
|
|
| 152 |
"api_name": api_name,
|
| 153 |
"status": "ok",
|
| 154 |
"client_wall_clock_seconds": round(call_end - call_start, 4),
|
| 155 |
+
"effective_model_options_json": effective_options_json,
|
| 156 |
"result": response,
|
| 157 |
}
|
| 158 |
)
|
|
|
|
| 164 |
"api_name": api_name,
|
| 165 |
"status": "error",
|
| 166 |
"client_wall_clock_seconds": round(call_end - call_start, 4),
|
| 167 |
+
"effective_model_options_json": effective_options_json,
|
| 168 |
"error": str(exc),
|
| 169 |
}
|
| 170 |
)
|
|
|
|
| 179 |
"initial_prompt": initial_prompt,
|
| 180 |
"postprocess_prompt": postprocess_prompt,
|
| 181 |
"model_options_json": options_json,
|
| 182 |
+
"model_options_by_model": model_options_by_model,
|
| 183 |
"models": model_sequence,
|
| 184 |
"benchmark_timing": {
|
| 185 |
"total_client_wall_clock_seconds": round(finished_at - started_at, 4),
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
torch>=2.3.0
|
| 2 |
-
|
| 3 |
-
|
| 4 |
nemo_toolkit[asr]>=2.0.0
|
|
|
|
| 1 |
torch>=2.3.0
|
| 2 |
+
transformers>=4.46.0
|
| 3 |
+
accelerate>=1.1.0
|
| 4 |
nemo_toolkit[asr]>=2.0.0
|
src/constants.py
CHANGED
|
@@ -1,20 +1,15 @@
|
|
| 1 |
-
BATCH_SIZE = 8
|
| 2 |
-
|
| 3 |
-
WHISPER_LARGE_V3 = "Whisper Large V3"
|
| 4 |
WHISPER_LARGE_V3_TURBO = "Whisper Large V3 Turbo"
|
| 5 |
-
WHISPER_CPP_LARGE = "Whisper.cpp (large)"
|
| 6 |
-
WHISPER_FASTER_LARGE = "Whisper faster (large)"
|
| 7 |
PARAKEET_V3 = "NVIDIA Parakeet v3"
|
| 8 |
|
| 9 |
SUPPORTED_MODELS = [
|
| 10 |
-
WHISPER_LARGE_V3,
|
| 11 |
WHISPER_LARGE_V3_TURBO,
|
| 12 |
-
WHISPER_CPP_LARGE,
|
| 13 |
-
WHISPER_FASTER_LARGE,
|
| 14 |
PARAKEET_V3,
|
| 15 |
]
|
| 16 |
|
| 17 |
OMITTED_MODELS = {
|
|
|
|
|
|
|
|
|
|
| 18 |
"IBM Granite Speech 3.3 8B": (
|
| 19 |
"Omitted because a stable, documented word-level timestamp interface is not available "
|
| 20 |
"in standard inference usage."
|
|
@@ -22,7 +17,6 @@ OMITTED_MODELS = {
|
|
| 22 |
}
|
| 23 |
|
| 24 |
MODEL_IDS = {
|
| 25 |
-
WHISPER_LARGE_V3: "openai/whisper-large-v3",
|
| 26 |
WHISPER_LARGE_V3_TURBO: "openai/whisper-large-v3-turbo",
|
| 27 |
PARAKEET_V3: "nvidia/parakeet-tdt-0.6b-v3",
|
| 28 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
WHISPER_LARGE_V3_TURBO = "Whisper Large V3 Turbo"
|
|
|
|
|
|
|
| 2 |
PARAKEET_V3 = "NVIDIA Parakeet v3"
|
| 3 |
|
| 4 |
SUPPORTED_MODELS = [
|
|
|
|
| 5 |
WHISPER_LARGE_V3_TURBO,
|
|
|
|
|
|
|
| 6 |
PARAKEET_V3,
|
| 7 |
]
|
| 8 |
|
| 9 |
OMITTED_MODELS = {
|
| 10 |
+
"Whisper Large V3": "Removed from this benchmark-focused app per configuration.",
|
| 11 |
+
"Whisper.cpp (large)": "Removed from this benchmark-focused app per configuration.",
|
| 12 |
+
"Whisper faster (large)": "Removed from this benchmark-focused app per configuration.",
|
| 13 |
"IBM Granite Speech 3.3 8B": (
|
| 14 |
"Omitted because a stable, documented word-level timestamp interface is not available "
|
| 15 |
"in standard inference usage."
|
|
|
|
| 17 |
}
|
| 18 |
|
| 19 |
MODEL_IDS = {
|
|
|
|
| 20 |
WHISPER_LARGE_V3_TURBO: "openai/whisper-large-v3-turbo",
|
| 21 |
PARAKEET_V3: "nvidia/parakeet-tdt-0.6b-v3",
|
| 22 |
}
|
src/models/faster_whisper_model.py
DELETED
|
@@ -1,98 +0,0 @@
|
|
| 1 |
-
import time
|
| 2 |
-
from typing import Any
|
| 3 |
-
|
| 4 |
-
import gradio as gr
|
| 5 |
-
import torch
|
| 6 |
-
|
| 7 |
-
from src.utils import serialize
|
| 8 |
-
|
| 9 |
-
_FASTER_WHISPER_MODELS: dict[str, Any] = {}
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
def _get_faster_whisper_model(model_options: dict[str, Any]):
|
| 13 |
-
model_size = model_options.get("model_size", "large-v3")
|
| 14 |
-
compute_type = model_options.get(
|
| 15 |
-
"compute_type",
|
| 16 |
-
"float16" if torch.cuda.is_available() else "int8",
|
| 17 |
-
)
|
| 18 |
-
cache_key = f"{model_size}:{compute_type}"
|
| 19 |
-
if cache_key in _FASTER_WHISPER_MODELS:
|
| 20 |
-
return _FASTER_WHISPER_MODELS[cache_key], model_size, compute_type
|
| 21 |
-
|
| 22 |
-
try:
|
| 23 |
-
from faster_whisper import WhisperModel
|
| 24 |
-
except Exception as exc:
|
| 25 |
-
raise gr.Error(
|
| 26 |
-
"faster-whisper backend requested but package is missing. "
|
| 27 |
-
"Add faster-whisper to requirements.txt"
|
| 28 |
-
) from exc
|
| 29 |
-
|
| 30 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 31 |
-
model = WhisperModel(model_size, device=device, compute_type=compute_type)
|
| 32 |
-
_FASTER_WHISPER_MODELS[cache_key] = model
|
| 33 |
-
return model, model_size, compute_type
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def run_faster_whisper(
|
| 37 |
-
audio_file: str,
|
| 38 |
-
task: str,
|
| 39 |
-
language: str,
|
| 40 |
-
initial_prompt: str,
|
| 41 |
-
model_options: dict[str, Any],
|
| 42 |
-
) -> dict[str, Any]:
|
| 43 |
-
model, model_size, compute_type = _get_faster_whisper_model(model_options)
|
| 44 |
-
beam_size = int(model_options.get("beam_size", 5))
|
| 45 |
-
temperature = float(model_options.get("temperature", 0.0))
|
| 46 |
-
vad_filter = bool(model_options.get("vad_filter", True))
|
| 47 |
-
|
| 48 |
-
infer_start = time.perf_counter()
|
| 49 |
-
segments, info = model.transcribe(
|
| 50 |
-
audio_file,
|
| 51 |
-
task=task,
|
| 52 |
-
language=language or None,
|
| 53 |
-
initial_prompt=initial_prompt or None,
|
| 54 |
-
word_timestamps=True,
|
| 55 |
-
beam_size=beam_size,
|
| 56 |
-
temperature=temperature,
|
| 57 |
-
vad_filter=vad_filter,
|
| 58 |
-
)
|
| 59 |
-
segments_list = list(segments)
|
| 60 |
-
infer_end = time.perf_counter()
|
| 61 |
-
|
| 62 |
-
raw_output = {
|
| 63 |
-
"info": serialize(info),
|
| 64 |
-
"segments": [
|
| 65 |
-
{
|
| 66 |
-
"id": seg.id,
|
| 67 |
-
"seek": seg.seek,
|
| 68 |
-
"start": seg.start,
|
| 69 |
-
"end": seg.end,
|
| 70 |
-
"text": seg.text,
|
| 71 |
-
"tokens": list(seg.tokens) if seg.tokens is not None else None,
|
| 72 |
-
"avg_logprob": seg.avg_logprob,
|
| 73 |
-
"compression_ratio": seg.compression_ratio,
|
| 74 |
-
"no_speech_prob": seg.no_speech_prob,
|
| 75 |
-
"words": [
|
| 76 |
-
{
|
| 77 |
-
"start": w.start,
|
| 78 |
-
"end": w.end,
|
| 79 |
-
"word": w.word,
|
| 80 |
-
"probability": w.probability,
|
| 81 |
-
}
|
| 82 |
-
for w in (seg.words or [])
|
| 83 |
-
],
|
| 84 |
-
}
|
| 85 |
-
for seg in segments_list
|
| 86 |
-
],
|
| 87 |
-
"runtime": {
|
| 88 |
-
"model_size": model_size,
|
| 89 |
-
"compute_type": compute_type,
|
| 90 |
-
},
|
| 91 |
-
}
|
| 92 |
-
|
| 93 |
-
return {
|
| 94 |
-
"raw_output": serialize(raw_output),
|
| 95 |
-
"timing": {
|
| 96 |
-
"inference_seconds": round(infer_end - infer_start, 4),
|
| 97 |
-
},
|
| 98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/models/parakeet_model.py
CHANGED
|
@@ -4,7 +4,7 @@ import gradio as gr
|
|
| 4 |
import torch
|
| 5 |
|
| 6 |
from src.constants import MODEL_IDS, PARAKEET_V3
|
| 7 |
-
from src.utils import serialize
|
| 8 |
|
| 9 |
_PARAKEET_MODEL = None
|
| 10 |
|
|
@@ -36,9 +36,44 @@ def run_parakeet(
|
|
| 36 |
) -> dict:
|
| 37 |
model = _get_parakeet_model()
|
| 38 |
batch_size = int(model_options.get("batch_size", 1))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
infer_start = time.perf_counter()
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
infer_end = time.perf_counter()
|
| 43 |
|
| 44 |
item = outputs[0] if outputs else None
|
|
@@ -46,6 +81,18 @@ def run_parakeet(
|
|
| 46 |
"output": serialize(item),
|
| 47 |
"timestamp_hint": "word timestamps available in output.timestamp['word'] when provided by NeMo",
|
| 48 |
"language_hint": language or "auto",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
}
|
| 50 |
|
| 51 |
return {
|
|
|
|
| 4 |
import torch
|
| 5 |
|
| 6 |
from src.constants import MODEL_IDS, PARAKEET_V3
|
| 7 |
+
from src.utils import get_audio_duration_seconds, serialize
|
| 8 |
|
| 9 |
_PARAKEET_MODEL = None
|
| 10 |
|
|
|
|
| 36 |
) -> dict:
|
| 37 |
model = _get_parakeet_model()
|
| 38 |
batch_size = int(model_options.get("batch_size", 1))
|
| 39 |
+
long_audio_threshold_seconds = float(model_options.get("long_audio_threshold_seconds", 480))
|
| 40 |
+
local_attention_left = int(model_options.get("local_attention_left", 256))
|
| 41 |
+
local_attention_right = int(model_options.get("local_attention_right", 256))
|
| 42 |
+
subsampling_conv_chunking_factor = int(model_options.get("subsampling_conv_chunking_factor", 1))
|
| 43 |
+
enable_long_audio_optimizations = bool(model_options.get("enable_long_audio_optimizations", True))
|
| 44 |
+
|
| 45 |
+
duration_seconds = get_audio_duration_seconds(audio_file)
|
| 46 |
+
is_long_audio = duration_seconds is not None and duration_seconds > long_audio_threshold_seconds
|
| 47 |
+
applied_local_attention = False
|
| 48 |
+
applied_subsampling_chunking = False
|
| 49 |
+
optimization_errors: list[str] = []
|
| 50 |
+
|
| 51 |
+
if enable_long_audio_optimizations and is_long_audio:
|
| 52 |
+
try:
|
| 53 |
+
model.change_attention_model("rel_pos_local_attn", [local_attention_left, local_attention_right])
|
| 54 |
+
applied_local_attention = True
|
| 55 |
+
except Exception as exc:
|
| 56 |
+
optimization_errors.append(f"change_attention_model failed: {exc}")
|
| 57 |
+
try:
|
| 58 |
+
model.change_subsampling_conv_chunking_factor(subsampling_conv_chunking_factor)
|
| 59 |
+
applied_subsampling_chunking = True
|
| 60 |
+
except Exception as exc:
|
| 61 |
+
optimization_errors.append(f"change_subsampling_conv_chunking_factor failed: {exc}")
|
| 62 |
|
| 63 |
infer_start = time.perf_counter()
|
| 64 |
+
try:
|
| 65 |
+
outputs = model.transcribe([audio_file], batch_size=batch_size, timestamps=True)
|
| 66 |
+
finally:
|
| 67 |
+
if applied_local_attention:
|
| 68 |
+
try:
|
| 69 |
+
model.change_attention_model("rel_pos")
|
| 70 |
+
except Exception:
|
| 71 |
+
pass
|
| 72 |
+
if applied_subsampling_chunking:
|
| 73 |
+
try:
|
| 74 |
+
model.change_subsampling_conv_chunking_factor(-1)
|
| 75 |
+
except Exception:
|
| 76 |
+
pass
|
| 77 |
infer_end = time.perf_counter()
|
| 78 |
|
| 79 |
item = outputs[0] if outputs else None
|
|
|
|
| 81 |
"output": serialize(item),
|
| 82 |
"timestamp_hint": "word timestamps available in output.timestamp['word'] when provided by NeMo",
|
| 83 |
"language_hint": language or "auto",
|
| 84 |
+
"long_audio_settings": {
|
| 85 |
+
"duration_seconds": duration_seconds,
|
| 86 |
+
"is_long_audio": is_long_audio,
|
| 87 |
+
"threshold_seconds": long_audio_threshold_seconds,
|
| 88 |
+
"enable_long_audio_optimizations": enable_long_audio_optimizations,
|
| 89 |
+
"applied_local_attention": applied_local_attention,
|
| 90 |
+
"applied_subsampling_chunking": applied_subsampling_chunking,
|
| 91 |
+
"local_attention_left": local_attention_left,
|
| 92 |
+
"local_attention_right": local_attention_right,
|
| 93 |
+
"subsampling_conv_chunking_factor": subsampling_conv_chunking_factor,
|
| 94 |
+
"optimization_errors": optimization_errors,
|
| 95 |
+
},
|
| 96 |
}
|
| 97 |
|
| 98 |
return {
|
src/models/whisper_cpp_model.py
DELETED
|
@@ -1,77 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import os
|
| 3 |
-
import subprocess
|
| 4 |
-
import tempfile
|
| 5 |
-
import time
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
|
| 8 |
-
import gradio as gr
|
| 9 |
-
|
| 10 |
-
from src.utils import serialize
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def run_whisper_cpp(
|
| 14 |
-
audio_file: str,
|
| 15 |
-
task: str,
|
| 16 |
-
language: str,
|
| 17 |
-
initial_prompt: str,
|
| 18 |
-
model_options: dict,
|
| 19 |
-
) -> dict:
|
| 20 |
-
whisper_cpp_bin = model_options.get("whisper_cpp_bin") or os.getenv("WHISPER_CPP_BIN", "whisper-cli")
|
| 21 |
-
whisper_cpp_model = model_options.get("whisper_cpp_model") or os.getenv("WHISPER_CPP_MODEL_LARGE")
|
| 22 |
-
if not whisper_cpp_model:
|
| 23 |
-
raise gr.Error(
|
| 24 |
-
"Whisper.cpp requires model path. Set WHISPER_CPP_MODEL_LARGE or pass "
|
| 25 |
-
"model_options_json={\"whisper_cpp_model\":\"/path/to/ggml-large-v3.bin\"}."
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
with tempfile.TemporaryDirectory() as tmpdir:
|
| 29 |
-
output_prefix = str(Path(tmpdir) / "whispercpp")
|
| 30 |
-
cmd = [
|
| 31 |
-
whisper_cpp_bin,
|
| 32 |
-
"-m",
|
| 33 |
-
whisper_cpp_model,
|
| 34 |
-
"-f",
|
| 35 |
-
audio_file,
|
| 36 |
-
"-oj",
|
| 37 |
-
"-ml",
|
| 38 |
-
"1",
|
| 39 |
-
"-of",
|
| 40 |
-
output_prefix,
|
| 41 |
-
]
|
| 42 |
-
|
| 43 |
-
if language:
|
| 44 |
-
cmd.extend(["-l", language])
|
| 45 |
-
if initial_prompt:
|
| 46 |
-
cmd.extend(["--prompt", initial_prompt])
|
| 47 |
-
if task == "translate":
|
| 48 |
-
cmd.append("-tr")
|
| 49 |
-
|
| 50 |
-
infer_start = time.perf_counter()
|
| 51 |
-
proc = subprocess.run(cmd, capture_output=True, text=True)
|
| 52 |
-
infer_end = time.perf_counter()
|
| 53 |
-
|
| 54 |
-
if proc.returncode != 0:
|
| 55 |
-
raise gr.Error(
|
| 56 |
-
"whisper.cpp transcription failed. "
|
| 57 |
-
f"exit={proc.returncode} stderr={proc.stderr[-1500:]}"
|
| 58 |
-
)
|
| 59 |
-
|
| 60 |
-
json_path = Path(f"{output_prefix}.json")
|
| 61 |
-
if not json_path.exists():
|
| 62 |
-
raise gr.Error(
|
| 63 |
-
"whisper.cpp did not produce JSON output. "
|
| 64 |
-
"Ensure your whisper.cpp binary supports -oj and word timestamps (-ml 1)."
|
| 65 |
-
)
|
| 66 |
-
|
| 67 |
-
raw_output = json.loads(json_path.read_text())
|
| 68 |
-
|
| 69 |
-
return {
|
| 70 |
-
"raw_output": {
|
| 71 |
-
"result": serialize(raw_output),
|
| 72 |
-
"stderr": proc.stderr,
|
| 73 |
-
},
|
| 74 |
-
"timing": {
|
| 75 |
-
"inference_seconds": round(infer_end - infer_start, 4),
|
| 76 |
-
},
|
| 77 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/models/whisper_openai_model.py
DELETED
|
@@ -1,79 +0,0 @@
|
|
| 1 |
-
import time
|
| 2 |
-
from typing import Any
|
| 3 |
-
|
| 4 |
-
import gradio as gr
|
| 5 |
-
import torch
|
| 6 |
-
import whisper
|
| 7 |
-
|
| 8 |
-
from src.constants import WHISPER_LARGE_V3, WHISPER_LARGE_V3_TURBO
|
| 9 |
-
from src.utils import serialize
|
| 10 |
-
|
| 11 |
-
_WHISPER_MODELS: dict[str, Any] = {}
|
| 12 |
-
|
| 13 |
-
_OPENAI_MODEL_NAMES = {
|
| 14 |
-
WHISPER_LARGE_V3: "large-v3",
|
| 15 |
-
WHISPER_LARGE_V3_TURBO: "turbo",
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def _get_whisper_model(model_label: str):
|
| 20 |
-
if model_label in _WHISPER_MODELS:
|
| 21 |
-
return _WHISPER_MODELS[model_label]
|
| 22 |
-
|
| 23 |
-
model_name = _OPENAI_MODEL_NAMES.get(model_label)
|
| 24 |
-
if model_name is None:
|
| 25 |
-
raise gr.Error(f"Unsupported Whisper model label: {model_label}")
|
| 26 |
-
|
| 27 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 28 |
-
model = whisper.load_model(model_name, device=device)
|
| 29 |
-
_WHISPER_MODELS[model_label] = model
|
| 30 |
-
return model
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def run_whisper_openai(
|
| 34 |
-
model_label: str,
|
| 35 |
-
audio_file: str,
|
| 36 |
-
task: str,
|
| 37 |
-
language: str,
|
| 38 |
-
initial_prompt: str,
|
| 39 |
-
model_options: dict[str, Any],
|
| 40 |
-
) -> dict[str, Any]:
|
| 41 |
-
model = _get_whisper_model(model_label)
|
| 42 |
-
|
| 43 |
-
decode_kwargs: dict[str, Any] = {"task": task, "word_timestamps": True}
|
| 44 |
-
if language:
|
| 45 |
-
decode_kwargs["language"] = language
|
| 46 |
-
|
| 47 |
-
if initial_prompt:
|
| 48 |
-
decode_kwargs["initial_prompt"] = initial_prompt
|
| 49 |
-
|
| 50 |
-
if "temperature" in model_options:
|
| 51 |
-
decode_kwargs["temperature"] = float(model_options["temperature"])
|
| 52 |
-
if "beam_size" in model_options:
|
| 53 |
-
decode_kwargs["beam_size"] = int(model_options["beam_size"])
|
| 54 |
-
if "best_of" in model_options:
|
| 55 |
-
decode_kwargs["best_of"] = int(model_options["best_of"])
|
| 56 |
-
if "patience" in model_options:
|
| 57 |
-
decode_kwargs["patience"] = float(model_options["patience"])
|
| 58 |
-
if "condition_on_previous_text" in model_options:
|
| 59 |
-
decode_kwargs["condition_on_previous_text"] = bool(model_options["condition_on_previous_text"])
|
| 60 |
-
if "suppress_tokens" in model_options:
|
| 61 |
-
decode_kwargs["suppress_tokens"] = model_options["suppress_tokens"]
|
| 62 |
-
|
| 63 |
-
# Ensure expected precision behavior when GPU is available.
|
| 64 |
-
decode_kwargs["fp16"] = bool(torch.cuda.is_available())
|
| 65 |
-
|
| 66 |
-
infer_start = time.perf_counter()
|
| 67 |
-
raw_output = model.transcribe(audio_file, **decode_kwargs)
|
| 68 |
-
infer_end = time.perf_counter()
|
| 69 |
-
|
| 70 |
-
return {
|
| 71 |
-
"raw_output": {
|
| 72 |
-
"backend": "openai-whisper",
|
| 73 |
-
"model_name": _OPENAI_MODEL_NAMES[model_label],
|
| 74 |
-
"result": serialize(raw_output),
|
| 75 |
-
},
|
| 76 |
-
"timing": {
|
| 77 |
-
"inference_seconds": round(infer_end - infer_start, 4),
|
| 78 |
-
},
|
| 79 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/models/whisper_turbo_model.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import warnings
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
from src.constants import MODEL_IDS, WHISPER_LARGE_V3_TURBO
|
| 9 |
+
from src.utils import get_audio_duration_seconds, serialize
|
| 10 |
+
|
| 11 |
+
_TURBO_PIPELINES: dict[str, Any] = {}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _get_turbo_pipeline(chunk_length_s: float):
|
| 15 |
+
cache_key = f"chunk:{chunk_length_s}"
|
| 16 |
+
if cache_key in _TURBO_PIPELINES:
|
| 17 |
+
return _TURBO_PIPELINES[cache_key]
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from transformers import pipeline
|
| 21 |
+
except Exception as exc:
|
| 22 |
+
raise gr.Error(
|
| 23 |
+
"transformers is required for Whisper Turbo long-audio chunked inference. "
|
| 24 |
+
"Add transformers and accelerate to requirements.txt."
|
| 25 |
+
) from exc
|
| 26 |
+
|
| 27 |
+
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 28 |
+
pipe = pipeline(
|
| 29 |
+
task="automatic-speech-recognition",
|
| 30 |
+
model=MODEL_IDS[WHISPER_LARGE_V3_TURBO],
|
| 31 |
+
device=0 if torch.cuda.is_available() else "cpu",
|
| 32 |
+
torch_dtype=dtype,
|
| 33 |
+
model_kwargs={"low_cpu_mem_usage": True, "use_safetensors": True},
|
| 34 |
+
)
|
| 35 |
+
_TURBO_PIPELINES[cache_key] = pipe
|
| 36 |
+
return pipe
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def run_whisper_turbo(
|
| 40 |
+
audio_file: str,
|
| 41 |
+
task: str,
|
| 42 |
+
language: str,
|
| 43 |
+
initial_prompt: str,
|
| 44 |
+
model_options: dict[str, Any],
|
| 45 |
+
) -> dict[str, Any]:
|
| 46 |
+
chunk_length_s = float(model_options.get("chunk_length_s", 30))
|
| 47 |
+
batch_size = int(model_options.get("batch_size", 16))
|
| 48 |
+
long_audio_threshold_seconds = float(model_options.get("long_audio_threshold_seconds", 120))
|
| 49 |
+
duration_seconds = get_audio_duration_seconds(audio_file)
|
| 50 |
+
|
| 51 |
+
pipe = _get_turbo_pipeline(chunk_length_s=chunk_length_s)
|
| 52 |
+
generate_kwargs: dict[str, Any] = {"task": task, "num_beams": int(model_options.get("num_beams", 1))}
|
| 53 |
+
if language:
|
| 54 |
+
generate_kwargs["language"] = language
|
| 55 |
+
if initial_prompt:
|
| 56 |
+
generate_kwargs["prompt"] = initial_prompt
|
| 57 |
+
if "temperature" in model_options:
|
| 58 |
+
generate_kwargs["temperature"] = float(model_options["temperature"])
|
| 59 |
+
|
| 60 |
+
is_long_audio = duration_seconds is not None and duration_seconds > long_audio_threshold_seconds
|
| 61 |
+
|
| 62 |
+
infer_start = time.perf_counter()
|
| 63 |
+
with warnings.catch_warnings():
|
| 64 |
+
warnings.filterwarnings("ignore", message=r".*chunk_length_s.*experimental.*", category=Warning)
|
| 65 |
+
warnings.filterwarnings("ignore", message=r".*input name `inputs` is deprecated.*", category=FutureWarning)
|
| 66 |
+
call_kwargs: dict[str, Any] = {
|
| 67 |
+
"return_timestamps": "word",
|
| 68 |
+
"batch_size": batch_size,
|
| 69 |
+
"generate_kwargs": generate_kwargs,
|
| 70 |
+
}
|
| 71 |
+
if is_long_audio:
|
| 72 |
+
call_kwargs["chunk_length_s"] = chunk_length_s
|
| 73 |
+
raw_output = pipe(audio_file, **call_kwargs)
|
| 74 |
+
infer_end = time.perf_counter()
|
| 75 |
+
|
| 76 |
+
return {
|
| 77 |
+
"raw_output": {
|
| 78 |
+
"backend": "transformers-whisper-turbo",
|
| 79 |
+
"model_name": MODEL_IDS[WHISPER_LARGE_V3_TURBO],
|
| 80 |
+
"long_audio_settings": {
|
| 81 |
+
"duration_seconds": duration_seconds,
|
| 82 |
+
"is_long_audio": is_long_audio,
|
| 83 |
+
"long_audio_threshold_seconds": long_audio_threshold_seconds,
|
| 84 |
+
"chunk_length_s": chunk_length_s,
|
| 85 |
+
"batch_size": batch_size,
|
| 86 |
+
},
|
| 87 |
+
"result": serialize(raw_output),
|
| 88 |
+
},
|
| 89 |
+
"timing": {
|
| 90 |
+
"inference_seconds": round(infer_end - infer_start, 4),
|
| 91 |
+
},
|
| 92 |
+
}
|
src/transcription_service.py
CHANGED
|
@@ -5,15 +5,10 @@ import gradio as gr
|
|
| 5 |
from src.constants import (
|
| 6 |
PARAKEET_V3,
|
| 7 |
SUPPORTED_MODELS,
|
| 8 |
-
WHISPER_CPP_LARGE,
|
| 9 |
-
WHISPER_FASTER_LARGE,
|
| 10 |
-
WHISPER_LARGE_V3,
|
| 11 |
WHISPER_LARGE_V3_TURBO,
|
| 12 |
)
|
| 13 |
-
from src.models.faster_whisper_model import run_faster_whisper
|
| 14 |
from src.models.parakeet_model import run_parakeet
|
| 15 |
-
from src.models.
|
| 16 |
-
from src.models.whisper_cpp_model import run_whisper_cpp
|
| 17 |
from src.utils import parse_model_options
|
| 18 |
|
| 19 |
|
|
@@ -56,25 +51,8 @@ def dispatch_transcription_with_options(
|
|
| 56 |
) -> dict:
|
| 57 |
gpu_start = time.perf_counter()
|
| 58 |
|
| 59 |
-
if model_label
|
| 60 |
-
result =
|
| 61 |
-
model_label=model_label,
|
| 62 |
-
audio_file=audio_file,
|
| 63 |
-
task=task,
|
| 64 |
-
language=language,
|
| 65 |
-
initial_prompt=initial_prompt,
|
| 66 |
-
model_options=model_options,
|
| 67 |
-
)
|
| 68 |
-
elif model_label == WHISPER_FASTER_LARGE:
|
| 69 |
-
result = run_faster_whisper(
|
| 70 |
-
audio_file=audio_file,
|
| 71 |
-
task=task,
|
| 72 |
-
language=language,
|
| 73 |
-
initial_prompt=initial_prompt,
|
| 74 |
-
model_options=model_options,
|
| 75 |
-
)
|
| 76 |
-
elif model_label == WHISPER_CPP_LARGE:
|
| 77 |
-
result = run_whisper_cpp(
|
| 78 |
audio_file=audio_file,
|
| 79 |
task=task,
|
| 80 |
language=language,
|
|
@@ -107,4 +85,3 @@ def dispatch_transcription_with_options(
|
|
| 107 |
"raw_output": result["raw_output"],
|
| 108 |
"timestamp_granularity": "word",
|
| 109 |
}
|
| 110 |
-
|
|
|
|
| 5 |
from src.constants import (
|
| 6 |
PARAKEET_V3,
|
| 7 |
SUPPORTED_MODELS,
|
|
|
|
|
|
|
|
|
|
| 8 |
WHISPER_LARGE_V3_TURBO,
|
| 9 |
)
|
|
|
|
| 10 |
from src.models.parakeet_model import run_parakeet
|
| 11 |
+
from src.models.whisper_turbo_model import run_whisper_turbo
|
|
|
|
| 12 |
from src.utils import parse_model_options
|
| 13 |
|
| 14 |
|
|
|
|
| 51 |
) -> dict:
|
| 52 |
gpu_start = time.perf_counter()
|
| 53 |
|
| 54 |
+
if model_label == WHISPER_LARGE_V3_TURBO:
|
| 55 |
+
result = run_whisper_turbo(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
audio_file=audio_file,
|
| 57 |
task=task,
|
| 58 |
language=language,
|
|
|
|
| 85 |
"raw_output": result["raw_output"],
|
| 86 |
"timestamp_granularity": "word",
|
| 87 |
}
|
|
|
src/utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import json
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
from typing import Any
|
| 4 |
|
|
@@ -39,3 +40,23 @@ def parse_model_options(raw: str | None) -> dict[str, Any]:
|
|
| 39 |
if not isinstance(parsed, dict):
|
| 40 |
raise gr.Error("model_options_json must decode to a JSON object")
|
| 41 |
return parsed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
+
import subprocess
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Any
|
| 5 |
|
|
|
|
| 40 |
if not isinstance(parsed, dict):
|
| 41 |
raise gr.Error("model_options_json must decode to a JSON object")
|
| 42 |
return parsed
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def get_audio_duration_seconds(audio_file: str) -> float | None:
|
| 46 |
+
cmd = [
|
| 47 |
+
"ffprobe",
|
| 48 |
+
"-v",
|
| 49 |
+
"error",
|
| 50 |
+
"-show_entries",
|
| 51 |
+
"format=duration",
|
| 52 |
+
"-of",
|
| 53 |
+
"default=noprint_wrappers=1:nokey=1",
|
| 54 |
+
audio_file,
|
| 55 |
+
]
|
| 56 |
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
| 57 |
+
if proc.returncode != 0:
|
| 58 |
+
return None
|
| 59 |
+
try:
|
| 60 |
+
return float(proc.stdout.strip())
|
| 61 |
+
except Exception:
|
| 62 |
+
return None
|