Ratnesh-dev commited on
Commit
96ec5c3
·
1 Parent(s): 06277f0

Add Multi-Model API For Transcription

Browse files
.gitignore CHANGED
@@ -1 +1,4 @@
1
  *.mp3
 
 
 
 
1
  *.mp3
2
+
3
+ __pycache__/
4
+ *.pyc
README.md CHANGED
@@ -9,7 +9,48 @@ python_version: '3.12'
9
  app_file: app.py
10
  pinned: false
11
  license: mit
12
- short_description: Transcribe files with speaker diarization
13
  ---
14
 
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  app_file: app.py
10
  pinned: false
11
  license: mit
12
+ short_description: Multi-model ASR benchmarking with word-level timestamps
13
  ---
14
 
15
+ This Space is optimized for API usage and benchmarking on ZeroGPU.
16
+
17
+ Supported models (word-level timestamp capable):
18
+ - Whisper Large V3
19
+ - Whisper Large V3 Turbo
20
+ - Whisper.cpp (large)
21
+ - Whisper faster (large)
22
+ - NVIDIA Parakeet v3
23
+
24
+ Omitted:
25
+ - IBM Granite Speech 3.3 8B (no stable, documented word-level timestamp output in standard inference APIs)
26
+
27
+ Every transcription response returns:
28
+ - raw model output object
29
+ - `zerogpu_timing.gpu_window_seconds`
30
+ - `zerogpu_timing.inference_seconds`
31
+
32
+ Benchmark response (`/benchmark_all_models`) returns:
33
+ - one item per supported model with `status` (`ok` or `error`)
34
+ - each successful model's full raw output + timing
35
+ - benchmark-level wall clock summary and speed leaderboard
36
+
37
+ Whisper.cpp notes:
38
+ - Requires a whisper.cpp binary and a model file.
39
+ - Configure with env vars:
40
+ - `WHISPER_CPP_BIN` (default: `whisper-cli`)
41
+ - `WHISPER_CPP_MODEL_LARGE` (path to ggml model)
42
+
43
+ API endpoints:
44
+ - `/transcribe_selected`
45
+ - `/benchmark_all_models`
46
+ - `/transcribe_whisper_large_v3`
47
+ - `/transcribe_whisper_large_v3_turbo`
48
+ - `/transcribe_whisper_cpp_large`
49
+ - `/transcribe_whisper_faster_large`
50
+ - `/transcribe_parakeet_v3`
51
+
52
+ Code structure:
53
+ - `app.py`: Gradio wiring and API routes
54
+ - `src/transcription_service.py`: dispatch + benchmark orchestration
55
+ - `src/utils.py`: shared JSON/serialization helpers
56
+ - `src/models/`: model-specific backend implementations
app.py CHANGED
@@ -1,50 +1,269 @@
 
1
  import spaces
2
- import torch
3
 
4
- import gradio as gr
5
- from transformers import pipeline
 
 
 
 
 
 
 
 
 
6
 
7
 
8
- MODEL_NAME = "openai/whisper-large-v3"
9
- BATCH_SIZE = 8
10
- FILE_LIMIT_MB = 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- device = 0 if torch.cuda.is_available() else "cpu"
13
 
14
- pipe = pipeline(
15
- task="automatic-speech-recognition",
16
- model=MODEL_NAME,
17
- chunk_length_s=30,
18
- device=device,
19
- )
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  @spaces.GPU
23
- def transcribe(inputs, task):
24
- if inputs is None:
25
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
28
- return text
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- demo = gr.Blocks()
32
 
33
- with demo:
34
- gr.Interface(
35
- fn=transcribe,
36
- inputs=[
37
- gr.Audio(sources="upload", type="filepath", label="Audio file"),
38
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
39
- ],
40
- outputs=gr.JSON(label="transcription"),
41
- title="Whisper Large V3: Transcribe Audio",
42
- description=(
43
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
44
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
45
- " of arbitrary length."
46
- ),
47
  )
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- demo.queue().launch(theme=gr.themes.Ocean(), ssr_mode=False)
 
1
+ import gradio as gr
2
  import spaces
 
3
 
4
+ from src.constants import (
5
+ FILE_LIMIT_MB,
6
+ OMITTED_MODELS,
7
+ PARAKEET_V3,
8
+ SUPPORTED_MODELS,
9
+ WHISPER_CPP_LARGE,
10
+ WHISPER_FASTER_LARGE,
11
+ WHISPER_LARGE_V3,
12
+ WHISPER_LARGE_V3_TURBO,
13
+ )
14
+ from src.transcription_service import benchmark_all_models, dispatch_transcription
15
 
16
 
17
+ @spaces.GPU
18
+ def transcribe_selected_model(
19
+ audio_file,
20
+ model_label,
21
+ task,
22
+ language,
23
+ initial_prompt,
24
+ postprocess_prompt,
25
+ model_options_json,
26
+ ):
27
+ return dispatch_transcription(
28
+ audio_file,
29
+ model_label,
30
+ task,
31
+ language,
32
+ initial_prompt,
33
+ postprocess_prompt,
34
+ model_options_json,
35
+ )
36
 
 
37
 
38
+ @spaces.GPU
39
+ def transcribe_whisper_large_v3(
40
+ audio_file,
41
+ task,
42
+ language,
43
+ initial_prompt,
44
+ postprocess_prompt,
45
+ model_options_json,
46
+ ):
47
+ return dispatch_transcription(
48
+ audio_file,
49
+ WHISPER_LARGE_V3,
50
+ task,
51
+ language,
52
+ initial_prompt,
53
+ postprocess_prompt,
54
+ model_options_json,
55
+ )
56
 
57
 
58
  @spaces.GPU
59
+ def transcribe_whisper_large_v3_turbo(
60
+ audio_file,
61
+ task,
62
+ language,
63
+ initial_prompt,
64
+ postprocess_prompt,
65
+ model_options_json,
66
+ ):
67
+ return dispatch_transcription(
68
+ audio_file,
69
+ WHISPER_LARGE_V3_TURBO,
70
+ task,
71
+ language,
72
+ initial_prompt,
73
+ postprocess_prompt,
74
+ model_options_json,
75
+ )
76
 
 
 
77
 
78
+ @spaces.GPU
79
+ def transcribe_whisper_cpp_large(
80
+ audio_file,
81
+ task,
82
+ language,
83
+ initial_prompt,
84
+ postprocess_prompt,
85
+ model_options_json,
86
+ ):
87
+ return dispatch_transcription(
88
+ audio_file,
89
+ WHISPER_CPP_LARGE,
90
+ task,
91
+ language,
92
+ initial_prompt,
93
+ postprocess_prompt,
94
+ model_options_json,
95
+ )
96
+
97
+
98
+ @spaces.GPU
99
+ def transcribe_whisper_faster_large(
100
+ audio_file,
101
+ task,
102
+ language,
103
+ initial_prompt,
104
+ postprocess_prompt,
105
+ model_options_json,
106
+ ):
107
+ return dispatch_transcription(
108
+ audio_file,
109
+ WHISPER_FASTER_LARGE,
110
+ task,
111
+ language,
112
+ initial_prompt,
113
+ postprocess_prompt,
114
+ model_options_json,
115
+ )
116
+
117
+
118
+ @spaces.GPU
119
+ def transcribe_parakeet_v3(
120
+ audio_file,
121
+ task,
122
+ language,
123
+ initial_prompt,
124
+ postprocess_prompt,
125
+ model_options_json,
126
+ ):
127
+ return dispatch_transcription(
128
+ audio_file,
129
+ PARAKEET_V3,
130
+ task,
131
+ language,
132
+ initial_prompt,
133
+ postprocess_prompt,
134
+ model_options_json,
135
+ )
136
+
137
+
138
+ @spaces.GPU
139
+ def benchmark_models(
140
+ audio_file,
141
+ task,
142
+ language,
143
+ initial_prompt,
144
+ postprocess_prompt,
145
+ model_options_json,
146
+ ):
147
+ return benchmark_all_models(
148
+ audio_file,
149
+ task,
150
+ language,
151
+ initial_prompt,
152
+ postprocess_prompt,
153
+ model_options_json,
154
+ )
155
 
 
156
 
157
+ with gr.Blocks(theme=gr.themes.Ocean(), title="Multi-model ASR benchmark (ZeroGPU)") as demo:
158
+ gr.Markdown(
159
+ "# Multi-model transcription benchmark (ZeroGPU)\n"
160
+ "API-first design with one endpoint per model and full raw outputs (including word-level timestamps)."
 
 
 
 
 
 
 
 
 
 
161
  )
162
 
163
+ with gr.Row():
164
+ audio_file = gr.Audio(
165
+ sources=["upload"],
166
+ type="filepath",
167
+ label="Audio file",
168
+ max_length=FILE_LIMIT_MB,
169
+ )
170
+
171
+ with gr.Row():
172
+ model_label = gr.Dropdown(
173
+ choices=SUPPORTED_MODELS,
174
+ value=WHISPER_LARGE_V3,
175
+ label="Model",
176
+ )
177
+ task = gr.Radio(
178
+ choices=["transcribe", "translate"],
179
+ value="transcribe",
180
+ label="Task",
181
+ )
182
+
183
+ with gr.Row():
184
+ language = gr.Textbox(label="Language code (optional)", placeholder="e.g. en")
185
+ initial_prompt = gr.Textbox(label="Initial prompt (optional)")
186
+
187
+ postprocess_prompt = gr.Textbox(
188
+ label="Post-processing prompt/instruction (optional, recorded in output metadata)",
189
+ lines=2,
190
+ )
191
+
192
+ model_options_json = gr.Textbox(
193
+ label="Model options JSON (optional)",
194
+ placeholder='{"beam_size": 5, "temperature": 0.0, "vad_filter": true}',
195
+ lines=3,
196
+ )
197
+
198
+ run_btn = gr.Button("Run selected model")
199
+ benchmark_btn = gr.Button("Benchmark all supported models")
200
+
201
+ output = gr.JSON(label="Raw transcription output + timing")
202
+
203
+ shared_inputs = [
204
+ audio_file,
205
+ task,
206
+ language,
207
+ initial_prompt,
208
+ postprocess_prompt,
209
+ model_options_json,
210
+ ]
211
+
212
+ run_btn.click(
213
+ fn=transcribe_selected_model,
214
+ inputs=[audio_file, model_label, *shared_inputs[1:]],
215
+ outputs=output,
216
+ api_name="transcribe_selected",
217
+ )
218
+
219
+ benchmark_btn.click(
220
+ fn=benchmark_models,
221
+ inputs=shared_inputs,
222
+ outputs=output,
223
+ api_name="benchmark_all_models",
224
+ )
225
+
226
+ # Hidden controls used only to expose dedicated API routes per model.
227
+ with gr.Row(visible=False):
228
+ api_btn_wlv3 = gr.Button("transcribe_whisper_large_v3")
229
+ api_btn_wlv3t = gr.Button("transcribe_whisper_large_v3_turbo")
230
+ api_btn_wcpp = gr.Button("transcribe_whisper_cpp_large")
231
+ api_btn_fw = gr.Button("transcribe_whisper_faster_large")
232
+ api_btn_parakeet = gr.Button("transcribe_parakeet_v3")
233
+
234
+ api_btn_wlv3.click(
235
+ fn=transcribe_whisper_large_v3,
236
+ inputs=shared_inputs,
237
+ outputs=output,
238
+ api_name="transcribe_whisper_large_v3",
239
+ )
240
+ api_btn_wlv3t.click(
241
+ fn=transcribe_whisper_large_v3_turbo,
242
+ inputs=shared_inputs,
243
+ outputs=output,
244
+ api_name="transcribe_whisper_large_v3_turbo",
245
+ )
246
+ api_btn_wcpp.click(
247
+ fn=transcribe_whisper_cpp_large,
248
+ inputs=shared_inputs,
249
+ outputs=output,
250
+ api_name="transcribe_whisper_cpp_large",
251
+ )
252
+ api_btn_fw.click(
253
+ fn=transcribe_whisper_faster_large,
254
+ inputs=shared_inputs,
255
+ outputs=output,
256
+ api_name="transcribe_whisper_faster_large",
257
+ )
258
+ api_btn_parakeet.click(
259
+ fn=transcribe_parakeet_v3,
260
+ inputs=shared_inputs,
261
+ outputs=output,
262
+ api_name="transcribe_parakeet_v3",
263
+ )
264
+
265
+ omitted = "\n".join([f"- {k}: {v}" for k, v in OMITTED_MODELS.items()])
266
+ gr.Markdown(f"## Omitted models\n{omitted}")
267
+
268
 
269
+ demo.queue().launch(ssr_mode=False)
requirements.txt CHANGED
@@ -1,2 +1,5 @@
1
- transformers
2
-
 
 
 
 
1
+ transformers>=4.46.0
2
+ accelerate>=1.1.0
3
+ torch>=2.3.0
4
+ faster-whisper>=1.1.0
5
+ nemo_toolkit[asr]>=2.0.0
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Package marker.
src/constants.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BATCH_SIZE = 8
2
+ FILE_LIMIT_MB = 1000
3
+
4
+ WHISPER_LARGE_V3 = "Whisper Large V3"
5
+ WHISPER_LARGE_V3_TURBO = "Whisper Large V3 Turbo"
6
+ WHISPER_CPP_LARGE = "Whisper.cpp (large)"
7
+ WHISPER_FASTER_LARGE = "Whisper faster (large)"
8
+ PARAKEET_V3 = "NVIDIA Parakeet v3"
9
+
10
+ SUPPORTED_MODELS = [
11
+ WHISPER_LARGE_V3,
12
+ WHISPER_LARGE_V3_TURBO,
13
+ WHISPER_CPP_LARGE,
14
+ WHISPER_FASTER_LARGE,
15
+ PARAKEET_V3,
16
+ ]
17
+
18
+ OMITTED_MODELS = {
19
+ "IBM Granite Speech 3.3 8B": (
20
+ "Omitted because a stable, documented word-level timestamp interface is not available "
21
+ "in standard inference usage."
22
+ )
23
+ }
24
+
25
+ MODEL_IDS = {
26
+ WHISPER_LARGE_V3: "openai/whisper-large-v3",
27
+ WHISPER_LARGE_V3_TURBO: "openai/whisper-large-v3-turbo",
28
+ PARAKEET_V3: "nvidia/parakeet-tdt-0.6b-v3",
29
+ }
src/models/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Model backend package.
src/models/faster_whisper_model.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from typing import Any
3
+
4
+ import gradio as gr
5
+ import torch
6
+
7
+ from src.utils import serialize
8
+
9
+ _FASTER_WHISPER_MODELS: dict[str, Any] = {}
10
+
11
+
12
+ def _get_faster_whisper_model(model_options: dict[str, Any]):
13
+ model_size = model_options.get("model_size", "large-v3")
14
+ compute_type = model_options.get(
15
+ "compute_type",
16
+ "float16" if torch.cuda.is_available() else "int8",
17
+ )
18
+ cache_key = f"{model_size}:{compute_type}"
19
+ if cache_key in _FASTER_WHISPER_MODELS:
20
+ return _FASTER_WHISPER_MODELS[cache_key], model_size, compute_type
21
+
22
+ try:
23
+ from faster_whisper import WhisperModel
24
+ except Exception as exc:
25
+ raise gr.Error(
26
+ "faster-whisper backend requested but package is missing. "
27
+ "Add faster-whisper to requirements.txt"
28
+ ) from exc
29
+
30
+ device = "cuda" if torch.cuda.is_available() else "cpu"
31
+ model = WhisperModel(model_size, device=device, compute_type=compute_type)
32
+ _FASTER_WHISPER_MODELS[cache_key] = model
33
+ return model, model_size, compute_type
34
+
35
+
36
+ def run_faster_whisper(
37
+ audio_file: str,
38
+ task: str,
39
+ language: str,
40
+ initial_prompt: str,
41
+ model_options: dict[str, Any],
42
+ ) -> dict[str, Any]:
43
+ model, model_size, compute_type = _get_faster_whisper_model(model_options)
44
+ beam_size = int(model_options.get("beam_size", 5))
45
+ temperature = float(model_options.get("temperature", 0.0))
46
+ vad_filter = bool(model_options.get("vad_filter", True))
47
+
48
+ infer_start = time.perf_counter()
49
+ segments, info = model.transcribe(
50
+ audio_file,
51
+ task=task,
52
+ language=language or None,
53
+ initial_prompt=initial_prompt or None,
54
+ word_timestamps=True,
55
+ beam_size=beam_size,
56
+ temperature=temperature,
57
+ vad_filter=vad_filter,
58
+ )
59
+ segments_list = list(segments)
60
+ infer_end = time.perf_counter()
61
+
62
+ raw_output = {
63
+ "info": serialize(info),
64
+ "segments": [
65
+ {
66
+ "id": seg.id,
67
+ "seek": seg.seek,
68
+ "start": seg.start,
69
+ "end": seg.end,
70
+ "text": seg.text,
71
+ "tokens": list(seg.tokens) if seg.tokens is not None else None,
72
+ "avg_logprob": seg.avg_logprob,
73
+ "compression_ratio": seg.compression_ratio,
74
+ "no_speech_prob": seg.no_speech_prob,
75
+ "words": [
76
+ {
77
+ "start": w.start,
78
+ "end": w.end,
79
+ "word": w.word,
80
+ "probability": w.probability,
81
+ }
82
+ for w in (seg.words or [])
83
+ ],
84
+ }
85
+ for seg in segments_list
86
+ ],
87
+ "runtime": {
88
+ "model_size": model_size,
89
+ "compute_type": compute_type,
90
+ },
91
+ }
92
+
93
+ return {
94
+ "raw_output": serialize(raw_output),
95
+ "timing": {
96
+ "inference_seconds": round(infer_end - infer_start, 4),
97
+ },
98
+ }
src/models/parakeet_model.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import gradio as gr
4
+ import torch
5
+
6
+ from src.constants import MODEL_IDS, PARAKEET_V3
7
+ from src.utils import serialize
8
+
9
+ _PARAKEET_MODEL = None
10
+
11
+
12
+ def _get_parakeet_model():
13
+ global _PARAKEET_MODEL
14
+ if _PARAKEET_MODEL is not None:
15
+ return _PARAKEET_MODEL
16
+
17
+ try:
18
+ import nemo.collections.asr as nemo_asr
19
+ except Exception as exc:
20
+ raise gr.Error(
21
+ "NVIDIA Parakeet backend requested but NeMo ASR package is missing. "
22
+ "Add nemo_toolkit[asr] to requirements.txt"
23
+ ) from exc
24
+
25
+ model = nemo_asr.models.ASRModel.from_pretrained(model_name=MODEL_IDS[PARAKEET_V3])
26
+ if torch.cuda.is_available():
27
+ model = model.to("cuda")
28
+ _PARAKEET_MODEL = model
29
+ return _PARAKEET_MODEL
30
+
31
+
32
+ def run_parakeet(
33
+ audio_file: str,
34
+ language: str,
35
+ model_options: dict,
36
+ ) -> dict:
37
+ model = _get_parakeet_model()
38
+ batch_size = int(model_options.get("batch_size", 1))
39
+
40
+ infer_start = time.perf_counter()
41
+ outputs = model.transcribe([audio_file], batch_size=batch_size, timestamps=True)
42
+ infer_end = time.perf_counter()
43
+
44
+ item = outputs[0] if outputs else None
45
+ raw_output = {
46
+ "output": serialize(item),
47
+ "timestamp_hint": "word timestamps available in output.timestamp['word'] when provided by NeMo",
48
+ "language_hint": language or "auto",
49
+ }
50
+
51
+ return {
52
+ "raw_output": raw_output,
53
+ "timing": {
54
+ "inference_seconds": round(infer_end - infer_start, 4),
55
+ },
56
+ }
src/models/whisper_cpp_model.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import subprocess
4
+ import tempfile
5
+ import time
6
+ from pathlib import Path
7
+
8
+ import gradio as gr
9
+
10
+ from src.utils import serialize
11
+
12
+
13
+ def run_whisper_cpp(
14
+ audio_file: str,
15
+ task: str,
16
+ language: str,
17
+ initial_prompt: str,
18
+ model_options: dict,
19
+ ) -> dict:
20
+ whisper_cpp_bin = model_options.get("whisper_cpp_bin") or os.getenv("WHISPER_CPP_BIN", "whisper-cli")
21
+ whisper_cpp_model = model_options.get("whisper_cpp_model") or os.getenv("WHISPER_CPP_MODEL_LARGE")
22
+ if not whisper_cpp_model:
23
+ raise gr.Error(
24
+ "Whisper.cpp requires model path. Set WHISPER_CPP_MODEL_LARGE or pass "
25
+ "model_options_json={\"whisper_cpp_model\":\"/path/to/ggml-large-v3.bin\"}."
26
+ )
27
+
28
+ with tempfile.TemporaryDirectory() as tmpdir:
29
+ output_prefix = str(Path(tmpdir) / "whispercpp")
30
+ cmd = [
31
+ whisper_cpp_bin,
32
+ "-m",
33
+ whisper_cpp_model,
34
+ "-f",
35
+ audio_file,
36
+ "-oj",
37
+ "-ml",
38
+ "1",
39
+ "-of",
40
+ output_prefix,
41
+ ]
42
+
43
+ if language:
44
+ cmd.extend(["-l", language])
45
+ if initial_prompt:
46
+ cmd.extend(["--prompt", initial_prompt])
47
+ if task == "translate":
48
+ cmd.append("-tr")
49
+
50
+ infer_start = time.perf_counter()
51
+ proc = subprocess.run(cmd, capture_output=True, text=True)
52
+ infer_end = time.perf_counter()
53
+
54
+ if proc.returncode != 0:
55
+ raise gr.Error(
56
+ "whisper.cpp transcription failed. "
57
+ f"exit={proc.returncode} stderr={proc.stderr[-1500:]}"
58
+ )
59
+
60
+ json_path = Path(f"{output_prefix}.json")
61
+ if not json_path.exists():
62
+ raise gr.Error(
63
+ "whisper.cpp did not produce JSON output. "
64
+ "Ensure your whisper.cpp binary supports -oj and word timestamps (-ml 1)."
65
+ )
66
+
67
+ raw_output = json.loads(json_path.read_text())
68
+
69
+ return {
70
+ "raw_output": {
71
+ "result": serialize(raw_output),
72
+ "stderr": proc.stderr,
73
+ },
74
+ "timing": {
75
+ "inference_seconds": round(infer_end - infer_start, 4),
76
+ },
77
+ }
src/models/whisper_transformers.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from typing import Any
3
+
4
+ import torch
5
+ from transformers import pipeline
6
+
7
+ from src.constants import BATCH_SIZE, MODEL_IDS
8
+ from src.utils import serialize
9
+
10
+ _TRANSFORMERS_PIPES: dict[str, Any] = {}
11
+
12
+
13
+ def _device_for_transformers() -> int | str:
14
+ return 0 if torch.cuda.is_available() else "cpu"
15
+
16
+
17
+ def _get_whisper_pipeline(model_label: str):
18
+ if model_label in _TRANSFORMERS_PIPES:
19
+ return _TRANSFORMERS_PIPES[model_label]
20
+
21
+ model_name = MODEL_IDS[model_label]
22
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
23
+ pipe = pipeline(
24
+ task="automatic-speech-recognition",
25
+ model=model_name,
26
+ chunk_length_s=30,
27
+ batch_size=BATCH_SIZE,
28
+ device=_device_for_transformers(),
29
+ model_kwargs={"torch_dtype": dtype, "low_cpu_mem_usage": True},
30
+ )
31
+ _TRANSFORMERS_PIPES[model_label] = pipe
32
+ return pipe
33
+
34
+
35
+ def run_whisper_transformers(
36
+ model_label: str,
37
+ audio_file: str,
38
+ task: str,
39
+ language: str,
40
+ initial_prompt: str,
41
+ model_options: dict[str, Any],
42
+ ) -> dict[str, Any]:
43
+ pipe = _get_whisper_pipeline(model_label)
44
+
45
+ generate_kwargs: dict[str, Any] = {"task": task}
46
+ if language:
47
+ generate_kwargs["language"] = language
48
+
49
+ if initial_prompt:
50
+ try:
51
+ prompt_ids = pipe.tokenizer.get_prompt_ids(initial_prompt, return_tensors="pt")
52
+ if hasattr(prompt_ids, "to") and torch.cuda.is_available():
53
+ prompt_ids = prompt_ids.to("cuda")
54
+ generate_kwargs["prompt_ids"] = prompt_ids
55
+ except Exception:
56
+ generate_kwargs["prompt"] = initial_prompt
57
+
58
+ if "temperature" in model_options:
59
+ generate_kwargs["temperature"] = model_options["temperature"]
60
+ if "num_beams" in model_options:
61
+ generate_kwargs["num_beams"] = model_options["num_beams"]
62
+
63
+ infer_start = time.perf_counter()
64
+ raw_output = pipe(
65
+ audio_file,
66
+ return_timestamps="word",
67
+ generate_kwargs=generate_kwargs,
68
+ )
69
+ infer_end = time.perf_counter()
70
+
71
+ return {
72
+ "raw_output": serialize(raw_output),
73
+ "timing": {
74
+ "inference_seconds": round(infer_end - infer_start, 4),
75
+ },
76
+ }
src/transcription_service.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import gradio as gr
4
+
5
+ from src.constants import (
6
+ PARAKEET_V3,
7
+ SUPPORTED_MODELS,
8
+ WHISPER_CPP_LARGE,
9
+ WHISPER_FASTER_LARGE,
10
+ WHISPER_LARGE_V3,
11
+ WHISPER_LARGE_V3_TURBO,
12
+ )
13
+ from src.models.faster_whisper_model import run_faster_whisper
14
+ from src.models.parakeet_model import run_parakeet
15
+ from src.models.whisper_cpp_model import run_whisper_cpp
16
+ from src.models.whisper_transformers import run_whisper_transformers
17
+ from src.utils import parse_model_options
18
+
19
+
20
+ def dispatch_transcription(
21
+ audio_file: str,
22
+ model_label: str,
23
+ task: str,
24
+ language: str,
25
+ initial_prompt: str,
26
+ postprocess_prompt: str,
27
+ model_options_json: str,
28
+ ) -> dict:
29
+ if audio_file is None:
30
+ raise gr.Error("No audio file submitted. Upload an audio file first.")
31
+ if model_label not in SUPPORTED_MODELS:
32
+ raise gr.Error(f"Model is not supported for word-level timestamps: {model_label}")
33
+ if task not in {"transcribe", "translate"}:
34
+ raise gr.Error("task must be one of: transcribe, translate")
35
+
36
+ model_options = parse_model_options(model_options_json)
37
+ return dispatch_transcription_with_options(
38
+ audio_file=audio_file,
39
+ model_label=model_label,
40
+ task=task,
41
+ language=language,
42
+ initial_prompt=initial_prompt,
43
+ postprocess_prompt=postprocess_prompt,
44
+ model_options=model_options,
45
+ )
46
+
47
+
48
+ def dispatch_transcription_with_options(
49
+ audio_file: str,
50
+ model_label: str,
51
+ task: str,
52
+ language: str,
53
+ initial_prompt: str,
54
+ postprocess_prompt: str,
55
+ model_options: dict,
56
+ ) -> dict:
57
+ gpu_start = time.perf_counter()
58
+
59
+ if model_label in {WHISPER_LARGE_V3, WHISPER_LARGE_V3_TURBO}:
60
+ result = run_whisper_transformers(
61
+ model_label=model_label,
62
+ audio_file=audio_file,
63
+ task=task,
64
+ language=language,
65
+ initial_prompt=initial_prompt,
66
+ model_options=model_options,
67
+ )
68
+ elif model_label == WHISPER_FASTER_LARGE:
69
+ result = run_faster_whisper(
70
+ audio_file=audio_file,
71
+ task=task,
72
+ language=language,
73
+ initial_prompt=initial_prompt,
74
+ model_options=model_options,
75
+ )
76
+ elif model_label == WHISPER_CPP_LARGE:
77
+ result = run_whisper_cpp(
78
+ audio_file=audio_file,
79
+ task=task,
80
+ language=language,
81
+ initial_prompt=initial_prompt,
82
+ model_options=model_options,
83
+ )
84
+ elif model_label == PARAKEET_V3:
85
+ if task == "translate":
86
+ raise gr.Error("NVIDIA Parakeet v3 backend in this app currently supports task='transcribe' only.")
87
+ result = run_parakeet(
88
+ audio_file=audio_file,
89
+ language=language,
90
+ model_options=model_options,
91
+ )
92
+ else:
93
+ raise gr.Error(f"Unsupported model {model_label}")
94
+
95
+ gpu_end = time.perf_counter()
96
+
97
+ return {
98
+ "model": model_label,
99
+ "task": task,
100
+ "audio_file": str(audio_file),
101
+ "postprocess_prompt": postprocess_prompt or None,
102
+ "model_options": model_options,
103
+ "zerogpu_timing": {
104
+ "gpu_window_seconds": round(gpu_end - gpu_start, 4),
105
+ **result.get("timing", {}),
106
+ },
107
+ "raw_output": result["raw_output"],
108
+ "timestamp_granularity": "word",
109
+ }
110
+
111
+
112
+ def benchmark_all_models(
113
+ audio_file: str,
114
+ task: str,
115
+ language: str,
116
+ initial_prompt: str,
117
+ postprocess_prompt: str,
118
+ model_options_json: str,
119
+ ) -> dict:
120
+ if audio_file is None:
121
+ raise gr.Error("No audio file submitted. Upload an audio file first.")
122
+ model_options = parse_model_options(model_options_json)
123
+
124
+ started_at = time.perf_counter()
125
+ results = []
126
+
127
+ for model_label in SUPPORTED_MODELS:
128
+ per_model_start = time.perf_counter()
129
+ try:
130
+ model_result = dispatch_transcription_with_options(
131
+ audio_file=audio_file,
132
+ model_label=model_label,
133
+ task=task,
134
+ language=language,
135
+ initial_prompt=initial_prompt,
136
+ postprocess_prompt=postprocess_prompt,
137
+ model_options=model_options,
138
+ )
139
+ per_model_end = time.perf_counter()
140
+ results.append(
141
+ {
142
+ "model": model_label,
143
+ "status": "ok",
144
+ "wall_clock_seconds": round(per_model_end - per_model_start, 4),
145
+ "result": model_result,
146
+ }
147
+ )
148
+ except Exception as exc:
149
+ per_model_end = time.perf_counter()
150
+ results.append(
151
+ {
152
+ "model": model_label,
153
+ "status": "error",
154
+ "wall_clock_seconds": round(per_model_end - per_model_start, 4),
155
+ "error": str(exc),
156
+ }
157
+ )
158
+
159
+ completed_at = time.perf_counter()
160
+
161
+ leaderboard = sorted(
162
+ [r for r in results if r["status"] == "ok"],
163
+ key=lambda item: item["result"]["zerogpu_timing"].get("gpu_window_seconds", float("inf")),
164
+ )
165
+
166
+ return {
167
+ "task": task,
168
+ "audio_file": str(audio_file),
169
+ "language": language or None,
170
+ "timestamp_granularity": "word",
171
+ "benchmark_timing": {
172
+ "total_wall_clock_seconds": round(completed_at - started_at, 4),
173
+ },
174
+ "results": results,
175
+ "leaderboard_by_gpu_window_seconds": [
176
+ {
177
+ "model": item["model"],
178
+ "gpu_window_seconds": item["result"]["zerogpu_timing"].get("gpu_window_seconds"),
179
+ "inference_seconds": item["result"]["zerogpu_timing"].get("inference_seconds"),
180
+ }
181
+ for item in leaderboard
182
+ ],
183
+ }
src/utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ import gradio as gr
6
+
7
+
8
+ def serialize(value: Any) -> Any:
9
+ if isinstance(value, (str, int, float, bool)) or value is None:
10
+ return value
11
+ if isinstance(value, Path):
12
+ return str(value)
13
+ if isinstance(value, dict):
14
+ return {str(k): serialize(v) for k, v in value.items()}
15
+ if isinstance(value, (list, tuple)):
16
+ return [serialize(v) for v in value]
17
+ if hasattr(value, "item"):
18
+ try:
19
+ return value.item()
20
+ except Exception:
21
+ pass
22
+ if hasattr(value, "tolist"):
23
+ try:
24
+ return value.tolist()
25
+ except Exception:
26
+ pass
27
+ if hasattr(value, "__dict__"):
28
+ return {k: serialize(v) for k, v in vars(value).items()}
29
+ return str(value)
30
+
31
+
32
+ def parse_model_options(raw: str | None) -> dict[str, Any]:
33
+ if not raw:
34
+ return {}
35
+ try:
36
+ parsed = json.loads(raw)
37
+ except json.JSONDecodeError as exc:
38
+ raise gr.Error(f"model_options_json must be valid JSON: {exc}") from exc
39
+ if not isinstance(parsed, dict):
40
+ raise gr.Error("model_options_json must decode to a JSON object")
41
+ return parsed