Ratnesh-dev commited on
Commit
1db40b9
·
1 Parent(s): a8f3b8c

Refactor To Only Use Whisper Turbo And Parakeet

Browse files
README.md CHANGED
@@ -9,19 +9,19 @@ python_version: '3.12'
9
  app_file: app.py
10
  pinned: false
11
  license: mit
12
- short_description: Multi-model ASR APIs with word-level timestamps
13
  ---
14
 
15
  This Space is optimized for API usage on ZeroGPU.
16
 
17
  Supported models (word-level timestamp capable):
18
- - Whisper Large V3
19
  - Whisper Large V3 Turbo
20
- - Whisper.cpp (large)
21
- - Whisper faster (large)
22
  - NVIDIA Parakeet v3
23
 
24
  Omitted:
 
 
 
25
  - IBM Granite Speech 3.3 8B (no stable, documented word-level timestamp output in standard inference APIs)
26
 
27
  Every transcription response returns:
@@ -29,23 +29,16 @@ Every transcription response returns:
29
  - `zerogpu_timing.gpu_window_seconds`
30
  - `zerogpu_timing.inference_seconds`
31
 
32
- Whisper.cpp notes:
33
- - Requires a whisper.cpp binary and a model file.
34
- - Configure with env vars:
35
- - `WHISPER_CPP_BIN` (default: `whisper-cli`)
36
- - `WHISPER_CPP_MODEL_LARGE` (path to ggml model)
37
-
38
  API endpoints:
39
  - `/transcribe_selected`
40
- - `/transcribe_whisper_large_v3`
41
  - `/transcribe_whisper_large_v3_turbo`
42
- - `/transcribe_whisper_cpp_large`
43
- - `/transcribe_whisper_faster_large`
44
  - `/transcribe_parakeet_v3`
45
 
46
  Local benchmark script (run in IPython):
47
  - `local_api_benchmark.py`
48
- - Calls each model-specific endpoint sequentially and returns all raw outputs + timings.
 
 
49
  - Example:
50
  ```python
51
  from local_api_benchmark import run_all_model_apis
@@ -57,7 +50,21 @@ res = run_all_model_apis(
57
  language=None,
58
  initial_prompt=None,
59
  postprocess_prompt=None,
60
- model_options={"beam_size": 5, "temperature": 0.0},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  save_outputs=True,
62
  output_dir="benchmark_outputs",
63
  )
 
9
  app_file: app.py
10
  pinned: false
11
  license: mit
12
+ short_description: Turbo + Parakeet ASR APIs with word-level timestamps
13
  ---
14
 
15
  This Space is optimized for API usage on ZeroGPU.
16
 
17
  Supported models (word-level timestamp capable):
 
18
  - Whisper Large V3 Turbo
 
 
19
  - NVIDIA Parakeet v3
20
 
21
  Omitted:
22
+ - Whisper Large V3 (removed from this benchmark-focused app)
23
+ - Whisper.cpp (large) (removed from this benchmark-focused app)
24
+ - Whisper faster (large) (removed from this benchmark-focused app)
25
  - IBM Granite Speech 3.3 8B (no stable, documented word-level timestamp output in standard inference APIs)
26
 
27
  Every transcription response returns:
 
29
  - `zerogpu_timing.gpu_window_seconds`
30
  - `zerogpu_timing.inference_seconds`
31
 
 
 
 
 
 
 
32
  API endpoints:
33
  - `/transcribe_selected`
 
34
  - `/transcribe_whisper_large_v3_turbo`
 
 
35
  - `/transcribe_parakeet_v3`
36
 
37
  Local benchmark script (run in IPython):
38
  - `local_api_benchmark.py`
39
+ - Calls only these two endpoints sequentially and returns all raw outputs + timings:
40
+ - `/transcribe_whisper_large_v3_turbo`
41
+ - `/transcribe_parakeet_v3`
42
  - Example:
43
  ```python
44
  from local_api_benchmark import run_all_model_apis
 
50
  language=None,
51
  initial_prompt=None,
52
  postprocess_prompt=None,
53
+ model_options_by_model={
54
+ "Whisper Large V3 Turbo": {
55
+ "chunk_length_s": 30,
56
+ "batch_size": 16,
57
+ "long_audio_threshold_seconds": 120,
58
+ "num_beams": 1,
59
+ },
60
+ "NVIDIA Parakeet v3": {
61
+ "batch_size": 1,
62
+ "long_audio_threshold_seconds": 480,
63
+ "local_attention_left": 256,
64
+ "local_attention_right": 256,
65
+ "subsampling_conv_chunking_factor": 1,
66
+ },
67
+ },
68
  save_outputs=True,
69
  output_dir="benchmark_outputs",
70
  )
app.py CHANGED
@@ -1,15 +1,16 @@
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import spaces
3
 
4
- from src.constants import (
5
- OMITTED_MODELS,
6
- PARAKEET_V3,
7
- SUPPORTED_MODELS,
8
- WHISPER_CPP_LARGE,
9
- WHISPER_FASTER_LARGE,
10
- WHISPER_LARGE_V3,
11
- WHISPER_LARGE_V3_TURBO,
12
- )
13
  from src.transcription_service import dispatch_transcription
14
 
15
 
@@ -34,26 +35,6 @@ def transcribe_selected_model(
34
  )
35
 
36
 
37
- @spaces.GPU
38
- def transcribe_whisper_large_v3(
39
- audio_file,
40
- task,
41
- language,
42
- initial_prompt,
43
- postprocess_prompt,
44
- model_options_json,
45
- ):
46
- return dispatch_transcription(
47
- audio_file,
48
- WHISPER_LARGE_V3,
49
- task,
50
- language,
51
- initial_prompt,
52
- postprocess_prompt,
53
- model_options_json,
54
- )
55
-
56
-
57
  @spaces.GPU
58
  def transcribe_whisper_large_v3_turbo(
59
  audio_file,
@@ -74,46 +55,6 @@ def transcribe_whisper_large_v3_turbo(
74
  )
75
 
76
 
77
- @spaces.GPU
78
- def transcribe_whisper_cpp_large(
79
- audio_file,
80
- task,
81
- language,
82
- initial_prompt,
83
- postprocess_prompt,
84
- model_options_json,
85
- ):
86
- return dispatch_transcription(
87
- audio_file,
88
- WHISPER_CPP_LARGE,
89
- task,
90
- language,
91
- initial_prompt,
92
- postprocess_prompt,
93
- model_options_json,
94
- )
95
-
96
-
97
- @spaces.GPU
98
- def transcribe_whisper_faster_large(
99
- audio_file,
100
- task,
101
- language,
102
- initial_prompt,
103
- postprocess_prompt,
104
- model_options_json,
105
- ):
106
- return dispatch_transcription(
107
- audio_file,
108
- WHISPER_FASTER_LARGE,
109
- task,
110
- language,
111
- initial_prompt,
112
- postprocess_prompt,
113
- model_options_json,
114
- )
115
-
116
-
117
  @spaces.GPU
118
  def transcribe_parakeet_v3(
119
  audio_file,
@@ -134,10 +75,10 @@ def transcribe_parakeet_v3(
134
  )
135
 
136
 
137
- with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
138
  gr.Markdown(
139
- "# Multi-model transcription APIs (ZeroGPU)\n"
140
- "API-first design with one endpoint per model and full raw outputs (including word-level timestamps)."
141
  )
142
 
143
  with gr.Row():
@@ -150,7 +91,7 @@ with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
150
  with gr.Row():
151
  model_label = gr.Dropdown(
152
  choices=SUPPORTED_MODELS,
153
- value=WHISPER_LARGE_V3,
154
  label="Model",
155
  )
156
  task = gr.Radio(
@@ -170,7 +111,7 @@ with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
170
 
171
  model_options_json = gr.Textbox(
172
  label="Model options JSON (optional)",
173
- placeholder='{"beam_size": 5, "temperature": 0.0, "vad_filter": true}',
174
  lines=3,
175
  )
176
 
@@ -196,36 +137,15 @@ with gr.Blocks(title="Multi-model ASR (ZeroGPU)") as demo:
196
 
197
  # Hidden controls used only to expose dedicated API routes per model.
198
  with gr.Row(visible=False):
199
- api_btn_wlv3 = gr.Button("transcribe_whisper_large_v3")
200
  api_btn_wlv3t = gr.Button("transcribe_whisper_large_v3_turbo")
201
- api_btn_wcpp = gr.Button("transcribe_whisper_cpp_large")
202
- api_btn_fw = gr.Button("transcribe_whisper_faster_large")
203
  api_btn_parakeet = gr.Button("transcribe_parakeet_v3")
204
 
205
- api_btn_wlv3.click(
206
- fn=transcribe_whisper_large_v3,
207
- inputs=shared_inputs,
208
- outputs=output,
209
- api_name="transcribe_whisper_large_v3",
210
- )
211
  api_btn_wlv3t.click(
212
  fn=transcribe_whisper_large_v3_turbo,
213
  inputs=shared_inputs,
214
  outputs=output,
215
  api_name="transcribe_whisper_large_v3_turbo",
216
  )
217
- api_btn_wcpp.click(
218
- fn=transcribe_whisper_cpp_large,
219
- inputs=shared_inputs,
220
- outputs=output,
221
- api_name="transcribe_whisper_cpp_large",
222
- )
223
- api_btn_fw.click(
224
- fn=transcribe_whisper_faster_large,
225
- inputs=shared_inputs,
226
- outputs=output,
227
- api_name="transcribe_whisper_faster_large",
228
- )
229
  api_btn_parakeet.click(
230
  fn=transcribe_parakeet_v3,
231
  inputs=shared_inputs,
 
1
+ import warnings
2
+
3
+ # Suppress a known deprecation warning emitted by a transitive dependency in spaces.
4
+ warnings.filterwarnings(
5
+ "ignore",
6
+ message=r"`torch\.distributed\.reduce_op` is deprecated, please use `torch\.distributed\.ReduceOp` instead",
7
+ category=FutureWarning,
8
+ )
9
+
10
  import gradio as gr
11
  import spaces
12
 
13
+ from src.constants import OMITTED_MODELS, PARAKEET_V3, SUPPORTED_MODELS, WHISPER_LARGE_V3_TURBO
 
 
 
 
 
 
 
 
14
  from src.transcription_service import dispatch_transcription
15
 
16
 
 
35
  )
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  @spaces.GPU
39
  def transcribe_whisper_large_v3_turbo(
40
  audio_file,
 
55
  )
56
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  @spaces.GPU
59
  def transcribe_parakeet_v3(
60
  audio_file,
 
75
  )
76
 
77
 
78
+ with gr.Blocks(title="Dual-model ASR (ZeroGPU)") as demo:
79
  gr.Markdown(
80
+ "# Turbo + Parakeet transcription APIs (ZeroGPU)\n"
81
+ "Focused benchmark app exposing only Whisper Large V3 Turbo and NVIDIA Parakeet v3."
82
  )
83
 
84
  with gr.Row():
 
91
  with gr.Row():
92
  model_label = gr.Dropdown(
93
  choices=SUPPORTED_MODELS,
94
+ value=WHISPER_LARGE_V3_TURBO,
95
  label="Model",
96
  )
97
  task = gr.Radio(
 
111
 
112
  model_options_json = gr.Textbox(
113
  label="Model options JSON (optional)",
114
+ placeholder='{"chunk_length_s": 30, "batch_size": 16}',
115
  lines=3,
116
  )
117
 
 
137
 
138
  # Hidden controls used only to expose dedicated API routes per model.
139
  with gr.Row(visible=False):
 
140
  api_btn_wlv3t = gr.Button("transcribe_whisper_large_v3_turbo")
 
 
141
  api_btn_parakeet = gr.Button("transcribe_parakeet_v3")
142
 
 
 
 
 
 
 
143
  api_btn_wlv3t.click(
144
  fn=transcribe_whisper_large_v3_turbo,
145
  inputs=shared_inputs,
146
  outputs=output,
147
  api_name="transcribe_whisper_large_v3_turbo",
148
  )
 
 
 
 
 
 
 
 
 
 
 
 
149
  api_btn_parakeet.click(
150
  fn=transcribe_parakeet_v3,
151
  inputs=shared_inputs,
local_api_benchmark.py CHANGED
@@ -8,17 +8,11 @@ from gradio_client import Client, handle_file
8
 
9
  from src.constants import (
10
  PARAKEET_V3,
11
- WHISPER_CPP_LARGE,
12
- WHISPER_FASTER_LARGE,
13
- WHISPER_LARGE_V3,
14
  WHISPER_LARGE_V3_TURBO,
15
  )
16
 
17
  MODEL_API_BY_LABEL = {
18
- WHISPER_LARGE_V3: "/transcribe_whisper_large_v3",
19
  WHISPER_LARGE_V3_TURBO: "/transcribe_whisper_large_v3_turbo",
20
- WHISPER_CPP_LARGE: "/transcribe_whisper_cpp_large",
21
- WHISPER_FASTER_LARGE: "/transcribe_whisper_faster_large",
22
  PARAKEET_V3: "/transcribe_parakeet_v3",
23
  }
24
 
@@ -110,6 +104,7 @@ def run_all_model_apis(
110
  initial_prompt: str | None = None,
111
  postprocess_prompt: str | None = None,
112
  model_options: str | dict[str, Any] | None = None,
 
113
  models: list[str] | None = None,
114
  hf_token: str | None = None,
115
  save_outputs: bool = True,
@@ -118,6 +113,7 @@ def run_all_model_apis(
118
  """Run each model-specific API endpoint one by one and collect full outputs.
119
 
120
  Designed for use from IPython notebooks/scripts.
 
121
  """
122
  if models is None:
123
  model_sequence = list(MODEL_API_BY_LABEL.keys())
@@ -135,6 +131,9 @@ def run_all_model_apis(
135
 
136
  for model in model_sequence:
137
  api_name = MODEL_API_BY_LABEL[model]
 
 
 
138
  call_start = time.perf_counter()
139
  try:
140
  response = client.predict(
@@ -143,7 +142,7 @@ def run_all_model_apis(
143
  language=language,
144
  initial_prompt=initial_prompt,
145
  postprocess_prompt=postprocess_prompt,
146
- model_options_json=options_json,
147
  api_name=api_name,
148
  )
149
  call_end = time.perf_counter()
@@ -153,6 +152,7 @@ def run_all_model_apis(
153
  "api_name": api_name,
154
  "status": "ok",
155
  "client_wall_clock_seconds": round(call_end - call_start, 4),
 
156
  "result": response,
157
  }
158
  )
@@ -164,6 +164,7 @@ def run_all_model_apis(
164
  "api_name": api_name,
165
  "status": "error",
166
  "client_wall_clock_seconds": round(call_end - call_start, 4),
 
167
  "error": str(exc),
168
  }
169
  )
@@ -178,6 +179,7 @@ def run_all_model_apis(
178
  "initial_prompt": initial_prompt,
179
  "postprocess_prompt": postprocess_prompt,
180
  "model_options_json": options_json,
 
181
  "models": model_sequence,
182
  "benchmark_timing": {
183
  "total_client_wall_clock_seconds": round(finished_at - started_at, 4),
 
8
 
9
  from src.constants import (
10
  PARAKEET_V3,
 
 
 
11
  WHISPER_LARGE_V3_TURBO,
12
  )
13
 
14
  MODEL_API_BY_LABEL = {
 
15
  WHISPER_LARGE_V3_TURBO: "/transcribe_whisper_large_v3_turbo",
 
 
16
  PARAKEET_V3: "/transcribe_parakeet_v3",
17
  }
18
 
 
104
  initial_prompt: str | None = None,
105
  postprocess_prompt: str | None = None,
106
  model_options: str | dict[str, Any] | None = None,
107
+ model_options_by_model: dict[str, str | dict[str, Any]] | None = None,
108
  models: list[str] | None = None,
109
  hf_token: str | None = None,
110
  save_outputs: bool = True,
 
113
  """Run each model-specific API endpoint one by one and collect full outputs.
114
 
115
  Designed for use from IPython notebooks/scripts.
116
+ Use model_options_by_model for per-model tuning in a single benchmark run.
117
  """
118
  if models is None:
119
  model_sequence = list(MODEL_API_BY_LABEL.keys())
 
131
 
132
  for model in model_sequence:
133
  api_name = MODEL_API_BY_LABEL[model]
134
+ effective_options_json = options_json
135
+ if model_options_by_model and model in model_options_by_model:
136
+ effective_options_json = _to_model_options_json(model_options_by_model[model])
137
  call_start = time.perf_counter()
138
  try:
139
  response = client.predict(
 
142
  language=language,
143
  initial_prompt=initial_prompt,
144
  postprocess_prompt=postprocess_prompt,
145
+ model_options_json=effective_options_json,
146
  api_name=api_name,
147
  )
148
  call_end = time.perf_counter()
 
152
  "api_name": api_name,
153
  "status": "ok",
154
  "client_wall_clock_seconds": round(call_end - call_start, 4),
155
+ "effective_model_options_json": effective_options_json,
156
  "result": response,
157
  }
158
  )
 
164
  "api_name": api_name,
165
  "status": "error",
166
  "client_wall_clock_seconds": round(call_end - call_start, 4),
167
+ "effective_model_options_json": effective_options_json,
168
  "error": str(exc),
169
  }
170
  )
 
179
  "initial_prompt": initial_prompt,
180
  "postprocess_prompt": postprocess_prompt,
181
  "model_options_json": options_json,
182
+ "model_options_by_model": model_options_by_model,
183
  "models": model_sequence,
184
  "benchmark_timing": {
185
  "total_client_wall_clock_seconds": round(finished_at - started_at, 4),
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  torch>=2.3.0
2
- openai-whisper>=20250625
3
- faster-whisper>=1.1.0
4
  nemo_toolkit[asr]>=2.0.0
 
1
  torch>=2.3.0
2
+ transformers>=4.46.0
3
+ accelerate>=1.1.0
4
  nemo_toolkit[asr]>=2.0.0
src/constants.py CHANGED
@@ -1,20 +1,15 @@
1
- BATCH_SIZE = 8
2
-
3
- WHISPER_LARGE_V3 = "Whisper Large V3"
4
  WHISPER_LARGE_V3_TURBO = "Whisper Large V3 Turbo"
5
- WHISPER_CPP_LARGE = "Whisper.cpp (large)"
6
- WHISPER_FASTER_LARGE = "Whisper faster (large)"
7
  PARAKEET_V3 = "NVIDIA Parakeet v3"
8
 
9
  SUPPORTED_MODELS = [
10
- WHISPER_LARGE_V3,
11
  WHISPER_LARGE_V3_TURBO,
12
- WHISPER_CPP_LARGE,
13
- WHISPER_FASTER_LARGE,
14
  PARAKEET_V3,
15
  ]
16
 
17
  OMITTED_MODELS = {
 
 
 
18
  "IBM Granite Speech 3.3 8B": (
19
  "Omitted because a stable, documented word-level timestamp interface is not available "
20
  "in standard inference usage."
@@ -22,7 +17,6 @@ OMITTED_MODELS = {
22
  }
23
 
24
  MODEL_IDS = {
25
- WHISPER_LARGE_V3: "openai/whisper-large-v3",
26
  WHISPER_LARGE_V3_TURBO: "openai/whisper-large-v3-turbo",
27
  PARAKEET_V3: "nvidia/parakeet-tdt-0.6b-v3",
28
  }
 
 
 
 
1
  WHISPER_LARGE_V3_TURBO = "Whisper Large V3 Turbo"
 
 
2
  PARAKEET_V3 = "NVIDIA Parakeet v3"
3
 
4
  SUPPORTED_MODELS = [
 
5
  WHISPER_LARGE_V3_TURBO,
 
 
6
  PARAKEET_V3,
7
  ]
8
 
9
  OMITTED_MODELS = {
10
+ "Whisper Large V3": "Removed from this benchmark-focused app per configuration.",
11
+ "Whisper.cpp (large)": "Removed from this benchmark-focused app per configuration.",
12
+ "Whisper faster (large)": "Removed from this benchmark-focused app per configuration.",
13
  "IBM Granite Speech 3.3 8B": (
14
  "Omitted because a stable, documented word-level timestamp interface is not available "
15
  "in standard inference usage."
 
17
  }
18
 
19
  MODEL_IDS = {
 
20
  WHISPER_LARGE_V3_TURBO: "openai/whisper-large-v3-turbo",
21
  PARAKEET_V3: "nvidia/parakeet-tdt-0.6b-v3",
22
  }
src/models/faster_whisper_model.py DELETED
@@ -1,98 +0,0 @@
1
- import time
2
- from typing import Any
3
-
4
- import gradio as gr
5
- import torch
6
-
7
- from src.utils import serialize
8
-
9
- _FASTER_WHISPER_MODELS: dict[str, Any] = {}
10
-
11
-
12
- def _get_faster_whisper_model(model_options: dict[str, Any]):
13
- model_size = model_options.get("model_size", "large-v3")
14
- compute_type = model_options.get(
15
- "compute_type",
16
- "float16" if torch.cuda.is_available() else "int8",
17
- )
18
- cache_key = f"{model_size}:{compute_type}"
19
- if cache_key in _FASTER_WHISPER_MODELS:
20
- return _FASTER_WHISPER_MODELS[cache_key], model_size, compute_type
21
-
22
- try:
23
- from faster_whisper import WhisperModel
24
- except Exception as exc:
25
- raise gr.Error(
26
- "faster-whisper backend requested but package is missing. "
27
- "Add faster-whisper to requirements.txt"
28
- ) from exc
29
-
30
- device = "cuda" if torch.cuda.is_available() else "cpu"
31
- model = WhisperModel(model_size, device=device, compute_type=compute_type)
32
- _FASTER_WHISPER_MODELS[cache_key] = model
33
- return model, model_size, compute_type
34
-
35
-
36
- def run_faster_whisper(
37
- audio_file: str,
38
- task: str,
39
- language: str,
40
- initial_prompt: str,
41
- model_options: dict[str, Any],
42
- ) -> dict[str, Any]:
43
- model, model_size, compute_type = _get_faster_whisper_model(model_options)
44
- beam_size = int(model_options.get("beam_size", 5))
45
- temperature = float(model_options.get("temperature", 0.0))
46
- vad_filter = bool(model_options.get("vad_filter", True))
47
-
48
- infer_start = time.perf_counter()
49
- segments, info = model.transcribe(
50
- audio_file,
51
- task=task,
52
- language=language or None,
53
- initial_prompt=initial_prompt or None,
54
- word_timestamps=True,
55
- beam_size=beam_size,
56
- temperature=temperature,
57
- vad_filter=vad_filter,
58
- )
59
- segments_list = list(segments)
60
- infer_end = time.perf_counter()
61
-
62
- raw_output = {
63
- "info": serialize(info),
64
- "segments": [
65
- {
66
- "id": seg.id,
67
- "seek": seg.seek,
68
- "start": seg.start,
69
- "end": seg.end,
70
- "text": seg.text,
71
- "tokens": list(seg.tokens) if seg.tokens is not None else None,
72
- "avg_logprob": seg.avg_logprob,
73
- "compression_ratio": seg.compression_ratio,
74
- "no_speech_prob": seg.no_speech_prob,
75
- "words": [
76
- {
77
- "start": w.start,
78
- "end": w.end,
79
- "word": w.word,
80
- "probability": w.probability,
81
- }
82
- for w in (seg.words or [])
83
- ],
84
- }
85
- for seg in segments_list
86
- ],
87
- "runtime": {
88
- "model_size": model_size,
89
- "compute_type": compute_type,
90
- },
91
- }
92
-
93
- return {
94
- "raw_output": serialize(raw_output),
95
- "timing": {
96
- "inference_seconds": round(infer_end - infer_start, 4),
97
- },
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/parakeet_model.py CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
4
  import torch
5
 
6
  from src.constants import MODEL_IDS, PARAKEET_V3
7
- from src.utils import serialize
8
 
9
  _PARAKEET_MODEL = None
10
 
@@ -36,9 +36,44 @@ def run_parakeet(
36
  ) -> dict:
37
  model = _get_parakeet_model()
38
  batch_size = int(model_options.get("batch_size", 1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  infer_start = time.perf_counter()
41
- outputs = model.transcribe([audio_file], batch_size=batch_size, timestamps=True)
 
 
 
 
 
 
 
 
 
 
 
 
42
  infer_end = time.perf_counter()
43
 
44
  item = outputs[0] if outputs else None
@@ -46,6 +81,18 @@ def run_parakeet(
46
  "output": serialize(item),
47
  "timestamp_hint": "word timestamps available in output.timestamp['word'] when provided by NeMo",
48
  "language_hint": language or "auto",
 
 
 
 
 
 
 
 
 
 
 
 
49
  }
50
 
51
  return {
 
4
  import torch
5
 
6
  from src.constants import MODEL_IDS, PARAKEET_V3
7
+ from src.utils import get_audio_duration_seconds, serialize
8
 
9
  _PARAKEET_MODEL = None
10
 
 
36
  ) -> dict:
37
  model = _get_parakeet_model()
38
  batch_size = int(model_options.get("batch_size", 1))
39
+ long_audio_threshold_seconds = float(model_options.get("long_audio_threshold_seconds", 480))
40
+ local_attention_left = int(model_options.get("local_attention_left", 256))
41
+ local_attention_right = int(model_options.get("local_attention_right", 256))
42
+ subsampling_conv_chunking_factor = int(model_options.get("subsampling_conv_chunking_factor", 1))
43
+ enable_long_audio_optimizations = bool(model_options.get("enable_long_audio_optimizations", True))
44
+
45
+ duration_seconds = get_audio_duration_seconds(audio_file)
46
+ is_long_audio = duration_seconds is not None and duration_seconds > long_audio_threshold_seconds
47
+ applied_local_attention = False
48
+ applied_subsampling_chunking = False
49
+ optimization_errors: list[str] = []
50
+
51
+ if enable_long_audio_optimizations and is_long_audio:
52
+ try:
53
+ model.change_attention_model("rel_pos_local_attn", [local_attention_left, local_attention_right])
54
+ applied_local_attention = True
55
+ except Exception as exc:
56
+ optimization_errors.append(f"change_attention_model failed: {exc}")
57
+ try:
58
+ model.change_subsampling_conv_chunking_factor(subsampling_conv_chunking_factor)
59
+ applied_subsampling_chunking = True
60
+ except Exception as exc:
61
+ optimization_errors.append(f"change_subsampling_conv_chunking_factor failed: {exc}")
62
 
63
  infer_start = time.perf_counter()
64
+ try:
65
+ outputs = model.transcribe([audio_file], batch_size=batch_size, timestamps=True)
66
+ finally:
67
+ if applied_local_attention:
68
+ try:
69
+ model.change_attention_model("rel_pos")
70
+ except Exception:
71
+ pass
72
+ if applied_subsampling_chunking:
73
+ try:
74
+ model.change_subsampling_conv_chunking_factor(-1)
75
+ except Exception:
76
+ pass
77
  infer_end = time.perf_counter()
78
 
79
  item = outputs[0] if outputs else None
 
81
  "output": serialize(item),
82
  "timestamp_hint": "word timestamps available in output.timestamp['word'] when provided by NeMo",
83
  "language_hint": language or "auto",
84
+ "long_audio_settings": {
85
+ "duration_seconds": duration_seconds,
86
+ "is_long_audio": is_long_audio,
87
+ "threshold_seconds": long_audio_threshold_seconds,
88
+ "enable_long_audio_optimizations": enable_long_audio_optimizations,
89
+ "applied_local_attention": applied_local_attention,
90
+ "applied_subsampling_chunking": applied_subsampling_chunking,
91
+ "local_attention_left": local_attention_left,
92
+ "local_attention_right": local_attention_right,
93
+ "subsampling_conv_chunking_factor": subsampling_conv_chunking_factor,
94
+ "optimization_errors": optimization_errors,
95
+ },
96
  }
97
 
98
  return {
src/models/whisper_cpp_model.py DELETED
@@ -1,77 +0,0 @@
1
- import json
2
- import os
3
- import subprocess
4
- import tempfile
5
- import time
6
- from pathlib import Path
7
-
8
- import gradio as gr
9
-
10
- from src.utils import serialize
11
-
12
-
13
- def run_whisper_cpp(
14
- audio_file: str,
15
- task: str,
16
- language: str,
17
- initial_prompt: str,
18
- model_options: dict,
19
- ) -> dict:
20
- whisper_cpp_bin = model_options.get("whisper_cpp_bin") or os.getenv("WHISPER_CPP_BIN", "whisper-cli")
21
- whisper_cpp_model = model_options.get("whisper_cpp_model") or os.getenv("WHISPER_CPP_MODEL_LARGE")
22
- if not whisper_cpp_model:
23
- raise gr.Error(
24
- "Whisper.cpp requires model path. Set WHISPER_CPP_MODEL_LARGE or pass "
25
- "model_options_json={\"whisper_cpp_model\":\"/path/to/ggml-large-v3.bin\"}."
26
- )
27
-
28
- with tempfile.TemporaryDirectory() as tmpdir:
29
- output_prefix = str(Path(tmpdir) / "whispercpp")
30
- cmd = [
31
- whisper_cpp_bin,
32
- "-m",
33
- whisper_cpp_model,
34
- "-f",
35
- audio_file,
36
- "-oj",
37
- "-ml",
38
- "1",
39
- "-of",
40
- output_prefix,
41
- ]
42
-
43
- if language:
44
- cmd.extend(["-l", language])
45
- if initial_prompt:
46
- cmd.extend(["--prompt", initial_prompt])
47
- if task == "translate":
48
- cmd.append("-tr")
49
-
50
- infer_start = time.perf_counter()
51
- proc = subprocess.run(cmd, capture_output=True, text=True)
52
- infer_end = time.perf_counter()
53
-
54
- if proc.returncode != 0:
55
- raise gr.Error(
56
- "whisper.cpp transcription failed. "
57
- f"exit={proc.returncode} stderr={proc.stderr[-1500:]}"
58
- )
59
-
60
- json_path = Path(f"{output_prefix}.json")
61
- if not json_path.exists():
62
- raise gr.Error(
63
- "whisper.cpp did not produce JSON output. "
64
- "Ensure your whisper.cpp binary supports -oj and word timestamps (-ml 1)."
65
- )
66
-
67
- raw_output = json.loads(json_path.read_text())
68
-
69
- return {
70
- "raw_output": {
71
- "result": serialize(raw_output),
72
- "stderr": proc.stderr,
73
- },
74
- "timing": {
75
- "inference_seconds": round(infer_end - infer_start, 4),
76
- },
77
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/whisper_openai_model.py DELETED
@@ -1,79 +0,0 @@
1
- import time
2
- from typing import Any
3
-
4
- import gradio as gr
5
- import torch
6
- import whisper
7
-
8
- from src.constants import WHISPER_LARGE_V3, WHISPER_LARGE_V3_TURBO
9
- from src.utils import serialize
10
-
11
- _WHISPER_MODELS: dict[str, Any] = {}
12
-
13
- _OPENAI_MODEL_NAMES = {
14
- WHISPER_LARGE_V3: "large-v3",
15
- WHISPER_LARGE_V3_TURBO: "turbo",
16
- }
17
-
18
-
19
- def _get_whisper_model(model_label: str):
20
- if model_label in _WHISPER_MODELS:
21
- return _WHISPER_MODELS[model_label]
22
-
23
- model_name = _OPENAI_MODEL_NAMES.get(model_label)
24
- if model_name is None:
25
- raise gr.Error(f"Unsupported Whisper model label: {model_label}")
26
-
27
- device = "cuda" if torch.cuda.is_available() else "cpu"
28
- model = whisper.load_model(model_name, device=device)
29
- _WHISPER_MODELS[model_label] = model
30
- return model
31
-
32
-
33
- def run_whisper_openai(
34
- model_label: str,
35
- audio_file: str,
36
- task: str,
37
- language: str,
38
- initial_prompt: str,
39
- model_options: dict[str, Any],
40
- ) -> dict[str, Any]:
41
- model = _get_whisper_model(model_label)
42
-
43
- decode_kwargs: dict[str, Any] = {"task": task, "word_timestamps": True}
44
- if language:
45
- decode_kwargs["language"] = language
46
-
47
- if initial_prompt:
48
- decode_kwargs["initial_prompt"] = initial_prompt
49
-
50
- if "temperature" in model_options:
51
- decode_kwargs["temperature"] = float(model_options["temperature"])
52
- if "beam_size" in model_options:
53
- decode_kwargs["beam_size"] = int(model_options["beam_size"])
54
- if "best_of" in model_options:
55
- decode_kwargs["best_of"] = int(model_options["best_of"])
56
- if "patience" in model_options:
57
- decode_kwargs["patience"] = float(model_options["patience"])
58
- if "condition_on_previous_text" in model_options:
59
- decode_kwargs["condition_on_previous_text"] = bool(model_options["condition_on_previous_text"])
60
- if "suppress_tokens" in model_options:
61
- decode_kwargs["suppress_tokens"] = model_options["suppress_tokens"]
62
-
63
- # Ensure expected precision behavior when GPU is available.
64
- decode_kwargs["fp16"] = bool(torch.cuda.is_available())
65
-
66
- infer_start = time.perf_counter()
67
- raw_output = model.transcribe(audio_file, **decode_kwargs)
68
- infer_end = time.perf_counter()
69
-
70
- return {
71
- "raw_output": {
72
- "backend": "openai-whisper",
73
- "model_name": _OPENAI_MODEL_NAMES[model_label],
74
- "result": serialize(raw_output),
75
- },
76
- "timing": {
77
- "inference_seconds": round(infer_end - infer_start, 4),
78
- },
79
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/whisper_turbo_model.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import warnings
3
+ from typing import Any
4
+
5
+ import gradio as gr
6
+ import torch
7
+
8
+ from src.constants import MODEL_IDS, WHISPER_LARGE_V3_TURBO
9
+ from src.utils import get_audio_duration_seconds, serialize
10
+
11
+ _TURBO_PIPELINES: dict[str, Any] = {}
12
+
13
+
14
+ def _get_turbo_pipeline(chunk_length_s: float):
15
+ cache_key = f"chunk:{chunk_length_s}"
16
+ if cache_key in _TURBO_PIPELINES:
17
+ return _TURBO_PIPELINES[cache_key]
18
+
19
+ try:
20
+ from transformers import pipeline
21
+ except Exception as exc:
22
+ raise gr.Error(
23
+ "transformers is required for Whisper Turbo long-audio chunked inference. "
24
+ "Add transformers and accelerate to requirements.txt."
25
+ ) from exc
26
+
27
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
28
+ pipe = pipeline(
29
+ task="automatic-speech-recognition",
30
+ model=MODEL_IDS[WHISPER_LARGE_V3_TURBO],
31
+ device=0 if torch.cuda.is_available() else "cpu",
32
+ torch_dtype=dtype,
33
+ model_kwargs={"low_cpu_mem_usage": True, "use_safetensors": True},
34
+ )
35
+ _TURBO_PIPELINES[cache_key] = pipe
36
+ return pipe
37
+
38
+
39
+ def run_whisper_turbo(
40
+ audio_file: str,
41
+ task: str,
42
+ language: str,
43
+ initial_prompt: str,
44
+ model_options: dict[str, Any],
45
+ ) -> dict[str, Any]:
46
+ chunk_length_s = float(model_options.get("chunk_length_s", 30))
47
+ batch_size = int(model_options.get("batch_size", 16))
48
+ long_audio_threshold_seconds = float(model_options.get("long_audio_threshold_seconds", 120))
49
+ duration_seconds = get_audio_duration_seconds(audio_file)
50
+
51
+ pipe = _get_turbo_pipeline(chunk_length_s=chunk_length_s)
52
+ generate_kwargs: dict[str, Any] = {"task": task, "num_beams": int(model_options.get("num_beams", 1))}
53
+ if language:
54
+ generate_kwargs["language"] = language
55
+ if initial_prompt:
56
+ generate_kwargs["prompt"] = initial_prompt
57
+ if "temperature" in model_options:
58
+ generate_kwargs["temperature"] = float(model_options["temperature"])
59
+
60
+ is_long_audio = duration_seconds is not None and duration_seconds > long_audio_threshold_seconds
61
+
62
+ infer_start = time.perf_counter()
63
+ with warnings.catch_warnings():
64
+ warnings.filterwarnings("ignore", message=r".*chunk_length_s.*experimental.*", category=Warning)
65
+ warnings.filterwarnings("ignore", message=r".*input name `inputs` is deprecated.*", category=FutureWarning)
66
+ call_kwargs: dict[str, Any] = {
67
+ "return_timestamps": "word",
68
+ "batch_size": batch_size,
69
+ "generate_kwargs": generate_kwargs,
70
+ }
71
+ if is_long_audio:
72
+ call_kwargs["chunk_length_s"] = chunk_length_s
73
+ raw_output = pipe(audio_file, **call_kwargs)
74
+ infer_end = time.perf_counter()
75
+
76
+ return {
77
+ "raw_output": {
78
+ "backend": "transformers-whisper-turbo",
79
+ "model_name": MODEL_IDS[WHISPER_LARGE_V3_TURBO],
80
+ "long_audio_settings": {
81
+ "duration_seconds": duration_seconds,
82
+ "is_long_audio": is_long_audio,
83
+ "long_audio_threshold_seconds": long_audio_threshold_seconds,
84
+ "chunk_length_s": chunk_length_s,
85
+ "batch_size": batch_size,
86
+ },
87
+ "result": serialize(raw_output),
88
+ },
89
+ "timing": {
90
+ "inference_seconds": round(infer_end - infer_start, 4),
91
+ },
92
+ }
src/transcription_service.py CHANGED
@@ -5,15 +5,10 @@ import gradio as gr
5
  from src.constants import (
6
  PARAKEET_V3,
7
  SUPPORTED_MODELS,
8
- WHISPER_CPP_LARGE,
9
- WHISPER_FASTER_LARGE,
10
- WHISPER_LARGE_V3,
11
  WHISPER_LARGE_V3_TURBO,
12
  )
13
- from src.models.faster_whisper_model import run_faster_whisper
14
  from src.models.parakeet_model import run_parakeet
15
- from src.models.whisper_openai_model import run_whisper_openai
16
- from src.models.whisper_cpp_model import run_whisper_cpp
17
  from src.utils import parse_model_options
18
 
19
 
@@ -56,25 +51,8 @@ def dispatch_transcription_with_options(
56
  ) -> dict:
57
  gpu_start = time.perf_counter()
58
 
59
- if model_label in {WHISPER_LARGE_V3, WHISPER_LARGE_V3_TURBO}:
60
- result = run_whisper_openai(
61
- model_label=model_label,
62
- audio_file=audio_file,
63
- task=task,
64
- language=language,
65
- initial_prompt=initial_prompt,
66
- model_options=model_options,
67
- )
68
- elif model_label == WHISPER_FASTER_LARGE:
69
- result = run_faster_whisper(
70
- audio_file=audio_file,
71
- task=task,
72
- language=language,
73
- initial_prompt=initial_prompt,
74
- model_options=model_options,
75
- )
76
- elif model_label == WHISPER_CPP_LARGE:
77
- result = run_whisper_cpp(
78
  audio_file=audio_file,
79
  task=task,
80
  language=language,
@@ -107,4 +85,3 @@ def dispatch_transcription_with_options(
107
  "raw_output": result["raw_output"],
108
  "timestamp_granularity": "word",
109
  }
110
-
 
5
  from src.constants import (
6
  PARAKEET_V3,
7
  SUPPORTED_MODELS,
 
 
 
8
  WHISPER_LARGE_V3_TURBO,
9
  )
 
10
  from src.models.parakeet_model import run_parakeet
11
+ from src.models.whisper_turbo_model import run_whisper_turbo
 
12
  from src.utils import parse_model_options
13
 
14
 
 
51
  ) -> dict:
52
  gpu_start = time.perf_counter()
53
 
54
+ if model_label == WHISPER_LARGE_V3_TURBO:
55
+ result = run_whisper_turbo(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  audio_file=audio_file,
57
  task=task,
58
  language=language,
 
85
  "raw_output": result["raw_output"],
86
  "timestamp_granularity": "word",
87
  }
 
src/utils.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
  from pathlib import Path
3
  from typing import Any
4
 
@@ -39,3 +40,23 @@ def parse_model_options(raw: str | None) -> dict[str, Any]:
39
  if not isinstance(parsed, dict):
40
  raise gr.Error("model_options_json must decode to a JSON object")
41
  return parsed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
+ import subprocess
3
  from pathlib import Path
4
  from typing import Any
5
 
 
40
  if not isinstance(parsed, dict):
41
  raise gr.Error("model_options_json must decode to a JSON object")
42
  return parsed
43
+
44
+
45
+ def get_audio_duration_seconds(audio_file: str) -> float | None:
46
+ cmd = [
47
+ "ffprobe",
48
+ "-v",
49
+ "error",
50
+ "-show_entries",
51
+ "format=duration",
52
+ "-of",
53
+ "default=noprint_wrappers=1:nokey=1",
54
+ audio_file,
55
+ ]
56
+ proc = subprocess.run(cmd, capture_output=True, text=True)
57
+ if proc.returncode != 0:
58
+ return None
59
+ try:
60
+ return float(proc.stdout.strip())
61
+ except Exception:
62
+ return None