Ratnesh-dev commited on
Commit
7def15a
·
1 Parent(s): 667e520

Report measured ZeroGPU inference time

Browse files
Files changed (1) hide show
  1. app.py +14 -7
app.py CHANGED
@@ -4,6 +4,7 @@ import csv
4
  import io
5
  import subprocess
6
  import tempfile
 
7
  from pathlib import Path
8
  from typing import Any
9
 
@@ -102,9 +103,10 @@ def _run_diarization(
102
  audio_path: str,
103
  hf_token: str,
104
  prefer_exclusive: bool,
105
- ) -> tuple[list[dict[str, Any]], str, str]:
106
  pipeline = get_pipeline(hf_token)
107
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
108
 
109
  pipeline.to(device)
110
  try:
@@ -116,6 +118,8 @@ def _run_diarization(
116
  pipeline.to(torch.device("cpu"))
117
  torch.cuda.empty_cache()
118
 
 
 
119
  annotation = output.speaker_diarization
120
  annotation_label = "speaker_diarization"
121
 
@@ -140,7 +144,7 @@ def _run_diarization(
140
  rttm_buffer = io.StringIO()
141
  annotation.write_rttm(rttm_buffer)
142
 
143
- return segments, rttm_buffer.getvalue(), annotation_label
144
 
145
 
146
  def _write_artifacts(segments: list[dict[str, Any]], rttm_text: str) -> list[str]:
@@ -183,7 +187,7 @@ def diarize(
183
  # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
184
  get_pipeline(resolved_token)
185
 
186
- segments, rttm_text, annotation_label = _run_diarization(
187
  audio_path=normalized_audio_path,
188
  hf_token=resolved_token,
189
  prefer_exclusive=prefer_exclusive,
@@ -194,7 +198,8 @@ def diarize(
194
  "### No active speaker segments were detected\n"
195
  f"Inference completed with `{annotation_label}` output, but it contained no segments."
196
  )
197
- return summary, [], "", _write_artifacts(segments, rttm_text)
 
198
 
199
  unique_speakers = sorted({segment["speaker"] for segment in segments})
200
  total_speech = sum(segment["duration"] for segment in segments)
@@ -204,7 +209,8 @@ def diarize(
204
  f"- Output used: `{annotation_label}`\n"
205
  f"- Segments: **{len(segments)}**\n"
206
  f"- Speakers detected: **{len(unique_speakers)}** ({', '.join(unique_speakers)})\n"
207
- f"- Total labelled speech: **{total_speech:.2f}s**"
 
208
  )
209
 
210
  table = [
@@ -223,7 +229,7 @@ def diarize(
223
  )
224
 
225
  artifacts = _write_artifacts(segments, rttm_text)
226
- return summary, table, turns_text, artifacts
227
 
228
 
229
  def build_demo() -> gr.Blocks:
@@ -262,6 +268,7 @@ def build_demo() -> gr.Blocks:
262
 
263
  with gr.Column(scale=1):
264
  summary_output = gr.Markdown()
 
265
  segments_output = gr.Dataframe(
266
  headers=["Speaker", "Start", "End", "Duration (s)"],
267
  datatype=["str", "str", "str", "number"],
@@ -282,7 +289,7 @@ def build_demo() -> gr.Blocks:
282
  token_input,
283
  prefer_exclusive,
284
  ],
285
- outputs=[summary_output, segments_output, turns_output, files_output],
286
  )
287
 
288
  gr.Markdown(
 
4
  import io
5
  import subprocess
6
  import tempfile
7
+ import time
8
  from pathlib import Path
9
  from typing import Any
10
 
 
103
  audio_path: str,
104
  hf_token: str,
105
  prefer_exclusive: bool,
106
+ ) -> tuple[list[dict[str, Any]], str, str, float]:
107
  pipeline = get_pipeline(hf_token)
108
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
109
+ started_at = time.perf_counter()
110
 
111
  pipeline.to(device)
112
  try:
 
118
  pipeline.to(torch.device("cpu"))
119
  torch.cuda.empty_cache()
120
 
121
+ zerogpu_seconds = time.perf_counter() - started_at
122
+
123
  annotation = output.speaker_diarization
124
  annotation_label = "speaker_diarization"
125
 
 
144
  rttm_buffer = io.StringIO()
145
  annotation.write_rttm(rttm_buffer)
146
 
147
+ return segments, rttm_buffer.getvalue(), annotation_label, zerogpu_seconds
148
 
149
 
150
  def _write_artifacts(segments: list[dict[str, Any]], rttm_text: str) -> list[str]:
 
187
  # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
188
  get_pipeline(resolved_token)
189
 
190
+ segments, rttm_text, annotation_label, zerogpu_seconds = _run_diarization(
191
  audio_path=normalized_audio_path,
192
  hf_token=resolved_token,
193
  prefer_exclusive=prefer_exclusive,
 
198
  "### No active speaker segments were detected\n"
199
  f"Inference completed with `{annotation_label}` output, but it contained no segments."
200
  )
201
+ summary += f"\n- ZeroGPU time used: **{zerogpu_seconds:.2f}s**"
202
+ return summary, round(zerogpu_seconds, 3), [], "", _write_artifacts(segments, rttm_text)
203
 
204
  unique_speakers = sorted({segment["speaker"] for segment in segments})
205
  total_speech = sum(segment["duration"] for segment in segments)
 
209
  f"- Output used: `{annotation_label}`\n"
210
  f"- Segments: **{len(segments)}**\n"
211
  f"- Speakers detected: **{len(unique_speakers)}** ({', '.join(unique_speakers)})\n"
212
+ f"- Total labelled speech: **{total_speech:.2f}s**\n"
213
+ f"- ZeroGPU time used: **{zerogpu_seconds:.2f}s**"
214
  )
215
 
216
  table = [
 
229
  )
230
 
231
  artifacts = _write_artifacts(segments, rttm_text)
232
+ return summary, round(zerogpu_seconds, 3), table, turns_text, artifacts
233
 
234
 
235
  def build_demo() -> gr.Blocks:
 
268
 
269
  with gr.Column(scale=1):
270
  summary_output = gr.Markdown()
271
+ zerogpu_seconds_output = gr.Number(label="ZeroGPU seconds used", precision=3)
272
  segments_output = gr.Dataframe(
273
  headers=["Speaker", "Start", "End", "Duration (s)"],
274
  datatype=["str", "str", "str", "number"],
 
289
  token_input,
290
  prefer_exclusive,
291
  ],
292
+ outputs=[summary_output, zerogpu_seconds_output, segments_output, turns_output, files_output],
293
  )
294
 
295
  gr.Markdown(