Spaces:

Ratnesh-dev
/

diarize

Running on Zero

App Files Files Community

Ratnesh-dev commited on Apr 8

Commit

7def15a

1 Parent(s): 667e520

Report measured ZeroGPU inference time

Browse files

Files changed (1) hide show

app.py +14 -7

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import csv
 import io
 import subprocess
 import tempfile
 from pathlib import Path
 from typing import Any
@@ -102,9 +103,10 @@ def _run_diarization(
     audio_path: str,
     hf_token: str,
     prefer_exclusive: bool,
-) -> tuple[list[dict[str, Any]], str, str]:
     pipeline = get_pipeline(hf_token)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     pipeline.to(device)
     try:
@@ -116,6 +118,8 @@ def _run_diarization(
             pipeline.to(torch.device("cpu"))
             torch.cuda.empty_cache()
     annotation = output.speaker_diarization
     annotation_label = "speaker_diarization"
@@ -140,7 +144,7 @@ def _run_diarization(
     rttm_buffer = io.StringIO()
     annotation.write_rttm(rttm_buffer)
-    return segments, rttm_buffer.getvalue(), annotation_label
 def _write_artifacts(segments: list[dict[str, Any]], rttm_text: str) -> list[str]:
@@ -183,7 +187,7 @@ def diarize(
     # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
     get_pipeline(resolved_token)
-    segments, rttm_text, annotation_label = _run_diarization(
         audio_path=normalized_audio_path,
         hf_token=resolved_token,
         prefer_exclusive=prefer_exclusive,
@@ -194,7 +198,8 @@ def diarize(
             "### No active speaker segments were detected\n"
             f"Inference completed with `{annotation_label}` output, but it contained no segments."
         )
-        return summary, [], "", _write_artifacts(segments, rttm_text)
     unique_speakers = sorted({segment["speaker"] for segment in segments})
     total_speech = sum(segment["duration"] for segment in segments)
@@ -204,7 +209,8 @@ def diarize(
         f"- Output used: `{annotation_label}`\n"
         f"- Segments: **{len(segments)}**\n"
         f"- Speakers detected: **{len(unique_speakers)}** ({', '.join(unique_speakers)})\n"
-        f"- Total labelled speech: **{total_speech:.2f}s**"
     )
     table = [
@@ -223,7 +229,7 @@ def diarize(
     )
     artifacts = _write_artifacts(segments, rttm_text)
-    return summary, table, turns_text, artifacts
 def build_demo() -> gr.Blocks:
@@ -262,6 +268,7 @@ def build_demo() -> gr.Blocks:
             with gr.Column(scale=1):
                 summary_output = gr.Markdown()
                 segments_output = gr.Dataframe(
                     headers=["Speaker", "Start", "End", "Duration (s)"],
                     datatype=["str", "str", "str", "number"],
@@ -282,7 +289,7 @@ def build_demo() -> gr.Blocks:
                 token_input,
                 prefer_exclusive,
             ],
-            outputs=[summary_output, segments_output, turns_output, files_output],
         )
         gr.Markdown(

 import io
 import subprocess
 import tempfile
+import time
 from pathlib import Path
 from typing import Any
     audio_path: str,
     hf_token: str,
     prefer_exclusive: bool,
+) -> tuple[list[dict[str, Any]], str, str, float]:
     pipeline = get_pipeline(hf_token)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    started_at = time.perf_counter()
     pipeline.to(device)
     try:
             pipeline.to(torch.device("cpu"))
             torch.cuda.empty_cache()
+    zerogpu_seconds = time.perf_counter() - started_at
     annotation = output.speaker_diarization
     annotation_label = "speaker_diarization"
     rttm_buffer = io.StringIO()
     annotation.write_rttm(rttm_buffer)
+    return segments, rttm_buffer.getvalue(), annotation_label, zerogpu_seconds
 def _write_artifacts(segments: list[dict[str, Any]], rttm_text: str) -> list[str]:
     # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
     get_pipeline(resolved_token)
+    segments, rttm_text, annotation_label, zerogpu_seconds = _run_diarization(
         audio_path=normalized_audio_path,
         hf_token=resolved_token,
         prefer_exclusive=prefer_exclusive,
             "### No active speaker segments were detected\n"
             f"Inference completed with `{annotation_label}` output, but it contained no segments."
         )
+        summary += f"\n- ZeroGPU time used: **{zerogpu_seconds:.2f}s**"
+        return summary, round(zerogpu_seconds, 3), [], "", _write_artifacts(segments, rttm_text)
     unique_speakers = sorted({segment["speaker"] for segment in segments})
     total_speech = sum(segment["duration"] for segment in segments)
         f"- Output used: `{annotation_label}`\n"
         f"- Segments: **{len(segments)}**\n"
         f"- Speakers detected: **{len(unique_speakers)}** ({', '.join(unique_speakers)})\n"
+        f"- Total labelled speech: **{total_speech:.2f}s**\n"
+        f"- ZeroGPU time used: **{zerogpu_seconds:.2f}s**"
     )
     table = [
     )
     artifacts = _write_artifacts(segments, rttm_text)
+    return summary, round(zerogpu_seconds, 3), table, turns_text, artifacts
 def build_demo() -> gr.Blocks:
             with gr.Column(scale=1):
                 summary_output = gr.Markdown()
+                zerogpu_seconds_output = gr.Number(label="ZeroGPU seconds used", precision=3)
                 segments_output = gr.Dataframe(
                     headers=["Speaker", "Start", "End", "Duration (s)"],
                     datatype=["str", "str", "str", "number"],
                 token_input,
                 prefer_exclusive,
             ],
+            outputs=[summary_output, zerogpu_seconds_output, segments_output, turns_output, files_output],
         )
         gr.Markdown(