Spaces:

Ratnesh-dev
/

diarize

Paused

App Files Files Community

Ratnesh-dev commited on Apr 8

Commit

96ec82d

1 Parent(s): 1103803

Simplify API-focused diarization app

Browse files

Files changed (1) hide show

app.py +18 -79

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from __future__ import annotations
-import csv
-import io
 import subprocess
 import tempfile
 import time
@@ -9,54 +7,21 @@ from pathlib import Path
 from typing import Any
 import gradio as gr
 import torch
 from pyannote.audio import Pipeline
-try:
-    import spaces
-except ImportError:  # local fallback when the ZeroGPU helper is unavailable
-    class _SpacesShim:
-        def GPU(self, *args, **kwargs):
-            if args and callable(args[0]) and len(args) == 1 and not kwargs:
-                return args[0]
-            def decorator(func):
-                return func
-            return decorator
-    spaces = _SpacesShim()
-MODEL_ID = "pyannote/speaker-diarization-community-1"
-GPU_DURATION_SECONDS = 30
 _PIPELINE: Pipeline | None = None
-def _resolve_token(hf_token: str | None) -> str:
-    if hf_token and hf_token.strip():
-        return hf_token.strip()
-    raise gr.Error(
-        "A Hugging Face access token is required. Accept the model conditions first, then pass `HF_TOKEN` in the UI or API call."
-    )
 def get_pipeline(hf_token: str) -> Pipeline:
     global _PIPELINE
     if _PIPELINE is not None:
         return _PIPELINE
-    try:
-        _PIPELINE = Pipeline.from_pretrained(MODEL_ID, token=hf_token)
-    except Exception as exc:  # pragma: no cover - depends on runtime/network/token state
-        raise gr.Error(
-            "Failed to load the pyannote pipeline. Make sure you accepted the model conditions "
-            f"for {MODEL_ID} and provided a valid token. Original error: {exc}"
-        ) from exc
     return _PIPELINE
@@ -98,11 +63,11 @@ def _normalize_audio(audio_path: str) -> str:
     return str(normalized_path)
-@spaces.GPU(duration=GPU_DURATION_SECONDS)
 def _run_diarization(
     audio_path: str,
     hf_token: str,
-) -> tuple[list[dict[str, Any]], str, str, float]:
     pipeline = get_pipeline(hf_token)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     started_at = time.perf_counter()
@@ -110,8 +75,6 @@ def _run_diarization(
     pipeline.to(device)
     try:
         output = pipeline(audio_path)
-    except Exception as exc:  # pragma: no cover - depends on model/runtime/audio
-        raise gr.Error(f"Diarization failed: {exc}") from exc
     finally:
         if device.type == "cuda":
             pipeline.to(torch.device("cpu"))
@@ -140,33 +103,7 @@ def _run_diarization(
             }
         )
-    rttm_buffer = io.StringIO()
-    annotation.write_rttm(rttm_buffer)
-    return segments, rttm_buffer.getvalue(), annotation_label, zerogpu_seconds
-def _write_artifacts(segments: list[dict[str, Any]], rttm_text: str) -> list[str]:
-    output_dir = Path(tempfile.mkdtemp(prefix="pyannote_diarization_"))
-    csv_path = output_dir / "segments.csv"
-    with csv_path.open("w", newline="", encoding="utf-8") as csv_file:
-        writer = csv.DictWriter(csv_file, fieldnames=["speaker", "start", "end", "duration"])
-        writer.writeheader()
-        writer.writerows(segments)
-    txt_path = output_dir / "segments.txt"
-    with txt_path.open("w", encoding="utf-8") as txt_file:
-        for segment in segments:
-            txt_file.write(
-                f"{segment['speaker']} | {_format_timestamp(segment['start'])} --> "
-                f"{_format_timestamp(segment['end'])}\n"
-            )
-    rttm_path = output_dir / "diarization.rttm"
-    rttm_path.write_text(rttm_text, encoding="utf-8")
-    return [str(csv_path), str(txt_path), str(rttm_path)]
 def diarize(
@@ -179,15 +116,20 @@ def diarize(
     if not Path(audio_path).exists():
         raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
     normalized_audio_path = _normalize_audio(audio_path)
-    resolved_token = _resolve_token(hf_token)
     # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
-    get_pipeline(resolved_token)
-    segments, rttm_text, annotation_label, zerogpu_seconds = _run_diarization(
         audio_path=normalized_audio_path,
-        hf_token=resolved_token,
     )
     if not segments:
@@ -196,7 +138,7 @@ def diarize(
             f"Inference completed with `{annotation_label}` output, but it contained no segments."
         )
         summary += f"\n- ZeroGPU time used: **{zerogpu_seconds:.2f}s**"
-        return summary, round(zerogpu_seconds, 3), [], "", _write_artifacts(segments, rttm_text)
     unique_speakers = sorted({segment["speaker"] for segment in segments})
     total_speech = sum(segment["duration"] for segment in segments)
@@ -227,8 +169,7 @@ def diarize(
         for segment in segments
     )
-    artifacts = _write_artifacts(segments, rttm_text)
-    return summary, round(zerogpu_seconds, 3), segments_json, turns_text, artifacts
 def build_demo() -> gr.Blocks:
@@ -268,7 +209,6 @@ def build_demo() -> gr.Blocks:
                     lines=14,
                     buttons=["copy"],
                 )
-                files_output = gr.File(label="Download outputs", file_count="multiple")
         run_button.click(
             fn=diarize,
@@ -276,13 +216,12 @@ def build_demo() -> gr.Blocks:
                 audio_input,
                 token_input,
             ],
-            outputs=[summary_output, zerogpu_seconds_output, segments_output, turns_output, files_output],
         )
         gr.Markdown(
             """
-            Outputs include segments as JSON, a plain-text speaker-turn list, and downloadable
-            `CSV`, `TXT`, and `RTTM` files.
             """
         )

 from __future__ import annotations
 import subprocess
 import tempfile
 import time
 from typing import Any
 import gradio as gr
+import spaces
 import torch
 from pyannote.audio import Pipeline
 _PIPELINE: Pipeline | None = None
 def get_pipeline(hf_token: str) -> Pipeline:
     global _PIPELINE
     if _PIPELINE is not None:
         return _PIPELINE
+    _PIPELINE = Pipeline.from_pretrained("pyannote/speaker-diarization-community-1", token=hf_token)
     return _PIPELINE
     return str(normalized_path)
+@spaces.GPU(duration=120)
 def _run_diarization(
     audio_path: str,
     hf_token: str,
+) -> tuple[list[dict[str, Any]], str, float]:
     pipeline = get_pipeline(hf_token)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     started_at = time.perf_counter()
     pipeline.to(device)
     try:
         output = pipeline(audio_path)
     finally:
         if device.type == "cuda":
             pipeline.to(torch.device("cpu"))
             }
         )
+    return segments, annotation_label, zerogpu_seconds
 def diarize(
     if not Path(audio_path).exists():
         raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
+    if not hf_token or not hf_token.strip():
+        raise gr.Error(
+            "A Hugging Face access token is required. Accept the model conditions first, then pass `HF_TOKEN` in the UI or API call."
+        )
     normalized_audio_path = _normalize_audio(audio_path)
+    hf_token = hf_token.strip()
     # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
+    get_pipeline(hf_token)
+    segments, annotation_label, zerogpu_seconds = _run_diarization(
         audio_path=normalized_audio_path,
+        hf_token=hf_token,
     )
     if not segments:
             f"Inference completed with `{annotation_label}` output, but it contained no segments."
         )
         summary += f"\n- ZeroGPU time used: **{zerogpu_seconds:.2f}s**"
+        return summary, round(zerogpu_seconds, 3), [], ""
     unique_speakers = sorted({segment["speaker"] for segment in segments})
     total_speech = sum(segment["duration"] for segment in segments)
         for segment in segments
     )
+    return summary, round(zerogpu_seconds, 3), segments_json, turns_text
 def build_demo() -> gr.Blocks:
                     lines=14,
                     buttons=["copy"],
                 )
         run_button.click(
             fn=diarize,
                 audio_input,
                 token_input,
             ],
+            outputs=[summary_output, zerogpu_seconds_output, segments_output, turns_output],
         )
         gr.Markdown(
             """
+            Outputs include segments as JSON and a plain-text speaker-turn list.
             """
         )