Spaces:

Ratnesh-dev
/

diarize

Sleeping

App Files Files Community

Ratnesh-dev commited on Apr 8

Commit

18bf750

1 Parent(s): 96ec82d

Return minimal diarization JSON schema

Browse files

Files changed (1) hide show

app.py +19 -61

app.py CHANGED Viewed

@@ -25,14 +25,6 @@ def get_pipeline(hf_token: str) -> Pipeline:
     return _PIPELINE
-def _format_timestamp(seconds: float) -> str:
-    milliseconds = int(round(seconds * 1000))
-    hours, remainder = divmod(milliseconds, 3_600_000)
-    minutes, remainder = divmod(remainder, 60_000)
-    secs, millis = divmod(remainder, 1_000)
-    return f"{hours:02}:{minutes:02}:{secs:02}.{millis:03}"
 def _normalize_audio(audio_path: str) -> str:
     normalized_dir = Path(tempfile.mkdtemp(prefix="pyannote_audio_"))
     normalized_path = normalized_dir / "normalized.wav"
@@ -111,7 +103,7 @@ def diarize(
     hf_token: str | None,
 ):
     if not audio_path:
-        raise gr.Error("Upload or record an audio file first.")
     if not Path(audio_path).exists():
         raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
@@ -127,49 +119,27 @@ def diarize(
     # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
     get_pipeline(hf_token)
-    segments, annotation_label, zerogpu_seconds = _run_diarization(
         audio_path=normalized_audio_path,
         hf_token=hf_token,
     )
-    if not segments:
-        summary = (
-            "### No active speaker segments were detected\n"
-            f"Inference completed with `{annotation_label}` output, but it contained no segments."
-        )
-        summary += f"\n- ZeroGPU time used: **{zerogpu_seconds:.2f}s**"
-        return summary, round(zerogpu_seconds, 3), [], ""
-    unique_speakers = sorted({segment["speaker"] for segment in segments})
-    total_speech = sum(segment["duration"] for segment in segments)
-    summary = (
-        f"### Diarization complete\n"
-        f"- Output used: `{annotation_label}`\n"
-        f"- Segments: **{len(segments)}**\n"
-        f"- Speakers detected: **{len(unique_speakers)}** ({', '.join(unique_speakers)})\n"
-        f"- Total labelled speech: **{total_speech:.2f}s**\n"
-        f"- ZeroGPU time used: **{zerogpu_seconds:.2f}s**"
-    )
-    segments_json = [
-        {
-            **segment,
-            "start": round(segment["start"], 3),
-            "end": round(segment["end"], 3),
-            "duration": round(segment["duration"], 3),
-            "start_timestamp": _format_timestamp(segment["start"]),
-            "end_timestamp": _format_timestamp(segment["end"]),
-        }
-        for segment in segments
-    ]
-    turns_text = "\n".join(
-        f"{segment['speaker']} | {_format_timestamp(segment['start'])} --> {_format_timestamp(segment['end'])}"
-        for segment in segments
-    )
-    return summary, round(zerogpu_seconds, 3), segments_json, turns_text
 def build_demo() -> gr.Blocks:
@@ -201,14 +171,7 @@ def build_demo() -> gr.Blocks:
                 run_button = gr.Button("Run diarization", variant="primary")
             with gr.Column(scale=1):
-                summary_output = gr.Markdown()
-                zerogpu_seconds_output = gr.Number(label="ZeroGPU seconds used", precision=3)
-                segments_output = gr.JSON(label="Segments JSON")
-                turns_output = gr.Textbox(
-                    label="Speaker turns",
-                    lines=14,
-                    buttons=["copy"],
-                )
         run_button.click(
             fn=diarize,
@@ -216,14 +179,9 @@ def build_demo() -> gr.Blocks:
                 audio_input,
                 token_input,
             ],
-            outputs=[summary_output, zerogpu_seconds_output, segments_output, turns_output],
         )
-        gr.Markdown(
-            """
-            Outputs include segments as JSON and a plain-text speaker-turn list.
-            """
-        )
     return demo

     return _PIPELINE
 def _normalize_audio(audio_path: str) -> str:
     normalized_dir = Path(tempfile.mkdtemp(prefix="pyannote_audio_"))
     normalized_path = normalized_dir / "normalized.wav"
     hf_token: str | None,
 ):
     if not audio_path:
+        raise gr.Error("Upload an audio file first.")
     if not Path(audio_path).exists():
         raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
     # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
     get_pipeline(hf_token)
+    segments, _, zerogpu_seconds = _run_diarization(
         audio_path=normalized_audio_path,
         hf_token=hf_token,
     )
+    response = {
+        "source": "pyannote/speaker-diarization-community-1",
+        "zerogpu_seconds": round(zerogpu_seconds, 3),
+        "segments": [
+            {
+                "segment_id": f"seg_{index:06d}",
+                "speaker_id": segment["speaker"],
+                "start": round(segment["start"], 3),
+                "end": round(segment["end"], 3),
+                "duration": round(segment["duration"], 3),
+            }
+            for index, segment in enumerate(segments, start=1)
+        ],
+    }
+    return response
 def build_demo() -> gr.Blocks:
                 run_button = gr.Button("Run diarization", variant="primary")
             with gr.Column(scale=1):
+                response_output = gr.JSON(label="Diarization JSON")
         run_button.click(
             fn=diarize,
                 audio_input,
                 token_input,
             ],
+            outputs=[response_output],
         )
     return demo