Ratnesh-dev commited on
Commit
18bf750
·
1 Parent(s): 96ec82d

Return minimal diarization JSON schema

Browse files
Files changed (1) hide show
  1. app.py +19 -61
app.py CHANGED
@@ -25,14 +25,6 @@ def get_pipeline(hf_token: str) -> Pipeline:
25
  return _PIPELINE
26
 
27
 
28
- def _format_timestamp(seconds: float) -> str:
29
- milliseconds = int(round(seconds * 1000))
30
- hours, remainder = divmod(milliseconds, 3_600_000)
31
- minutes, remainder = divmod(remainder, 60_000)
32
- secs, millis = divmod(remainder, 1_000)
33
- return f"{hours:02}:{minutes:02}:{secs:02}.{millis:03}"
34
-
35
-
36
  def _normalize_audio(audio_path: str) -> str:
37
  normalized_dir = Path(tempfile.mkdtemp(prefix="pyannote_audio_"))
38
  normalized_path = normalized_dir / "normalized.wav"
@@ -111,7 +103,7 @@ def diarize(
111
  hf_token: str | None,
112
  ):
113
  if not audio_path:
114
- raise gr.Error("Upload or record an audio file first.")
115
 
116
  if not Path(audio_path).exists():
117
  raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
@@ -127,49 +119,27 @@ def diarize(
127
  # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
128
  get_pipeline(hf_token)
129
 
130
- segments, annotation_label, zerogpu_seconds = _run_diarization(
131
  audio_path=normalized_audio_path,
132
  hf_token=hf_token,
133
  )
134
 
135
- if not segments:
136
- summary = (
137
- "### No active speaker segments were detected\n"
138
- f"Inference completed with `{annotation_label}` output, but it contained no segments."
139
- )
140
- summary += f"\n- ZeroGPU time used: **{zerogpu_seconds:.2f}s**"
141
- return summary, round(zerogpu_seconds, 3), [], ""
142
-
143
- unique_speakers = sorted({segment["speaker"] for segment in segments})
144
- total_speech = sum(segment["duration"] for segment in segments)
145
-
146
- summary = (
147
- f"### Diarization complete\n"
148
- f"- Output used: `{annotation_label}`\n"
149
- f"- Segments: **{len(segments)}**\n"
150
- f"- Speakers detected: **{len(unique_speakers)}** ({', '.join(unique_speakers)})\n"
151
- f"- Total labelled speech: **{total_speech:.2f}s**\n"
152
- f"- ZeroGPU time used: **{zerogpu_seconds:.2f}s**"
153
- )
154
-
155
- segments_json = [
156
- {
157
- **segment,
158
- "start": round(segment["start"], 3),
159
- "end": round(segment["end"], 3),
160
- "duration": round(segment["duration"], 3),
161
- "start_timestamp": _format_timestamp(segment["start"]),
162
- "end_timestamp": _format_timestamp(segment["end"]),
163
- }
164
- for segment in segments
165
- ]
166
-
167
- turns_text = "\n".join(
168
- f"{segment['speaker']} | {_format_timestamp(segment['start'])} --> {_format_timestamp(segment['end'])}"
169
- for segment in segments
170
- )
171
 
172
- return summary, round(zerogpu_seconds, 3), segments_json, turns_text
173
 
174
 
175
  def build_demo() -> gr.Blocks:
@@ -201,14 +171,7 @@ def build_demo() -> gr.Blocks:
201
  run_button = gr.Button("Run diarization", variant="primary")
202
 
203
  with gr.Column(scale=1):
204
- summary_output = gr.Markdown()
205
- zerogpu_seconds_output = gr.Number(label="ZeroGPU seconds used", precision=3)
206
- segments_output = gr.JSON(label="Segments JSON")
207
- turns_output = gr.Textbox(
208
- label="Speaker turns",
209
- lines=14,
210
- buttons=["copy"],
211
- )
212
 
213
  run_button.click(
214
  fn=diarize,
@@ -216,14 +179,9 @@ def build_demo() -> gr.Blocks:
216
  audio_input,
217
  token_input,
218
  ],
219
- outputs=[summary_output, zerogpu_seconds_output, segments_output, turns_output],
220
  )
221
 
222
- gr.Markdown(
223
- """
224
- Outputs include segments as JSON and a plain-text speaker-turn list.
225
- """
226
- )
227
 
228
  return demo
229
 
 
25
  return _PIPELINE
26
 
27
 
 
 
 
 
 
 
 
 
28
  def _normalize_audio(audio_path: str) -> str:
29
  normalized_dir = Path(tempfile.mkdtemp(prefix="pyannote_audio_"))
30
  normalized_path = normalized_dir / "normalized.wav"
 
103
  hf_token: str | None,
104
  ):
105
  if not audio_path:
106
+ raise gr.Error("Upload an audio file first.")
107
 
108
  if not Path(audio_path).exists():
109
  raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
 
119
  # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
120
  get_pipeline(hf_token)
121
 
122
+ segments, _, zerogpu_seconds = _run_diarization(
123
  audio_path=normalized_audio_path,
124
  hf_token=hf_token,
125
  )
126
 
127
+ response = {
128
+ "source": "pyannote/speaker-diarization-community-1",
129
+ "zerogpu_seconds": round(zerogpu_seconds, 3),
130
+ "segments": [
131
+ {
132
+ "segment_id": f"seg_{index:06d}",
133
+ "speaker_id": segment["speaker"],
134
+ "start": round(segment["start"], 3),
135
+ "end": round(segment["end"], 3),
136
+ "duration": round(segment["duration"], 3),
137
+ }
138
+ for index, segment in enumerate(segments, start=1)
139
+ ],
140
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ return response
143
 
144
 
145
  def build_demo() -> gr.Blocks:
 
171
  run_button = gr.Button("Run diarization", variant="primary")
172
 
173
  with gr.Column(scale=1):
174
+ response_output = gr.JSON(label="Diarization JSON")
 
 
 
 
 
 
 
175
 
176
  run_button.click(
177
  fn=diarize,
 
179
  audio_input,
180
  token_input,
181
  ],
182
+ outputs=[response_output],
183
  )
184
 
 
 
 
 
 
185
 
186
  return demo
187