Ratnesh-dev commited on
Commit
667e520
·
1 Parent(s): f2eb92a

Normalize audio before diarization

Browse files
Files changed (1) hide show
  1. app.py +33 -1
app.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
 
3
  import csv
4
  import io
 
5
  import tempfile
6
  from pathlib import Path
7
  from typing import Any
@@ -66,6 +67,36 @@ def _format_timestamp(seconds: float) -> str:
66
  return f"{hours:02}:{minutes:02}:{secs:02}.{millis:03}"
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  @spaces.GPU(duration=GPU_DURATION_SECONDS)
70
  def _run_diarization(
71
  audio_path: str,
@@ -146,13 +177,14 @@ def diarize(
146
  if not Path(audio_path).exists():
147
  raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
148
 
 
149
  resolved_token = _resolve_token(hf_token)
150
 
151
  # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
152
  get_pipeline(resolved_token)
153
 
154
  segments, rttm_text, annotation_label = _run_diarization(
155
- audio_path=audio_path,
156
  hf_token=resolved_token,
157
  prefer_exclusive=prefer_exclusive,
158
  )
 
2
 
3
  import csv
4
  import io
5
+ import subprocess
6
  import tempfile
7
  from pathlib import Path
8
  from typing import Any
 
67
  return f"{hours:02}:{minutes:02}:{secs:02}.{millis:03}"
68
 
69
 
70
+ def _normalize_audio(audio_path: str) -> str:
71
+ normalized_dir = Path(tempfile.mkdtemp(prefix="pyannote_audio_"))
72
+ normalized_path = normalized_dir / "normalized.wav"
73
+
74
+ command = [
75
+ "ffmpeg",
76
+ "-y",
77
+ "-hide_banner",
78
+ "-loglevel",
79
+ "error",
80
+ "-i",
81
+ audio_path,
82
+ "-ac",
83
+ "1",
84
+ "-ar",
85
+ "16000",
86
+ "-c:a",
87
+ "pcm_s16le",
88
+ str(normalized_path),
89
+ ]
90
+
91
+ try:
92
+ subprocess.run(command, check=True, capture_output=True, text=True)
93
+ except subprocess.CalledProcessError as exc:
94
+ message = exc.stderr.strip() or exc.stdout.strip() or str(exc)
95
+ raise gr.Error(f"Failed to normalize audio with ffmpeg: {message}") from exc
96
+
97
+ return str(normalized_path)
98
+
99
+
100
  @spaces.GPU(duration=GPU_DURATION_SECONDS)
101
  def _run_diarization(
102
  audio_path: str,
 
177
  if not Path(audio_path).exists():
178
  raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
179
 
180
+ normalized_audio_path = _normalize_audio(audio_path)
181
  resolved_token = _resolve_token(hf_token)
182
 
183
  # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
184
  get_pipeline(resolved_token)
185
 
186
  segments, rttm_text, annotation_label = _run_diarization(
187
+ audio_path=normalized_audio_path,
188
  hf_token=resolved_token,
189
  prefer_exclusive=prefer_exclusive,
190
  )