Spaces:

Ratnesh-dev
/

diarize

Running on Zero

App Files Files Community

Ratnesh-dev commited on Apr 8

Commit

667e520

1 Parent(s): f2eb92a

Normalize audio before diarization

Browse files

Files changed (1) hide show

app.py +33 -1

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import csv
 import io
 import tempfile
 from pathlib import Path
 from typing import Any
@@ -66,6 +67,36 @@ def _format_timestamp(seconds: float) -> str:
     return f"{hours:02}:{minutes:02}:{secs:02}.{millis:03}"
 @spaces.GPU(duration=GPU_DURATION_SECONDS)
 def _run_diarization(
     audio_path: str,
@@ -146,13 +177,14 @@ def diarize(
     if not Path(audio_path).exists():
         raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
     resolved_token = _resolve_token(hf_token)
     # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
     get_pipeline(resolved_token)
     segments, rttm_text, annotation_label = _run_diarization(
-        audio_path=audio_path,
         hf_token=resolved_token,
         prefer_exclusive=prefer_exclusive,
     )

 import csv
 import io
+import subprocess
 import tempfile
 from pathlib import Path
 from typing import Any
     return f"{hours:02}:{minutes:02}:{secs:02}.{millis:03}"
+def _normalize_audio(audio_path: str) -> str:
+    normalized_dir = Path(tempfile.mkdtemp(prefix="pyannote_audio_"))
+    normalized_path = normalized_dir / "normalized.wav"
+    command = [
+        "ffmpeg",
+        "-y",
+        "-hide_banner",
+        "-loglevel",
+        "error",
+        "-i",
+        audio_path,
+        "-ac",
+        "1",
+        "-ar",
+        "16000",
+        "-c:a",
+        "pcm_s16le",
+        str(normalized_path),
+    ]
+    try:
+        subprocess.run(command, check=True, capture_output=True, text=True)
+    except subprocess.CalledProcessError as exc:
+        message = exc.stderr.strip() or exc.stdout.strip() or str(exc)
+        raise gr.Error(f"Failed to normalize audio with ffmpeg: {message}") from exc
+    return str(normalized_path)
 @spaces.GPU(duration=GPU_DURATION_SECONDS)
 def _run_diarization(
     audio_path: str,
     if not Path(audio_path).exists():
         raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
+    normalized_audio_path = _normalize_audio(audio_path)
     resolved_token = _resolve_token(hf_token)
     # Load on CPU first so the ZeroGPU decorator only wraps actual inference.
     get_pipeline(resolved_token)
     segments, rttm_text, annotation_label = _run_diarization(
+        audio_path=normalized_audio_path,
         hf_token=resolved_token,
         prefer_exclusive=prefer_exclusive,
     )