Spaces:
Running on Zero
Running on Zero
Commit ·
667e520
1
Parent(s): f2eb92a
Normalize audio before diarization
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import csv
|
| 4 |
import io
|
|
|
|
| 5 |
import tempfile
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Any
|
|
@@ -66,6 +67,36 @@ def _format_timestamp(seconds: float) -> str:
|
|
| 66 |
return f"{hours:02}:{minutes:02}:{secs:02}.{millis:03}"
|
| 67 |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
@spaces.GPU(duration=GPU_DURATION_SECONDS)
|
| 70 |
def _run_diarization(
|
| 71 |
audio_path: str,
|
|
@@ -146,13 +177,14 @@ def diarize(
|
|
| 146 |
if not Path(audio_path).exists():
|
| 147 |
raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
|
| 148 |
|
|
|
|
| 149 |
resolved_token = _resolve_token(hf_token)
|
| 150 |
|
| 151 |
# Load on CPU first so the ZeroGPU decorator only wraps actual inference.
|
| 152 |
get_pipeline(resolved_token)
|
| 153 |
|
| 154 |
segments, rttm_text, annotation_label = _run_diarization(
|
| 155 |
-
audio_path=
|
| 156 |
hf_token=resolved_token,
|
| 157 |
prefer_exclusive=prefer_exclusive,
|
| 158 |
)
|
|
|
|
| 2 |
|
| 3 |
import csv
|
| 4 |
import io
|
| 5 |
+
import subprocess
|
| 6 |
import tempfile
|
| 7 |
from pathlib import Path
|
| 8 |
from typing import Any
|
|
|
|
| 67 |
return f"{hours:02}:{minutes:02}:{secs:02}.{millis:03}"
|
| 68 |
|
| 69 |
|
| 70 |
+
def _normalize_audio(audio_path: str) -> str:
|
| 71 |
+
normalized_dir = Path(tempfile.mkdtemp(prefix="pyannote_audio_"))
|
| 72 |
+
normalized_path = normalized_dir / "normalized.wav"
|
| 73 |
+
|
| 74 |
+
command = [
|
| 75 |
+
"ffmpeg",
|
| 76 |
+
"-y",
|
| 77 |
+
"-hide_banner",
|
| 78 |
+
"-loglevel",
|
| 79 |
+
"error",
|
| 80 |
+
"-i",
|
| 81 |
+
audio_path,
|
| 82 |
+
"-ac",
|
| 83 |
+
"1",
|
| 84 |
+
"-ar",
|
| 85 |
+
"16000",
|
| 86 |
+
"-c:a",
|
| 87 |
+
"pcm_s16le",
|
| 88 |
+
str(normalized_path),
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
subprocess.run(command, check=True, capture_output=True, text=True)
|
| 93 |
+
except subprocess.CalledProcessError as exc:
|
| 94 |
+
message = exc.stderr.strip() or exc.stdout.strip() or str(exc)
|
| 95 |
+
raise gr.Error(f"Failed to normalize audio with ffmpeg: {message}") from exc
|
| 96 |
+
|
| 97 |
+
return str(normalized_path)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
@spaces.GPU(duration=GPU_DURATION_SECONDS)
|
| 101 |
def _run_diarization(
|
| 102 |
audio_path: str,
|
|
|
|
| 177 |
if not Path(audio_path).exists():
|
| 178 |
raise gr.Error("The uploaded audio file could not be found. Please re-upload it and try again.")
|
| 179 |
|
| 180 |
+
normalized_audio_path = _normalize_audio(audio_path)
|
| 181 |
resolved_token = _resolve_token(hf_token)
|
| 182 |
|
| 183 |
# Load on CPU first so the ZeroGPU decorator only wraps actual inference.
|
| 184 |
get_pipeline(resolved_token)
|
| 185 |
|
| 186 |
segments, rttm_text, annotation_label = _run_diarization(
|
| 187 |
+
audio_path=normalized_audio_path,
|
| 188 |
hf_token=resolved_token,
|
| 189 |
prefer_exclusive=prefer_exclusive,
|
| 190 |
)
|