Ratnesh-dev's picture
Add Pyannote Diarization API
2e9f41b
import json
import os
import subprocess
import tempfile
from pathlib import Path
from typing import Any
import gradio as gr
def serialize(value: Any) -> Any:
if isinstance(value, (str, int, float, bool)) or value is None:
return value
if isinstance(value, Path):
return str(value)
if isinstance(value, dict):
return {str(k): serialize(v) for k, v in value.items()}
if isinstance(value, (list, tuple)):
return [serialize(v) for v in value]
if hasattr(value, "item"):
try:
return value.item()
except Exception:
pass
if hasattr(value, "tolist"):
try:
return value.tolist()
except Exception:
pass
if hasattr(value, "__dict__"):
return {k: serialize(v) for k, v in vars(value).items()}
return str(value)
def parse_model_options(raw: str | None) -> dict[str, Any]:
if not raw:
return {}
try:
parsed = json.loads(raw)
except json.JSONDecodeError as exc:
raise gr.Error(f"model_options_json must be valid JSON: {exc}") from exc
if not isinstance(parsed, dict):
raise gr.Error("model_options_json must decode to a JSON object")
return parsed
def get_audio_duration_seconds(audio_file: str) -> float | None:
cmd = [
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
audio_file,
]
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
return None
try:
return float(proc.stdout.strip())
except Exception:
return None
def extract_audio_clip(
source_audio_file: str,
start_seconds: float,
duration_seconds: float,
tmpdir: str | None = None,
) -> str:
tmp_root = tmpdir or tempfile.gettempdir()
out_path = os.path.join(tmp_root, f"chunk_{int(start_seconds*1000)}_{int(duration_seconds*1000)}.wav")
cmd = [
"ffmpeg",
"-y",
"-ss",
str(start_seconds),
"-t",
str(duration_seconds),
"-i",
source_audio_file,
"-vn",
"-ac",
"1",
"-ar",
"16000",
out_path,
]
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
raise gr.Error(f"ffmpeg failed while extracting audio chunk: {proc.stderr[-1200:]}")
return out_path
def build_audio_chunk_plan(
audio_file: str,
chunk_duration_s: float,
chunk_overlap_s: float,
) -> list[dict[str, float | int]]:
duration = get_audio_duration_seconds(audio_file)
if duration is None:
raise gr.Error("Could not determine audio duration (ffprobe failed).")
if duration <= 0:
return []
if chunk_duration_s <= 0:
raise gr.Error("chunk_duration_s must be > 0")
if chunk_overlap_s < 0:
raise gr.Error("chunk_overlap_s must be >= 0")
if chunk_overlap_s >= chunk_duration_s:
raise gr.Error("chunk_overlap_s must be smaller than chunk_duration_s")
plan = []
step = chunk_duration_s - chunk_overlap_s
start = 0.0
idx = 0
while start < duration:
end = min(duration, start + chunk_duration_s)
plan.append(
{
"index": idx,
"start": start,
"end": end,
"duration": end - start,
}
)
if end >= duration:
break
start += step
idx += 1
return plan