import json import os import subprocess import tempfile from pathlib import Path from typing import Any import gradio as gr def serialize(value: Any) -> Any: if isinstance(value, (str, int, float, bool)) or value is None: return value if isinstance(value, Path): return str(value) if isinstance(value, dict): return {str(k): serialize(v) for k, v in value.items()} if isinstance(value, (list, tuple)): return [serialize(v) for v in value] if hasattr(value, "item"): try: return value.item() except Exception: pass if hasattr(value, "tolist"): try: return value.tolist() except Exception: pass if hasattr(value, "__dict__"): return {k: serialize(v) for k, v in vars(value).items()} return str(value) def parse_model_options(raw: str | None) -> dict[str, Any]: if not raw: return {} try: parsed = json.loads(raw) except json.JSONDecodeError as exc: raise gr.Error(f"model_options_json must be valid JSON: {exc}") from exc if not isinstance(parsed, dict): raise gr.Error("model_options_json must decode to a JSON object") return parsed def get_audio_duration_seconds(audio_file: str) -> float | None: cmd = [ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", audio_file, ] proc = subprocess.run(cmd, capture_output=True, text=True) if proc.returncode != 0: return None try: return float(proc.stdout.strip()) except Exception: return None def extract_audio_clip( source_audio_file: str, start_seconds: float, duration_seconds: float, tmpdir: str | None = None, ) -> str: tmp_root = tmpdir or tempfile.gettempdir() out_path = os.path.join(tmp_root, f"chunk_{int(start_seconds*1000)}_{int(duration_seconds*1000)}.wav") cmd = [ "ffmpeg", "-y", "-ss", str(start_seconds), "-t", str(duration_seconds), "-i", source_audio_file, "-vn", "-ac", "1", "-ar", "16000", out_path, ] proc = subprocess.run(cmd, capture_output=True, text=True) if proc.returncode != 0: raise gr.Error(f"ffmpeg failed while extracting audio chunk: {proc.stderr[-1200:]}") return out_path def build_audio_chunk_plan( audio_file: str, chunk_duration_s: float, chunk_overlap_s: float, ) -> list[dict[str, float | int]]: duration = get_audio_duration_seconds(audio_file) if duration is None: raise gr.Error("Could not determine audio duration (ffprobe failed).") if duration <= 0: return [] if chunk_duration_s <= 0: raise gr.Error("chunk_duration_s must be > 0") if chunk_overlap_s < 0: raise gr.Error("chunk_overlap_s must be >= 0") if chunk_overlap_s >= chunk_duration_s: raise gr.Error("chunk_overlap_s must be smaller than chunk_duration_s") plan = [] step = chunk_duration_s - chunk_overlap_s start = 0.0 idx = 0 while start < duration: end = min(duration, start + chunk_duration_s) plan.append( { "index": idx, "start": start, "end": end, "duration": end - start, } ) if end >= duration: break start += step idx += 1 return plan