Spaces:

nsfwalex
/

audio-edit

Sleeping

App Files Files Community

liuyang commited on Aug 20, 2025

Commit

a4ab88e

1 Parent(s): a095ed4

Add AudioJob integration to app.py with UI for running audio jobs and handling manifests. Updated requirements.txt to include webrtcvad and boto3.

Browse files

Files changed (3) hide show

app.py +58 -0
audiojob.py +1016 -0
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -2,6 +2,11 @@ import gradio as gr
 import requests
 import tempfile
 import os
 from pydub import AudioSegment
 from typing import Optional, Tuple
 import logging
@@ -240,6 +245,59 @@ with gr.Blocks(title="Audio Editor", theme=gr.themes.Soft()) as demo:
         outputs=[audio_output, status_output]
     )
 # Launch the app
 if __name__ == "__main__":
     demo.launch()

 import requests
 import tempfile
 import os
+import json
+import traceback
+# AudioJob integration
+from audiojob import AudioJobRunner, LocalStorageAdapter
 from pydub import AudioSegment
 from typing import Optional, Tuple
 import logging
         outputs=[audio_output, status_output]
     )
+    with gr.Tab("AudioJob Runner"):
+        gr.Markdown("### AudioJob: preprocess -> split (inspect manifest)")
+        with gr.Row():
+            with gr.Column():
+                aj_source_input = gr.Textbox(
+                    label="Source URI",
+                    placeholder="e.g. /abs/path/to/file.wav or s3://bucket/key",
+                    info="Source URI for AudioJobRunner"
+                )
+                aj_manifest_input = gr.Textbox(
+                    label="Manifest JSON (optional)",
+                    placeholder="Paste existing manifest JSON to resume (optional)",
+                    lines=10
+                )
+                aj_s3_prefix = gr.Textbox(
+                    label="S3 Prefix",
+                    placeholder="Optional prefix for uploaded working copies (e.g. jobs/)",
+                    info="Uploaded keys will be prefixed with this value",
+                )
+                aj_run_button = gr.Button("Run AudioJob", variant="primary")
+            with gr.Column():
+                aj_output = gr.Textbox(label="AudioJob Output (manifest)", lines=30, interactive=False)
+        def run_audiojob_ui(source_uri: str, manifest_json: str, s3_prefix: str) -> str:
+            try:
+                manifest = None
+                if manifest_json and manifest_json.strip():
+                    manifest = json.loads(manifest_json)
+                work_root = tempfile.mkdtemp(prefix="audiojob_")
+                storage = LocalStorageAdapter()
+                # allow presets from top-level presets if desired; using defaults here
+                runner = AudioJobRunner(
+                    manifest=manifest,
+                    source_uri=None if manifest else source_uri,
+                    work_root=work_root,
+                    storage=storage,
+                    presets={
+                        # Read bucket and endpoint from environment where possible
+                        "s3_bucket": os.environ.get("S3_BUCKET"),
+                        "s3_region": "auto",
+                        "s3_prefix": s3_prefix or "",
+                        "s3_endpoint": os.environ.get("S3_ENDPOINT", "")
+                    }
+                )
+                out_manifest = runner.run_until_split()
+                return json.dumps(out_manifest, ensure_ascii=False, indent=2)
+            except Exception as e:
+                tb = traceback.format_exc()
+                return f"Error: {e}\n\n{tb}"
+        aj_run_button.click(fn=run_audiojob_ui, inputs=[aj_source_input, aj_manifest_input], outputs=[aj_output])
 # Launch the app
 if __name__ == "__main__":
     demo.launch()

audiojob.py ADDED Viewed

	@@ -0,0 +1,1016 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Audio preprocess runner v2 (until split), checkpointable & resumable.
+Key upgrades:
+- Robust subprocess with retries/backoff/timeout
+- Content-addressed cache for preprocess outputs
+- Unique temp files + cleanup
+- Explicit stream mapping (-map 0:a:0)
+- Filter availability detection with graceful fallbacks
+- VAD streams PCM from ffmpeg (no giant temp WAV), with progress updates
+- Split plan exposes per-channel source_uris (for virtual slicing)
+- Stage error breadcrumbs + atomic manifest rev bumps
+- Pluggable StorageAdapter (LocalStorageAdapter provided)
+Author: you
+"""
+import os
+import re
+import io
+import sys
+import json
+import math
+import time
+import shutil
+import hashlib
+import tempfile
+import datetime as dt
+import subprocess
+import uuid
+from typing import Optional, Dict, Any, List, Tuple, BinaryIO
+# ============================================================
+# Storage Adapters
+# ============================================================
+class StorageAdapter:
+    """Abstract interface for reading/writing blobs and metadata."""
+    def exists(self, uri: str) -> bool: raise NotImplementedError
+    def open_read(self, uri: str) -> BinaryIO: raise NotImplementedError
+    def open_write(self, uri: str) -> BinaryIO: raise NotImplementedError
+    def save_json(self, uri: str, obj: dict) -> None:
+        with self.open_write(uri) as f:
+            f.write(json.dumps(obj, ensure_ascii=False, indent=2).encode("utf-8"))
+    def load_json(self, uri: str) -> dict:
+        with self.open_read(uri) as f:
+            return json.loads(f.read().decode("utf-8"))
+    def stat(self, uri: str) -> Dict[str, Any]:
+        """Return {'bytes': int|None, 'sha256': str|None, 'etag': str|None} where possible."""
+        return {"bytes": None, "sha256": None, "etag": None}
+    def presign(self, uri: str, method: str = "GET", ttl: int = 3600) -> str:
+        """Return a URL suitable for HTTP reads/writes. Local adapter may return a file:// path."""
+        return uri
+class LocalStorageAdapter(StorageAdapter):
+    """Treats absolute paths as 'uris' (file system)."""
+    def exists(self, uri: str) -> bool:
+        return os.path.exists(uri)
+    def open_read(self, uri: str) -> BinaryIO:
+        return open(uri, "rb")
+    def open_write(self, uri: str) -> BinaryIO:
+        os.makedirs(os.path.dirname(uri), exist_ok=True)
+        return open(uri, "wb")
+    def stat(self, uri: str) -> Dict[str, Any]:
+        if not os.path.exists(uri):
+            return {"bytes": None, "sha256": None, "etag": None}
+        st = os.stat(uri)
+        return {"bytes": st.st_size, "sha256": None, "etag": None}
+    def presign(self, uri: str, method: str = "GET", ttl: int = 3600) -> str:
+        return uri  # consumers should handle file paths
+# Stub you can implement for S3/R2 later:
+class S3LikeStorageAdapter(StorageAdapter):
+    """S3-like storage adapter using boto3. Exposes simple operations for
+    checking existence, reading, writing (via temp files) and presigning URLs.
+    Usage:
+        adapter = S3LikeStorageAdapter(bucket="my-bucket", region_name="us-east-1")
+        adapter.upload_file(local_path, key)
+        url = adapter.presign(key, "GET", ttl=3600)
+    """
+    def __init__(self, bucket: str, region_name: Optional[str] = None,
+                 aws_access_key_id: Optional[str] = None,
+                 aws_secret_access_key: Optional[str] = None,
+                 aws_session_token: Optional[str] = None,
+                 endpoint_url: Optional[str] = None):
+        try:
+            import boto3
+        except Exception:
+            raise RuntimeError("boto3 is required for S3LikeStorageAdapter but is not installed")
+        session_kwargs = {}
+        if aws_access_key_id and aws_secret_access_key:
+            session_kwargs.update({
+                "aws_access_key_id": aws_access_key_id,
+                "aws_secret_access_key": aws_secret_access_key,
+            })
+        if aws_session_token:
+            session_kwargs["aws_session_token"] = aws_session_token
+        session = boto3.session.Session(**session_kwargs)
+        client_kwargs = {"region_name": region_name}
+        if endpoint_url:
+            client_kwargs["endpoint_url"] = endpoint_url
+        self.s3 = session.client("s3", **client_kwargs)
+        self.bucket = bucket
+    def exists(self, key: str) -> bool:
+        try:
+            self.s3.head_object(Bucket=self.bucket, Key=key)
+            return True
+        except Exception:
+            return False
+    def open_read(self, key: str) -> BinaryIO:
+        # Download into memory (caller should avoid very large files via this API)
+        obj = self.s3.get_object(Bucket=self.bucket, Key=key)
+        body = obj["Body"].read()
+        return io.BytesIO(body)
+    def open_write(self, key: str) -> BinaryIO:
+        # Provide a temp file that will be uploaded on close
+        tmp = tempfile.NamedTemporaryFile(delete=False)
+        class _S3Writer(io.BufferedWriter):
+            def __init__(self, tmp_file_path: str, outer: "S3LikeStorageAdapter", key: str):
+                self._tmp_path = tmp_file_path
+                self._outer = outer
+                self._key = key
+                f = open(tmp_file_path, "r+b")
+                super().__init__(f)
+            def close(self):
+                try:
+                    super().close()
+                finally:
+                    # Upload using boto3's upload_file which handles multipart for large files
+                    try:
+                        self._outer.s3.upload_file(self._tmp_path, self._outer.bucket, self._key)
+                    except Exception as e:
+                        raise
+                    finally:
+                        try:
+                            os.remove(self._tmp_path)
+                        except Exception:
+                            pass
+        tmp_path = tmp.name
+        tmp.close()
+        return _S3Writer(tmp_path, self, key)
+    def stat(self, key: str) -> Dict[str, Any]:
+        try:
+            r = self.s3.head_object(Bucket=self.bucket, Key=key)
+            return {"bytes": int(r.get("ContentLength", 0)), "sha256": None, "etag": r.get("ETag")}
+        except Exception:
+            return {"bytes": None, "sha256": None, "etag": None}
+    def presign(self, key: str, method: str = "GET", ttl: int = 3600) -> str:
+        # Return a presigned URL for the given key
+        params = {"Bucket": self.bucket, "Key": key}
+        http_method = method.upper()
+        return self.s3.generate_presigned_url(
+            ClientMethod="get_object" if http_method == "GET" else "put_object",
+            Params=params,
+            ExpiresIn=int(ttl),
+        )
+    def upload_file(self, local_path: str, key: str) -> str:
+        """Upload a local file to the S3 bucket under `key`.
+        Uses boto3's managed uploader which supports multipart uploads for large files.
+        Returns the uploaded key on success.
+        """
+        self.s3.upload_file(local_path, self.bucket, key)
+        return key
+# ============================================================
+# Utilities
+# ============================================================
+def utc_now_iso() -> str:
+    return dt.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
+def sha256_file(path: str, chunk_size: int = 1024 * 1024) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(chunk_size), b""):
+            h.update(chunk)
+    return h.hexdigest()
+def clamp(v: float, lo: float, hi: float) -> float:
+    return max(lo, min(hi, v))
+def sec_to_hms(seconds: float) -> str:
+    s = max(0.0, float(seconds))
+    h = int(s // 3600)
+    m = int((s % 3600) // 60)
+    ss = s - h * 3600 - m * 60
+    return f"{h:02d}:{m:02d}:{ss:06.3f}"
+def float_or_none(x: Any) -> Optional[float]:
+    try: return float(x)
+    except Exception: return None
+# Robust subprocess with retries/backoff/timeout
+def run(cmd: List[str], timeout: Optional[int] = None) -> Tuple[int, str, str]:
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+    )
+    try:
+        out, err = proc.communicate(timeout=timeout)
+    except subprocess.TimeoutExpired:
+        proc.kill()
+        out, err = proc.communicate()
+        return 124, out, err + "\nTIMEOUT"
+    return proc.returncode, out, err
+def run_with_retry(cmd: List[str], retries: int = 3, timeout: Optional[int] = None, backoff: float = 1.5) -> str:
+    last = None
+    for i in range(retries):
+        code, out, err = run(cmd, timeout)
+        if code == 0:
+            return out
+        last = (code, err)
+        time.sleep(backoff ** i)
+    raise RuntimeError(f"Command failed after {retries} attempts: {' '.join(cmd)}\n{last}")
+# ============================================================
+# Defaults / presets
+# ============================================================
+DEFAULT_PRESETS = {
+    "materialize_chunks": False,       # virtual slicing by default
+    "sample_rate_target": 16000,
+    "container_target": "flac",
+    "channel_policy": "auto",          # auto | split | downmix | keep
+    "normalize": "light",              # none | light | r128
+    "denoise": "auto",                 # none | light | auto
+    "chunk_policy": "vad_fallback_fixed",
+    "chunk_target_ms": 1800000,
+    "overlap_ms": 300,
+    "vad_aggressiveness": 2,           # 0..3 if webrtcvad available
+    "highpass_hz": 60,
+    "max_gain_db": 6.0,
+    "min_mean_dbfs_for_gain": -30.0,
+    "stereo_side_mid_threshold_db": 20.0,  # side <= mid - 20 dB => mono OK
+    "ff_timeout_sec": 600,             # per ffmpeg/ffprobe call
+    "ff_retries": 3,
+    # Optional S3 uploader settings (for working copy uploads only)
+    "s3_bucket": None,
+    "s3_region": None,
+    "s3_prefix": "",
+}
+# ============================================================
+# Main runner
+# ============================================================
+class AudioJobRunner:
+    """
+    Drives the workflow until 'split' is complete (transcription not included).
+    Now with retries, content-addressed cache, streaming VAD, and StorageAdapter.
+    Usage:
+        storage = LocalStorageAdapter()
+        runner = AudioJobRunner(
+            manifest=None,
+            source_uri="/abs/path/to/audio.wav",   # or r2://bucket/key if your adapter supports it
+            work_root="/tmp/jobwork",
+            storage=storage,
+            presets={"chunk_target_ms": 45000}
+        )
+        manifest = runner.run_until_split()
+    """
+    def __init__(
+        self,
+        manifest: Optional[Dict[str, Any]],
+        source_uri: Optional[str], # should always be a url, upstream should determine and convert the filekey to url with proper domain
+        work_root: str,
+        storage: StorageAdapter,
+        presets: Optional[Dict[str, Any]] = None,
+    ):
+        self.storage = storage
+        self.work_root = os.path.abspath(work_root)
+        os.makedirs(self.work_root, exist_ok=True)
+        self.presets = dict(DEFAULT_PRESETS)
+        if presets:
+            self.presets.update(presets)
+        # Detect tools & filters
+        self.tool_versions = self._detect_tool_versions()
+        self.filter_caps = self._detect_filter_caps()
+        # Initialize or load manifest
+        if manifest is None:
+            if not source_uri:
+                raise ValueError("source_uri is required for a new job.")
+            self.manifest = self._init_manifest(source_uri)
+        else:
+            self.manifest = manifest
+            self.manifest.setdefault("tool_versions", self.tool_versions)
+        self.manifest.setdefault("version", "2.0")
+        self.manifest.setdefault("rev", 0)
+        self._touch_updated()
+    # --------------------------------------------------------
+    # Public API
+    # --------------------------------------------------------
+    def run_until_split(self) -> Dict[str, Any]:
+        try:
+            if self._stage_status("probe") != "done":
+                self._run_probe()
+            if self._stage_status("preprocess") != "done":
+                self._run_preprocess()
+            if self._stage_status("split") != "done":
+                self._run_split_plan()
+            return self.manifest
+        except Exception as e:
+            # Leave breadcrumb on the current running stage if any
+            for stage in ("split", "preprocess", "probe"):
+                if self._stage_status(stage) == "running":
+                    self._set_stage(stage, "failed", 0.0, {"last_error": str(e), "ended_at": utc_now_iso()})
+                    break
+            raise
+    # --------------------------------------------------------
+    # Manifest helpers
+    # --------------------------------------------------------
+    def _init_manifest(self, source_uri: str) -> Dict[str, Any]:
+        job_id = str(uuid.uuid4())
+        m = {
+            "version": "2.0",
+            "rev": 0,
+            "job_id": job_id,
+            "created_at": utc_now_iso(),
+            "updated_at": utc_now_iso(),
+            "source": {
+                "uri": source_uri,
+                "sha256": None,
+                "etag": None,
+                "bytes": None,
+                "container": None,
+                "codec": None,
+                "duration_ms": None,
+                "sample_rate": None,
+                "channels": None,
+            },
+            "tool_versions": self.tool_versions,
+            "policy": dict(self.presets),
+            "stages": {
+                "probe": {"status": "pending", "progress": 0.0},
+                "preprocess": {"status": "pending", "progress": 0.0},
+                "split": {"status": "pending", "progress": 0.0},
+                "transcribe": {"status": "pending", "progress": 0.0},
+            },
+            "stitch": {"status": "pending", "progress": 0.0},
+            "outputs": {
+                "transcript_uri": None,
+                "srt_uri": None,
+                "vtt_uri": None,
+                "txt_uri": None,
+                "qc": {"passed": None, "issues": []},
+            },
+        }
+        return m
+    def _touch_updated(self):
+        self.manifest["updated_at"] = utc_now_iso()
+        self.manifest["rev"] = int(self.manifest.get("rev", 0)) + 1
+    def _stage_status(self, name: str) -> str:
+        return self.manifest.get("stages", {}).get(name, {}).get("status", "pending")
+    def _set_stage(self, name: str, status: str, progress: float, extra: Dict[str, Any] = None):
+        st = self.manifest["stages"].setdefault(name, {})
+        st["status"] = status
+        st["progress"] = clamp(progress, 0.0, 1.0)
+        if extra:
+            st.update(extra)
+        self._touch_updated()
+    # --------------------------------------------------------
+    # Tool/Filter detection
+    # --------------------------------------------------------
+    def _detect_tool_versions(self) -> Dict[str, str]:
+        vers = {}
+        for tool in ("ffmpeg", "ffprobe"):
+            try:
+                out = run_with_retry([tool, "-version"], retries=1, timeout=10)
+                first = out.splitlines()[0]
+                m = re.search(r"version\s+([^\s]+)", first)
+                vers[tool] = m.group(1) if m else first
+            except Exception:
+                vers[tool] = "unknown"
+        try:
+            import webrtcvad  # noqa
+            vers["webrtcvad"] = "installed"
+        except Exception:
+            vers["webrtcvad"] = "missing"
+        return vers
+    def _detect_filter_caps(self) -> Dict[str, bool]:
+        caps = {"arnndn": False, "adeclip": False, "highpass": True}
+        try:
+            out = run_with_retry(["ffmpeg", "-hide_banner", "-filters"], retries=1, timeout=10)
+            txt = "\n".join(out.splitlines())
+            for name in list(caps.keys()):
+                if f" {name} " in txt:
+                    caps[name] = True
+        except Exception:
+            pass
+        return caps
+    # --------------------------------------------------------
+    # Optional S3 uploader for working copies only
+    # --------------------------------------------------------
+    def _get_s3_uploader(self) -> Optional[S3LikeStorageAdapter]:
+        bucket = self.presets.get("s3_bucket")
+        if not bucket:
+            return None
+        region = self.presets.get("s3_region")
+        prefix = self.presets.get("s3_prefix", "")  # kept in presets for key generation
+        endpoint = self.presets.get("s3_endpoint")
+        try:
+            return S3LikeStorageAdapter(bucket=bucket, region_name=region, endpoint_url=endpoint)
+        except Exception:
+            return None
+    def _maybe_upload_working_to_s3(self, working: Dict[str, Any], local_map: Dict[str, str]) -> None:
+        uploader = self._get_s3_uploader()
+        if not uploader:
+            return
+        prefix = str(self.presets.get("s3_prefix", "")).strip()
+        jobid = self.manifest.get("job_id", "")
+        for chan, local_path in local_map.items():
+            ext = os.path.splitext(local_path)[1].lstrip(".") or working.get("format", "flac")
+            key = f"{jobid}_{'main' if chan == 'mono' else ('ch1' if chan == 'L' else 'ch2')}.{ext}"
+            if prefix:
+                key = os.path.join(prefix, key)
+            try:
+                uploader.upload_file(local_path, key)
+                working["uris_remote"][chan] = key
+            except Exception:
+                # best-effort; continue without failing the job
+                pass
+    # --------------------------------------------------------
+    # Stage: Probe
+    # --------------------------------------------------------
+    def _run_probe(self):
+        self._set_stage("probe", "running", 0.05, {"started_at": utc_now_iso()})
+        src = self.manifest["source"]["uri"]
+        # Try to stat source (size/etag); SHA for local files
+        st = self.storage.stat(src)
+        self.manifest["source"]["bytes"] = st.get("bytes")
+        self.manifest["source"]["etag"] = st.get("etag")
+        if os.path.isabs(src) and os.path.isfile(src):
+            try:
+                self.manifest["source"]["sha256"] = sha256_file(src)
+            except Exception:
+                self.manifest["source"]["sha256"] = None
+        info = self._ffprobe_streams(src)
+        fmt = info.get("format", {})
+        streams = info.get("streams", [])
+        audio = next((s for s in streams if s.get("codec_type") == "audio"), {})
+        self.manifest["source"].update({
+            "container": fmt.get("format_name"),
+            "codec": audio.get("codec_name"),
+            "duration_ms": int(float(fmt.get("duration", 0)) * 1000) if fmt.get("duration") else None,
+            "sample_rate": int(audio.get("sample_rate", 0)) if audio.get("sample_rate") else None,
+            "channels": int(audio.get("channels", 0)) if audio.get("channels") else None,
+        })
+        self._set_stage("probe", "running", 0.6)
+        # Stereo assessment (if 2ch)
+        if self.manifest["source"]["channels"] == 2:
+            stereo_metrics = self._stereo_metrics(src)
+        else:
+            stereo_metrics = {
+                "rms_L": None, "rms_R": None,
+                "mid_rms_db": None, "side_rms_db": None,
+                "max_dbfs": None, "clipping_pct": None,
+                "near_silent_channel": False, "corr": None,
+                "recommended_mode": "mono" if self.manifest["source"]["channels"] == 1 else "as_is"
+            }
+        actions = self._decide_actions(stereo_metrics)
+        self.manifest["stages"]["probe"].update({
+            "metrics": {
+                "rms_dbfs_L": stereo_metrics.get("rms_L"),
+                "rms_dbfs_R": stereo_metrics.get("rms_R"),
+                "max_dbfs": stereo_metrics.get("max_dbfs"),
+                "clipping_pct": stereo_metrics.get("clipping_pct"),
+                "stereo": stereo_metrics,
+            },
+            "actions": actions,
+        })
+        self._set_stage("probe", "done", 1.0, {"ended_at": utc_now_iso()})
+    def _ffprobe_streams(self, uri: str) -> Dict[str, Any]:
+        cmd = ["ffprobe", "-v", "error", "-select_streams", "a:0", "-show_streams",
+               "-show_format", "-of", "json", uri]
+        out = run_with_retry(cmd, retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
+        return json.loads(out)
+    def _stereo_metrics(self, uri: str) -> Dict[str, Any]:
+        # Unique temp files to avoid collisions
+        base = os.path.join(tempfile.gettempdir(), f"stmetrics_{uuid.uuid4().hex}")
+        L_txt, R_txt, MID_txt, SIDE_txt = [base + s for s in (".L.txt",".R.txt",".MID.txt",".SIDE.txt")]
+        try:
+            # astats + mid/side
+            cmd = [
+                "ffmpeg", "-nostdin", "-hide_banner", "-v", "error",
+                "-i", uri, "-map", "0:a:0",
+                "-filter_complex",
+                (
+                    "channelsplit=channel_layout=stereo[chl][chr];"
+                    "[chl]astats=metadata=1:reset=1,ametadata=print:file={L};"
+                    "[chr]astats=metadata=1:reset=1,ametadata=print:file={R};"
+                    "pan=stereo|c0=0.5*c0+0.5*c1|c1=0.5*c0-0.5*c1[mid][side];"
+                    "[mid]astats=metadata=1:reset=1,ametadata=print:file={MID};"
+                    "[side]astats=metadata=1:reset=1,ametadata=print:file={SIDE}"
+                ).format(L=L_txt, R=R_txt, MID=MID_txt, SIDE=SIDE_txt),
+                "-f", "null", "-"
+            ]
+            run_with_retry(cmd, retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
+            # volumedetect
+            vd = run_with_retry([
+                "ffmpeg", "-nostdin", "-hide_banner", "-v", "error",
+                "-i", uri, "-map", "0:a:0", "-af", "volumedetect", "-f", "null", "-"
+            ], retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
+            def parse_overall_rms(txt_path: str) -> Optional[float]:
+                if not os.path.exists(txt_path): return None
+                with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
+                    data = f.read()
+                m = re.findall(r"Overall RMS level:\s*([-\d\.]+)\s*dB", data)
+                return float(m[-1]) if m else None
+            def parse_max_dbfs(vol_text: str) -> Optional[float]:
+                m = re.findall(r"max_volume:\s*([-\d\.]+)\s*dB", vol_text)
+                return float(m[-1]) if m else None
+            def parse_clipping(vol_text: str) -> Optional[float]:
+                m = re.findall(r"number of clipped samples:\s*(\d+)", vol_text)
+                return 100.0 if (m and int(m[-1]) > 0) else 0.0
+            rms_L = parse_overall_rms(L_txt)
+            rms_R = parse_overall_rms(R_txt)
+            mid_rms = parse_overall_rms(MID_txt)
+            side_rms = parse_overall_rms(SIDE_txt)
+            # Decide near-silent channel
+            near_silent = False
+            if rms_L is not None and rms_R is not None:
+                if (rms_L < -45.0 and (rms_R - rms_L) > 15.0) or (rms_R < -45.0 and (rms_L - rms_R) > 15.0):
+                    near_silent = True
+            # Recommended mode
+            rec_mode = "split"
+            thr = float(self.presets["stereo_side_mid_threshold_db"])
+            if (mid_rms is not None and side_rms is not None and (side_rms <= (mid_rms - thr))) or near_silent:
+                rec_mode = "mono"
+            return {
+                "rms_L": rms_L, "rms_R": rms_R,
+                "mid_rms_db": mid_rms, "side_rms_db": side_rms,
+                "max_dbfs": parse_max_dbfs(vd), "clipping_pct": parse_clipping(vd),
+                "near_silent_channel": near_silent, "corr": None,
+                "recommended_mode": rec_mode
+            }
+        finally:
+            for p in (L_txt, R_txt, MID_txt, SIDE_txt):
+                try:
+                    if os.path.exists(p): os.remove(p)
+                except Exception:
+                    pass
+    def _decide_actions(self, stereo_metrics: Dict[str, Any]) -> Dict[str, Any]:
+        src_ch = self.manifest["source"]["channels"] or 1
+        policy = self.manifest["policy"]
+        # Channel policy
+        if policy.get("channel_policy") == "auto":
+            if src_ch == 1:
+                ch_pol = "downmix"
+            else:
+                ch_pol = "split" if stereo_metrics.get("recommended_mode") == "split" else "downmix"
+        else:
+            ch_pol = policy.get("channel_policy")
+        # Denoise: auto -> light if very quiet
+        denoise = policy.get("denoise", "auto")
+        if denoise == "auto":
+            rms_L = stereo_metrics.get("rms_L")
+            rms_R = stereo_metrics.get("rms_R")
+            denoise_flag = bool((rms_L and rms_L < -35.0) or (rms_R and rms_R < -35.0))
+            denoise = "light" if denoise_flag else "none"
+        # Normalize
+        normalize = policy.get("normalize", "light")
+        return {
+            "will_resample": True,
+            "will_split_stereo": (ch_pol == "split"),
+            "will_downmix": (ch_pol == "downmix"),
+            "will_denoise": (denoise == "light" and self.filter_caps.get("arnndn", False)),
+            "will_normalize": (normalize != "none"),
+            "channel_policy": ch_pol,
+            "normalize_mode": normalize,
+            "denoise_mode": denoise if self.filter_caps.get("arnndn", False) else "none",
+        }
+    # --------------------------------------------------------
+    # Stage: Preprocess (with content-addressed cache)
+    # --------------------------------------------------------
+    def _run_preprocess(self):
+        self._set_stage("preprocess", "running", 0.05, {"started_at": utc_now_iso()})
+        src = self.manifest["source"]["uri"]
+        actions = self.manifest["stages"]["probe"]["actions"]
+        policy = self.manifest["policy"]
+        # Build filtergraph with soft-fallbacks
+        filters = []
+        if self.filter_caps.get("highpass", True):
+            filters.append(f"highpass=f={int(policy.get('highpass_hz', 60))}")
+        if self.filter_caps.get("adeclip", False):
+            filters.append("adeclip")
+        if actions["will_denoise"] and self.filter_caps.get("arnndn", False):
+            filters.append("arnndn")
+        # Gentle gain if needed
+        metrics = self.manifest["stages"]["probe"].get("metrics", {})
+        mean_db = None
+        if metrics.get("stereo", {}).get("mid_rms_db") is not None:
+            mean_db = metrics["stereo"]["mid_rms_db"]
+        elif metrics.get("rms_dbfs_L") is not None and metrics.get("rms_dbfs_R") is not None:
+            mean_db = (metrics["rms_dbfs_L"] + metrics["rms_dbfs_R"]) / 2.0
+        if mean_db is not None and mean_db < float(self.presets["min_mean_dbfs_for_gain"]):
+            target_boost = min(float(self.presets["max_gain_db"]),
+                               abs(float(self.presets["min_mean_dbfs_for_gain"]) - mean_db))
+            filters.append(f"volume={target_boost:.1f}dB")
+        filtergraph = ",".join(filters) if filters else "anull"
+        sr = int(policy["sample_rate_target"])
+        target_container = policy["container_target"].lower()
+        ch_policy = actions["channel_policy"]
+        # Compute idempotency key BEFORE encoding (source hash/etag + params)
+        idem_src = self.manifest["source"].get("sha256") or self.manifest["source"].get("etag") or self.manifest["source"]["uri"]
+        idem_payload = json.dumps({
+            "src": idem_src, "filter": filtergraph, "sr": sr,
+            "fmt": target_container, "ch_policy": ch_policy,
+            "tools": self.tool_versions
+        }, sort_keys=True).encode("utf-8")
+        idem_key = hashlib.sha256(idem_payload).hexdigest()
+        # Content-addressed working dir
+        base_dir = os.path.join(self.work_root, self.manifest["job_id"], idem_key)
+        os.makedirs(base_dir, exist_ok=True)
+        def out_path(name: str) -> str:
+            return os.path.join(base_dir, f"{name}.{target_container}")
+        # Note: Do not store local paths in manifest. Only store remote keys.
+        working = {"format": target_container, "sample_rate": sr, "channel_map": [], "uris_remote": {}, "filtergraph": filtergraph}
+        produced_any = False
+        try:
+            if ch_policy == "split" and (self.manifest["source"]["channels"] == 2):
+                # L/R mono outputs
+                outL, outR = out_path("ch1"), out_path("ch2")
+                if not (self.storage.exists(outL) and self.storage.exists(outR)):
+                    cmd = [
+                        "ffmpeg", "-nostdin", "-hide_banner", "-y", "-v", "error",
+                        "-i", self.storage.presign(src), "-map", "0:a:0",
+                        "-map_channel", "0.0.0", "-ac", "1", "-ar", str(sr), "-af", filtergraph, outL,
+                        "-map_channel", "0.0.1", "-ac", "1", "-ar", str(sr), "-af", filtergraph, outR
+                    ]
+                    run_with_retry(cmd, retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
+                    produced_any = True
+                working["channel_map"] = ["L", "R"]
+                # Upload to S3/R2 if configured; keep local files but do not store local paths in manifest
+                self._maybe_upload_working_to_s3(working, {"L": outL, "R": outR})
+            else:
+                # Single mono output
+                outM = out_path("main")
+                if not self.storage.exists(outM):
+                    cmd = [
+                        "ffmpeg", "-nostdin", "-hide_banner", "-y", "-v", "error",
+                        "-i", self.storage.presign(src), "-map", "0:a:0",
+                        "-ac", "1", "-ar", str(sr), "-af", filtergraph, outM
+                    ]
+                    run_with_retry(cmd, retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
+                    produced_any = True
+                working["channel_map"] = ["mono"]
+                self._maybe_upload_working_to_s3(working, {"mono": outM})
+            self.manifest["stages"]["preprocess"].update({
+                "idempotency_key": idem_key, "working": working, "ended_at": utc_now_iso()
+            })
+            self._set_stage("preprocess", "done", 1.0)
+        except Exception as e:
+            self._set_stage("preprocess", "failed", 0.0, {"last_error": str(e), "ended_at": utc_now_iso()})
+            raise
+    # --------------------------------------------------------
+    # Stage: Split plan (virtual by default, VAD streaming)
+    # --------------------------------------------------------
+    def _run_split_plan(self):
+        self._set_stage("split", "running", 0.05, {"started_at": utc_now_iso()})
+        policy = self.manifest["policy"]
+        chunk_target = int(policy["chunk_target_ms"])
+        overlap = int(policy["overlap_ms"])
+        materialize = bool(policy["materialize_chunks"])
+        work = self.manifest["stages"]["preprocess"]["working"]
+        channels = work["channel_map"]
+        chunks: List[Dict[str, Any]] = []
+        total_chunks = 0
+        plan_source_uris = {}
+        proc_source: Dict[str, str] = {}
+        try:
+            for chan in channels:
+                # Decide source for processing: local file if exists, else remote if available
+                local_candidate = None
+                # Build the expected local path based on idempotent working dir
+                base_dir = os.path.join(self.work_root, self.manifest["job_id"], self.manifest["stages"]["preprocess"]["idempotency_key"])
+                fname = "main" if chan == "mono" else ("ch1" if chan == "L" else "ch2")
+                local_candidate = os.path.join(base_dir, f"{fname}.{work['format']}")
+                remote_key = work.get("uris_remote", {}).get(chan)
+                # Expose preferred source to downstream: presigned remote if available, else local path
+                if remote_key:
+                    try:
+                        uploader = self._get_s3_uploader()
+                        plan_source_uris[chan] = uploader.presign(remote_key, "GET") if uploader else local_candidate
+                    except Exception:
+                        plan_source_uris[chan] = local_candidate
+                else:
+                    plan_source_uris[chan] = local_candidate
+                # Choose ffmpeg input: local if present, else presigned remote, else raw key (ffmpeg may support s3/http)
+                if os.path.exists(local_candidate):
+                    ffmpeg_src = local_candidate
+                elif remote_key:
+                    try:
+                        uploader = self._get_s3_uploader()
+                        ffmpeg_src = uploader.presign(remote_key, "GET") if uploader else remote_key
+                    except Exception:
+                        ffmpeg_src = remote_key
+                else:
+                    ffmpeg_src = local_candidate  # may not exist; will fail predictably
+                # cache processing source for materialization stage
+                proc_source[chan] = ffmpeg_src
+                info = self._ffprobe_streams(ffmpeg_src)
+                dur_ms = int(float(info.get("format", {}).get("duration", 0.0)) * 1000)
+                # Build ranges via streaming VAD if requested/available
+                ranges = self._build_chunks_vad_or_fixed_streaming(ffmpeg_src, dur_ms, chunk_target, overlap)
+                for idx, (start_ms, dur) in enumerate(ranges):
+                    chunks.append({"idx": idx if len(channels) == 1 else f"{idx}{chan}",
+                                   "chan": chan, "start_ms": int(start_ms), "dur_ms": int(dur),
+                                   "status": "queued"})
+                total_chunks += len(ranges)
+                # Progress rough update per channel
+                self._set_stage("split", "running", clamp(0.05 + 0.4 * (total_chunks / max(1, total_chunks)), 0.05, 0.9))
+            plan = {
+                "mode": "materialized" if materialize else "virtual",
+                "channels": channels,
+                "source_uris": plan_source_uris,    # <--- expose per-channel sources
+                "chunk_policy": policy["chunk_policy"],
+                "chunk_target_ms": chunk_target,
+                "overlap_ms": overlap,
+                "total_chunks": total_chunks,
+                "chunks": chunks[:2000] if total_chunks <= 2000 else [],  # avoid bloating manifest
+            }
+            if materialize:
+                # Accurate-seek pattern for better boundaries:
+                # - Fast seek before -i
+                # - Fine seek after -i (optional) + atrim
+                out_dir = os.path.join(self.work_root, self.manifest["job_id"], "chunks")
+                os.makedirs(out_dir, exist_ok=True)
+                for c in chunks:
+                    st_s = c["start_ms"] / 1000.0
+                    du_s = c["dur_ms"] / 1000.0
+                    chan = c["chan"]
+                    outp = os.path.join(out_dir, f"chunk_{c['idx']}.flac")
+                    inp = proc_source.get(chan, None) or plan_source_uris.get(chan)
+                    # fast seek near the start, then fine trim with atrim to be exact
+                    cmd = [
+                        "ffmpeg", "-nostdin", "-hide_banner", "-y", "-v", "error",
+                        "-ss", sec_to_hms(max(0.0, st_s - 0.05)),   # fast seek slightly earlier
+                        "-i", inp, "-map", "0:a:0",
+                        "-ss", "0.05", "-t", f"{du_s:.3f}",
+                        "-af", f"atrim=start=0:end={du_s:.3f}",
+                        "-ac", "1", "-ar", str(self.presets["sample_rate_target"]),
+                        outp
+                    ]
+                    run_with_retry(cmd, retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
+                    c["uri"] = outp
+                    c["status"] = "ready"
+            # Save plan
+            self.manifest["stages"]["split"]["plan"] = plan
+            self._set_stage("split", "done", 1.0, {"ended_at": utc_now_iso()})
+            # Initialize downstream counters
+            self.manifest["stages"]["transcribe"].update({
+                "status": "pending",
+                "progress": 0.0,
+                "chunks": {"total": total_chunks, "done": 0, "running": 0, "failed": 0, "queued": total_chunks},
+                "per_chunk": []
+            })
+        except Exception as e:
+            self._set_stage("split", "failed", 0.0, {"last_error": str(e), "ended_at": utc_now_iso()})
+            raise
+    # --- VAD (streaming from ffmpeg to avoid big temp files) ---
+    def _build_chunks_vad_or_fixed_streaming(self, src_uri: str, dur_ms: int, target_ms: int, overlap_ms: int) -> List[Tuple[int, int]]:
+        use_vad = (self.tool_versions.get("webrtcvad") == "installed") and \
+                  (self.manifest["policy"].get("chunk_policy", "").startswith("vad"))
+        if not use_vad:
+            return self._fixed_chunks(dur_ms, target_ms, overlap_ms)
+        # Decode via ffmpeg to s16le PCM @16k mono on stdout, feed WebRTC VAD
+        try:
+            import webrtcvad
+            vad = webrtcvad.Vad(int(self.presets["vad_aggressiveness"]))
+            frame_ms = 30
+            bytes_per_frame = int(16000 * 2 * frame_ms / 1000)
+            # Start ffmpeg process
+            cmd = [
+                "ffmpeg", "-nostdin", "-hide_banner", "-v", "error",
+                "-i", src_uri, "-map", "0:a:0", "-ac", "1", "-ar", "16000",
+                "-f", "s16le", "-"  # raw PCM to stdout
+            ]
+            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            speech_regions: List[Tuple[int, int]] = []
+            in_speech = False
+            seg_start = 0
+            frames_read = 0
+            last_progress_emit = time.time()
+            while True:
+                chunk = proc.stdout.read(bytes_per_frame)
+                if chunk is None or len(chunk) == 0:
+                    break
+                if len(chunk) < bytes_per_frame:
+                    break  # tail
+                t_ms = frames_read * frame_ms
+                frames_read += 1
+                is_speech = vad.is_speech(chunk, 16000)
+                if is_speech and not in_speech:
+                    in_speech = True
+                    seg_start = t_ms
+                elif not is_speech and in_speech:
+                    in_speech = False
+                    speech_regions.append((seg_start, t_ms - seg_start))
+                # emit progress occasionally (decode-based approximation)
+                if time.time() - last_progress_emit > 0.5 and dur_ms:
+                    prog = clamp(0.05 + 0.8 * (t_ms / dur_ms), 0.05, 0.95)
+                    self._set_stage("split", "running", prog)
+                    last_progress_emit = time.time()
+            # finalize region if still in speech
+            if in_speech:
+                speech_regions.append((seg_start, max(0, dur_ms - seg_start)))
+            # Build chunks by packing islands to ~target_ms
+            chunks: List[Tuple[int, int]] = []
+            if not speech_regions:
+                return self._fixed_chunks(dur_ms, target_ms, overlap_ms)
+            cur_start = None
+            cur_end = None
+            max_len = target_ms + 500
+            gap_allow = 300
+            # Pack complete speech islands into chunks. We never split an
+            # individual speech island; max_len is only used to decide when
+            # to merge adjacent islands. If a single island exceeds max_len
+            # it will remain intact in its own chunk.
+            for s, d in speech_regions:
+                e = s + d
+                if cur_start is None:
+                    cur_start, cur_end = s, e
+                    continue
+                # merge only if small gap and combined length stays within max_len
+                if (s - cur_end) <= gap_allow and (e - cur_start) <= max_len:
+                    cur_end = e
+                else:
+                    # finalize current chunk as the full span of islands
+                    chunks.append((cur_start, cur_end - cur_start))
+                    cur_start = s
+                    cur_end = e
+            # finalize last chunk
+            if cur_start is not None and (cur_end - cur_start) > 250:
+                chunks.append((cur_start, cur_end - cur_start))
+            # Normalize with overlap shift but preserve complete islands.
+            # We shift the start earlier by overlap_ms for non-first chunks but
+            # keep the full island coverage in the duration.
+            normalized: List[Tuple[int, int]] = []
+            for i, (s, d) in enumerate(chunks):
+                if i == 0:
+                    normalized.append((max(0, s), d))
+                else:
+                    s2 = max(0, s - overlap_ms)
+                    normalized.append((s2, (s + d) - s2))
+            return self._cap_chunks(normalized, dur_ms)
+        except Exception:
+            # If anything fails in VAD pipeline → fallback to fixed
+            return self._fixed_chunks(dur_ms, target_ms, overlap_ms)
+    def _fixed_chunks(self, dur_ms: int, target_ms: int, overlap_ms: int) -> List[Tuple[int, int]]:
+        chunks: List[Tuple[int, int]] = []
+        if dur_ms <= 0: return chunks
+        step = max(1, target_ms - overlap_ms)
+        start = 0
+        while start < dur_ms:
+            length = min(target_ms, dur_ms - start)
+            chunks.append((start, length))
+            if length < target_ms: break
+            start += step
+        return chunks
+    def _cap_chunks(self, chunks: List[Tuple[int, int]], dur_ms: int) -> List[Tuple[int, int]]:
+        capped = []
+        for s, d in chunks:
+            s2 = clamp(s, 0, max(0, dur_ms - 1))
+            d2 = clamp(d, 1, dur_ms - s2)
+            capped.append((int(s2), int(d2)))
+        return capped
+# ------------------------------------------------------
+# Example CLI (optional)
+# ------------------------------------------------------
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser(description="Audio preprocess runner v2 (until split).")
+    ap.add_argument("source", help="Local path or URL/storage URI to audio")
+    ap.add_argument("work_root", help="Working root directory")
+    ap.add_argument("--manifest", help="Path to existing manifest.json to resume", default=None)
+    ap.add_argument("--chunk_ms", type=int, default=60000)
+    ap.add_argument("--overlap_ms", type=int, default=300)
+    ap.add_argument("--materialize", action="store_true")
+    args = ap.parse_args()
+    presets = {
+        "chunk_target_ms": args.chunk_ms,
+        "overlap_ms": args.overlap_ms,
+        "materialize_chunks": args.materialize
+    }
+    storage = LocalStorageAdapter()
+    manifest = None
+    if args.manifest and os.path.exists(args.manifest):
+        with open(args.manifest, "r", encoding="utf-8") as f:
+            manifest = json.load(f)
+    runner = AudioJobRunner(
+        manifest=manifest,
+        source_uri=None if manifest else args.source,
+        work_root=args.work_root,
+        storage=storage,
+        presets=presets
+    )
+    out_manifest = runner.run_until_split()
+    out_path = os.path.join(args.work_root, "manifest.json")
+    os.makedirs(args.work_root, exist_ok=True)
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(out_manifest, f, ensure_ascii=False, indent=2)
+    print(f"Saved manifest -> {out_path}")

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
 gradio>=5.39.0
 requests>=2.31.0
 pydub>=0.25.1
-ffmpeg-python>=0.2.0

 gradio>=5.39.0
 requests>=2.31.0
 pydub>=0.25.1
+ffmpeg-python>=0.2.0
+webrtcvad
+boto3>=1.34.0