Spaces:

nsfwalex
/

audio-edit

Running

App Files Files Community

liuyang commited on Nov 2, 2025

Commit

a7c5fd6

1 Parent(s): 46f053a

revert

Browse files

Files changed (1) hide show

audiojob.py +7 -450

audiojob.py CHANGED Viewed

@@ -71,20 +71,6 @@ DEFAULT_PRESETS = {
     "dual_mono_corr": 0.90,          # was 0.995; still gated by Side/Mid & RMS check
     "corr_probe_ms": 30000,          # cap correlation probe at 30s
     "stereo_probe_win_s": 12,        # each sample window length (sec)
-    # VAD (global + decision) defaults
-    "vad_aggressiveness": 3,            # 0..3 for WebRTC VAD (more non-speech in pauses)
-    "vad_similarity_thr": 0.95,         # stereo L/R VAD similarity threshold for dual-mono
-    "vad_max_lag_frames": 1,            # allow ±1 frame lag when matching
-    "vad_probe_win_s": 10.0,            # legacy quick probe window length
-    "use_full_vad_for_decision": False, # prefer quick-window VAD; avoid full-file decode
-    # Split alignment using VAD
-    "split_use_vad": True,              # align fixed windows to nearest silence
-    "split_vad_seek_ms": 3000,          # how far to seek around target boundary (default ±3s)
-    "split_vad_min_silence_ms": 250,    # minimum silence run to consider
-    "split_vad_frame_ms": 30,           # VAD frame size (ms)
-    "split_silence_noise_db": -40.0,    # fallback silencedetect noise threshold
 }
 # --------------------------- Runner --------------------------
@@ -120,8 +106,6 @@ class AudioJobRunner:
             self.manifest = manifest
         self.manifest.setdefault("version", "2.3")
         self.manifest.setdefault("rev", 0)
-        # Ephemeral cache for VAD results to avoid re-decoding across stages
-        self._vad_cache: Optional[Dict[str, Any]] = None
         self._touch()
     # -------- Public API --------
@@ -217,73 +201,6 @@ class AudioJobRunner:
         durms = int(self.manifest["source"].get("duration_ms") or 0)
         dur_s = max(1, durms // 1000)
-        # Fast path: if configured, compute or reuse full-file stereo VAD once and
-        # derive similarity from it (single pass reused later by split stage).
-        try:
-            ch = int(self.manifest["source"].get("channels") or 1)
-        except Exception:
-            ch = 1
-        use_full_vad = bool(pol.get("use_full_vad_for_decision", True))
-        if use_full_vad and ch == 2:
-            # Ensure cached VAD has stereo masks; compute if absent
-            if not self._vad_cache or not self._vad_cache.get("has_stereo"):
-                self._vad_cache = self._compute_vad_timeline(uri, want_stereo_masks=True)
-            vad_obj = self._vad_cache or {}
-            L_mask: Optional[List[bool]] = vad_obj.get("L_mask")
-            R_mask: Optional[List[bool]] = vad_obj.get("R_mask")
-            if L_mask and R_mask:
-                vad_frame_ms    = int(pol.get("split_vad_frame_ms", 30))
-                vad_sim_thr     = float(pol.get("vad_similarity_thr", 0.95))
-                vad_max_lag     = int(pol.get("vad_max_lag_frames", 1))
-                def best_similarity(a: List[bool], b: List[bool], max_lag: int) -> float:
-                    if not a or not b: return 0.0
-                    n = min(len(a), len(b))
-                    a = a[:n]; b = b[:n]
-                    best = 0.0
-                    for lag in range(-max_lag, max_lag + 1):
-                        if lag > 0:
-                            a2 = a[lag:]; b2 = b[:len(a2)]
-                        elif lag < 0:
-                            b2 = b[-lag:]; a2 = a[:len(b2)]
-                        else:
-                            a2, b2 = a, b
-                        if not a2 or not b2:
-                            continue
-                        matches = sum(1 for x, y in zip(a2, b2) if x == y)
-                        best = max(best, matches / float(len(a2)))
-                    return best
-                sim = best_similarity(L_mask, R_mask, vad_max_lag)
-                dual_mono = (sim >= vad_sim_thr)
-                rec = "downmix" if dual_mono else "split"
-                metrics = {
-                    "mid_db": None,
-                    "side_db": None,
-                    "L_db": None,
-                    "R_db": None,
-                    "near_silent": False,
-                    "corr": None,
-                    "dual_mono": dual_mono,
-                    "side_mid_gap_db": None,
-                    "side_mid_thr_db": float(pol.get("dual_mono_side_mid_db", 18.0)),
-                    "rms_delta_thr_db": float(pol.get("dual_mono_rms_delta_db", 1.0)),
-                    "corr_thr": float(pol.get("dual_mono_corr", 0.93)),
-                    "windows_used": 1,
-                    "vad_similarities": [sim],
-                    "vad_similarity_median": sim,
-                    "vad_params": {
-                        "aggressiveness": int(pol.get("vad_aggressiveness", 2)),
-                        "frame_ms": vad_frame_ms,
-                        "sim_thr": vad_sim_thr,
-                        "max_lag_frames": vad_max_lag,
-                        "probe_win_s": None
-                    }
-                }
-                return rec, metrics
         # VAD params (defaults if not present in policy)
         vad_aggr        = int(pol.get("vad_aggressiveness", 2))   # 0..3
         vad_frame_ms    = 30                                       # keep 30ms (supported by webrtcvad)
@@ -514,126 +431,6 @@ class AudioJobRunner:
             return rec, metrics
-    def _compute_vad_timeline(self, uri: str, want_stereo_masks: bool = False) -> Dict[str, Any]:
-        """
-        Build a global VAD timeline across the entire file at 16 kHz using WebRTC VAD.
-        - If want_stereo_masks and source has 2 channels, produce L/R boolean masks per frame.
-        - Always produce a mono_mask (L OR R if stereo) and derived silence_spans (>= min_silence_ms).
-        Returns an object cached in-memory (not embedded in manifest) to avoid repeated decodes.
-        """
-        pol = self.manifest["policy"]
-        try:
-            ch = int(self.manifest["source"].get("channels") or 1)
-        except Exception:
-            ch = 1
-        min_sil_ms = int(pol.get("split_vad_min_silence_ms", 300))
-        frame_ms   = int(pol.get("split_vad_frame_ms", 30))
-        vad_aggr   = int(pol.get("vad_aggressiveness", 2))
-        try:
-            import webrtcvad, subprocess, array
-        except Exception as e:
-            raise RuntimeError(f"WebRTC VAD not available: {e}")
-        vad = webrtcvad.Vad(vad_aggr)
-        sr = 16000
-        frame_samples = int(sr * frame_ms / 1000)
-        bytes_per_sample = 2
-        ac = 2 if (want_stereo_masks and ch == 2) else 1
-        cmd = [
-            "ffmpeg","-nostdin","-hide_banner","-v","error",
-            "-i", uri, "-map","0:a:0",
-            "-ac", str(ac), "-ar", str(sr), "-f", "s16le", "-"
-        ]
-        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        mono_mask: List[bool] = []
-        L_mask: Optional[List[bool]] = [] if (ac == 2) else None
-        R_mask: Optional[List[bool]] = [] if (ac == 2) else None
-        if ac == 1:
-            frame_bytes = frame_samples * bytes_per_sample
-            leftover = b""
-            while True:
-                chunk = proc.stdout.read(65536)
-                if not chunk: break
-                data = leftover + chunk
-                n_frames = len(data) // frame_bytes
-                for i in range(n_frames):
-                    start = i * frame_bytes
-                    end = start + frame_bytes
-                    mono_frame = data[start:end]
-                    mono_mask.append(vad.is_speech(mono_frame, sr))
-                leftover = data[n_frames * frame_bytes:]
-        else:
-            # stereo: deinterleave per-frame
-            ints_leftover = array.array("h")
-            ints_per_frame = 2 * frame_samples  # L+R int16 values per frame
-            while True:
-                chunk = proc.stdout.read(65536)
-                if not chunk: break
-                arr = array.array("h")
-                arr.frombytes(chunk)
-                if len(ints_leftover):
-                    ints_leftover.extend(arr)
-                else:
-                    ints_leftover = arr
-                total_frames = len(ints_leftover) // ints_per_frame
-                if total_frames <= 0:
-                    continue
-                # Process frames
-                for fidx in range(total_frames):
-                    base = fidx * ints_per_frame
-                    # Gather L and R samples for this frame
-                    L_frame = array.array("h", ints_leftover[base:base + ints_per_frame:2])
-                    R_frame = array.array("h", ints_leftover[base + 1:base + ints_per_frame:2])
-                    Lb = L_frame.tobytes(); Rb = R_frame.tobytes()
-                    sL = vad.is_speech(Lb, sr); sR = vad.is_speech(Rb, sr)
-                    L_mask.append(sL); R_mask.append(sR)
-                    mono_mask.append(bool(sL or sR))
-                # Keep leftovers
-                used = total_frames * ints_per_frame
-                if used:
-                    ints_leftover = array.array("h", ints_leftover[used:])
-        try:
-            proc.kill()
-        except Exception:
-            pass
-        # Build silence spans from mono mask
-        silence_spans: List[Tuple[int,int]] = []
-        min_run = max(1, (min_sil_ms + frame_ms - 1) // frame_ms)
-        run_len = 0
-        run_start_idx = 0
-        for idx, is_speech in enumerate(mono_mask):
-            if not is_speech:
-                if run_len == 0:
-                    run_start_idx = idx
-                run_len += 1
-            else:
-                if run_len >= min_run:
-                    st = run_start_idx * frame_ms
-                    en = (run_start_idx + run_len) * frame_ms
-                    silence_spans.append((st, en))
-                run_len = 0
-        # tail
-        if run_len >= min_run:
-            st = run_start_idx * frame_ms
-            en = (run_start_idx + run_len) * frame_ms
-            silence_spans.append((st, en))
-        return {
-            "frame_ms": frame_ms,
-            "num_frames": len(mono_mask),
-            "silence_spans": silence_spans,
-            "mono_mask": mono_mask,  # retained in-memory only
-            "has_stereo": bool(L_mask is not None and R_mask is not None and len(L_mask) > 0 and len(R_mask) > 0),
-            "L_mask": L_mask if L_mask is not None else None,
-            "R_mask": R_mask if R_mask is not None else None,
-        }
     # -------- Preprocess (plan-only) --------
     def _build_ingest_plan(self):
         self._set_stage("preprocess","running",0.1,{"started_at":utc_now_iso()})
@@ -665,181 +462,6 @@ class AudioJobRunner:
         })
         self._set_stage("preprocess","done",1.0)
-    def _find_nearest_silence_local(self, uri: str, center_ms: int, seek_ms: int) -> Optional[int]:
-        """
-        Decode a small mono window around center_ms (±seek_ms), run WebRTC VAD in frames,
-        and return the nearest silence center (midpoint of a silence run) to center_ms.
-        Returns None if VAD unavailable or no silence found in the window.
-        """
-        pol = self.manifest["policy"]
-        dur_ms = int(self.manifest["source"].get("duration_ms") or 0)
-        if dur_ms <= 0:
-            return None
-        frame_ms = int(pol.get("split_vad_frame_ms", 30))
-        min_sil_ms = int(pol.get("split_vad_min_silence_ms", 300))
-        vad_aggr = int(pol.get("vad_aggressiveness", 2))
-        try:
-            import webrtcvad, subprocess, array
-        except Exception:
-            logger.warning("local_vad: webrtcvad not available; skipping alignment around %dms", center_ms)
-            return None
-        sr = 16000
-        frame_samples = int(sr * frame_ms / 1000)
-        frame_bytes = frame_samples * 2
-        vad = webrtcvad.Vad(vad_aggr)
-        def attempt(this_seek_ms: int) -> Optional[int]:
-            win_lo = max(0, center_ms - this_seek_ms)
-            win_hi = min(dur_ms, center_ms + this_seek_ms)
-            if win_hi <= win_lo:
-                return None
-            ss = win_lo / 1000.0
-            t = (win_hi - win_lo) / 1000.0
-            logger.info(
-                "local_vad: center=%dms window=[%d,%d]ms frame_ms=%d min_silence_ms=%d",
-                center_ms, win_lo, win_hi, frame_ms, min_sil_ms
-            )
-            # If source stereo, decode stereo and require both channels non-speech per frame
-            try:
-                ch = int(self.manifest["source"].get("channels") or 1)
-            except Exception:
-                ch = 1
-            ac = 2 if ch == 2 else 1
-            cmd = [
-                "ffmpeg","-nostdin","-hide_banner","-v","error",
-                "-ss", f"{ss:.3f}", "-t", f"{t:.3f}",
-                "-i", uri, "-map","0:a:0",
-                "-ac",str(ac),"-ar",str(sr),"-f","s16le","-"
-            ]
-            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            buf = b""
-            while True:
-                chunk = proc.stdout.read(65536)
-                if not chunk:
-                    break
-                buf += chunk
-            try:
-                proc.kill()
-            except Exception:
-                pass
-            if not buf or len(buf) < frame_bytes * (2 if ac == 2 else 1):
-                logger.info("local_vad: insufficient audio decoded for window around %dms (len=%d)", center_ms, len(buf) if buf else 0)
-                return None
-            silence_spans: List[Tuple[int,int]] = []
-            min_run_local = max(1, (min_sil_ms + frame_ms - 1) // frame_ms)
-            if ac == 1:
-                n_frames = len(buf) // frame_bytes
-                non_speech_mask: List[bool] = []
-                for i in range(n_frames):
-                    start = i * frame_bytes
-                    end = start + frame_bytes
-                    non_speech_mask.append(not vad.is_speech(buf[start:end], sr))
-            else:
-                # stereo: deinterleave, require both channels non-speech
-                import array as _array
-                a = _array.array("h")
-                a.frombytes(buf)
-                ints_per_frame = 2 * frame_samples
-                total_frames = len(a) // ints_per_frame
-                non_speech_mask = []
-                for fidx in range(total_frames):
-                    base = fidx * ints_per_frame
-                    L_frame = _array.array("h", a[base:base + ints_per_frame:2])
-                    R_frame = _array.array("h", a[base + 1:base + ints_per_frame:2])
-                    Lb = L_frame.tobytes(); Rb = R_frame.tobytes()
-                    sL = vad.is_speech(Lb, sr); sR = vad.is_speech(Rb, sr)
-                    non_speech_mask.append((not sL) and (not sR))
-            run_len = 0
-            run_start_idx = 0
-            for idx, is_sil in enumerate(non_speech_mask):
-                if is_sil:
-                    if run_len == 0:
-                        run_start_idx = idx
-                    run_len += 1
-                else:
-                    if run_len >= min_run_local:
-                        st = win_lo + run_start_idx * frame_ms
-                        en = win_lo + (run_start_idx + run_len) * frame_ms
-                        silence_spans.append((st, en))
-                    run_len = 0
-            if run_len >= min_run_local:
-                st = win_lo + run_start_idx * frame_ms
-                en = win_lo + (run_start_idx + run_len) * frame_ms
-                silence_spans.append((st, en))
-            if not silence_spans:
-                logger.info("local_vad: no silence spans found in window around %dms (±%dms) via VAD; trying silencedetect fallback", center_ms, this_seek_ms)
-                # Fallback using ffmpeg silencedetect (amplitude-based)
-                noise_db = float(pol.get("split_silence_noise_db", -38.0))
-                min_dur_s = max(0.05, min_sil_ms / 1000.0)
-                try:
-                    txt = run_with_retry_collect(
-                        [
-                            "ffmpeg","-nostdin","-hide_banner","-v","error",
-                            "-ss", f"{ss:.3f}", "-t", f"{t:.3f}",
-                            "-i", uri, "-map","0:a:0",
-                            "-af", f"silencedetect=noise={noise_db}dB:d={min_dur_s:.3f}",
-                            "-f","null","-"
-                        ],
-                        retries=self.manifest["policy"]["ff_retries"],
-                        timeout=self.manifest["policy"]["ff_timeout_sec"]
-                    )
-                    # parse silencedetect output
-                    spans: List[Tuple[float,float]] = []
-                    cur_start: Optional[float] = None
-                    for line in txt.splitlines():
-                        m1 = re.search(r"silence_start:\s*([0-9.]+)", line)
-                        if m1:
-                            try:
-                                cur_start = float(m1.group(1))
-                            except Exception:
-                                cur_start = None
-                            continue
-                        m2 = re.search(r"silence_end:\s*([0-9.]+)", line)
-                        if m2 and cur_start is not None:
-                            try:
-                                end_s = float(m2.group(1))
-                                spans.append((cur_start, end_s))
-                            except Exception:
-                                pass
-                            cur_start = None
-                    if spans:
-                        # choose nearest center
-                        best_local = None
-                        best_dist = None
-                        for (st_s, en_s) in spans:
-                            center_abs = win_lo + int(((st_s + en_s) * 500.0))  # seconds to ms, averaged
-                            d = abs(center_abs - center_ms)
-                            if best_dist is None or d < best_dist:
-                                best_local = center_abs
-                                best_dist = d
-                        logger.info("silencedetect: found %d spans; nearest_center=%s (dist=%s)", len(spans), str(best_local), str(best_dist))
-                        return best_local
-                    else:
-                        logger.info("silencedetect: no spans produced in window around %dms (±%dms)", center_ms, this_seek_ms)
-                        return None
-                except Exception as se:
-                    logger.warning("silencedetect fallback failed: %s", se)
-                    return None
-            best_local = None
-            best_dist = None
-            for (st, en) in silence_spans:
-                center = (st + en) // 2
-                d = abs(center - center_ms)
-                if best_dist is None or d < best_dist:
-                    best_local = center
-                    best_dist = d
-            logger.info(
-                "local_vad: found %d silence spans; nearest_center=%s (dist=%s)",
-                len(silence_spans), str(best_local), str(best_dist)
-            )
-            return best_local
-        # Try once within the configured seek window; keep fixed boundary if none
-        return attempt(seek_ms)
     # -------- Split (fixed windows with overlap) --------
     def _run_split_plan(self):
         self._set_stage("split","running",0.1,{"started_at":utc_now_iso()})
@@ -852,59 +474,14 @@ class AudioJobRunner:
             overlap = max(0, target - 1)
         step = target - overlap
-        # Optionally align chunk starts to nearest silence using local VAD around boundaries
-        pol = self.manifest["policy"]
-        use_vad_align = bool(pol.get("split_use_vad", True))
-        seek_ms = int(pol.get("split_vad_seek_ms", 1500))
-        src_uri = self.manifest["source"]["uri"]
-        if use_vad_align:
-            logger.info(
-                "split: VAD alignment enabled seek_ms=%d frame_ms=%d min_silence_ms=%d",
-                seek_ms, int(pol.get("split_vad_frame_ms", 30)), int(pol.get("split_vad_min_silence_ms", 300))
-            )
-        else:
-            logger.info("split: VAD alignment disabled; using fixed_overlap stepping")
         ranges: List[Tuple[int,int]] = []
-        if dur_ms > 0:
-            s = 0
-            aligned_count = 0
-            total_boundaries = 0
-            chunk_idx = 0
             while s < dur_ms:
-                base_next = s + step
-                if base_next >= dur_ms:
-                    # last chunk from s to end
-                    l = max(0, dur_ms - s)
-                    if l > 0:
-                        ranges.append((s, l))
-                        logger.info(
-                            "split[%d]: last chunk start=%dms dur=%dms (target=%d overlap=%d)",
-                            chunk_idx, s, l, target, overlap
-                        )
-                    break
-                if use_vad_align:
-                    cand = self._find_nearest_silence_local(src_uri, base_next, seek_ms)
-                    next_start = cand if cand is not None else base_next
-                    aligned_count += 1 if cand is not None else 0
-                    total_boundaries += 1
-                else:
-                    next_start = base_next
-                # ensure progress and bounds
-                next_start = max(s + 1, min(dur_ms, int(next_start)))
-                # choose duration so that chunk spills overlap into next chunk's start by `overlap`
-                l = min(dur_ms - s, (next_start - s) + overlap)
-                if l <= 0:
-                    # safety fallback
-                    l = min(target, dur_ms - s)
-                ranges.append((s, l))
-                logger.info(
-                    "split[%d]: start=%dms base_next=%dms chosen_next=%dms dur=%dms (target=%d overlap=%d aligned=%s)",
-                    chunk_idx, s, base_next, next_start, l, target, overlap, str(use_vad_align)
-                )
-                s = next_start
-                chunk_idx += 1
         channels = self.manifest["stages"]["preprocess"]["working"]["channel_map"]
         src = self.manifest["source"]["uri"]
@@ -922,34 +499,14 @@ class AudioJobRunner:
             "mode": "virtual",
             "channels": channels,
             "source_uris": plan_source_uris,
-            "chunk_policy": ("vad_aligned_overlap" if use_vad_align else "fixed_overlap"),
             "chunk_target_ms": target,
             "overlap_ms": overlap,
             "total_chunks": len(chunks),
             "execution": "transcriber",
             "chunks": chunks[:MAX_EMBED],
         }
-        if use_vad_align:
-            plan["alignment"] = {
-                "method": "local_vad",
-                "seek_ms": seek_ms,
-                "frame_ms": int(pol.get("split_vad_frame_ms", 30)),
-                "min_silence_ms": int(pol.get("split_vad_min_silence_ms", 300)),
-            }
         self.manifest["stages"]["split"]["plan"]=plan
-        try:
-            if use_vad_align:
-                logger.info(
-                    "split: policy=%s chunks=%d target=%d overlap=%d",
-                    plan.get("chunk_policy"), len(chunks), target, overlap
-                )
-            else:
-                logger.info(
-                    "split: policy=%s chunks=%d target=%d overlap=%d (fixed)",
-                    plan.get("chunk_policy"), len(chunks), target, overlap
-                )
-        except Exception:
-            pass
         self._set_stage("split","done",1.0,{"ended_at":utc_now_iso()})
         # Keep transcribe stage for downstream processing

     "dual_mono_corr": 0.90,          # was 0.995; still gated by Side/Mid & RMS check
     "corr_probe_ms": 30000,          # cap correlation probe at 30s
     "stereo_probe_win_s": 12,        # each sample window length (sec)
 }
 # --------------------------- Runner --------------------------
             self.manifest = manifest
         self.manifest.setdefault("version", "2.3")
         self.manifest.setdefault("rev", 0)
         self._touch()
     # -------- Public API --------
         durms = int(self.manifest["source"].get("duration_ms") or 0)
         dur_s = max(1, durms // 1000)
         # VAD params (defaults if not present in policy)
         vad_aggr        = int(pol.get("vad_aggressiveness", 2))   # 0..3
         vad_frame_ms    = 30                                       # keep 30ms (supported by webrtcvad)
             return rec, metrics
     # -------- Preprocess (plan-only) --------
     def _build_ingest_plan(self):
         self._set_stage("preprocess","running",0.1,{"started_at":utc_now_iso()})
         })
         self._set_stage("preprocess","done",1.0)
     # -------- Split (fixed windows with overlap) --------
     def _run_split_plan(self):
         self._set_stage("split","running",0.1,{"started_at":utc_now_iso()})
             overlap = max(0, target - 1)
         step = target - overlap
         ranges: List[Tuple[int,int]] = []
+        if dur_ms>0:
+            s=0
             while s < dur_ms:
+                l = min(target, dur_ms - s)
+                ranges.append((s,l))
+                if l < target: break
+                s += step
         channels = self.manifest["stages"]["preprocess"]["working"]["channel_map"]
         src = self.manifest["source"]["uri"]
             "mode": "virtual",
             "channels": channels,
             "source_uris": plan_source_uris,
+            "chunk_policy": "fixed_overlap",
             "chunk_target_ms": target,
             "overlap_ms": overlap,
             "total_chunks": len(chunks),
             "execution": "transcriber",
             "chunks": chunks[:MAX_EMBED],
         }
         self.manifest["stages"]["split"]["plan"]=plan
         self._set_stage("split","done",1.0,{"ended_at":utc_now_iso()})
         # Keep transcribe stage for downstream processing