liuyang commited on
Commit
a7c5fd6
·
1 Parent(s): 46f053a
Files changed (1) hide show
  1. audiojob.py +7 -450
audiojob.py CHANGED
@@ -71,20 +71,6 @@ DEFAULT_PRESETS = {
71
  "dual_mono_corr": 0.90, # was 0.995; still gated by Side/Mid & RMS check
72
  "corr_probe_ms": 30000, # cap correlation probe at 30s
73
  "stereo_probe_win_s": 12, # each sample window length (sec)
74
-
75
- # VAD (global + decision) defaults
76
- "vad_aggressiveness": 3, # 0..3 for WebRTC VAD (more non-speech in pauses)
77
- "vad_similarity_thr": 0.95, # stereo L/R VAD similarity threshold for dual-mono
78
- "vad_max_lag_frames": 1, # allow ±1 frame lag when matching
79
- "vad_probe_win_s": 10.0, # legacy quick probe window length
80
- "use_full_vad_for_decision": False, # prefer quick-window VAD; avoid full-file decode
81
-
82
- # Split alignment using VAD
83
- "split_use_vad": True, # align fixed windows to nearest silence
84
- "split_vad_seek_ms": 3000, # how far to seek around target boundary (default ±3s)
85
- "split_vad_min_silence_ms": 250, # minimum silence run to consider
86
- "split_vad_frame_ms": 30, # VAD frame size (ms)
87
- "split_silence_noise_db": -40.0, # fallback silencedetect noise threshold
88
  }
89
 
90
  # --------------------------- Runner --------------------------
@@ -120,8 +106,6 @@ class AudioJobRunner:
120
  self.manifest = manifest
121
  self.manifest.setdefault("version", "2.3")
122
  self.manifest.setdefault("rev", 0)
123
- # Ephemeral cache for VAD results to avoid re-decoding across stages
124
- self._vad_cache: Optional[Dict[str, Any]] = None
125
  self._touch()
126
 
127
  # -------- Public API --------
@@ -217,73 +201,6 @@ class AudioJobRunner:
217
  durms = int(self.manifest["source"].get("duration_ms") or 0)
218
  dur_s = max(1, durms // 1000)
219
 
220
- # Fast path: if configured, compute or reuse full-file stereo VAD once and
221
- # derive similarity from it (single pass reused later by split stage).
222
- try:
223
- ch = int(self.manifest["source"].get("channels") or 1)
224
- except Exception:
225
- ch = 1
226
- use_full_vad = bool(pol.get("use_full_vad_for_decision", True))
227
- if use_full_vad and ch == 2:
228
- # Ensure cached VAD has stereo masks; compute if absent
229
- if not self._vad_cache or not self._vad_cache.get("has_stereo"):
230
- self._vad_cache = self._compute_vad_timeline(uri, want_stereo_masks=True)
231
-
232
- vad_obj = self._vad_cache or {}
233
- L_mask: Optional[List[bool]] = vad_obj.get("L_mask")
234
- R_mask: Optional[List[bool]] = vad_obj.get("R_mask")
235
- if L_mask and R_mask:
236
- vad_frame_ms = int(pol.get("split_vad_frame_ms", 30))
237
- vad_sim_thr = float(pol.get("vad_similarity_thr", 0.95))
238
- vad_max_lag = int(pol.get("vad_max_lag_frames", 1))
239
-
240
- def best_similarity(a: List[bool], b: List[bool], max_lag: int) -> float:
241
- if not a or not b: return 0.0
242
- n = min(len(a), len(b))
243
- a = a[:n]; b = b[:n]
244
- best = 0.0
245
- for lag in range(-max_lag, max_lag + 1):
246
- if lag > 0:
247
- a2 = a[lag:]; b2 = b[:len(a2)]
248
- elif lag < 0:
249
- b2 = b[-lag:]; a2 = a[:len(b2)]
250
- else:
251
- a2, b2 = a, b
252
- if not a2 or not b2:
253
- continue
254
- matches = sum(1 for x, y in zip(a2, b2) if x == y)
255
- best = max(best, matches / float(len(a2)))
256
- return best
257
-
258
- sim = best_similarity(L_mask, R_mask, vad_max_lag)
259
- dual_mono = (sim >= vad_sim_thr)
260
- rec = "downmix" if dual_mono else "split"
261
-
262
- metrics = {
263
- "mid_db": None,
264
- "side_db": None,
265
- "L_db": None,
266
- "R_db": None,
267
- "near_silent": False,
268
- "corr": None,
269
- "dual_mono": dual_mono,
270
- "side_mid_gap_db": None,
271
- "side_mid_thr_db": float(pol.get("dual_mono_side_mid_db", 18.0)),
272
- "rms_delta_thr_db": float(pol.get("dual_mono_rms_delta_db", 1.0)),
273
- "corr_thr": float(pol.get("dual_mono_corr", 0.93)),
274
- "windows_used": 1,
275
- "vad_similarities": [sim],
276
- "vad_similarity_median": sim,
277
- "vad_params": {
278
- "aggressiveness": int(pol.get("vad_aggressiveness", 2)),
279
- "frame_ms": vad_frame_ms,
280
- "sim_thr": vad_sim_thr,
281
- "max_lag_frames": vad_max_lag,
282
- "probe_win_s": None
283
- }
284
- }
285
- return rec, metrics
286
-
287
  # VAD params (defaults if not present in policy)
288
  vad_aggr = int(pol.get("vad_aggressiveness", 2)) # 0..3
289
  vad_frame_ms = 30 # keep 30ms (supported by webrtcvad)
@@ -514,126 +431,6 @@ class AudioJobRunner:
514
  return rec, metrics
515
 
516
 
517
- def _compute_vad_timeline(self, uri: str, want_stereo_masks: bool = False) -> Dict[str, Any]:
518
- """
519
- Build a global VAD timeline across the entire file at 16 kHz using WebRTC VAD.
520
- - If want_stereo_masks and source has 2 channels, produce L/R boolean masks per frame.
521
- - Always produce a mono_mask (L OR R if stereo) and derived silence_spans (>= min_silence_ms).
522
- Returns an object cached in-memory (not embedded in manifest) to avoid repeated decodes.
523
- """
524
- pol = self.manifest["policy"]
525
- try:
526
- ch = int(self.manifest["source"].get("channels") or 1)
527
- except Exception:
528
- ch = 1
529
- min_sil_ms = int(pol.get("split_vad_min_silence_ms", 300))
530
- frame_ms = int(pol.get("split_vad_frame_ms", 30))
531
- vad_aggr = int(pol.get("vad_aggressiveness", 2))
532
-
533
- try:
534
- import webrtcvad, subprocess, array
535
- except Exception as e:
536
- raise RuntimeError(f"WebRTC VAD not available: {e}")
537
-
538
- vad = webrtcvad.Vad(vad_aggr)
539
- sr = 16000
540
- frame_samples = int(sr * frame_ms / 1000)
541
- bytes_per_sample = 2
542
- ac = 2 if (want_stereo_masks and ch == 2) else 1
543
-
544
- cmd = [
545
- "ffmpeg","-nostdin","-hide_banner","-v","error",
546
- "-i", uri, "-map","0:a:0",
547
- "-ac", str(ac), "-ar", str(sr), "-f", "s16le", "-"
548
- ]
549
- proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
550
-
551
- mono_mask: List[bool] = []
552
- L_mask: Optional[List[bool]] = [] if (ac == 2) else None
553
- R_mask: Optional[List[bool]] = [] if (ac == 2) else None
554
-
555
- if ac == 1:
556
- frame_bytes = frame_samples * bytes_per_sample
557
- leftover = b""
558
- while True:
559
- chunk = proc.stdout.read(65536)
560
- if not chunk: break
561
- data = leftover + chunk
562
- n_frames = len(data) // frame_bytes
563
- for i in range(n_frames):
564
- start = i * frame_bytes
565
- end = start + frame_bytes
566
- mono_frame = data[start:end]
567
- mono_mask.append(vad.is_speech(mono_frame, sr))
568
- leftover = data[n_frames * frame_bytes:]
569
- else:
570
- # stereo: deinterleave per-frame
571
- ints_leftover = array.array("h")
572
- ints_per_frame = 2 * frame_samples # L+R int16 values per frame
573
- while True:
574
- chunk = proc.stdout.read(65536)
575
- if not chunk: break
576
- arr = array.array("h")
577
- arr.frombytes(chunk)
578
- if len(ints_leftover):
579
- ints_leftover.extend(arr)
580
- else:
581
- ints_leftover = arr
582
- total_frames = len(ints_leftover) // ints_per_frame
583
- if total_frames <= 0:
584
- continue
585
- # Process frames
586
- for fidx in range(total_frames):
587
- base = fidx * ints_per_frame
588
- # Gather L and R samples for this frame
589
- L_frame = array.array("h", ints_leftover[base:base + ints_per_frame:2])
590
- R_frame = array.array("h", ints_leftover[base + 1:base + ints_per_frame:2])
591
- Lb = L_frame.tobytes(); Rb = R_frame.tobytes()
592
- sL = vad.is_speech(Lb, sr); sR = vad.is_speech(Rb, sr)
593
- L_mask.append(sL); R_mask.append(sR)
594
- mono_mask.append(bool(sL or sR))
595
- # Keep leftovers
596
- used = total_frames * ints_per_frame
597
- if used:
598
- ints_leftover = array.array("h", ints_leftover[used:])
599
-
600
- try:
601
- proc.kill()
602
- except Exception:
603
- pass
604
-
605
- # Build silence spans from mono mask
606
- silence_spans: List[Tuple[int,int]] = []
607
- min_run = max(1, (min_sil_ms + frame_ms - 1) // frame_ms)
608
- run_len = 0
609
- run_start_idx = 0
610
- for idx, is_speech in enumerate(mono_mask):
611
- if not is_speech:
612
- if run_len == 0:
613
- run_start_idx = idx
614
- run_len += 1
615
- else:
616
- if run_len >= min_run:
617
- st = run_start_idx * frame_ms
618
- en = (run_start_idx + run_len) * frame_ms
619
- silence_spans.append((st, en))
620
- run_len = 0
621
- # tail
622
- if run_len >= min_run:
623
- st = run_start_idx * frame_ms
624
- en = (run_start_idx + run_len) * frame_ms
625
- silence_spans.append((st, en))
626
-
627
- return {
628
- "frame_ms": frame_ms,
629
- "num_frames": len(mono_mask),
630
- "silence_spans": silence_spans,
631
- "mono_mask": mono_mask, # retained in-memory only
632
- "has_stereo": bool(L_mask is not None and R_mask is not None and len(L_mask) > 0 and len(R_mask) > 0),
633
- "L_mask": L_mask if L_mask is not None else None,
634
- "R_mask": R_mask if R_mask is not None else None,
635
- }
636
-
637
  # -------- Preprocess (plan-only) --------
638
  def _build_ingest_plan(self):
639
  self._set_stage("preprocess","running",0.1,{"started_at":utc_now_iso()})
@@ -665,181 +462,6 @@ class AudioJobRunner:
665
  })
666
  self._set_stage("preprocess","done",1.0)
667
 
668
- def _find_nearest_silence_local(self, uri: str, center_ms: int, seek_ms: int) -> Optional[int]:
669
- """
670
- Decode a small mono window around center_ms (±seek_ms), run WebRTC VAD in frames,
671
- and return the nearest silence center (midpoint of a silence run) to center_ms.
672
- Returns None if VAD unavailable or no silence found in the window.
673
- """
674
- pol = self.manifest["policy"]
675
- dur_ms = int(self.manifest["source"].get("duration_ms") or 0)
676
- if dur_ms <= 0:
677
- return None
678
- frame_ms = int(pol.get("split_vad_frame_ms", 30))
679
- min_sil_ms = int(pol.get("split_vad_min_silence_ms", 300))
680
- vad_aggr = int(pol.get("vad_aggressiveness", 2))
681
- try:
682
- import webrtcvad, subprocess, array
683
- except Exception:
684
- logger.warning("local_vad: webrtcvad not available; skipping alignment around %dms", center_ms)
685
- return None
686
- sr = 16000
687
- frame_samples = int(sr * frame_ms / 1000)
688
- frame_bytes = frame_samples * 2
689
- vad = webrtcvad.Vad(vad_aggr)
690
-
691
- def attempt(this_seek_ms: int) -> Optional[int]:
692
- win_lo = max(0, center_ms - this_seek_ms)
693
- win_hi = min(dur_ms, center_ms + this_seek_ms)
694
- if win_hi <= win_lo:
695
- return None
696
- ss = win_lo / 1000.0
697
- t = (win_hi - win_lo) / 1000.0
698
- logger.info(
699
- "local_vad: center=%dms window=[%d,%d]ms frame_ms=%d min_silence_ms=%d",
700
- center_ms, win_lo, win_hi, frame_ms, min_sil_ms
701
- )
702
- # If source stereo, decode stereo and require both channels non-speech per frame
703
- try:
704
- ch = int(self.manifest["source"].get("channels") or 1)
705
- except Exception:
706
- ch = 1
707
- ac = 2 if ch == 2 else 1
708
- cmd = [
709
- "ffmpeg","-nostdin","-hide_banner","-v","error",
710
- "-ss", f"{ss:.3f}", "-t", f"{t:.3f}",
711
- "-i", uri, "-map","0:a:0",
712
- "-ac",str(ac),"-ar",str(sr),"-f","s16le","-"
713
- ]
714
- proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
715
- buf = b""
716
- while True:
717
- chunk = proc.stdout.read(65536)
718
- if not chunk:
719
- break
720
- buf += chunk
721
- try:
722
- proc.kill()
723
- except Exception:
724
- pass
725
- if not buf or len(buf) < frame_bytes * (2 if ac == 2 else 1):
726
- logger.info("local_vad: insufficient audio decoded for window around %dms (len=%d)", center_ms, len(buf) if buf else 0)
727
- return None
728
- silence_spans: List[Tuple[int,int]] = []
729
- min_run_local = max(1, (min_sil_ms + frame_ms - 1) // frame_ms)
730
-
731
- if ac == 1:
732
- n_frames = len(buf) // frame_bytes
733
- non_speech_mask: List[bool] = []
734
- for i in range(n_frames):
735
- start = i * frame_bytes
736
- end = start + frame_bytes
737
- non_speech_mask.append(not vad.is_speech(buf[start:end], sr))
738
- else:
739
- # stereo: deinterleave, require both channels non-speech
740
- import array as _array
741
- a = _array.array("h")
742
- a.frombytes(buf)
743
- ints_per_frame = 2 * frame_samples
744
- total_frames = len(a) // ints_per_frame
745
- non_speech_mask = []
746
- for fidx in range(total_frames):
747
- base = fidx * ints_per_frame
748
- L_frame = _array.array("h", a[base:base + ints_per_frame:2])
749
- R_frame = _array.array("h", a[base + 1:base + ints_per_frame:2])
750
- Lb = L_frame.tobytes(); Rb = R_frame.tobytes()
751
- sL = vad.is_speech(Lb, sr); sR = vad.is_speech(Rb, sr)
752
- non_speech_mask.append((not sL) and (not sR))
753
-
754
- run_len = 0
755
- run_start_idx = 0
756
- for idx, is_sil in enumerate(non_speech_mask):
757
- if is_sil:
758
- if run_len == 0:
759
- run_start_idx = idx
760
- run_len += 1
761
- else:
762
- if run_len >= min_run_local:
763
- st = win_lo + run_start_idx * frame_ms
764
- en = win_lo + (run_start_idx + run_len) * frame_ms
765
- silence_spans.append((st, en))
766
- run_len = 0
767
- if run_len >= min_run_local:
768
- st = win_lo + run_start_idx * frame_ms
769
- en = win_lo + (run_start_idx + run_len) * frame_ms
770
- silence_spans.append((st, en))
771
-
772
- if not silence_spans:
773
- logger.info("local_vad: no silence spans found in window around %dms (±%dms) via VAD; trying silencedetect fallback", center_ms, this_seek_ms)
774
- # Fallback using ffmpeg silencedetect (amplitude-based)
775
- noise_db = float(pol.get("split_silence_noise_db", -38.0))
776
- min_dur_s = max(0.05, min_sil_ms / 1000.0)
777
- try:
778
- txt = run_with_retry_collect(
779
- [
780
- "ffmpeg","-nostdin","-hide_banner","-v","error",
781
- "-ss", f"{ss:.3f}", "-t", f"{t:.3f}",
782
- "-i", uri, "-map","0:a:0",
783
- "-af", f"silencedetect=noise={noise_db}dB:d={min_dur_s:.3f}",
784
- "-f","null","-"
785
- ],
786
- retries=self.manifest["policy"]["ff_retries"],
787
- timeout=self.manifest["policy"]["ff_timeout_sec"]
788
- )
789
- # parse silencedetect output
790
- spans: List[Tuple[float,float]] = []
791
- cur_start: Optional[float] = None
792
- for line in txt.splitlines():
793
- m1 = re.search(r"silence_start:\s*([0-9.]+)", line)
794
- if m1:
795
- try:
796
- cur_start = float(m1.group(1))
797
- except Exception:
798
- cur_start = None
799
- continue
800
- m2 = re.search(r"silence_end:\s*([0-9.]+)", line)
801
- if m2 and cur_start is not None:
802
- try:
803
- end_s = float(m2.group(1))
804
- spans.append((cur_start, end_s))
805
- except Exception:
806
- pass
807
- cur_start = None
808
- if spans:
809
- # choose nearest center
810
- best_local = None
811
- best_dist = None
812
- for (st_s, en_s) in spans:
813
- center_abs = win_lo + int(((st_s + en_s) * 500.0)) # seconds to ms, averaged
814
- d = abs(center_abs - center_ms)
815
- if best_dist is None or d < best_dist:
816
- best_local = center_abs
817
- best_dist = d
818
- logger.info("silencedetect: found %d spans; nearest_center=%s (dist=%s)", len(spans), str(best_local), str(best_dist))
819
- return best_local
820
- else:
821
- logger.info("silencedetect: no spans produced in window around %dms (±%dms)", center_ms, this_seek_ms)
822
- return None
823
- except Exception as se:
824
- logger.warning("silencedetect fallback failed: %s", se)
825
- return None
826
- best_local = None
827
- best_dist = None
828
- for (st, en) in silence_spans:
829
- center = (st + en) // 2
830
- d = abs(center - center_ms)
831
- if best_dist is None or d < best_dist:
832
- best_local = center
833
- best_dist = d
834
- logger.info(
835
- "local_vad: found %d silence spans; nearest_center=%s (dist=%s)",
836
- len(silence_spans), str(best_local), str(best_dist)
837
- )
838
- return best_local
839
-
840
- # Try once within the configured seek window; keep fixed boundary if none
841
- return attempt(seek_ms)
842
-
843
  # -------- Split (fixed windows with overlap) --------
844
  def _run_split_plan(self):
845
  self._set_stage("split","running",0.1,{"started_at":utc_now_iso()})
@@ -852,59 +474,14 @@ class AudioJobRunner:
852
  overlap = max(0, target - 1)
853
  step = target - overlap
854
 
855
- # Optionally align chunk starts to nearest silence using local VAD around boundaries
856
- pol = self.manifest["policy"]
857
- use_vad_align = bool(pol.get("split_use_vad", True))
858
- seek_ms = int(pol.get("split_vad_seek_ms", 1500))
859
- src_uri = self.manifest["source"]["uri"]
860
-
861
- if use_vad_align:
862
- logger.info(
863
- "split: VAD alignment enabled seek_ms=%d frame_ms=%d min_silence_ms=%d",
864
- seek_ms, int(pol.get("split_vad_frame_ms", 30)), int(pol.get("split_vad_min_silence_ms", 300))
865
- )
866
- else:
867
- logger.info("split: VAD alignment disabled; using fixed_overlap stepping")
868
-
869
  ranges: List[Tuple[int,int]] = []
870
- if dur_ms > 0:
871
- s = 0
872
- aligned_count = 0
873
- total_boundaries = 0
874
- chunk_idx = 0
875
  while s < dur_ms:
876
- base_next = s + step
877
- if base_next >= dur_ms:
878
- # last chunk from s to end
879
- l = max(0, dur_ms - s)
880
- if l > 0:
881
- ranges.append((s, l))
882
- logger.info(
883
- "split[%d]: last chunk start=%dms dur=%dms (target=%d overlap=%d)",
884
- chunk_idx, s, l, target, overlap
885
- )
886
- break
887
- if use_vad_align:
888
- cand = self._find_nearest_silence_local(src_uri, base_next, seek_ms)
889
- next_start = cand if cand is not None else base_next
890
- aligned_count += 1 if cand is not None else 0
891
- total_boundaries += 1
892
- else:
893
- next_start = base_next
894
- # ensure progress and bounds
895
- next_start = max(s + 1, min(dur_ms, int(next_start)))
896
- # choose duration so that chunk spills overlap into next chunk's start by `overlap`
897
- l = min(dur_ms - s, (next_start - s) + overlap)
898
- if l <= 0:
899
- # safety fallback
900
- l = min(target, dur_ms - s)
901
- ranges.append((s, l))
902
- logger.info(
903
- "split[%d]: start=%dms base_next=%dms chosen_next=%dms dur=%dms (target=%d overlap=%d aligned=%s)",
904
- chunk_idx, s, base_next, next_start, l, target, overlap, str(use_vad_align)
905
- )
906
- s = next_start
907
- chunk_idx += 1
908
 
909
  channels = self.manifest["stages"]["preprocess"]["working"]["channel_map"]
910
  src = self.manifest["source"]["uri"]
@@ -922,34 +499,14 @@ class AudioJobRunner:
922
  "mode": "virtual",
923
  "channels": channels,
924
  "source_uris": plan_source_uris,
925
- "chunk_policy": ("vad_aligned_overlap" if use_vad_align else "fixed_overlap"),
926
  "chunk_target_ms": target,
927
  "overlap_ms": overlap,
928
  "total_chunks": len(chunks),
929
  "execution": "transcriber",
930
  "chunks": chunks[:MAX_EMBED],
931
  }
932
- if use_vad_align:
933
- plan["alignment"] = {
934
- "method": "local_vad",
935
- "seek_ms": seek_ms,
936
- "frame_ms": int(pol.get("split_vad_frame_ms", 30)),
937
- "min_silence_ms": int(pol.get("split_vad_min_silence_ms", 300)),
938
- }
939
  self.manifest["stages"]["split"]["plan"]=plan
940
- try:
941
- if use_vad_align:
942
- logger.info(
943
- "split: policy=%s chunks=%d target=%d overlap=%d",
944
- plan.get("chunk_policy"), len(chunks), target, overlap
945
- )
946
- else:
947
- logger.info(
948
- "split: policy=%s chunks=%d target=%d overlap=%d (fixed)",
949
- plan.get("chunk_policy"), len(chunks), target, overlap
950
- )
951
- except Exception:
952
- pass
953
  self._set_stage("split","done",1.0,{"ended_at":utc_now_iso()})
954
 
955
  # Keep transcribe stage for downstream processing
 
71
  "dual_mono_corr": 0.90, # was 0.995; still gated by Side/Mid & RMS check
72
  "corr_probe_ms": 30000, # cap correlation probe at 30s
73
  "stereo_probe_win_s": 12, # each sample window length (sec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  }
75
 
76
  # --------------------------- Runner --------------------------
 
106
  self.manifest = manifest
107
  self.manifest.setdefault("version", "2.3")
108
  self.manifest.setdefault("rev", 0)
 
 
109
  self._touch()
110
 
111
  # -------- Public API --------
 
201
  durms = int(self.manifest["source"].get("duration_ms") or 0)
202
  dur_s = max(1, durms // 1000)
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  # VAD params (defaults if not present in policy)
205
  vad_aggr = int(pol.get("vad_aggressiveness", 2)) # 0..3
206
  vad_frame_ms = 30 # keep 30ms (supported by webrtcvad)
 
431
  return rec, metrics
432
 
433
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  # -------- Preprocess (plan-only) --------
435
  def _build_ingest_plan(self):
436
  self._set_stage("preprocess","running",0.1,{"started_at":utc_now_iso()})
 
462
  })
463
  self._set_stage("preprocess","done",1.0)
464
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  # -------- Split (fixed windows with overlap) --------
466
  def _run_split_plan(self):
467
  self._set_stage("split","running",0.1,{"started_at":utc_now_iso()})
 
474
  overlap = max(0, target - 1)
475
  step = target - overlap
476
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  ranges: List[Tuple[int,int]] = []
478
+ if dur_ms>0:
479
+ s=0
 
 
 
480
  while s < dur_ms:
481
+ l = min(target, dur_ms - s)
482
+ ranges.append((s,l))
483
+ if l < target: break
484
+ s += step
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
 
486
  channels = self.manifest["stages"]["preprocess"]["working"]["channel_map"]
487
  src = self.manifest["source"]["uri"]
 
499
  "mode": "virtual",
500
  "channels": channels,
501
  "source_uris": plan_source_uris,
502
+ "chunk_policy": "fixed_overlap",
503
  "chunk_target_ms": target,
504
  "overlap_ms": overlap,
505
  "total_chunks": len(chunks),
506
  "execution": "transcriber",
507
  "chunks": chunks[:MAX_EMBED],
508
  }
 
 
 
 
 
 
 
509
  self.manifest["stages"]["split"]["plan"]=plan
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  self._set_stage("split","done",1.0,{"ended_at":utc_now_iso()})
511
 
512
  # Keep transcribe stage for downstream processing