Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,089 Bytes
20e9692 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
"""Data types for the segmentation pipeline."""
from dataclasses import dataclass
from typing import Optional
@dataclass
class VadSegment:
"""Raw VAD segment with timing info."""
start_time: float
end_time: float
segment_idx: int
@dataclass
class SegmentInfo:
"""Processed segment with transcription and matching results."""
start_time: float
end_time: float
transcribed_text: str
matched_text: str
matched_ref: str # e.g. "2:255:1-2:255:5"
match_score: float
error: Optional[str] = None
has_missing_words: bool = False
potentially_undersegmented: bool = False
@dataclass
class ProfilingData:
"""Profiling metrics for the processing pipeline."""
# Preprocessing
resample_time: float = 0.0 # Audio resampling time
# VAD profiling
vad_model_load_time: float = 0.0
vad_model_move_time: float = 0.0
vad_inference_time: float = 0.0
vad_gpu_time: float = 0.0 # Actual GPU lease execution time
vad_wall_time: float = 0.0 # Wall-clock time (includes queue wait)
# Phoneme ASR profiling
asr_time: float = 0.0 # Wav2vec wall-clock time (includes queue wait)
asr_gpu_time: float = 0.0 # Actual GPU lease execution time
asr_model_move_time: float = 0.0 # ASR model GPU move time
asr_sorting_time: float = 0.0 # Duration-sorting time
asr_batch_build_time: float = 0.0 # Dynamic batch construction time
asr_batch_profiling: list = None # Per-batch timing details
# Global anchor profiling
anchor_time: float = 0.0 # N-gram voting anchor detection
# Phoneme alignment profiling
phoneme_total_time: float = 0.0 # Overall phoneme matching time
phoneme_ref_build_time: float = 0.0 # Time to build chapter reference
phoneme_dp_total_time: float = 0.0 # Total DP time across all segments
phoneme_dp_min_time: float = 0.0 # Min DP time per segment
phoneme_dp_max_time: float = 0.0 # Max DP time per segment
phoneme_window_setup_time: float = 0.0 # Total window slicing time
phoneme_result_build_time: float = 0.0 # Total result construction time
phoneme_num_segments: int = 0 # Number of segments aligned
match_wall_time: float = 0.0 # Total matching wall-clock time
# Retry / reanchor counters
tier1_attempts: int = 0
tier1_passed: int = 0
tier1_segments: list = None
tier2_attempts: int = 0
tier2_passed: int = 0
tier2_segments: list = None
consec_reanchors: int = 0
segments_attempted: int = 0
segments_passed: int = 0
special_merges: int = 0
# Result building profiling
result_build_time: float = 0.0 # Total result building time
result_audio_encode_time: float = 0.0 # Audio-to-data-URL encoding
# Total pipeline time
total_time: float = 0.0 # End-to-end pipeline time
@property
def phoneme_dp_avg_time(self) -> float:
"""Average DP time per segment."""
if self.phoneme_num_segments == 0:
return 0.0
return self.phoneme_dp_total_time / self.phoneme_num_segments
@staticmethod
def _fmt(seconds):
"""Format seconds as m:ss.fff when >= 60s, else as s.fffs."""
if seconds >= 60:
m, s = divmod(seconds, 60)
return f"{int(m)}:{s:06.3f}"
return f"{seconds:.3f}s"
def summary(self) -> str:
"""Return a formatted profiling summary."""
_fmt = self._fmt
lines = [
"\n" + "=" * 60,
"PROFILING SUMMARY",
"=" * 60,
f" Preprocessing:",
f" Resample: {self.resample_time:.3f}s",
f" VAD: wall {_fmt(self.vad_wall_time)}",
f" GPU Time: {self.vad_gpu_time:.3f}s (queue {self.vad_wall_time - self.vad_gpu_time:.3f}s)",
f" Model Load: {self.vad_model_load_time:.3f}s",
f" Model Move: {self.vad_model_move_time:.3f}s",
f" Inference: {self.vad_inference_time:.3f}s",
f" Phoneme ASR: wall {_fmt(self.asr_time)}",
f" GPU Time: {self.asr_gpu_time:.3f}s (queue {self.asr_time - self.asr_gpu_time:.3f}s)",
f" Model Move: {self.asr_model_move_time:.3f}s",
f" Sorting: {self.asr_sorting_time:.3f}s",
f" Batch Build: {self.asr_batch_build_time:.3f}s",
f" Batches: {len(self.asr_batch_profiling) if self.asr_batch_profiling else 0}",
]
if self.asr_batch_profiling:
for b in self.asr_batch_profiling:
lines.append(
f" Batch {b['batch_num']:>2}: {b['size']:>3} segs | "
f"{b['time']:.3f}s | "
f"{b['min_dur']:.2f}-{b['max_dur']:.2f}s "
f"(A {b['avg_dur']:.2f}s, T {b['total_seconds']:.1f}s, W {b['pad_waste']:.0%})"
)
lines += [
f" Global Anchor:",
f" N-gram Voting: {self.anchor_time:.3f}s",
f" Phoneme Alignment: wall {_fmt(self.match_wall_time)}",
f" Ref Build: {self.phoneme_ref_build_time:.3f}s",
f" Window Setup: {self.phoneme_window_setup_time:.3f}s",
f" DP Total: {self.phoneme_dp_total_time:.3f}s",
f" Segments: {self.phoneme_num_segments}",
f" DP Avg/segment: {1000*self.phoneme_dp_avg_time:.3f}ms",
f" DP Min: {1000*self.phoneme_dp_min_time:.3f}ms",
f" DP Max: {1000*self.phoneme_dp_max_time:.3f}ms",
]
pct = 100 * self.segments_passed / self.segments_attempted if self.segments_attempted else 0
t1_segs = self.tier1_segments or []
t2_segs = self.tier2_segments or []
lines += [
f" Alignment Stats:",
f" Attempted: {self.segments_attempted}",
f" Passed: {self.segments_passed} ({pct:.1f}%)",
f" Tier 1 Retries: {self.tier1_passed}/{self.tier1_attempts} passed segments: {t1_segs}",
f" Tier 2 Retries: {self.tier2_passed}/{self.tier2_attempts} passed segments: {t2_segs}",
f" Reanchors (consec failures): {self.consec_reanchors}",
f" Special Merges: {self.special_merges}",
"-" * 60,
]
profiled_sum = (self.resample_time + self.vad_wall_time + self.asr_time
+ self.anchor_time + self.match_wall_time + self.result_build_time)
unaccounted = self.total_time - profiled_sum
lines += [
f" PROFILED SUM: {_fmt(profiled_sum)}",
f" TOTAL (wall): {_fmt(self.total_time)} (unaccounted: {_fmt(unaccounted)})",
"=" * 60,
]
return "\n".join(lines)
|