Quran-multi-aligner / src /segment_types.py
hetchyy's picture
Initial commit
20e9692
"""Data types for the segmentation pipeline."""
from dataclasses import dataclass
from typing import Optional
@dataclass
class VadSegment:
"""Raw VAD segment with timing info."""
start_time: float
end_time: float
segment_idx: int
@dataclass
class SegmentInfo:
"""Processed segment with transcription and matching results."""
start_time: float
end_time: float
transcribed_text: str
matched_text: str
matched_ref: str # e.g. "2:255:1-2:255:5"
match_score: float
error: Optional[str] = None
has_missing_words: bool = False
potentially_undersegmented: bool = False
@dataclass
class ProfilingData:
"""Profiling metrics for the processing pipeline."""
# Preprocessing
resample_time: float = 0.0 # Audio resampling time
# VAD profiling
vad_model_load_time: float = 0.0
vad_model_move_time: float = 0.0
vad_inference_time: float = 0.0
vad_gpu_time: float = 0.0 # Actual GPU lease execution time
vad_wall_time: float = 0.0 # Wall-clock time (includes queue wait)
# Phoneme ASR profiling
asr_time: float = 0.0 # Wav2vec wall-clock time (includes queue wait)
asr_gpu_time: float = 0.0 # Actual GPU lease execution time
asr_model_move_time: float = 0.0 # ASR model GPU move time
asr_sorting_time: float = 0.0 # Duration-sorting time
asr_batch_build_time: float = 0.0 # Dynamic batch construction time
asr_batch_profiling: list = None # Per-batch timing details
# Global anchor profiling
anchor_time: float = 0.0 # N-gram voting anchor detection
# Phoneme alignment profiling
phoneme_total_time: float = 0.0 # Overall phoneme matching time
phoneme_ref_build_time: float = 0.0 # Time to build chapter reference
phoneme_dp_total_time: float = 0.0 # Total DP time across all segments
phoneme_dp_min_time: float = 0.0 # Min DP time per segment
phoneme_dp_max_time: float = 0.0 # Max DP time per segment
phoneme_window_setup_time: float = 0.0 # Total window slicing time
phoneme_result_build_time: float = 0.0 # Total result construction time
phoneme_num_segments: int = 0 # Number of segments aligned
match_wall_time: float = 0.0 # Total matching wall-clock time
# Retry / reanchor counters
tier1_attempts: int = 0
tier1_passed: int = 0
tier1_segments: list = None
tier2_attempts: int = 0
tier2_passed: int = 0
tier2_segments: list = None
consec_reanchors: int = 0
segments_attempted: int = 0
segments_passed: int = 0
special_merges: int = 0
# Result building profiling
result_build_time: float = 0.0 # Total result building time
result_audio_encode_time: float = 0.0 # Audio-to-data-URL encoding
# Total pipeline time
total_time: float = 0.0 # End-to-end pipeline time
@property
def phoneme_dp_avg_time(self) -> float:
"""Average DP time per segment."""
if self.phoneme_num_segments == 0:
return 0.0
return self.phoneme_dp_total_time / self.phoneme_num_segments
@staticmethod
def _fmt(seconds):
"""Format seconds as m:ss.fff when >= 60s, else as s.fffs."""
if seconds >= 60:
m, s = divmod(seconds, 60)
return f"{int(m)}:{s:06.3f}"
return f"{seconds:.3f}s"
def summary(self) -> str:
"""Return a formatted profiling summary."""
_fmt = self._fmt
lines = [
"\n" + "=" * 60,
"PROFILING SUMMARY",
"=" * 60,
f" Preprocessing:",
f" Resample: {self.resample_time:.3f}s",
f" VAD: wall {_fmt(self.vad_wall_time)}",
f" GPU Time: {self.vad_gpu_time:.3f}s (queue {self.vad_wall_time - self.vad_gpu_time:.3f}s)",
f" Model Load: {self.vad_model_load_time:.3f}s",
f" Model Move: {self.vad_model_move_time:.3f}s",
f" Inference: {self.vad_inference_time:.3f}s",
f" Phoneme ASR: wall {_fmt(self.asr_time)}",
f" GPU Time: {self.asr_gpu_time:.3f}s (queue {self.asr_time - self.asr_gpu_time:.3f}s)",
f" Model Move: {self.asr_model_move_time:.3f}s",
f" Sorting: {self.asr_sorting_time:.3f}s",
f" Batch Build: {self.asr_batch_build_time:.3f}s",
f" Batches: {len(self.asr_batch_profiling) if self.asr_batch_profiling else 0}",
]
if self.asr_batch_profiling:
for b in self.asr_batch_profiling:
lines.append(
f" Batch {b['batch_num']:>2}: {b['size']:>3} segs | "
f"{b['time']:.3f}s | "
f"{b['min_dur']:.2f}-{b['max_dur']:.2f}s "
f"(A {b['avg_dur']:.2f}s, T {b['total_seconds']:.1f}s, W {b['pad_waste']:.0%})"
)
lines += [
f" Global Anchor:",
f" N-gram Voting: {self.anchor_time:.3f}s",
f" Phoneme Alignment: wall {_fmt(self.match_wall_time)}",
f" Ref Build: {self.phoneme_ref_build_time:.3f}s",
f" Window Setup: {self.phoneme_window_setup_time:.3f}s",
f" DP Total: {self.phoneme_dp_total_time:.3f}s",
f" Segments: {self.phoneme_num_segments}",
f" DP Avg/segment: {1000*self.phoneme_dp_avg_time:.3f}ms",
f" DP Min: {1000*self.phoneme_dp_min_time:.3f}ms",
f" DP Max: {1000*self.phoneme_dp_max_time:.3f}ms",
]
pct = 100 * self.segments_passed / self.segments_attempted if self.segments_attempted else 0
t1_segs = self.tier1_segments or []
t2_segs = self.tier2_segments or []
lines += [
f" Alignment Stats:",
f" Attempted: {self.segments_attempted}",
f" Passed: {self.segments_passed} ({pct:.1f}%)",
f" Tier 1 Retries: {self.tier1_passed}/{self.tier1_attempts} passed segments: {t1_segs}",
f" Tier 2 Retries: {self.tier2_passed}/{self.tier2_attempts} passed segments: {t2_segs}",
f" Reanchors (consec failures): {self.consec_reanchors}",
f" Special Merges: {self.special_merges}",
"-" * 60,
]
profiled_sum = (self.resample_time + self.vad_wall_time + self.asr_time
+ self.anchor_time + self.match_wall_time + self.result_build_time)
unaccounted = self.total_time - profiled_sum
lines += [
f" PROFILED SUM: {_fmt(profiled_sum)}",
f" TOTAL (wall): {_fmt(self.total_time)} (unaccounted: {_fmt(unaccounted)})",
"=" * 60,
]
return "\n".join(lines)