Spaces:

hetchyy
/

Quran-multi-aligner

Running on Zero

File size: 7,089 Bytes

20e9692

"""Data types for the segmentation pipeline."""

from dataclasses import dataclass
from typing import Optional


@dataclass
class VadSegment:
    """Raw VAD segment with timing info."""
    start_time: float
    end_time: float
    segment_idx: int


@dataclass
class SegmentInfo:
    """Processed segment with transcription and matching results."""
    start_time: float
    end_time: float
    transcribed_text: str
    matched_text: str
    matched_ref: str  # e.g. "2:255:1-2:255:5"
    match_score: float
    error: Optional[str] = None
    has_missing_words: bool = False
    potentially_undersegmented: bool = False


@dataclass
class ProfilingData:
    """Profiling metrics for the processing pipeline."""
    # Preprocessing
    resample_time: float = 0.0               # Audio resampling time
    # VAD profiling
    vad_model_load_time: float = 0.0
    vad_model_move_time: float = 0.0
    vad_inference_time: float = 0.0
    vad_gpu_time: float = 0.0               # Actual GPU lease execution time
    vad_wall_time: float = 0.0              # Wall-clock time (includes queue wait)
    # Phoneme ASR profiling
    asr_time: float = 0.0                    # Wav2vec wall-clock time (includes queue wait)
    asr_gpu_time: float = 0.0               # Actual GPU lease execution time
    asr_model_move_time: float = 0.0         # ASR model GPU move time
    asr_sorting_time: float = 0.0            # Duration-sorting time
    asr_batch_build_time: float = 0.0        # Dynamic batch construction time
    asr_batch_profiling: list = None         # Per-batch timing details
    # Global anchor profiling
    anchor_time: float = 0.0                 # N-gram voting anchor detection
    # Phoneme alignment profiling
    phoneme_total_time: float = 0.0          # Overall phoneme matching time
    phoneme_ref_build_time: float = 0.0      # Time to build chapter reference
    phoneme_dp_total_time: float = 0.0       # Total DP time across all segments
    phoneme_dp_min_time: float = 0.0         # Min DP time per segment
    phoneme_dp_max_time: float = 0.0         # Max DP time per segment
    phoneme_window_setup_time: float = 0.0   # Total window slicing time
    phoneme_result_build_time: float = 0.0   # Total result construction time
    phoneme_num_segments: int = 0            # Number of segments aligned
    match_wall_time: float = 0.0             # Total matching wall-clock time
    # Retry / reanchor counters
    tier1_attempts: int = 0
    tier1_passed: int = 0
    tier1_segments: list = None
    tier2_attempts: int = 0
    tier2_passed: int = 0
    tier2_segments: list = None
    consec_reanchors: int = 0
    segments_attempted: int = 0
    segments_passed: int = 0
    special_merges: int = 0
    # Result building profiling
    result_build_time: float = 0.0           # Total result building time
    result_audio_encode_time: float = 0.0    # Audio-to-data-URL encoding
    # Total pipeline time
    total_time: float = 0.0                  # End-to-end pipeline time

    @property
    def phoneme_dp_avg_time(self) -> float:
        """Average DP time per segment."""
        if self.phoneme_num_segments == 0:
            return 0.0
        return self.phoneme_dp_total_time / self.phoneme_num_segments

    @staticmethod
    def _fmt(seconds):
        """Format seconds as m:ss.fff when >= 60s, else as s.fffs."""
        if seconds >= 60:
            m, s = divmod(seconds, 60)
            return f"{int(m)}:{s:06.3f}"
        return f"{seconds:.3f}s"

    def summary(self) -> str:
        """Return a formatted profiling summary."""
        _fmt = self._fmt
        lines = [
            "\n" + "=" * 60,
            "PROFILING SUMMARY",
            "=" * 60,
            f"  Preprocessing:",
            f"    Resample:        {self.resample_time:.3f}s",
            f"  VAD:                                 wall {_fmt(self.vad_wall_time)}",
            f"    GPU Time:        {self.vad_gpu_time:.3f}s   (queue {self.vad_wall_time - self.vad_gpu_time:.3f}s)",
            f"    Model Load:      {self.vad_model_load_time:.3f}s",
            f"    Model Move:      {self.vad_model_move_time:.3f}s",
            f"    Inference:       {self.vad_inference_time:.3f}s",
            f"  Phoneme ASR:                         wall {_fmt(self.asr_time)}",
            f"    GPU Time:        {self.asr_gpu_time:.3f}s   (queue {self.asr_time - self.asr_gpu_time:.3f}s)",
            f"    Model Move:      {self.asr_model_move_time:.3f}s",
            f"    Sorting:         {self.asr_sorting_time:.3f}s",
            f"    Batch Build:     {self.asr_batch_build_time:.3f}s",
            f"    Batches:         {len(self.asr_batch_profiling) if self.asr_batch_profiling else 0}",
        ]
        if self.asr_batch_profiling:
            for b in self.asr_batch_profiling:
                lines.append(
                    f"    Batch {b['batch_num']:>2}: {b['size']:>3} segs | "
                    f"{b['time']:.3f}s | "
                    f"{b['min_dur']:.2f}-{b['max_dur']:.2f}s "
                    f"(A {b['avg_dur']:.2f}s, T {b['total_seconds']:.1f}s, W {b['pad_waste']:.0%})"
                )
        lines += [
            f"  Global Anchor:",
            f"    N-gram Voting:   {self.anchor_time:.3f}s",
            f"  Phoneme Alignment:                   wall {_fmt(self.match_wall_time)}",
            f"    Ref Build:       {self.phoneme_ref_build_time:.3f}s",
            f"    Window Setup:    {self.phoneme_window_setup_time:.3f}s",
            f"    DP Total:        {self.phoneme_dp_total_time:.3f}s",
            f"    Segments:        {self.phoneme_num_segments}",
            f"    DP Avg/segment:  {1000*self.phoneme_dp_avg_time:.3f}ms",
            f"    DP Min:          {1000*self.phoneme_dp_min_time:.3f}ms",
            f"    DP Max:          {1000*self.phoneme_dp_max_time:.3f}ms",
        ]
        pct = 100 * self.segments_passed / self.segments_attempted if self.segments_attempted else 0
        t1_segs = self.tier1_segments or []
        t2_segs = self.tier2_segments or []
        lines += [
            f"  Alignment Stats:",
            f"    Attempted:       {self.segments_attempted}",
            f"    Passed:          {self.segments_passed}  ({pct:.1f}%)",
            f"    Tier 1 Retries:  {self.tier1_passed}/{self.tier1_attempts} passed   segments: {t1_segs}",
            f"    Tier 2 Retries:  {self.tier2_passed}/{self.tier2_attempts} passed   segments: {t2_segs}",
            f"    Reanchors (consec failures): {self.consec_reanchors}",
            f"    Special Merges:  {self.special_merges}",
            "-" * 60,
        ]
        profiled_sum = (self.resample_time + self.vad_wall_time + self.asr_time
                        + self.anchor_time + self.match_wall_time + self.result_build_time)
        unaccounted = self.total_time - profiled_sum
        lines += [
            f"  PROFILED SUM:      {_fmt(profiled_sum)}",
            f"  TOTAL (wall):      {_fmt(self.total_time)}   (unaccounted: {_fmt(unaccounted)})",
            "=" * 60,
        ]
        return "\n".join(lines)