rikhoffbauer2
/

lyric-sync

ml-intern

Model card Files Files and versions

xet

Community

rikhoffbauer2 commited on 18 days ago

Commit

1be2d0f

verified ·

1 Parent(s): 63c7ca6

Upload lyric_sync/refine.py

Browse files

Files changed (1) hide show

lyric_sync/refine.py +340 -0

lyric_sync/refine.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+Audio-based timing refinement using onset/offset detection.
+Refines coarse word timestamps (from ASR alignment) to sub-10ms precision
+using signal-domain analysis of the vocals waveform:
+1. Onset detection (spectral flux + librosa ODF) → snap word starts
+2. RMS energy envelope → find word ends (energy decay)
+3. Silence gap detection → refine inter-word boundaries
+4. Sanity constraints (minimum duration, no overlaps)
+Reference: Standard MIR onset detection (librosa) combined with
+forced-alignment-specific refinement heuristics.
+"""
+import logging
+from typing import Optional
+import numpy as np
+from lyric_sync.transcribe import TimedWord
+logger = logging.getLogger(__name__)
+class TimingRefiner:
+    """
+    Refine word-level timestamps using audio signal analysis.
+    Operates on the isolated vocals waveform (post-Demucs separation).
+    Expects mono float32 audio at 44100 Hz for maximum temporal precision.
+    """
+    def __init__(
+        self,
+        sr: int = 44100,
+        hop_length: int = 256,
+        onset_search_window_sec: float = 0.08,
+        offset_search_window_sec: float = 0.05,
+        silence_threshold_db: float = -45.0,
+        min_word_duration_sec: float = 0.03,
+        fmin: float = 80.0,
+        fmax: float = 4000.0,
+    ):
+        """
+        Args:
+            sr: Sample rate of input audio (44100 recommended for precision)
+            hop_length: STFT hop length. 256 at 44100Hz → 5.8ms frame resolution.
+            onset_search_window_sec: Search window for onset snapping (±this around ASR time)
+            offset_search_window_sec: Search window for end-of-word refinement
+            silence_threshold_db: dB below peak RMS to consider "silence"
+            min_word_duration_sec: Minimum allowed word duration
+            fmin: Lowest frequency for vocal onset detection (Hz)
+            fmax: Highest frequency for vocal onset detection (Hz)
+        """
+        self.sr = sr
+        self.hop_length = hop_length
+        self.onset_search_window_sec = onset_search_window_sec
+        self.offset_search_window_sec = offset_search_window_sec
+        self.silence_threshold_db = silence_threshold_db
+        self.min_word_duration_sec = min_word_duration_sec
+        self.fmin = fmin
+        self.fmax = fmax
+    def refine(
+        self,
+        vocals: np.ndarray,
+        words: list[TimedWord],
+    ) -> list[TimedWord]:
+        """
+        Refine all word timestamps using audio analysis.
+        Args:
+            vocals: Mono float32 numpy array at self.sr Hz
+            words: Words with coarse timestamps from alignment
+        Returns:
+            Words with refined timestamps
+        """
+        import librosa
+        if len(vocals) == 0 or not words:
+            return words
+        # Pre-compute analysis signals
+        odf = self._compute_onset_envelope(vocals)
+        rms = self._compute_rms_envelope(vocals)
+        rms_smooth = self._smooth(rms, window_size=7)
+        silence_gaps = self._detect_silence_gaps(rms)
+        onset_frames = self._detect_onsets(odf)
+        logger.info(
+            f"Timing refinement: {len(onset_frames)} onsets, "
+            f"{len(silence_gaps)} silence gaps detected"
+        )
+        refined = []
+        for word in words:
+            w = TimedWord(
+                word=word.word,
+                start=word.start,
+                end=word.end,
+                confidence=word.confidence,
+            )
+            # Refine start → snap to nearest onset
+            w.start = self._snap_to_onset(
+                w.start, onset_frames, odf
+            )
+            # Refine end → find energy drop-off
+            w.end = self._refine_end(w.end, rms_smooth)
+            # Sanity: end must be after start with minimum duration
+            if w.end <= w.start + self.min_word_duration_sec:
+                w.end = w.start + self.min_word_duration_sec
+            refined.append(w)
+        # Silence gap snapping (final pass)
+        refined = self._snap_to_silence_gaps(refined, silence_gaps)
+        # Ensure no overlaps
+        refined = self._resolve_overlaps(refined)
+        return refined
+    def _compute_onset_envelope(self, y: np.ndarray) -> np.ndarray:
+        """Compute onset strength envelope tuned for vocals."""
+        import librosa
+        odf = librosa.onset.onset_strength(
+            y=y,
+            sr=self.sr,
+            hop_length=self.hop_length,
+            n_fft=1024,
+            fmin=self.fmin,
+            fmax=self.fmax,
+            aggregate=np.median,
+            detrend=True,
+            center=True,
+        )
+        return odf
+    def _compute_rms_envelope(self, y: np.ndarray) -> np.ndarray:
+        """Compute RMS energy per frame."""
+        import librosa
+        rms = librosa.feature.rms(
+            y=y,
+            frame_length=1024,
+            hop_length=self.hop_length,
+            center=True,
+        )[0]
+        return rms
+    def _detect_onsets(self, odf: np.ndarray) -> np.ndarray:
+        """Detect all onsets in the onset envelope."""
+        import librosa
+        onsets = librosa.onset.onset_detect(
+            onset_envelope=odf,
+            sr=self.sr,
+            hop_length=self.hop_length,
+            backtrack=True,
+            units='frames',
+            pre_max=2,
+            post_max=2,
+            pre_avg=2,
+            post_avg=4,
+            delta=0.05,
+            wait=8,
+        )
+        return onsets
+    def _detect_silence_gaps(
+        self,
+        rms: np.ndarray,
+        min_gap_frames: int = 3,
+    ) -> list[tuple[float, float]]:
+        """
+        Detect silence regions in the RMS envelope.
+        Returns list of (gap_start_sec, gap_end_sec).
+        """
+        import librosa
+        rms_db = librosa.amplitude_to_db(rms + 1e-10, ref=rms.max() + 1e-10)
+        is_silent = rms_db < self.silence_threshold_db
+        gaps = []
+        in_gap = False
+        gap_start = 0
+        for i, silent in enumerate(is_silent):
+            if silent and not in_gap:
+                in_gap = True
+                gap_start = i
+            elif not silent and in_gap:
+                if i - gap_start >= min_gap_frames:
+                    t_start = librosa.frames_to_time(gap_start, sr=self.sr, hop_length=self.hop_length)
+                    t_end = librosa.frames_to_time(i, sr=self.sr, hop_length=self.hop_length)
+                    gaps.append((t_start, t_end))
+                in_gap = False
+        return gaps
+    def _snap_to_onset(
+        self,
+        approx_time: float,
+        onset_frames: np.ndarray,
+        odf: np.ndarray,
+    ) -> float:
+        """Snap an approximate word-start to the nearest detected onset."""
+        import librosa
+        if len(onset_frames) == 0:
+            return approx_time
+        approx_frame = librosa.time_to_frames(
+            approx_time, sr=self.sr, hop_length=self.hop_length
+        )
+        window_frames = int(self.onset_search_window_sec * self.sr / self.hop_length)
+        # Find onsets within search window
+        lo = approx_frame - window_frames
+        hi = approx_frame + window_frames
+        candidates = onset_frames[(onset_frames >= lo) & (onset_frames <= hi)]
+        if len(candidates) == 0:
+            return approx_time
+        # Pick the onset nearest to the ASR timestamp
+        nearest_frame = candidates[np.argmin(np.abs(candidates - approx_frame))]
+        return librosa.frames_to_time(nearest_frame, sr=self.sr, hop_length=self.hop_length)
+    def _refine_end(self, approx_end: float, rms_smooth: np.ndarray) -> float:
+        """Refine word end by finding energy drop-off."""
+        import librosa
+        rms_db = librosa.amplitude_to_db(rms_smooth + 1e-10, ref=rms_smooth.max() + 1e-10)
+        end_frame = librosa.time_to_frames(
+            approx_end, sr=self.sr, hop_length=self.hop_length
+        )
+        search_frames = int(self.offset_search_window_sec * self.sr / self.hop_length)
+        lo = max(0, end_frame - search_frames)
+        hi = min(len(rms_db) - 1, end_frame + search_frames)
+        if lo >= hi:
+            return approx_end
+        # Find first frame where energy drops significantly
+        window_db = rms_db[lo:hi + 1]
+        threshold = self.silence_threshold_db + 5  # slightly above full silence
+        silent_frames = np.where(window_db < threshold)[0]
+        if len(silent_frames) > 0:
+            # First energy drop in the window
+            drop_frame = lo + silent_frames[0]
+            return librosa.frames_to_time(drop_frame, sr=self.sr, hop_length=self.hop_length)
+        # No clear drop: use energy minimum in window
+        min_frame = lo + np.argmin(rms_smooth[lo:hi + 1])
+        return librosa.frames_to_time(min_frame, sr=self.sr, hop_length=self.hop_length)
+    def _snap_to_silence_gaps(
+        self,
+        words: list[TimedWord],
+        gaps: list[tuple[float, float]],
+        snap_tolerance: float = 0.04,
+    ) -> list[TimedWord]:
+        """Snap word boundaries to nearby silence gaps."""
+        refined = []
+        for word in words:
+            w = TimedWord(
+                word=word.word,
+                start=word.start,
+                end=word.end,
+                confidence=word.confidence,
+            )
+            for gap_start, gap_end in gaps:
+                # Snap word start to end of gap (sound resumes)
+                if abs(gap_end - w.start) < snap_tolerance:
+                    w.start = gap_end
+                # Snap word end to start of gap (sound stops)
+                if abs(gap_start - w.end) < snap_tolerance:
+                    w.end = gap_start
+            refined.append(w)
+        return refined
+    def _resolve_overlaps(self, words: list[TimedWord]) -> list[TimedWord]:
+        """Ensure no word overlaps with the next, maintaining monotonic order."""
+        for i in range(len(words) - 1):
+            if words[i].end > words[i + 1].start:
+                # Split the overlap at the midpoint
+                mid = (words[i].end + words[i + 1].start) / 2
+                words[i] = TimedWord(
+                    word=words[i].word,
+                    start=words[i].start,
+                    end=mid,
+                    confidence=words[i].confidence,
+                )
+                words[i + 1] = TimedWord(
+                    word=words[i + 1].word,
+                    start=mid,
+                    end=words[i + 1].end,
+                    confidence=words[i + 1].confidence,
+                )
+        return words
+    @staticmethod
+    def _smooth(arr: np.ndarray, window_size: int = 5) -> np.ndarray:
+        """Simple uniform smoothing."""
+        if window_size <= 1:
+            return arr
+        kernel = np.ones(window_size) / window_size
+        return np.convolve(arr, kernel, mode='same')
+def refine_timings(
+    vocals: np.ndarray,
+    sr: int,
+    words: list[TimedWord],
+    **kwargs,
+) -> list[TimedWord]:
+    """
+    Convenience function: refine word timestamps using audio analysis.
+    Args:
+        vocals: Mono float32 numpy array (ideally at 44100 Hz)
+        sr: Sample rate
+        words: Words with coarse timestamps
+        **kwargs: Additional args for TimingRefiner
+    Returns:
+        Words with refined timestamps
+    """
+    refiner = TimingRefiner(sr=sr, **kwargs)
+    return refiner.refine(vocals, words)