mazesmazes
/

tiny-audio

@@ -4,9 +4,9 @@ import numpy as np
 import torch
 # Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
-# Calibrated on librispeech-alignments dataset
-START_OFFSET = 0.06  # Subtract from start times (shift earlier)
-END_OFFSET = -0.03  # Add to end times (shift later)
 def _get_device() -> str:
@@ -254,7 +254,7 @@ class ForcedAligner:
         word_idx = 0
         separator_id = dictionary.get("|", dictionary.get(" ", 0))
-        for token_id, start_frame, end_frame, peak_frame in alignment_path:
             if token_id == separator_id:  # Word separator
                 if (
                     first_char_peak is not None

 import torch
 # Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
+# Calibrated on librispeech-alignments dataset (n=25, Median AE=20ms)
+START_OFFSET = 0.03  # Subtract from start times (shift earlier)
+END_OFFSET = -0.03  # Subtract from end times (shift earlier)
 def _get_device() -> str:
         word_idx = 0
         separator_id = dictionary.get("|", dictionary.get(" ", 0))
+        for token_id, _start_frame, _end_frame, peak_frame in alignment_path:
             if token_id == separator_id:  # Word separator
                 if (
                     first_char_peak is not None