Spaces:

danielrosehill
/

STT-Comparison

Running

File size: 3,452 Bytes

0aa8adc

#!/usr/bin/env python3
"""
Adjust SRT file timestamps to align with ground truth.
This script removes timing offsets to ensure all transcripts start at 00:00:00,000
"""

import re
from pathlib import Path


def parse_timestamp(timestamp_str):
    """Convert SRT timestamp to milliseconds."""
    # Format: HH:MM:SS,mmm
    time_part, ms_part = timestamp_str.split(',')
    h, m, s = map(int, time_part.split(':'))
    ms = int(ms_part)
    return (h * 3600 + m * 60 + s) * 1000 + ms


def format_timestamp(ms):
    """Convert milliseconds to SRT timestamp format."""
    hours = ms // 3600000
    ms %= 3600000
    minutes = ms // 60000
    ms %= 60000
    seconds = ms // 1000
    milliseconds = ms % 1000
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"


def adjust_srt_timing(input_path, output_path, offset_ms):
    """
    Adjust all timestamps in an SRT file by subtracting offset_ms.

    Args:
        input_path: Path to input SRT file
        output_path: Path to output SRT file
        offset_ms: Offset in milliseconds to subtract from all timestamps
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Remove BOM if present
    content = content.lstrip('\ufeff')

    # Pattern to match timestamp lines: HH:MM:SS,mmm --> HH:MM:SS,mmm
    timestamp_pattern = re.compile(
        r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})'
    )

    def adjust_match(match):
        start_str = match.group(1)
        end_str = match.group(2)

        start_ms = parse_timestamp(start_str)
        end_ms = parse_timestamp(end_str)

        # Subtract offset
        new_start_ms = max(0, start_ms - offset_ms)
        new_end_ms = max(0, end_ms - offset_ms)

        new_start = format_timestamp(new_start_ms)
        new_end = format_timestamp(new_end_ms)

        return f"{new_start} --> {new_end}"

    adjusted_content = timestamp_pattern.sub(adjust_match, content)

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(adjusted_content)

    print(f"✓ Adjusted {input_path.name}: offset={offset_ms}ms → {output_path.name}")


def find_first_timestamp(srt_path):
    """Find the first timestamp in an SRT file."""
    with open(srt_path, 'r', encoding='utf-8') as f:
        content = f.read()

    timestamp_pattern = re.compile(r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->')
    match = timestamp_pattern.search(content)
    if match:
        return parse_timestamp(match.group(1))
    return 0


def main():
    srt_dir = Path(__file__).parent / "srt-out"

    # Files to adjust
    srt_files = [
        "assembly.srt",
        "gladia.srt",
        "nova3.srt",
        "speechmatics.srt"
    ]

    print("Analyzing SRT files for timing offset...\n")

    for filename in srt_files:
        input_path = srt_dir / filename
        if not input_path.exists():
            print(f"⚠ Skipping {filename} (not found)")
            continue

        # Find first timestamp
        first_ts_ms = find_first_timestamp(input_path)

        if first_ts_ms == 0:
            print(f"✓ {filename} already starts at 00:00:00,000 (no adjustment needed)")
            continue

        # Calculate offset
        offset_ms = first_ts_ms

        # Adjust the file in place
        adjust_srt_timing(input_path, input_path, offset_ms)

    print("\n✅ All SRT files have been adjusted to start at 00:00:00,000")


if __name__ == "__main__":
    main()