Spaces:
Running
Running
File size: 3,452 Bytes
0aa8adc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
#!/usr/bin/env python3
"""
Adjust SRT file timestamps to align with ground truth.
This script removes timing offsets to ensure all transcripts start at 00:00:00,000
"""
import re
from pathlib import Path
def parse_timestamp(timestamp_str):
"""Convert SRT timestamp to milliseconds."""
# Format: HH:MM:SS,mmm
time_part, ms_part = timestamp_str.split(',')
h, m, s = map(int, time_part.split(':'))
ms = int(ms_part)
return (h * 3600 + m * 60 + s) * 1000 + ms
def format_timestamp(ms):
"""Convert milliseconds to SRT timestamp format."""
hours = ms // 3600000
ms %= 3600000
minutes = ms // 60000
ms %= 60000
seconds = ms // 1000
milliseconds = ms % 1000
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
def adjust_srt_timing(input_path, output_path, offset_ms):
"""
Adjust all timestamps in an SRT file by subtracting offset_ms.
Args:
input_path: Path to input SRT file
output_path: Path to output SRT file
offset_ms: Offset in milliseconds to subtract from all timestamps
"""
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
# Remove BOM if present
content = content.lstrip('\ufeff')
# Pattern to match timestamp lines: HH:MM:SS,mmm --> HH:MM:SS,mmm
timestamp_pattern = re.compile(
r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})'
)
def adjust_match(match):
start_str = match.group(1)
end_str = match.group(2)
start_ms = parse_timestamp(start_str)
end_ms = parse_timestamp(end_str)
# Subtract offset
new_start_ms = max(0, start_ms - offset_ms)
new_end_ms = max(0, end_ms - offset_ms)
new_start = format_timestamp(new_start_ms)
new_end = format_timestamp(new_end_ms)
return f"{new_start} --> {new_end}"
adjusted_content = timestamp_pattern.sub(adjust_match, content)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(adjusted_content)
print(f"✓ Adjusted {input_path.name}: offset={offset_ms}ms → {output_path.name}")
def find_first_timestamp(srt_path):
"""Find the first timestamp in an SRT file."""
with open(srt_path, 'r', encoding='utf-8') as f:
content = f.read()
timestamp_pattern = re.compile(r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->')
match = timestamp_pattern.search(content)
if match:
return parse_timestamp(match.group(1))
return 0
def main():
srt_dir = Path(__file__).parent / "srt-out"
# Files to adjust
srt_files = [
"assembly.srt",
"gladia.srt",
"nova3.srt",
"speechmatics.srt"
]
print("Analyzing SRT files for timing offset...\n")
for filename in srt_files:
input_path = srt_dir / filename
if not input_path.exists():
print(f"⚠ Skipping {filename} (not found)")
continue
# Find first timestamp
first_ts_ms = find_first_timestamp(input_path)
if first_ts_ms == 0:
print(f"✓ {filename} already starts at 00:00:00,000 (no adjustment needed)")
continue
# Calculate offset
offset_ms = first_ts_ms
# Adjust the file in place
adjust_srt_timing(input_path, input_path, offset_ms)
print("\n✅ All SRT files have been adjusted to start at 00:00:00,000")
if __name__ == "__main__":
main()
|