#!/usr/bin/env python3 """Unit tests for caption_parser module.""" import sys import json import tempfile from pathlib import Path # Add parent to path sys.path.insert(0, str(Path(__file__).parent.parent)) from caption_parser import normalize_sinhala_text, parse_youtube_json3 def test_normalize_basic(): text = " hello world " result = normalize_sinhala_text(text) assert result == "hello world" def test_normalize_music_markers(): text = "♪ සංගීතය ♫" result = normalize_sinhala_text(text) assert "♪" not in result assert "♫" not in result def test_normalize_parenthetical(): text = "මෙය (සංගීතය) වාක්‍යයකි" result = normalize_sinhala_text(text) assert "(සංගීතය)" not in result def test_normalize_punctuation_spacing(): text = "මෙය.වාක්‍යයකි" result = normalize_sinhala_text(text) assert ". " in result or result == "මෙය. වාක්‍යයකි" def test_normalize_nfc(): # Test that composed characters are normalized text = "ක\u0dca\u200d\u0dc2" # k + halant + ZWJ + sha (should normalize) result = normalize_sinhala_text(text) assert len(result) > 0 def test_parse_uses_segment_offsets_for_timing(): payload = { "events": [ { "tStartMs": 1000, "dDurationMs": 5000, "segs": [ {"utf8": "හෙලෝ", "tOffsetMs": 300}, {"utf8": " වර්ල්ඩ්", "tOffsetMs": 900}, ], }, { "tStartMs": 2300, "dDurationMs": 1000, "segs": [{"utf8": "ඊළඟ", "tOffsetMs": 0}], }, ], } with tempfile.TemporaryDirectory() as tmp: p = Path(tmp) / "sample.json3" p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8") rows = parse_youtube_json3(p) assert len(rows) == 2 # Start uses earliest segment offset, not raw tStartMs. assert rows[0]["start_sec"] == 1.3 # End is capped by next cue start to avoid overlap. assert rows[0]["end_sec"] <= 2.299 def test_parse_skips_pure_newline_events(): payload = { "events": [ {"tStartMs": 1000, "dDurationMs": 1000, "segs": [{"utf8": "\n"}]}, {"tStartMs": 1500, "dDurationMs": 1000, "segs": [{"utf8": "පෙළ"}]}, ], } with tempfile.TemporaryDirectory() as tmp: p = Path(tmp) / "sample.json3" p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8") rows = parse_youtube_json3(p) assert len(rows) == 1 assert rows[0]["normalized_text"] == "පෙළ" if __name__ == "__main__": test_normalize_basic() test_normalize_music_markers() test_normalize_parenthetical() test_normalize_punctuation_spacing() test_normalize_nfc() test_parse_uses_segment_offsets_for_timing() test_parse_skips_pure_newline_events() print("All caption_parser tests passed!")