File size: 3,101 Bytes
657e7ee
 
 
 
18e3c61
 
657e7ee
 
 
 
 
18e3c61
657e7ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18e3c61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657e7ee
 
 
 
 
 
18e3c61
 
657e7ee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
"""Unit tests for caption_parser module."""

import sys
import json
import tempfile
from pathlib import Path

# Add parent to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from caption_parser import normalize_sinhala_text, parse_youtube_json3


def test_normalize_basic():
    text = "  hello   world  "
    result = normalize_sinhala_text(text)
    assert result == "hello world"


def test_normalize_music_markers():
    text = "♪ සංගීතය ♫"
    result = normalize_sinhala_text(text)
    assert "♪" not in result
    assert "♫" not in result


def test_normalize_parenthetical():
    text = "මෙය (සංගීතය) වාක්‍යයකි"
    result = normalize_sinhala_text(text)
    assert "(සංගීතය)" not in result


def test_normalize_punctuation_spacing():
    text = "මෙය.වාක්‍යයකි"
    result = normalize_sinhala_text(text)
    assert ". " in result or result == "මෙය. වාක්‍යයකි"


def test_normalize_nfc():
    # Test that composed characters are normalized
    text = "ක\u0dca\u200d\u0dc2"  # k + halant + ZWJ + sha (should normalize)
    result = normalize_sinhala_text(text)
    assert len(result) > 0


def test_parse_uses_segment_offsets_for_timing():
    payload = {
        "events": [
            {
                "tStartMs": 1000,
                "dDurationMs": 5000,
                "segs": [
                    {"utf8": "හෙලෝ", "tOffsetMs": 300},
                    {"utf8": " වර්ල්ඩ්", "tOffsetMs": 900},
                ],
            },
            {
                "tStartMs": 2300,
                "dDurationMs": 1000,
                "segs": [{"utf8": "ඊළඟ", "tOffsetMs": 0}],
            },
        ],
    }
    with tempfile.TemporaryDirectory() as tmp:
        p = Path(tmp) / "sample.json3"
        p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8")
        rows = parse_youtube_json3(p)

    assert len(rows) == 2
    # Start uses earliest segment offset, not raw tStartMs.
    assert rows[0]["start_sec"] == 1.3
    # End is capped by next cue start to avoid overlap.
    assert rows[0]["end_sec"] <= 2.299


def test_parse_skips_pure_newline_events():
    payload = {
        "events": [
            {"tStartMs": 1000, "dDurationMs": 1000, "segs": [{"utf8": "\n"}]},
            {"tStartMs": 1500, "dDurationMs": 1000, "segs": [{"utf8": "පෙළ"}]},
        ],
    }
    with tempfile.TemporaryDirectory() as tmp:
        p = Path(tmp) / "sample.json3"
        p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8")
        rows = parse_youtube_json3(p)

    assert len(rows) == 1
    assert rows[0]["normalized_text"] == "පෙළ"


if __name__ == "__main__":
    test_normalize_basic()
    test_normalize_music_markers()
    test_normalize_parenthetical()
    test_normalize_punctuation_spacing()
    test_normalize_nfc()
    test_parse_uses_segment_offsets_for_timing()
    test_parse_skips_pure_newline_events()
    print("All caption_parser tests passed!")