| |
| """Unit tests for caption_parser module.""" |
|
|
| import sys |
| import json |
| import tempfile |
| from pathlib import Path |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
| from caption_parser import normalize_sinhala_text, parse_youtube_json3 |
|
|
|
|
| def test_normalize_basic(): |
| text = " hello world " |
| result = normalize_sinhala_text(text) |
| assert result == "hello world" |
|
|
|
|
| def test_normalize_music_markers(): |
| text = "♪ සංගීතය ♫" |
| result = normalize_sinhala_text(text) |
| assert "♪" not in result |
| assert "♫" not in result |
|
|
|
|
| def test_normalize_parenthetical(): |
| text = "මෙය (සංගීතය) වාක්යයකි" |
| result = normalize_sinhala_text(text) |
| assert "(සංගීතය)" not in result |
|
|
|
|
| def test_normalize_punctuation_spacing(): |
| text = "මෙය.වාක්යයකි" |
| result = normalize_sinhala_text(text) |
| assert ". " in result or result == "මෙය. වාක්යයකි" |
|
|
|
|
| def test_normalize_nfc(): |
| |
| text = "ක\u0dca\u200d\u0dc2" |
| result = normalize_sinhala_text(text) |
| assert len(result) > 0 |
|
|
|
|
| def test_parse_uses_segment_offsets_for_timing(): |
| payload = { |
| "events": [ |
| { |
| "tStartMs": 1000, |
| "dDurationMs": 5000, |
| "segs": [ |
| {"utf8": "හෙලෝ", "tOffsetMs": 300}, |
| {"utf8": " වර්ල්ඩ්", "tOffsetMs": 900}, |
| ], |
| }, |
| { |
| "tStartMs": 2300, |
| "dDurationMs": 1000, |
| "segs": [{"utf8": "ඊළඟ", "tOffsetMs": 0}], |
| }, |
| ], |
| } |
| with tempfile.TemporaryDirectory() as tmp: |
| p = Path(tmp) / "sample.json3" |
| p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8") |
| rows = parse_youtube_json3(p) |
|
|
| assert len(rows) == 2 |
| |
| assert rows[0]["start_sec"] == 1.3 |
| |
| assert rows[0]["end_sec"] <= 2.299 |
|
|
|
|
| def test_parse_skips_pure_newline_events(): |
| payload = { |
| "events": [ |
| {"tStartMs": 1000, "dDurationMs": 1000, "segs": [{"utf8": "\n"}]}, |
| {"tStartMs": 1500, "dDurationMs": 1000, "segs": [{"utf8": "පෙළ"}]}, |
| ], |
| } |
| with tempfile.TemporaryDirectory() as tmp: |
| p = Path(tmp) / "sample.json3" |
| p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8") |
| rows = parse_youtube_json3(p) |
|
|
| assert len(rows) == 1 |
| assert rows[0]["normalized_text"] == "පෙළ" |
|
|
|
|
| if __name__ == "__main__": |
| test_normalize_basic() |
| test_normalize_music_markers() |
| test_normalize_parenthetical() |
| test_normalize_punctuation_spacing() |
| test_normalize_nfc() |
| test_parse_uses_segment_offsets_for_timing() |
| test_parse_skips_pure_newline_events() |
| print("All caption_parser tests passed!") |
|
|