sinhala-tts / process /raw-extract /tests /test_caption_parser.py
outlawmold's picture
Add pilot_002 outputs and pending pipeline hardening changes
18e3c61
#!/usr/bin/env python3
"""Unit tests for caption_parser module."""
import sys
import json
import tempfile
from pathlib import Path
# Add parent to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from caption_parser import normalize_sinhala_text, parse_youtube_json3
def test_normalize_basic():
text = " hello world "
result = normalize_sinhala_text(text)
assert result == "hello world"
def test_normalize_music_markers():
text = "♪ සංගීතය ♫"
result = normalize_sinhala_text(text)
assert "♪" not in result
assert "♫" not in result
def test_normalize_parenthetical():
text = "මෙය (සංගීතය) වාක්‍යයකි"
result = normalize_sinhala_text(text)
assert "(සංගීතය)" not in result
def test_normalize_punctuation_spacing():
text = "මෙය.වාක්‍යයකි"
result = normalize_sinhala_text(text)
assert ". " in result or result == "මෙය. වාක්‍යයකි"
def test_normalize_nfc():
# Test that composed characters are normalized
text = "ක\u0dca\u200d\u0dc2" # k + halant + ZWJ + sha (should normalize)
result = normalize_sinhala_text(text)
assert len(result) > 0
def test_parse_uses_segment_offsets_for_timing():
payload = {
"events": [
{
"tStartMs": 1000,
"dDurationMs": 5000,
"segs": [
{"utf8": "හෙලෝ", "tOffsetMs": 300},
{"utf8": " වර්ල්ඩ්", "tOffsetMs": 900},
],
},
{
"tStartMs": 2300,
"dDurationMs": 1000,
"segs": [{"utf8": "ඊළඟ", "tOffsetMs": 0}],
},
],
}
with tempfile.TemporaryDirectory() as tmp:
p = Path(tmp) / "sample.json3"
p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8")
rows = parse_youtube_json3(p)
assert len(rows) == 2
# Start uses earliest segment offset, not raw tStartMs.
assert rows[0]["start_sec"] == 1.3
# End is capped by next cue start to avoid overlap.
assert rows[0]["end_sec"] <= 2.299
def test_parse_skips_pure_newline_events():
payload = {
"events": [
{"tStartMs": 1000, "dDurationMs": 1000, "segs": [{"utf8": "\n"}]},
{"tStartMs": 1500, "dDurationMs": 1000, "segs": [{"utf8": "පෙළ"}]},
],
}
with tempfile.TemporaryDirectory() as tmp:
p = Path(tmp) / "sample.json3"
p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8")
rows = parse_youtube_json3(p)
assert len(rows) == 1
assert rows[0]["normalized_text"] == "පෙළ"
if __name__ == "__main__":
test_normalize_basic()
test_normalize_music_markers()
test_normalize_parenthetical()
test_normalize_punctuation_spacing()
test_normalize_nfc()
test_parse_uses_segment_offsets_for_timing()
test_parse_skips_pure_newline_events()
print("All caption_parser tests passed!")