sinhala-tts / process /raw-extract /tests /test_caption_parser.py

Add pilot_002 outputs and pending pipeline hardening changes

18e3c61 about 1 month ago

3.1 kB

	#!/usr/bin/env python3
	"""Unit tests for caption_parser module."""

	import sys
	import json
	import tempfile
	from pathlib import Path

	# Add parent to path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from caption_parser import normalize_sinhala_text, parse_youtube_json3


	def test_normalize_basic():
	text = " hello world "
	result = normalize_sinhala_text(text)
	assert result == "hello world"


	def test_normalize_music_markers():
	text = "♪ සංගීතය ♫"
	result = normalize_sinhala_text(text)
	assert "♪" not in result
	assert "♫" not in result


	def test_normalize_parenthetical():
	text = "මෙය (සංගීතය) වාක්‍යයකි"
	result = normalize_sinhala_text(text)
	assert "(සංගීතය)" not in result


	def test_normalize_punctuation_spacing():
	text = "මෙය.වාක්‍යයකි"
	result = normalize_sinhala_text(text)
	assert ". " in result or result == "මෙය. වාක්‍යයකි"


	def test_normalize_nfc():
	# Test that composed characters are normalized
	text = "ක\u0dca\u200d\u0dc2" # k + halant + ZWJ + sha (should normalize)
	result = normalize_sinhala_text(text)
	assert len(result) > 0


	def test_parse_uses_segment_offsets_for_timing():
	payload = {
	"events": [
	{
	"tStartMs": 1000,
	"dDurationMs": 5000,
	"segs": [
	{"utf8": "හෙලෝ", "tOffsetMs": 300},
	{"utf8": " වර්ල්ඩ්", "tOffsetMs": 900},
	],
	},
	{
	"tStartMs": 2300,
	"dDurationMs": 1000,
	"segs": [{"utf8": "ඊළඟ", "tOffsetMs": 0}],
	},
	],
	}
	with tempfile.TemporaryDirectory() as tmp:
	p = Path(tmp) / "sample.json3"
	p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8")
	rows = parse_youtube_json3(p)

	assert len(rows) == 2
	# Start uses earliest segment offset, not raw tStartMs.
	assert rows[0]["start_sec"] == 1.3
	# End is capped by next cue start to avoid overlap.
	assert rows[0]["end_sec"] <= 2.299


	def test_parse_skips_pure_newline_events():
	payload = {
	"events": [
	{"tStartMs": 1000, "dDurationMs": 1000, "segs": [{"utf8": "\n"}]},
	{"tStartMs": 1500, "dDurationMs": 1000, "segs": [{"utf8": "පෙළ"}]},
	],
	}
	with tempfile.TemporaryDirectory() as tmp:
	p = Path(tmp) / "sample.json3"
	p.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8")
	rows = parse_youtube_json3(p)

	assert len(rows) == 1
	assert rows[0]["normalized_text"] == "පෙළ"


	if __name__ == "__main__":
	test_normalize_basic()
	test_normalize_music_markers()
	test_normalize_parenthetical()
	test_normalize_punctuation_spacing()
	test_normalize_nfc()
	test_parse_uses_segment_offsets_for_timing()
	test_parse_skips_pure_newline_events()
	print("All caption_parser tests passed!")