Pukhto_Pashto / scripts /validate_normalization.py

musaw

feat(data): add normalization starter dataset and validator

379266c 3 days ago

2.98 kB

	"""Validate normalization seed data for the Pashto project.

	Usage:
	python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv
	"""

	from __future__ import annotations

	import csv
	import sys
	from pathlib import Path


	REQUIRED_COLUMNS = ("id", "raw_text", "normalized_text", "note")


	def detect_delimiter(first_line: str) -> str \| None:
	if "\t" in first_line:
	return "\t"
	if "," in first_line:
	return ","
	return None


	def validate_file(path: Path) -> list[str]:
	errors: list[str] = []
	seen_ids: dict[str, int] = {}

	if not path.exists():
	return [f"File not found: {path}"]

	with path.open("r", encoding="utf-8-sig", newline="") as handle:
	first_line = handle.readline()
	if not first_line:
	return [f"Empty file: {path}"]

	delimiter = detect_delimiter(first_line)
	if delimiter is None:
	return [
	"Could not detect delimiter. Use TSV (preferred) or CSV with headers: "
	+ ", ".join(REQUIRED_COLUMNS)
	]

	handle.seek(0)
	reader = csv.DictReader(handle, delimiter=delimiter)

	if reader.fieldnames is None:
	return [f"Missing header row in: {path}"]

	missing = [col for col in REQUIRED_COLUMNS if col not in reader.fieldnames]
	if missing:
	errors.append(f"Missing required columns: {', '.join(missing)}")
	return errors

	row_count = 0
	for line_number, row in enumerate(reader, start=2):
	row_count += 1

	row_id = (row.get("id") or "").strip()
	raw_text = (row.get("raw_text") or "").strip()
	normalized_text = (row.get("normalized_text") or "").strip()

	if not row_id:
	errors.append(f"Line {line_number}: empty 'id'")
	elif row_id in seen_ids:
	errors.append(
	f"Line {line_number}: duplicate id '{row_id}' "
	f"(first seen at line {seen_ids[row_id]})"
	)
	else:
	seen_ids[row_id] = line_number

	if not raw_text:
	errors.append(f"Line {line_number}: empty 'raw_text'")
	if not normalized_text:
	errors.append(f"Line {line_number}: empty 'normalized_text'")

	if row_count == 0:
	errors.append("No data rows found.")

	return errors


	def main() -> int:
	if len(sys.argv) != 2:
	print(
	"Usage: python scripts/validate_normalization.py "
	"data/processed/normalization_seed_v0.1.tsv"
	)
	return 2

	input_path = Path(sys.argv[1])
	errors = validate_file(input_path)

	if errors:
	print("Validation failed:")
	for error in errors:
	print(f"- {error}")
	return 1

	print(f"Validation passed: {input_path}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())