Pukhto_Pashto / scripts /validate_normalization.py
musaw
feat(data): add normalization starter dataset and validator
379266c
"""Validate normalization seed data for the Pashto project.
Usage:
python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv
"""
from __future__ import annotations
import csv
import sys
from pathlib import Path
REQUIRED_COLUMNS = ("id", "raw_text", "normalized_text", "note")
def detect_delimiter(first_line: str) -> str | None:
if "\t" in first_line:
return "\t"
if "," in first_line:
return ","
return None
def validate_file(path: Path) -> list[str]:
errors: list[str] = []
seen_ids: dict[str, int] = {}
if not path.exists():
return [f"File not found: {path}"]
with path.open("r", encoding="utf-8-sig", newline="") as handle:
first_line = handle.readline()
if not first_line:
return [f"Empty file: {path}"]
delimiter = detect_delimiter(first_line)
if delimiter is None:
return [
"Could not detect delimiter. Use TSV (preferred) or CSV with headers: "
+ ", ".join(REQUIRED_COLUMNS)
]
handle.seek(0)
reader = csv.DictReader(handle, delimiter=delimiter)
if reader.fieldnames is None:
return [f"Missing header row in: {path}"]
missing = [col for col in REQUIRED_COLUMNS if col not in reader.fieldnames]
if missing:
errors.append(f"Missing required columns: {', '.join(missing)}")
return errors
row_count = 0
for line_number, row in enumerate(reader, start=2):
row_count += 1
row_id = (row.get("id") or "").strip()
raw_text = (row.get("raw_text") or "").strip()
normalized_text = (row.get("normalized_text") or "").strip()
if not row_id:
errors.append(f"Line {line_number}: empty 'id'")
elif row_id in seen_ids:
errors.append(
f"Line {line_number}: duplicate id '{row_id}' "
f"(first seen at line {seen_ids[row_id]})"
)
else:
seen_ids[row_id] = line_number
if not raw_text:
errors.append(f"Line {line_number}: empty 'raw_text'")
if not normalized_text:
errors.append(f"Line {line_number}: empty 'normalized_text'")
if row_count == 0:
errors.append("No data rows found.")
return errors
def main() -> int:
if len(sys.argv) != 2:
print(
"Usage: python scripts/validate_normalization.py "
"data/processed/normalization_seed_v0.1.tsv"
)
return 2
input_path = Path(sys.argv[1])
errors = validate_file(input_path)
if errors:
print("Validation failed:")
for error in errors:
print(f"- {error}")
return 1
print(f"Validation passed: {input_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())