| """Validate normalization seed data for the Pashto project. | |
| Usage: | |
| python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv | |
| """ | |
| from __future__ import annotations | |
| import csv | |
| import sys | |
| from pathlib import Path | |
| REQUIRED_COLUMNS = ("id", "raw_text", "normalized_text", "note") | |
| def detect_delimiter(first_line: str) -> str | None: | |
| if "\t" in first_line: | |
| return "\t" | |
| if "," in first_line: | |
| return "," | |
| return None | |
| def validate_file(path: Path) -> list[str]: | |
| errors: list[str] = [] | |
| seen_ids: dict[str, int] = {} | |
| if not path.exists(): | |
| return [f"File not found: {path}"] | |
| with path.open("r", encoding="utf-8-sig", newline="") as handle: | |
| first_line = handle.readline() | |
| if not first_line: | |
| return [f"Empty file: {path}"] | |
| delimiter = detect_delimiter(first_line) | |
| if delimiter is None: | |
| return [ | |
| "Could not detect delimiter. Use TSV (preferred) or CSV with headers: " | |
| + ", ".join(REQUIRED_COLUMNS) | |
| ] | |
| handle.seek(0) | |
| reader = csv.DictReader(handle, delimiter=delimiter) | |
| if reader.fieldnames is None: | |
| return [f"Missing header row in: {path}"] | |
| missing = [col for col in REQUIRED_COLUMNS if col not in reader.fieldnames] | |
| if missing: | |
| errors.append(f"Missing required columns: {', '.join(missing)}") | |
| return errors | |
| row_count = 0 | |
| for line_number, row in enumerate(reader, start=2): | |
| row_count += 1 | |
| row_id = (row.get("id") or "").strip() | |
| raw_text = (row.get("raw_text") or "").strip() | |
| normalized_text = (row.get("normalized_text") or "").strip() | |
| if not row_id: | |
| errors.append(f"Line {line_number}: empty 'id'") | |
| elif row_id in seen_ids: | |
| errors.append( | |
| f"Line {line_number}: duplicate id '{row_id}' " | |
| f"(first seen at line {seen_ids[row_id]})" | |
| ) | |
| else: | |
| seen_ids[row_id] = line_number | |
| if not raw_text: | |
| errors.append(f"Line {line_number}: empty 'raw_text'") | |
| if not normalized_text: | |
| errors.append(f"Line {line_number}: empty 'normalized_text'") | |
| if row_count == 0: | |
| errors.append("No data rows found.") | |
| return errors | |
| def main() -> int: | |
| if len(sys.argv) != 2: | |
| print( | |
| "Usage: python scripts/validate_normalization.py " | |
| "data/processed/normalization_seed_v0.1.tsv" | |
| ) | |
| return 2 | |
| input_path = Path(sys.argv[1]) | |
| errors = validate_file(input_path) | |
| if errors: | |
| print("Validation failed:") | |
| for error in errors: | |
| print(f"- {error}") | |
| return 1 | |
| print(f"Validation passed: {input_path}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |