File size: 2,978 Bytes
379266c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""Validate normalization seed data for the Pashto project.

Usage:
    python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv
"""

from __future__ import annotations

import csv
import sys
from pathlib import Path


REQUIRED_COLUMNS = ("id", "raw_text", "normalized_text", "note")


def detect_delimiter(first_line: str) -> str | None:
    if "\t" in first_line:
        return "\t"
    if "," in first_line:
        return ","
    return None


def validate_file(path: Path) -> list[str]:
    errors: list[str] = []
    seen_ids: dict[str, int] = {}

    if not path.exists():
        return [f"File not found: {path}"]

    with path.open("r", encoding="utf-8-sig", newline="") as handle:
        first_line = handle.readline()
        if not first_line:
            return [f"Empty file: {path}"]

        delimiter = detect_delimiter(first_line)
        if delimiter is None:
            return [
                "Could not detect delimiter. Use TSV (preferred) or CSV with headers: "
                + ", ".join(REQUIRED_COLUMNS)
            ]

        handle.seek(0)
        reader = csv.DictReader(handle, delimiter=delimiter)

        if reader.fieldnames is None:
            return [f"Missing header row in: {path}"]

        missing = [col for col in REQUIRED_COLUMNS if col not in reader.fieldnames]
        if missing:
            errors.append(f"Missing required columns: {', '.join(missing)}")
            return errors

        row_count = 0
        for line_number, row in enumerate(reader, start=2):
            row_count += 1

            row_id = (row.get("id") or "").strip()
            raw_text = (row.get("raw_text") or "").strip()
            normalized_text = (row.get("normalized_text") or "").strip()

            if not row_id:
                errors.append(f"Line {line_number}: empty 'id'")
            elif row_id in seen_ids:
                errors.append(
                    f"Line {line_number}: duplicate id '{row_id}' "
                    f"(first seen at line {seen_ids[row_id]})"
                )
            else:
                seen_ids[row_id] = line_number

            if not raw_text:
                errors.append(f"Line {line_number}: empty 'raw_text'")
            if not normalized_text:
                errors.append(f"Line {line_number}: empty 'normalized_text'")

        if row_count == 0:
            errors.append("No data rows found.")

    return errors


def main() -> int:
    if len(sys.argv) != 2:
        print(
            "Usage: python scripts/validate_normalization.py "
            "data/processed/normalization_seed_v0.1.tsv"
        )
        return 2

    input_path = Path(sys.argv[1])
    errors = validate_file(input_path)

    if errors:
        print("Validation failed:")
        for error in errors:
            print(f"- {error}")
        return 1

    print(f"Validation passed: {input_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())