File size: 2,702 Bytes
170ad43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from __future__ import annotations

import json
from collections import Counter
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
EVAL_DIR = ROOT / "resources" / "evaluation"


def _iter_jsonl(path: Path):
    with path.open(encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                yield lineno, json.loads(line)
            except json.JSONDecodeError as exc:
                raise ValueError(f"{path}:{lineno}: JSON tidak valid: {exc}") from exc


def _labels(row: dict) -> list[str]:
    if "label" in row:
        return [str(row["label"])]
    if "labels" in row:
        return [str(x) for x in row["labels"]]
    if "spans" in row:
        return [str(span.get("label", "")) for span in row["spans"] if span.get("label")]
    return []


def validate_file(path: Path) -> dict[str, object]:
    ids: set[str] = set()
    label_counts: Counter[str] = Counter()
    errors: list[str] = []
    rows = 0

    for lineno, row in _iter_jsonl(path):
        rows += 1
        for key in ("id", "text", "risk", "compute_note"):
            if key not in row:
                errors.append(f"{path.name}:{lineno}: field '{key}' wajib ada")

        row_id = str(row.get("id", ""))
        if row_id in ids:
            errors.append(f"{path.name}:{lineno}: id duplikat '{row_id}'")
        ids.add(row_id)

        labels = _labels(row)
        if not labels and path.name != "pii_validation.seed.jsonl":
            errors.append(f"{path.name}:{lineno}: label/labels/spans wajib ada")
        label_counts.update(labels or ["<none>"])

        if "spans" in row:
            text = str(row.get("text", ""))
            for span in row["spans"]:
                start = int(span.get("start", -1))
                end = int(span.get("end", -1))
                if start < 0 or end <= start or end > len(text):
                    errors.append(f"{path.name}:{lineno}: span invalid {span}")

    return {
        "file": path.name,
        "rows": rows,
        "labels": dict(sorted(label_counts.items())),
        "errors": errors,
    }


def main() -> int:
    summaries = [validate_file(path) for path in sorted(EVAL_DIR.glob("*.jsonl"))]
    total_errors = 0
    for summary in summaries:
        print(f"{summary['file']}: rows={summary['rows']} labels={summary['labels']}")
        for error in summary["errors"]:
            total_errors += 1
            print(f"  ERROR {error}")

    print(f"\nSUMMARY files={len(summaries)} errors={total_errors}")
    return 1 if total_errors else 0


if __name__ == "__main__":
    raise SystemExit(main())