from __future__ import annotations import json from collections import Counter from pathlib import Path ROOT = Path(__file__).resolve().parents[1] EVAL_DIR = ROOT / "resources" / "evaluation" def _iter_jsonl(path: Path): with path.open(encoding="utf-8") as f: for lineno, line in enumerate(f, start=1): line = line.strip() if not line: continue try: yield lineno, json.loads(line) except json.JSONDecodeError as exc: raise ValueError(f"{path}:{lineno}: JSON tidak valid: {exc}") from exc def _labels(row: dict) -> list[str]: if "label" in row: return [str(row["label"])] if "labels" in row: return [str(x) for x in row["labels"]] if "spans" in row: return [str(span.get("label", "")) for span in row["spans"] if span.get("label")] return [] def validate_file(path: Path) -> dict[str, object]: ids: set[str] = set() label_counts: Counter[str] = Counter() errors: list[str] = [] rows = 0 for lineno, row in _iter_jsonl(path): rows += 1 for key in ("id", "text", "risk", "compute_note"): if key not in row: errors.append(f"{path.name}:{lineno}: field '{key}' wajib ada") row_id = str(row.get("id", "")) if row_id in ids: errors.append(f"{path.name}:{lineno}: id duplikat '{row_id}'") ids.add(row_id) labels = _labels(row) if not labels and path.name != "pii_validation.seed.jsonl": errors.append(f"{path.name}:{lineno}: label/labels/spans wajib ada") label_counts.update(labels or [""]) if "spans" in row: text = str(row.get("text", "")) for span in row["spans"]: start = int(span.get("start", -1)) end = int(span.get("end", -1)) if start < 0 or end <= start or end > len(text): errors.append(f"{path.name}:{lineno}: span invalid {span}") return { "file": path.name, "rows": rows, "labels": dict(sorted(label_counts.items())), "errors": errors, } def main() -> int: summaries = [validate_file(path) for path in sorted(EVAL_DIR.glob("*.jsonl"))] total_errors = 0 for summary in summaries: print(f"{summary['file']}: rows={summary['rows']} labels={summary['labels']}") for error in summary["errors"]: total_errors += 1 print(f" ERROR {error}") print(f"\nSUMMARY files={len(summaries)} errors={total_errors}") return 1 if total_errors else 0 if __name__ == "__main__": raise SystemExit(main())