Spaces:
Running
Running
| from __future__ import annotations | |
| import json | |
| from collections import Counter | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parents[1] | |
| EVAL_DIR = ROOT / "resources" / "evaluation" | |
| def _iter_jsonl(path: Path): | |
| with path.open(encoding="utf-8") as f: | |
| for lineno, line in enumerate(f, start=1): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| yield lineno, json.loads(line) | |
| except json.JSONDecodeError as exc: | |
| raise ValueError(f"{path}:{lineno}: JSON tidak valid: {exc}") from exc | |
| def _labels(row: dict) -> list[str]: | |
| if "label" in row: | |
| return [str(row["label"])] | |
| if "labels" in row: | |
| return [str(x) for x in row["labels"]] | |
| if "spans" in row: | |
| return [str(span.get("label", "")) for span in row["spans"] if span.get("label")] | |
| return [] | |
| def validate_file(path: Path) -> dict[str, object]: | |
| ids: set[str] = set() | |
| label_counts: Counter[str] = Counter() | |
| errors: list[str] = [] | |
| rows = 0 | |
| for lineno, row in _iter_jsonl(path): | |
| rows += 1 | |
| for key in ("id", "text", "risk", "compute_note"): | |
| if key not in row: | |
| errors.append(f"{path.name}:{lineno}: field '{key}' wajib ada") | |
| row_id = str(row.get("id", "")) | |
| if row_id in ids: | |
| errors.append(f"{path.name}:{lineno}: id duplikat '{row_id}'") | |
| ids.add(row_id) | |
| labels = _labels(row) | |
| if not labels and path.name != "pii_validation.seed.jsonl": | |
| errors.append(f"{path.name}:{lineno}: label/labels/spans wajib ada") | |
| label_counts.update(labels or ["<none>"]) | |
| if "spans" in row: | |
| text = str(row.get("text", "")) | |
| for span in row["spans"]: | |
| start = int(span.get("start", -1)) | |
| end = int(span.get("end", -1)) | |
| if start < 0 or end <= start or end > len(text): | |
| errors.append(f"{path.name}:{lineno}: span invalid {span}") | |
| return { | |
| "file": path.name, | |
| "rows": rows, | |
| "labels": dict(sorted(label_counts.items())), | |
| "errors": errors, | |
| } | |
| def main() -> int: | |
| summaries = [validate_file(path) for path in sorted(EVAL_DIR.glob("*.jsonl"))] | |
| total_errors = 0 | |
| for summary in summaries: | |
| print(f"{summary['file']}: rows={summary['rows']} labels={summary['labels']}") | |
| for error in summary["errors"]: | |
| total_errors += 1 | |
| print(f" ERROR {error}") | |
| print(f"\nSUMMARY files={len(summaries)} errors={total_errors}") | |
| return 1 if total_errors else 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |