Spaces:

ArielJoe
/

Prompt-Builder

Running

App Files Files Community

Prompt-Builder / scripts /validate_evaluation_sets.py

ArielJoe

feat: strengthen detectors, slim API schema, and improve UX

170ad43 19 days ago

Raw

History Blame Contribute Delete

2.7 kB

	from __future__ import annotations

	import json
	from collections import Counter
	from pathlib import Path


	ROOT = Path(__file__).resolve().parents[1]
	EVAL_DIR = ROOT / "resources" / "evaluation"


	def _iter_jsonl(path: Path):
	with path.open(encoding="utf-8") as f:
	for lineno, line in enumerate(f, start=1):
	line = line.strip()
	if not line:
	continue
	try:
	yield lineno, json.loads(line)
	except json.JSONDecodeError as exc:
	raise ValueError(f"{path}:{lineno}: JSON tidak valid: {exc}") from exc


	def _labels(row: dict) -> list[str]:
	if "label" in row:
	return [str(row["label"])]
	if "labels" in row:
	return [str(x) for x in row["labels"]]
	if "spans" in row:
	return [str(span.get("label", "")) for span in row["spans"] if span.get("label")]
	return []


	def validate_file(path: Path) -> dict[str, object]:
	ids: set[str] = set()
	label_counts: Counter[str] = Counter()
	errors: list[str] = []
	rows = 0

	for lineno, row in _iter_jsonl(path):
	rows += 1
	for key in ("id", "text", "risk", "compute_note"):
	if key not in row:
	errors.append(f"{path.name}:{lineno}: field '{key}' wajib ada")

	row_id = str(row.get("id", ""))
	if row_id in ids:
	errors.append(f"{path.name}:{lineno}: id duplikat '{row_id}'")
	ids.add(row_id)

	labels = _labels(row)
	if not labels and path.name != "pii_validation.seed.jsonl":
	errors.append(f"{path.name}:{lineno}: label/labels/spans wajib ada")
	label_counts.update(labels or ["<none>"])

	if "spans" in row:
	text = str(row.get("text", ""))
	for span in row["spans"]:
	start = int(span.get("start", -1))
	end = int(span.get("end", -1))
	if start < 0 or end <= start or end > len(text):
	errors.append(f"{path.name}:{lineno}: span invalid {span}")

	return {
	"file": path.name,
	"rows": rows,
	"labels": dict(sorted(label_counts.items())),
	"errors": errors,
	}


	def main() -> int:
	summaries = [validate_file(path) for path in sorted(EVAL_DIR.glob("*.jsonl"))]
	total_errors = 0
	for summary in summaries:
	print(f"{summary['file']}: rows={summary['rows']} labels={summary['labels']}")
	for error in summary["errors"]:
	total_errors += 1
	print(f" ERROR {error}")

	print(f"\nSUMMARY files={len(summaries)} errors={total_errors}")
	return 1 if total_errors else 0


	if __name__ == "__main__":
	raise SystemExit(main())