my-env / scripts /validate_dataset.py
Maheswar01's picture
Deploy OpenEnv to Space
080a295
Raw
History Blame Contribute Delete
2.21 kB
"""Validate scam_dataset.json: unique ids, required fields, label consistency."""
from __future__ import annotations
import json
import sys
from pathlib import Path
REQUIRED = ("id", "difficulty", "true_label", "sender_type", "channel", "messages", "urgency_score")
ALLOW_DIFF = frozenset({"easy", "medium", "hard"})
ALLOW_LABEL = frozenset({"scam", "legitimate"})
def assert_dataset_ok(rows: list[dict]) -> None:
ids: set[str] = set()
for i, row in enumerate(rows):
for k in REQUIRED:
if k not in row:
raise ValueError(f"Row {i} missing key {k!r} (id={row.get('id')})")
if row["difficulty"] not in ALLOW_DIFF:
raise ValueError(f"Row {i} bad difficulty: {row.get('id')}")
if row["true_label"] not in ALLOW_LABEL:
raise ValueError(f"Row {i} bad true_label: {row.get('id')}")
msgs = row["messages"]
if not isinstance(msgs, list) or not msgs or not all(isinstance(m, str) and m.strip() for m in msgs):
raise ValueError(f"Row {i} messages must be non-empty list of strings: {row.get('id')}")
u = float(row["urgency_score"])
if not 0.0 <= u <= 1.0:
raise ValueError(f"Row {i} urgency_score out of [0,1]: {row.get('id')}")
oid = row["id"]
if oid in ids:
raise ValueError(f"Duplicate id: {oid}")
ids.add(oid)
otp_idx = row.get("otp_message_index")
if otp_idx is not None:
oi = int(otp_idx)
if oi < 0 or oi >= len(msgs):
raise ValueError(f"otp_message_index out of range for {oid}")
if row["difficulty"] == "hard" and row["true_label"] == "scam" and len(msgs) < 2:
raise ValueError(f"Hard scam should have multi-turn messages: {oid}")
def main() -> None:
root = Path(__file__).resolve().parent.parent
path = root / "data" / "scam_dataset.json"
rows = json.loads(path.read_text(encoding="utf-8"))
assert_dataset_ok(rows)
print(f"OK: {len(rows)} scenarios validated at {path}")
if __name__ == "__main__":
try:
main()
except ValueError as e:
print(f"VALIDATION FAILED: {e}", file=sys.stderr)
sys.exit(1)