Spaces:
Sleeping
Sleeping
| """Validate scam_dataset.json: unique ids, required fields, label consistency.""" | |
| from __future__ import annotations | |
| import json | |
| import sys | |
| from pathlib import Path | |
| REQUIRED = ("id", "difficulty", "true_label", "sender_type", "channel", "messages", "urgency_score") | |
| ALLOW_DIFF = frozenset({"easy", "medium", "hard"}) | |
| ALLOW_LABEL = frozenset({"scam", "legitimate"}) | |
| def assert_dataset_ok(rows: list[dict]) -> None: | |
| ids: set[str] = set() | |
| for i, row in enumerate(rows): | |
| for k in REQUIRED: | |
| if k not in row: | |
| raise ValueError(f"Row {i} missing key {k!r} (id={row.get('id')})") | |
| if row["difficulty"] not in ALLOW_DIFF: | |
| raise ValueError(f"Row {i} bad difficulty: {row.get('id')}") | |
| if row["true_label"] not in ALLOW_LABEL: | |
| raise ValueError(f"Row {i} bad true_label: {row.get('id')}") | |
| msgs = row["messages"] | |
| if not isinstance(msgs, list) or not msgs or not all(isinstance(m, str) and m.strip() for m in msgs): | |
| raise ValueError(f"Row {i} messages must be non-empty list of strings: {row.get('id')}") | |
| u = float(row["urgency_score"]) | |
| if not 0.0 <= u <= 1.0: | |
| raise ValueError(f"Row {i} urgency_score out of [0,1]: {row.get('id')}") | |
| oid = row["id"] | |
| if oid in ids: | |
| raise ValueError(f"Duplicate id: {oid}") | |
| ids.add(oid) | |
| otp_idx = row.get("otp_message_index") | |
| if otp_idx is not None: | |
| oi = int(otp_idx) | |
| if oi < 0 or oi >= len(msgs): | |
| raise ValueError(f"otp_message_index out of range for {oid}") | |
| if row["difficulty"] == "hard" and row["true_label"] == "scam" and len(msgs) < 2: | |
| raise ValueError(f"Hard scam should have multi-turn messages: {oid}") | |
| def main() -> None: | |
| root = Path(__file__).resolve().parent.parent | |
| path = root / "data" / "scam_dataset.json" | |
| rows = json.loads(path.read_text(encoding="utf-8")) | |
| assert_dataset_ok(rows) | |
| print(f"OK: {len(rows)} scenarios validated at {path}") | |
| if __name__ == "__main__": | |
| try: | |
| main() | |
| except ValueError as e: | |
| print(f"VALIDATION FAILED: {e}", file=sys.stderr) | |
| sys.exit(1) | |