Spaces:
Sleeping
Sleeping
| import requests | |
| BASE = "http://localhost:7860" | |
| def post(path, payload): | |
| r = requests.post(f"{BASE}{path}", json=payload, timeout=20) | |
| r.raise_for_status() | |
| return r.json() | |
| def score(task_id, report): | |
| post("/reset", {"task_id": task_id, "seed": 42}) | |
| out = post("/step", {"action": {"action_type": "submit_report", "report": report}}) | |
| return out["reward"]["value"], out["reward"].get("breakdown", {}) | |
| EMPTY = { | |
| "null_issues": {}, | |
| "duplicate_row_count": 0, | |
| "schema_violations": [], | |
| "drifted_columns": [], | |
| "drift_details": {}, | |
| "recommended_fixes": [] | |
| } | |
| BETTER_T1 = { | |
| "null_issues": {"email": 10, "customer_id": 4}, | |
| "duplicate_row_count": 15, | |
| "schema_violations": [], | |
| "drifted_columns": [], | |
| "drift_details": {}, | |
| "recommended_fixes": ["dedupe rows", "fill nulls"] | |
| } | |
| BETTER_T2 = { | |
| "null_issues": {"negative_quantity_rows": 7}, | |
| "duplicate_row_count": 0, | |
| "schema_violations": [ | |
| {"column": "amount", "issue_type": "type_violation", "example": "$12.50"}, | |
| {"column": "order_date", "issue_type": "date_format_violation", "example": "Jan 5 2024"}, | |
| {"column": "amount", "issue_type": "unparseable", "example": "N/A"}, | |
| {"column": "quantity", "issue_type": "negative_value", "example": "-3"} | |
| ], | |
| "drifted_columns": [], | |
| "drift_details": {}, | |
| "recommended_fixes": ["parse amount", "normalize date", "clamp quantity"] | |
| } | |
| BETTER_T3 = { | |
| "null_issues": {}, | |
| "duplicate_row_count": 0, | |
| "schema_violations": [{"column": "category", "issue_type": "new_values", "example": "crypto, NFT"}], | |
| "drifted_columns": ["amount"], | |
| "drift_details": {"amount": "mean shifted from ~50 to ~78", "user_id": "new users around 15%"}, | |
| "recommended_fixes": ["monitor drift", "update reference sets"] | |
| } | |
| def main(): | |
| for task_id, better in [(1, BETTER_T1), (2, BETTER_T2), (3, BETTER_T3)]: | |
| s0, _ = score(task_id, EMPTY) | |
| s1, b1 = score(task_id, better) | |
| print(f"task {task_id}: empty={s0:.3f} better={s1:.3f} breakdown={b1}") | |
| if s1 < s0: | |
| raise SystemExit(f"Unexpected scoring regression on task {task_id}") | |
| print("grader dynamics check passed") | |
| if __name__ == "__main__": | |
| main() | |