Spaces:
Sleeping
Sleeping
File size: 2,242 Bytes
91e7690 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import requests
BASE = "http://localhost:7860"
def post(path, payload):
r = requests.post(f"{BASE}{path}", json=payload, timeout=20)
r.raise_for_status()
return r.json()
def score(task_id, report):
post("/reset", {"task_id": task_id, "seed": 42})
out = post("/step", {"action": {"action_type": "submit_report", "report": report}})
return out["reward"]["value"], out["reward"].get("breakdown", {})
EMPTY = {
"null_issues": {},
"duplicate_row_count": 0,
"schema_violations": [],
"drifted_columns": [],
"drift_details": {},
"recommended_fixes": []
}
BETTER_T1 = {
"null_issues": {"email": 10, "customer_id": 4},
"duplicate_row_count": 15,
"schema_violations": [],
"drifted_columns": [],
"drift_details": {},
"recommended_fixes": ["dedupe rows", "fill nulls"]
}
BETTER_T2 = {
"null_issues": {"negative_quantity_rows": 7},
"duplicate_row_count": 0,
"schema_violations": [
{"column": "amount", "issue_type": "type_violation", "example": "$12.50"},
{"column": "order_date", "issue_type": "date_format_violation", "example": "Jan 5 2024"},
{"column": "amount", "issue_type": "unparseable", "example": "N/A"},
{"column": "quantity", "issue_type": "negative_value", "example": "-3"}
],
"drifted_columns": [],
"drift_details": {},
"recommended_fixes": ["parse amount", "normalize date", "clamp quantity"]
}
BETTER_T3 = {
"null_issues": {},
"duplicate_row_count": 0,
"schema_violations": [{"column": "category", "issue_type": "new_values", "example": "crypto, NFT"}],
"drifted_columns": ["amount"],
"drift_details": {"amount": "mean shifted from ~50 to ~78", "user_id": "new users around 15%"},
"recommended_fixes": ["monitor drift", "update reference sets"]
}
def main():
for task_id, better in [(1, BETTER_T1), (2, BETTER_T2), (3, BETTER_T3)]:
s0, _ = score(task_id, EMPTY)
s1, b1 = score(task_id, better)
print(f"task {task_id}: empty={s0:.3f} better={s1:.3f} breakdown={b1}")
if s1 < s0:
raise SystemExit(f"Unexpected scoring regression on task {task_id}")
print("grader dynamics check passed")
if __name__ == "__main__":
main()
|