Fix grading keys mismatch: allow actual dataset metrics to be graded
Browse files- server/grader.py +13 -6
server/grader.py
CHANGED
|
@@ -199,21 +199,28 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
|
|
| 199 |
"""
|
| 200 |
# 1. Structure Score (30%)
|
| 201 |
outcomes = action.expected_outcomes
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
# 2. Tradeoff Realism Check (50%)
|
| 207 |
realism_score = 0.5 # default
|
| 208 |
-
if keys_present =
|
| 209 |
values = []
|
| 210 |
-
for k in
|
| 211 |
v = outcomes[k]
|
| 212 |
# Normalise: accept 0-1 floats OR 0-100 integers
|
| 213 |
if isinstance(v, (int, float)):
|
| 214 |
values.append(float(v) if v <= 1.0 else float(v) / 100.0)
|
| 215 |
|
| 216 |
-
if len(values) =
|
| 217 |
all_high = all(v > 0.7 for v in values)
|
| 218 |
all_positive = all(v > 0 for v in values)
|
| 219 |
|
|
|
|
| 199 |
"""
|
| 200 |
# 1. Structure Score (30%)
|
| 201 |
outcomes = action.expected_outcomes
|
| 202 |
+
valid_keys = {
|
| 203 |
+
"fraud_rate", "revenue_velocity", "seller_trust",
|
| 204 |
+
"false_positive_rate", "fraud_detection_rate",
|
| 205 |
+
"seller_trust_score", "review_queue_overload",
|
| 206 |
+
"legitimate_revenue_lost"
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
present_valid_keys = [k for k in outcomes.keys() if k in valid_keys]
|
| 210 |
+
keys_present = len(present_valid_keys)
|
| 211 |
+
structure_score = min(keys_present / 3.0, 1.0)
|
| 212 |
|
| 213 |
# 2. Tradeoff Realism Check (50%)
|
| 214 |
realism_score = 0.5 # default
|
| 215 |
+
if keys_present >= 3:
|
| 216 |
values = []
|
| 217 |
+
for k in present_valid_keys:
|
| 218 |
v = outcomes[k]
|
| 219 |
# Normalise: accept 0-1 floats OR 0-100 integers
|
| 220 |
if isinstance(v, (int, float)):
|
| 221 |
values.append(float(v) if v <= 1.0 else float(v) / 100.0)
|
| 222 |
|
| 223 |
+
if len(values) >= 3:
|
| 224 |
all_high = all(v > 0.7 for v in values)
|
| 225 |
all_positive = all(v > 0 for v in values)
|
| 226 |
|