Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from env.models import FlakySleuthAction | |
| def grade(action: FlakySleuthAction, task: dict) -> float: | |
| """Binary classification: flaky or stable. Exact match only.""" | |
| if action.action_type != "classify_flakiness": | |
| return 0.001 | |
| predicted = action.argument.strip().lower() | |
| if predicted not in ("flaky", "stable"): | |
| return 0.001 | |
| ground_truth = str(task.get("label", "flaky")).strip().lower() or "flaky" | |
| return 0.999 if predicted == ground_truth else 0.001 | |