from __future__ import annotations from env.models import FlakySleuthAction def grade(action: FlakySleuthAction, task: dict) -> float: """Binary classification: flaky or stable. Exact match only.""" if action.action_type != "classify_flakiness": return 0.001 predicted = action.argument.strip().lower() if predicted not in ("flaky", "stable"): return 0.001 ground_truth = str(task.get("label", "flaky")).strip().lower() or "flaky" return 0.999 if predicted == ground_truth else 0.001