Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- graders.py +33 -13
graders.py
CHANGED
|
@@ -6,20 +6,40 @@ from tasks import TaskConfig
|
|
| 6 |
|
| 7 |
|
| 8 |
SCORE_EPS = 1e-3
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def _clamp(value: float, low: float = SCORE_EPS, high: float = 1.0 - SCORE_EPS) -> float:
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
def _strict_score(value: float) -> float:
|
| 16 |
return _clamp(value, SCORE_EPS, 1.0 - SCORE_EPS)
|
| 17 |
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
def grade_episode(task: TaskConfig, metrics: dict[str, Any]) -> float:
|
| 20 |
-
weights = task.score_weights
|
| 21 |
-
score = sum(
|
| 22 |
-
return
|
| 23 |
|
| 24 |
|
| 25 |
def summarize_episode(total_reward: float, state_history: list[dict[str, Any]], terminal_outcome: str) -> dict[str, Any]:
|
|
@@ -51,16 +71,16 @@ def summarize_episode(total_reward: float, state_history: list[dict[str, Any]],
|
|
| 51 |
timeliness = _clamp(1.0 - (first_meaningful_step / step_count))
|
| 52 |
stability = _clamp(sum(item.get("stability_score", 0.0) for item in state_history) / step_count)
|
| 53 |
safety = _clamp(1.0 - (safety_violations / step_count))
|
| 54 |
-
outcome = 1.0
|
| 55 |
return {
|
| 56 |
"steps": step_count,
|
| 57 |
-
"avg_reward":
|
| 58 |
-
"detection":
|
| 59 |
-
"lab_workup":
|
| 60 |
-
"treatment":
|
| 61 |
-
"timeliness":
|
| 62 |
-
"stability":
|
| 63 |
-
"safety":
|
| 64 |
-
"safety_violation_rate":
|
| 65 |
"outcome": outcome,
|
| 66 |
}
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
SCORE_EPS = 1e-3
|
| 9 |
+
SCORE_MARGIN = 1e-6
|
| 10 |
|
| 11 |
|
| 12 |
def _clamp(value: float, low: float = SCORE_EPS, high: float = 1.0 - SCORE_EPS) -> float:
|
| 13 |
+
try:
|
| 14 |
+
numeric_value = float(value)
|
| 15 |
+
except (TypeError, ValueError):
|
| 16 |
+
numeric_value = low
|
| 17 |
+
if numeric_value <= low:
|
| 18 |
+
return low + SCORE_MARGIN
|
| 19 |
+
if numeric_value >= high:
|
| 20 |
+
return high - SCORE_MARGIN
|
| 21 |
+
return numeric_value
|
| 22 |
|
| 23 |
|
| 24 |
def _strict_score(value: float) -> float:
|
| 25 |
return _clamp(value, SCORE_EPS, 1.0 - SCORE_EPS)
|
| 26 |
|
| 27 |
|
| 28 |
+
def _normalize_weights(weights: dict[str, float]) -> dict[str, float]:
|
| 29 |
+
total = sum(float(weight) for weight in weights.values())
|
| 30 |
+
if total <= 0:
|
| 31 |
+
return weights
|
| 32 |
+
return {metric_name: float(weight) / total for metric_name, weight in weights.items()}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _format_metric(value: float) -> float:
|
| 36 |
+
return float(f"{_clamp(value):.6f}")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
def grade_episode(task: TaskConfig, metrics: dict[str, Any]) -> float:
|
| 40 |
+
weights = _normalize_weights(task.score_weights)
|
| 41 |
+
score = sum(weight * _clamp(metrics.get(metric_name, 0.0)) for metric_name, weight in weights.items())
|
| 42 |
+
return float(f"{_strict_score(score):.6f}")
|
| 43 |
|
| 44 |
|
| 45 |
def summarize_episode(total_reward: float, state_history: list[dict[str, Any]], terminal_outcome: str) -> dict[str, Any]:
|
|
|
|
| 71 |
timeliness = _clamp(1.0 - (first_meaningful_step / step_count))
|
| 72 |
stability = _clamp(sum(item.get("stability_score", 0.0) for item in state_history) / step_count)
|
| 73 |
safety = _clamp(1.0 - (safety_violations / step_count))
|
| 74 |
+
outcome = _format_metric(1.0 if terminal_outcome == "survived" else 0.0)
|
| 75 |
return {
|
| 76 |
"steps": step_count,
|
| 77 |
+
"avg_reward": _format_metric(total_reward / step_count),
|
| 78 |
+
"detection": _format_metric(detection),
|
| 79 |
+
"lab_workup": _format_metric(lab_workup),
|
| 80 |
+
"treatment": _format_metric(treatment),
|
| 81 |
+
"timeliness": _format_metric(timeliness),
|
| 82 |
+
"stability": _format_metric(stability),
|
| 83 |
+
"safety": _format_metric(safety),
|
| 84 |
+
"safety_violation_rate": _format_metric(safety_violations / step_count),
|
| 85 |
"outcome": outcome,
|
| 86 |
}
|