Commit ·
d1583fb
1
Parent(s): 0e20f4b
fix: clamp scores to (0.001, 0.999) for Phase 2 strict range validation
Browse files- environment.py +3 -3
- inference.py +4 -4
- server/environment.py +3 -3
environment.py
CHANGED
|
@@ -73,7 +73,7 @@ class Action(BaseModel):
|
|
| 73 |
|
| 74 |
class Reward(BaseModel):
|
| 75 |
"""Deterministic grading result returned after each step."""
|
| 76 |
-
total_score: float = Field(...,
|
| 77 |
breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
|
| 78 |
|
| 79 |
|
|
@@ -228,7 +228,7 @@ class CloudSREEnv:
|
|
| 228 |
self._health = max(0.0, self._health - 5.0)
|
| 229 |
|
| 230 |
reward = Reward(
|
| 231 |
-
total_score=0.
|
| 232 |
breakdown={
|
| 233 |
"status": 0.0,
|
| 234 |
"message_investigating": 0.0,
|
|
@@ -335,7 +335,7 @@ class CloudSREEnv:
|
|
| 335 |
breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
|
| 336 |
|
| 337 |
raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
|
| 338 |
-
total = max(0.
|
| 339 |
|
| 340 |
breakdown["budget_spent"] = self._budget
|
| 341 |
breakdown["final_health"] = self._health
|
|
|
|
| 73 |
|
| 74 |
class Reward(BaseModel):
|
| 75 |
"""Deterministic grading result returned after each step."""
|
| 76 |
+
total_score: float = Field(..., gt=0.0, lt=1.0, description="Final score in (0, 1) exclusive")
|
| 77 |
breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
|
| 78 |
|
| 79 |
|
|
|
|
| 228 |
self._health = max(0.0, self._health - 5.0)
|
| 229 |
|
| 230 |
reward = Reward(
|
| 231 |
+
total_score=0.001,
|
| 232 |
breakdown={
|
| 233 |
"status": 0.0,
|
| 234 |
"message_investigating": 0.0,
|
|
|
|
| 335 |
breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
|
| 336 |
|
| 337 |
raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
|
| 338 |
+
total = max(0.001, min(0.999, round(raw - penalty, 4)))
|
| 339 |
|
| 340 |
breakdown["budget_spent"] = self._budget
|
| 341 |
breakdown["final_health"] = self._health
|
inference.py
CHANGED
|
@@ -269,7 +269,7 @@ def run_evaluation():
|
|
| 269 |
|
| 270 |
if done:
|
| 271 |
reward = step_data["reward"]
|
| 272 |
-
score = reward["total_score"]
|
| 273 |
results[task] = score
|
| 274 |
print(f"[STEP] step={obs['turn_number']} reward={score}", flush=True)
|
| 275 |
print(f"[END] task={task} score={score} steps={obs['turn_number']}", flush=True)
|
|
@@ -282,10 +282,10 @@ def run_evaluation():
|
|
| 282 |
obs = step_data["observation"]
|
| 283 |
|
| 284 |
except Exception as exc:
|
| 285 |
-
print(f"[STEP] step={obs['turn_number']} reward=0.
|
| 286 |
-
print(f"[END] task={task} score=0.
|
| 287 |
print(f" ERROR: Environment step failed: {exc}")
|
| 288 |
-
results[task] = 0.
|
| 289 |
done = True
|
| 290 |
|
| 291 |
time.sleep(0.5)
|
|
|
|
| 269 |
|
| 270 |
if done:
|
| 271 |
reward = step_data["reward"]
|
| 272 |
+
score = max(0.001, min(0.999, float(reward["total_score"])))
|
| 273 |
results[task] = score
|
| 274 |
print(f"[STEP] step={obs['turn_number']} reward={score}", flush=True)
|
| 275 |
print(f"[END] task={task} score={score} steps={obs['turn_number']}", flush=True)
|
|
|
|
| 282 |
obs = step_data["observation"]
|
| 283 |
|
| 284 |
except Exception as exc:
|
| 285 |
+
print(f"[STEP] step={obs['turn_number']} reward=0.001", flush=True)
|
| 286 |
+
print(f"[END] task={task} score=0.001 steps={obs['turn_number']}", flush=True)
|
| 287 |
print(f" ERROR: Environment step failed: {exc}")
|
| 288 |
+
results[task] = 0.001
|
| 289 |
done = True
|
| 290 |
|
| 291 |
time.sleep(0.5)
|
server/environment.py
CHANGED
|
@@ -73,7 +73,7 @@ class Action(BaseModel):
|
|
| 73 |
|
| 74 |
class Reward(BaseModel):
|
| 75 |
"""Deterministic grading result returned after each step."""
|
| 76 |
-
total_score: float = Field(...,
|
| 77 |
breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
|
| 78 |
|
| 79 |
|
|
@@ -228,7 +228,7 @@ class CloudSREEnv:
|
|
| 228 |
self._health = max(0.0, self._health - 5.0)
|
| 229 |
|
| 230 |
reward = Reward(
|
| 231 |
-
total_score=0.
|
| 232 |
breakdown={
|
| 233 |
"status": 0.0,
|
| 234 |
"message_investigating": 0.0,
|
|
@@ -335,7 +335,7 @@ class CloudSREEnv:
|
|
| 335 |
breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
|
| 336 |
|
| 337 |
raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
|
| 338 |
-
total = max(0.
|
| 339 |
|
| 340 |
breakdown["budget_spent"] = self._budget
|
| 341 |
breakdown["final_health"] = self._health
|
|
|
|
| 73 |
|
| 74 |
class Reward(BaseModel):
|
| 75 |
"""Deterministic grading result returned after each step."""
|
| 76 |
+
total_score: float = Field(..., gt=0.0, lt=1.0, description="Final score in (0, 1) exclusive")
|
| 77 |
breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
|
| 78 |
|
| 79 |
|
|
|
|
| 228 |
self._health = max(0.0, self._health - 5.0)
|
| 229 |
|
| 230 |
reward = Reward(
|
| 231 |
+
total_score=0.001,
|
| 232 |
breakdown={
|
| 233 |
"status": 0.0,
|
| 234 |
"message_investigating": 0.0,
|
|
|
|
| 335 |
breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
|
| 336 |
|
| 337 |
raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
|
| 338 |
+
total = max(0.001, min(0.999, round(raw - penalty, 4)))
|
| 339 |
|
| 340 |
breakdown["budget_spent"] = self._budget
|
| 341 |
breakdown["final_health"] = self._health
|