Escanor925 commited on
Commit
d1583fb
·
1 Parent(s): 0e20f4b

fix: clamp scores to (0.001, 0.999) for Phase 2 strict range validation

Browse files
Files changed (3) hide show
  1. environment.py +3 -3
  2. inference.py +4 -4
  3. server/environment.py +3 -3
environment.py CHANGED
@@ -73,7 +73,7 @@ class Action(BaseModel):
73
 
74
  class Reward(BaseModel):
75
  """Deterministic grading result returned after each step."""
76
- total_score: float = Field(..., ge=0.0, le=1.0, description="Final score 0.0–1.0")
77
  breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
78
 
79
 
@@ -228,7 +228,7 @@ class CloudSREEnv:
228
  self._health = max(0.0, self._health - 5.0)
229
 
230
  reward = Reward(
231
- total_score=0.0,
232
  breakdown={
233
  "status": 0.0,
234
  "message_investigating": 0.0,
@@ -335,7 +335,7 @@ class CloudSREEnv:
335
  breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
336
 
337
  raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
338
- total = max(0.0, min(1.0, round(raw - penalty, 4)))
339
 
340
  breakdown["budget_spent"] = self._budget
341
  breakdown["final_health"] = self._health
 
73
 
74
  class Reward(BaseModel):
75
  """Deterministic grading result returned after each step."""
76
+ total_score: float = Field(..., gt=0.0, lt=1.0, description="Final score in (0, 1) exclusive")
77
  breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
78
 
79
 
 
228
  self._health = max(0.0, self._health - 5.0)
229
 
230
  reward = Reward(
231
+ total_score=0.001,
232
  breakdown={
233
  "status": 0.0,
234
  "message_investigating": 0.0,
 
335
  breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
336
 
337
  raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
338
+ total = max(0.001, min(0.999, round(raw - penalty, 4)))
339
 
340
  breakdown["budget_spent"] = self._budget
341
  breakdown["final_health"] = self._health
inference.py CHANGED
@@ -269,7 +269,7 @@ def run_evaluation():
269
 
270
  if done:
271
  reward = step_data["reward"]
272
- score = reward["total_score"]
273
  results[task] = score
274
  print(f"[STEP] step={obs['turn_number']} reward={score}", flush=True)
275
  print(f"[END] task={task} score={score} steps={obs['turn_number']}", flush=True)
@@ -282,10 +282,10 @@ def run_evaluation():
282
  obs = step_data["observation"]
283
 
284
  except Exception as exc:
285
- print(f"[STEP] step={obs['turn_number']} reward=0.0", flush=True)
286
- print(f"[END] task={task} score=0.0 steps={obs['turn_number']}", flush=True)
287
  print(f" ERROR: Environment step failed: {exc}")
288
- results[task] = 0.0
289
  done = True
290
 
291
  time.sleep(0.5)
 
269
 
270
  if done:
271
  reward = step_data["reward"]
272
+ score = max(0.001, min(0.999, float(reward["total_score"])))
273
  results[task] = score
274
  print(f"[STEP] step={obs['turn_number']} reward={score}", flush=True)
275
  print(f"[END] task={task} score={score} steps={obs['turn_number']}", flush=True)
 
282
  obs = step_data["observation"]
283
 
284
  except Exception as exc:
285
+ print(f"[STEP] step={obs['turn_number']} reward=0.001", flush=True)
286
+ print(f"[END] task={task} score=0.001 steps={obs['turn_number']}", flush=True)
287
  print(f" ERROR: Environment step failed: {exc}")
288
+ results[task] = 0.001
289
  done = True
290
 
291
  time.sleep(0.5)
server/environment.py CHANGED
@@ -73,7 +73,7 @@ class Action(BaseModel):
73
 
74
  class Reward(BaseModel):
75
  """Deterministic grading result returned after each step."""
76
- total_score: float = Field(..., ge=0.0, le=1.0, description="Final score 0.0–1.0")
77
  breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
78
 
79
 
@@ -228,7 +228,7 @@ class CloudSREEnv:
228
  self._health = max(0.0, self._health - 5.0)
229
 
230
  reward = Reward(
231
- total_score=0.0,
232
  breakdown={
233
  "status": 0.0,
234
  "message_investigating": 0.0,
@@ -335,7 +335,7 @@ class CloudSREEnv:
335
  breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
336
 
337
  raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
338
- total = max(0.0, min(1.0, round(raw - penalty, 4)))
339
 
340
  breakdown["budget_spent"] = self._budget
341
  breakdown["final_health"] = self._health
 
73
 
74
  class Reward(BaseModel):
75
  """Deterministic grading result returned after each step."""
76
+ total_score: float = Field(..., gt=0.0, lt=1.0, description="Final score in (0, 1) exclusive")
77
  breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
78
 
79
 
 
228
  self._health = max(0.0, self._health - 5.0)
229
 
230
  reward = Reward(
231
+ total_score=0.001,
232
  breakdown={
233
  "status": 0.0,
234
  "message_investigating": 0.0,
 
335
  breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
336
 
337
  raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
338
+ total = max(0.001, min(0.999, round(raw - penalty, 4)))
339
 
340
  breakdown["budget_spent"] = self._budget
341
  breakdown["final_health"] = self._health