Spaces:

Escanor-playground
/

cloud-sre-arbiter

Running

Escanor925 commited on 12 days ago

Commit

d1583fb

1 Parent(s): 0e20f4b

fix: clamp scores to (0.001, 0.999) for Phase 2 strict range validation

Files changed (3) hide show

environment.py CHANGED Viewed

@@ -73,7 +73,7 @@ class Action(BaseModel):
 class Reward(BaseModel):
     """Deterministic grading result returned after each step."""
-    total_score: float = Field(..., ge=0.0, le=1.0, description="Final score 0.0–1.0")
     breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
@@ -228,7 +228,7 @@ class CloudSREEnv:
         self._health = max(0.0, self._health - 5.0)
         reward = Reward(
-            total_score=0.0,
             breakdown={
                 "status": 0.0,
                 "message_investigating": 0.0,
@@ -335,7 +335,7 @@ class CloudSREEnv:
             breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
         raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
-        total = max(0.0, min(1.0, round(raw - penalty, 4)))
         breakdown["budget_spent"] = self._budget
         breakdown["final_health"] = self._health

 class Reward(BaseModel):
     """Deterministic grading result returned after each step."""
+    total_score: float = Field(..., gt=0.0, lt=1.0, description="Final score in (0, 1) exclusive")
     breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
         self._health = max(0.0, self._health - 5.0)
         reward = Reward(
+            total_score=0.001,
             breakdown={
                 "status": 0.0,
                 "message_investigating": 0.0,
             breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
         raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
+        total = max(0.001, min(0.999, round(raw - penalty, 4)))
         breakdown["budget_spent"] = self._budget
         breakdown["final_health"] = self._health

inference.py CHANGED Viewed

@@ -269,7 +269,7 @@ def run_evaluation():
                 if done:
                     reward = step_data["reward"]
-                    score = reward["total_score"]
                     results[task] = score
                     print(f"[STEP] step={obs['turn_number']} reward={score}", flush=True)
                     print(f"[END] task={task} score={score} steps={obs['turn_number']}", flush=True)
@@ -282,10 +282,10 @@ def run_evaluation():
                     obs = step_data["observation"]
             except Exception as exc:
-                print(f"[STEP] step={obs['turn_number']} reward=0.0", flush=True)
-                print(f"[END] task={task} score=0.0 steps={obs['turn_number']}", flush=True)
                 print(f"  ERROR: Environment step failed: {exc}")
-                results[task] = 0.0
                 done = True
         time.sleep(0.5)

                 if done:
                     reward = step_data["reward"]
+                    score = max(0.001, min(0.999, float(reward["total_score"])))
                     results[task] = score
                     print(f"[STEP] step={obs['turn_number']} reward={score}", flush=True)
                     print(f"[END] task={task} score={score} steps={obs['turn_number']}", flush=True)
                     obs = step_data["observation"]
             except Exception as exc:
+                print(f"[STEP] step={obs['turn_number']} reward=0.001", flush=True)
+                print(f"[END] task={task} score=0.001 steps={obs['turn_number']}", flush=True)
                 print(f"  ERROR: Environment step failed: {exc}")
+                results[task] = 0.001
                 done = True
         time.sleep(0.5)

server/environment.py CHANGED Viewed

@@ -73,7 +73,7 @@ class Action(BaseModel):
 class Reward(BaseModel):
     """Deterministic grading result returned after each step."""
-    total_score: float = Field(..., ge=0.0, le=1.0, description="Final score 0.0–1.0")
     breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
@@ -228,7 +228,7 @@ class CloudSREEnv:
         self._health = max(0.0, self._health - 5.0)
         reward = Reward(
-            total_score=0.0,
             breakdown={
                 "status": 0.0,
                 "message_investigating": 0.0,
@@ -335,7 +335,7 @@ class CloudSREEnv:
             breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
         raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
-        total = max(0.0, min(1.0, round(raw - penalty, 4)))
         breakdown["budget_spent"] = self._budget
         breakdown["final_health"] = self._health

 class Reward(BaseModel):
     """Deterministic grading result returned after each step."""
+    total_score: float = Field(..., gt=0.0, lt=1.0, description="Final score in (0, 1) exclusive")
     breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")
         self._health = max(0.0, self._health - 5.0)
         reward = Reward(
+            total_score=0.001,
             breakdown={
                 "status": 0.0,
                 "message_investigating": 0.0,
             breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY
         raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
+        total = max(0.001, min(0.999, round(raw - penalty, 4)))
         breakdown["budget_spent"] = self._budget
         breakdown["final_health"] = self._health