Spaces:
Sleeping
Sleeping
Fix Phase 2: grader scores in (0,1), task IDs, and smoke tests
Browse files- app.py +2 -1
- data/tasks.json +3 -0
- inference.py +3 -3
- openenv.yaml +3 -3
- smoke_test.py +16 -4
- tasks.py +6 -4
app.py
CHANGED
|
@@ -95,7 +95,7 @@ def grader(req: GraderRequest):
|
|
| 95 |
max_steps=env.task_data.get("max_steps", 20),
|
| 96 |
)
|
| 97 |
return {
|
| 98 |
-
"score": score,
|
| 99 |
"feedback": feedback,
|
| 100 |
"breakdown": breakdown,
|
| 101 |
"task_id": env.task_data.get("id", f"{env.difficulty}-0"),
|
|
@@ -120,6 +120,7 @@ def tasks():
|
|
| 120 |
"required_evidence": task.get("required_evidence", []),
|
| 121 |
"supports_actions": ["investigate", "correlate", "query_metrics", "check_logs", "conclude"],
|
| 122 |
"has_grader": True,
|
|
|
|
| 123 |
})
|
| 124 |
return {"tasks": task_list, "action_schema": action_schema}
|
| 125 |
|
|
|
|
| 95 |
max_steps=env.task_data.get("max_steps", 20),
|
| 96 |
)
|
| 97 |
return {
|
| 98 |
+
"score": float(score),
|
| 99 |
"feedback": feedback,
|
| 100 |
"breakdown": breakdown,
|
| 101 |
"task_id": env.task_data.get("id", f"{env.difficulty}-0"),
|
|
|
|
| 120 |
"required_evidence": task.get("required_evidence", []),
|
| 121 |
"supports_actions": ["investigate", "correlate", "query_metrics", "check_logs", "conclude"],
|
| 122 |
"has_grader": True,
|
| 123 |
+
"grader": {"enabled": True},
|
| 124 |
})
|
| 125 |
return {"tasks": task_list, "action_schema": action_schema}
|
| 126 |
|
data/tasks.json
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"easy": [
|
| 3 |
{
|
|
|
|
| 4 |
"alarms": [
|
| 5 |
{
|
| 6 |
"id": "INC-HMIS-Shahdara-J1.1K-PE-T4-NR-269506_1770963126.613",
|
|
@@ -152,6 +153,7 @@
|
|
| 152 |
],
|
| 153 |
"medium": [
|
| 154 |
{
|
|
|
|
| 155 |
"alarms": [
|
| 156 |
{
|
| 157 |
"id": "INC-Baroda_House_MX-204-cfabf4bc_1771263950.9",
|
|
@@ -496,6 +498,7 @@
|
|
| 496 |
],
|
| 497 |
"hard": [
|
| 498 |
{
|
|
|
|
| 499 |
"alarms": [
|
| 500 |
{
|
| 501 |
"id": "INC-GTL-C8201-T1-R2-P-3db6a007_1772517515.197",
|
|
|
|
| 1 |
{
|
| 2 |
"easy": [
|
| 3 |
{
|
| 4 |
+
"id": "easy-0",
|
| 5 |
"alarms": [
|
| 6 |
{
|
| 7 |
"id": "INC-HMIS-Shahdara-J1.1K-PE-T4-NR-269506_1770963126.613",
|
|
|
|
| 153 |
],
|
| 154 |
"medium": [
|
| 155 |
{
|
| 156 |
+
"id": "medium-0",
|
| 157 |
"alarms": [
|
| 158 |
{
|
| 159 |
"id": "INC-Baroda_House_MX-204-cfabf4bc_1771263950.9",
|
|
|
|
| 498 |
],
|
| 499 |
"hard": [
|
| 500 |
{
|
| 501 |
+
"id": "hard-0",
|
| 502 |
"alarms": [
|
| 503 |
{
|
| 504 |
"id": "INC-GTL-C8201-T1-R2-P-3db6a007_1772517515.197",
|
inference.py
CHANGED
|
@@ -88,17 +88,17 @@ def run_episode(task_name: str = "network-rca", benchmark: str = "openenv") -> N
|
|
| 88 |
|
| 89 |
step_idx = idx
|
| 90 |
reward_value = float(score)
|
| 91 |
-
rewards.append(f"{reward_value:.
|
| 92 |
error_value = _safe_error(info.get("last_action_error"))
|
| 93 |
print(
|
| 94 |
f"[STEP] step={step_idx} action={action.action_type} "
|
| 95 |
-
f"reward={reward_value:.
|
| 96 |
)
|
| 97 |
success = True
|
| 98 |
except Exception as exc:
|
| 99 |
print(
|
| 100 |
f"[STEP] step={step_idx + 1} action=error "
|
| 101 |
-
f"reward=0.
|
| 102 |
)
|
| 103 |
finally:
|
| 104 |
if hasattr(env, "close"):
|
|
|
|
| 88 |
|
| 89 |
step_idx = idx
|
| 90 |
reward_value = float(score)
|
| 91 |
+
rewards.append(f"{reward_value:.4f}")
|
| 92 |
error_value = _safe_error(info.get("last_action_error"))
|
| 93 |
print(
|
| 94 |
f"[STEP] step={step_idx} action={action.action_type} "
|
| 95 |
+
f"reward={reward_value:.4f} done={_bool_str(step_idx == len(task_ids))} error={error_value}"
|
| 96 |
)
|
| 97 |
success = True
|
| 98 |
except Exception as exc:
|
| 99 |
print(
|
| 100 |
f"[STEP] step={step_idx + 1} action=error "
|
| 101 |
+
f"reward=0.0100 done=false error={_safe_error(exc)}"
|
| 102 |
)
|
| 103 |
finally:
|
| 104 |
if hasattr(env, "close"):
|
openenv.yaml
CHANGED
|
@@ -15,21 +15,21 @@ tasks:
|
|
| 15 |
description: Diagnose a single node-down incident from alarms, topology, metrics, and logs.
|
| 16 |
difficulty: easy
|
| 17 |
max_steps: 10
|
| 18 |
-
reward_range: [0.
|
| 19 |
has_grader: true
|
| 20 |
- id: medium-0
|
| 21 |
name: OSPF neighbor cascade (medium)
|
| 22 |
description: Correlate multiple OSPF neighbor alarms and identify the root cause with evidence.
|
| 23 |
difficulty: medium
|
| 24 |
max_steps: 10
|
| 25 |
-
reward_range: [0.
|
| 26 |
has_grader: true
|
| 27 |
- id: hard-0
|
| 28 |
name: Multi-link hard RCA (hard)
|
| 29 |
description: Resolve a complex link-down scenario requiring metrics and logs on multiple devices.
|
| 30 |
difficulty: hard
|
| 31 |
max_steps: 10
|
| 32 |
-
reward_range: [0.
|
| 33 |
has_grader: true
|
| 34 |
|
| 35 |
endpoints:
|
|
|
|
| 15 |
description: Diagnose a single node-down incident from alarms, topology, metrics, and logs.
|
| 16 |
difficulty: easy
|
| 17 |
max_steps: 10
|
| 18 |
+
reward_range: [0.01, 0.99]
|
| 19 |
has_grader: true
|
| 20 |
- id: medium-0
|
| 21 |
name: OSPF neighbor cascade (medium)
|
| 22 |
description: Correlate multiple OSPF neighbor alarms and identify the root cause with evidence.
|
| 23 |
difficulty: medium
|
| 24 |
max_steps: 10
|
| 25 |
+
reward_range: [0.01, 0.99]
|
| 26 |
has_grader: true
|
| 27 |
- id: hard-0
|
| 28 |
name: Multi-link hard RCA (hard)
|
| 29 |
description: Resolve a complex link-down scenario requiring metrics and logs on multiple devices.
|
| 30 |
difficulty: hard
|
| 31 |
max_steps: 10
|
| 32 |
+
reward_range: [0.01, 0.99]
|
| 33 |
has_grader: true
|
| 34 |
|
| 35 |
endpoints:
|
smoke_test.py
CHANGED
|
@@ -7,14 +7,16 @@ import urllib.request
|
|
| 7 |
BASE_URL = "http://127.0.0.1:7860"
|
| 8 |
|
| 9 |
|
| 10 |
-
def _request(
|
|
|
|
|
|
|
| 11 |
data = None
|
| 12 |
headers = {}
|
| 13 |
if body is not None:
|
| 14 |
data = json.dumps(body).encode("utf-8")
|
| 15 |
headers["Content-Type"] = "application/json"
|
| 16 |
req = urllib.request.Request(f"{BASE_URL}{path}", data=data, headers=headers, method=method)
|
| 17 |
-
with urllib.request.urlopen(req, timeout=
|
| 18 |
payload = resp.read().decode("utf-8")
|
| 19 |
return json.loads(payload) if payload else {}
|
| 20 |
|
|
@@ -39,7 +41,10 @@ def main() -> int:
|
|
| 39 |
checks.append(("/state", isinstance(state.get("step_count"), int), "step_count missing"))
|
| 40 |
|
| 41 |
tasks = _request("GET", "/tasks")
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
grader = _request("POST", "/grader", {"conclusion": "power outage"})
|
| 45 |
score = grader.get("score")
|
|
@@ -47,7 +52,14 @@ def main() -> int:
|
|
| 47 |
score_ok = isinstance(score, (int, float)) and 0.0 < float(score) < 1.0
|
| 48 |
checks.append(("/grader", score_ok, "grader score must be strictly between 0 and 1"))
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
has_results = isinstance(baseline.get("results"), dict) and len(baseline["results"]) > 0
|
| 52 |
checks.append(("/baseline", has_results, "baseline results missing"))
|
| 53 |
except urllib.error.URLError as e:
|
|
|
|
| 7 |
BASE_URL = "http://127.0.0.1:7860"
|
| 8 |
|
| 9 |
|
| 10 |
+
def _request(
|
| 11 |
+
method: str, path: str, body: dict | None = None, *, timeout_s: float = 20.0
|
| 12 |
+
) -> dict:
|
| 13 |
data = None
|
| 14 |
headers = {}
|
| 15 |
if body is not None:
|
| 16 |
data = json.dumps(body).encode("utf-8")
|
| 17 |
headers["Content-Type"] = "application/json"
|
| 18 |
req = urllib.request.Request(f"{BASE_URL}{path}", data=data, headers=headers, method=method)
|
| 19 |
+
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
|
| 20 |
payload = resp.read().decode("utf-8")
|
| 21 |
return json.loads(payload) if payload else {}
|
| 22 |
|
|
|
|
| 41 |
checks.append(("/state", isinstance(state.get("step_count"), int), "step_count missing"))
|
| 42 |
|
| 43 |
tasks = _request("GET", "/tasks")
|
| 44 |
+
task_rows = tasks.get("tasks", [])
|
| 45 |
+
checks.append(("/tasks", len(task_rows) >= 3, "tasks < 3"))
|
| 46 |
+
graded = sum(1 for t in task_rows if t.get("has_grader") or (t.get("grader") or {}).get("enabled"))
|
| 47 |
+
checks.append(("/tasks", graded >= 3, f"tasks with graders < 3 (got {graded})"))
|
| 48 |
|
| 49 |
grader = _request("POST", "/grader", {"conclusion": "power outage"})
|
| 50 |
score = grader.get("score")
|
|
|
|
| 52 |
score_ok = isinstance(score, (int, float)) and 0.0 < float(score) < 1.0
|
| 53 |
checks.append(("/grader", score_ok, "grader score must be strictly between 0 and 1"))
|
| 54 |
|
| 55 |
+
for tid in ("easy-0", "medium-0", "hard-0"):
|
| 56 |
+
_request("POST", "/reset", {"difficulty": "easy", "task_id": tid})
|
| 57 |
+
g = _request("POST", "/grader", {"conclusion": "probe"})
|
| 58 |
+
sc = g.get("score")
|
| 59 |
+
ok = isinstance(sc, (int, float)) and 0.0 < float(sc) < 1.0
|
| 60 |
+
checks.append((f"/grader[{tid}]", ok, f"score out of (0,1): {sc!r}"))
|
| 61 |
+
|
| 62 |
+
baseline = _request("GET", "/baseline", timeout_s=180.0)
|
| 63 |
has_results = isinstance(baseline.get("results"), dict) and len(baseline["results"]) > 0
|
| 64 |
checks.append(("/baseline", has_results, "baseline results missing"))
|
| 65 |
except urllib.error.URLError as e:
|
tasks.py
CHANGED
|
@@ -12,7 +12,8 @@ with open(_TASKS_FILE, 'r') as f:
|
|
| 12 |
_TASKS = json.load(f)
|
| 13 |
|
| 14 |
# Phase-2 validators require grader scores strictly inside (0, 1), not 0.0 or 1.0.
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def _clamp_grader_score_open_interval(score: float) -> float:
|
|
@@ -109,10 +110,11 @@ def grade_episode(
|
|
| 109 |
final_score = max(0.0, min(1.0, final_score))
|
| 110 |
final_score = _clamp_grader_score_open_interval(final_score)
|
| 111 |
|
|
|
|
| 112 |
breakdown = {
|
| 113 |
-
"root_cause_score": round(root_score, 4),
|
| 114 |
-
"evidence_score": round(evidence_score, 4),
|
| 115 |
-
"efficiency_score": round(efficiency_score, 4),
|
| 116 |
"weights": weights,
|
| 117 |
"missing_evidence": missing_evidence,
|
| 118 |
}
|
|
|
|
| 12 |
_TASKS = json.load(f)
|
| 13 |
|
| 14 |
# Phase-2 validators require grader scores strictly inside (0, 1), not 0.0 or 1.0.
|
| 15 |
+
# Use a margin large enough to survive float/json round-trips and strict boundary checks.
|
| 16 |
+
_GRADER_EPS = 0.01
|
| 17 |
|
| 18 |
|
| 19 |
def _clamp_grader_score_open_interval(score: float) -> float:
|
|
|
|
| 110 |
final_score = max(0.0, min(1.0, final_score))
|
| 111 |
final_score = _clamp_grader_score_open_interval(final_score)
|
| 112 |
|
| 113 |
+
# Some validators scan all floats in the grader payload; keep components off 0.0/1.0 too.
|
| 114 |
breakdown = {
|
| 115 |
+
"root_cause_score": round(_clamp_grader_score_open_interval(root_score), 4),
|
| 116 |
+
"evidence_score": round(_clamp_grader_score_open_interval(evidence_score), 4),
|
| 117 |
+
"efficiency_score": round(_clamp_grader_score_open_interval(efficiency_score), 4),
|
| 118 |
"weights": weights,
|
| 119 |
"missing_evidence": missing_evidence,
|
| 120 |
}
|