Spaces:
Sleeping
Sleeping
Commit ·
7cd2458
1
Parent(s): 7ee6ca2
error handling
Browse files- openenv.yaml +5 -3
- pytest_output.txt +0 -0
- src/adaptive_alert_triage/server.py +4 -2
- tasks/hard.py +6 -9
- tasks/medium.py +3 -4
- tests/test_tasks.py +17 -12
openenv.yaml
CHANGED
|
@@ -140,7 +140,7 @@ tasks:
|
|
| 140 |
correlation_probability: 0.10
|
| 141 |
success_threshold: 0.70 # correct_actions / total_actions >= 0.70
|
| 142 |
grader: "tasks.easy.EasyTaskGrader"
|
| 143 |
-
grading_formula: "score = correct_actions / total_actions"
|
| 144 |
|
| 145 |
- id: "medium"
|
| 146 |
name: "Resource-Constrained Triage"
|
|
@@ -158,7 +158,8 @@ tasks:
|
|
| 158 |
grader: "tasks.medium.MediumTaskGrader"
|
| 159 |
grading_formula: |
|
| 160 |
raw = resolved_score / max_possible_score
|
| 161 |
-
|
|
|
|
| 162 |
|
| 163 |
- id: "hard"
|
| 164 |
name: "Cascading Failure Prevention"
|
|
@@ -178,7 +179,8 @@ tasks:
|
|
| 178 |
grading_formula: |
|
| 179 |
chain_score = Σ stop_reward(position) × severity_weight
|
| 180 |
stability = {0 failures: 1.0, 1: 0.80, 2: 0.60, 3: 0.30, 4+: 0.00}
|
| 181 |
-
|
|
|
|
| 182 |
|
| 183 |
# ── Evaluation metrics (produced by graders) ──────────────────────────────────
|
| 184 |
metrics:
|
|
|
|
| 140 |
correlation_probability: 0.10
|
| 141 |
success_threshold: 0.70 # correct_actions / total_actions >= 0.70
|
| 142 |
grader: "tasks.easy.EasyTaskGrader"
|
| 143 |
+
grading_formula: "score = max(0.01, min(0.99, (correct_actions / total_actions) * 0.98 + 0.01))"
|
| 144 |
|
| 145 |
- id: "medium"
|
| 146 |
name: "Resource-Constrained Triage"
|
|
|
|
| 158 |
grader: "tasks.medium.MediumTaskGrader"
|
| 159 |
grading_formula: |
|
| 160 |
raw = resolved_score / max_possible_score
|
| 161 |
+
base_score = max(0, raw - fp_penalty(0.30) - critical_miss_penalty(0.20))
|
| 162 |
+
score = max(0.01, min(0.99, base_score * 0.98 + 0.01))
|
| 163 |
|
| 164 |
- id: "hard"
|
| 165 |
name: "Cascading Failure Prevention"
|
|
|
|
| 179 |
grading_formula: |
|
| 180 |
chain_score = Σ stop_reward(position) × severity_weight
|
| 181 |
stability = {0 failures: 1.0, 1: 0.80, 2: 0.60, 3: 0.30, 4+: 0.00}
|
| 182 |
+
base_score = min(chain_score / max_possible * stability, 1.0)
|
| 183 |
+
score = max(0.01, min(0.99, base_score * 0.98 + 0.01))
|
| 184 |
|
| 185 |
# ── Evaluation metrics (produced by graders) ──────────────────────────────────
|
| 186 |
metrics:
|
pytest_output.txt
ADDED
|
Binary file (25.1 kB). View file
|
|
|
src/adaptive_alert_triage/server.py
CHANGED
|
@@ -130,7 +130,8 @@ def _tick(info: Dict) -> None:
|
|
| 130 |
|
| 131 |
|
| 132 |
def _score() -> float:
|
| 133 |
-
|
|
|
|
| 134 |
|
| 135 |
|
| 136 |
# ── PPO helpers ───────────────────────────────────────────────────────────────
|
|
@@ -602,7 +603,8 @@ async def ws_train(websocket: WebSocket):
|
|
| 602 |
obs, reward, done, info = env.step(act)
|
| 603 |
lt += 1
|
| 604 |
if info.get("action_correct", False): lc += 1
|
| 605 |
-
|
|
|
|
| 606 |
if done: episode_scores.append(s)
|
| 607 |
info["task_score"] = s
|
| 608 |
await websocket.send_json({
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
def _score() -> float:
|
| 133 |
+
raw = _step_correct / _step_total if _step_total else 0.0
|
| 134 |
+
return max(0.01, min(round(0.01 + 0.98 * raw, 2), 0.99))
|
| 135 |
|
| 136 |
|
| 137 |
# ── PPO helpers ───────────────────────────────────────────────────────────────
|
|
|
|
| 603 |
obs, reward, done, info = env.step(act)
|
| 604 |
lt += 1
|
| 605 |
if info.get("action_correct", False): lc += 1
|
| 606 |
+
raw_s = lc / lt if lt else 0.0
|
| 607 |
+
s = max(0.01, min(round(0.01 + 0.98 * raw_s, 2), 0.99))
|
| 608 |
if done: episode_scores.append(s)
|
| 609 |
info["task_score"] = s
|
| 610 |
await websocket.send_json({
|
tasks/hard.py
CHANGED
|
@@ -411,17 +411,16 @@ class HardTaskGrader:
|
|
| 411 |
"""
|
| 412 |
Fraction of chains that were successfully stopped (any position).
|
| 413 |
|
| 414 |
-
Returns
|
| 415 |
"""
|
| 416 |
if not self._chains:
|
| 417 |
-
return
|
| 418 |
stopped = sum(
|
| 419 |
1 for c in self._chains.values()
|
| 420 |
if c.completed and not c.hit_failure
|
| 421 |
)
|
| 422 |
raw = stopped / len(self._chains)
|
| 423 |
-
|
| 424 |
-
return max(0.01, min(raw, 0.99))
|
| 425 |
|
| 426 |
def calculate_stability_score(self) -> float:
|
| 427 |
"""Return the stability multiplier for the current failure count."""
|
|
@@ -570,13 +569,11 @@ class HardTaskGrader:
|
|
| 570 |
|
| 571 |
@staticmethod
|
| 572 |
def _stability_score(failures: int) -> float:
|
| 573 |
-
"""Step-function stability multiplier
|
| 574 |
for threshold, score in _STABILITY_BY_FAILURES:
|
| 575 |
if failures <= threshold:
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
# Return floor clamped to (0, 1)
|
| 579 |
-
return max(0.01, min(_STABILITY_FLOOR, 0.99))
|
| 580 |
|
| 581 |
|
| 582 |
# ---------------------------------------------------------------------------
|
|
|
|
| 411 |
"""
|
| 412 |
Fraction of chains that were successfully stopped (any position).
|
| 413 |
|
| 414 |
+
Returns 1.0 when no chains exist (nothing to detect).
|
| 415 |
"""
|
| 416 |
if not self._chains:
|
| 417 |
+
return 1.0
|
| 418 |
stopped = sum(
|
| 419 |
1 for c in self._chains.values()
|
| 420 |
if c.completed and not c.hit_failure
|
| 421 |
)
|
| 422 |
raw = stopped / len(self._chains)
|
| 423 |
+
return raw
|
|
|
|
| 424 |
|
| 425 |
def calculate_stability_score(self) -> float:
|
| 426 |
"""Return the stability multiplier for the current failure count."""
|
|
|
|
| 569 |
|
| 570 |
@staticmethod
|
| 571 |
def _stability_score(failures: int) -> float:
|
| 572 |
+
"""Step-function stability multiplier."""
|
| 573 |
for threshold, score in _STABILITY_BY_FAILURES:
|
| 574 |
if failures <= threshold:
|
| 575 |
+
return score
|
| 576 |
+
return _STABILITY_FLOOR
|
|
|
|
|
|
|
| 577 |
|
| 578 |
|
| 579 |
# ---------------------------------------------------------------------------
|
tasks/medium.py
CHANGED
|
@@ -236,16 +236,15 @@ class MediumTaskGrader:
|
|
| 236 |
Fraction of INVESTIGATE + ESCALATE actions that were productive.
|
| 237 |
|
| 238 |
Productive = action on an alert with true_severity ≥ 0.50.
|
| 239 |
-
Returns
|
| 240 |
"""
|
| 241 |
costly = [h for h in self._action_history
|
| 242 |
if h["action"] in ("INVESTIGATE", "ESCALATE")]
|
| 243 |
if not costly:
|
| 244 |
-
return
|
| 245 |
productive = sum(1 for h in costly if h["true_severity"] >= _MEDIUM_LOWER)
|
| 246 |
raw = productive / len(costly)
|
| 247 |
-
|
| 248 |
-
return max(0.01, min(raw, 0.99))
|
| 249 |
|
| 250 |
# ------------------------------------------------------------------
|
| 251 |
# Metrics
|
|
|
|
| 236 |
Fraction of INVESTIGATE + ESCALATE actions that were productive.
|
| 237 |
|
| 238 |
Productive = action on an alert with true_severity ≥ 0.50.
|
| 239 |
+
Returns 1.0 when no costly actions were taken (or 1.0 for perfect efficiency).
|
| 240 |
"""
|
| 241 |
costly = [h for h in self._action_history
|
| 242 |
if h["action"] in ("INVESTIGATE", "ESCALATE")]
|
| 243 |
if not costly:
|
| 244 |
+
return 1.0
|
| 245 |
productive = sum(1 for h in costly if h["true_severity"] >= _MEDIUM_LOWER)
|
| 246 |
raw = productive / len(costly)
|
| 247 |
+
return raw
|
|
|
|
| 248 |
|
| 249 |
# ------------------------------------------------------------------
|
| 250 |
# Metrics
|
tests/test_tasks.py
CHANGED
|
@@ -226,7 +226,8 @@ class TestHardTaskGrader:
|
|
| 226 |
def test_correlation_detection(self):
|
| 227 |
"""Test bonus for handling correlated alerts."""
|
| 228 |
correlation_chains = [["alert_001", "alert_002", "alert_003"]]
|
| 229 |
-
grader = HardTaskGrader(
|
|
|
|
| 230 |
|
| 231 |
alert = Alert(
|
| 232 |
id="alert_001",
|
|
@@ -249,7 +250,8 @@ class TestHardTaskGrader:
|
|
| 249 |
def test_failure_prevention_bonus(self):
|
| 250 |
"""Test bonus for preventing cascading failures."""
|
| 251 |
correlation_chains = [["alert_001", "alert_002", "alert_003"]]
|
| 252 |
-
grader = HardTaskGrader(
|
|
|
|
| 253 |
|
| 254 |
# Handle first alert in chain (early detection)
|
| 255 |
alert = Alert(
|
|
@@ -266,17 +268,17 @@ class TestHardTaskGrader:
|
|
| 266 |
|
| 267 |
grader.grade_action(action, alert, reward)
|
| 268 |
|
| 269 |
-
|
|
|
|
| 270 |
|
| 271 |
def test_system_failure_penalty(self):
|
| 272 |
"""Test heavy penalty for system failures."""
|
| 273 |
grader = HardTaskGrader()
|
| 274 |
|
| 275 |
# Record a failure
|
| 276 |
-
grader.
|
| 277 |
|
| 278 |
-
assert grader.
|
| 279 |
-
assert grader.stability_penalty > 0.0
|
| 280 |
|
| 281 |
# Stability score should be reduced
|
| 282 |
stability = grader.calculate_stability_score()
|
|
@@ -285,7 +287,8 @@ class TestHardTaskGrader:
|
|
| 285 |
def test_missed_correlated_alert_penalty(self):
|
| 286 |
"""Test extra penalty for missing correlated alerts."""
|
| 287 |
correlation_chains = [["alert_001", "alert_002"]]
|
| 288 |
-
grader = HardTaskGrader(
|
|
|
|
| 289 |
|
| 290 |
alert = Alert(
|
| 291 |
id="alert_001",
|
|
@@ -301,8 +304,8 @@ class TestHardTaskGrader:
|
|
| 301 |
|
| 302 |
contribution = grader.grade_action(action, alert, reward)
|
| 303 |
|
| 304 |
-
# Should have
|
| 305 |
-
assert contribution < -
|
| 306 |
|
| 307 |
def test_correlation_detection_rate(self):
|
| 308 |
"""Test calculation of correlation detection rate."""
|
|
@@ -310,10 +313,12 @@ class TestHardTaskGrader:
|
|
| 310 |
["alert_001", "alert_002"],
|
| 311 |
["alert_003", "alert_004"],
|
| 312 |
]
|
| 313 |
-
grader = HardTaskGrader(
|
|
|
|
| 314 |
|
| 315 |
# Handle one chain
|
| 316 |
-
|
|
|
|
| 317 |
|
| 318 |
rate = grader.calculate_correlation_detection_rate()
|
| 319 |
assert abs(rate - 0.5) < 0.01, "Should detect 50% of chains"
|
|
@@ -331,7 +336,7 @@ class TestHardTaskGrader:
|
|
| 331 |
|
| 332 |
# Multiple failures
|
| 333 |
for _ in range(3):
|
| 334 |
-
grader.
|
| 335 |
|
| 336 |
stability = grader.calculate_stability_score()
|
| 337 |
assert stability < 1.0, "Failures should reduce stability"
|
|
|
|
| 226 |
def test_correlation_detection(self):
|
| 227 |
"""Test bonus for handling correlated alerts."""
|
| 228 |
correlation_chains = [["alert_001", "alert_002", "alert_003"]]
|
| 229 |
+
grader = HardTaskGrader()
|
| 230 |
+
grader.update_correlation_state(correlation_chains)
|
| 231 |
|
| 232 |
alert = Alert(
|
| 233 |
id="alert_001",
|
|
|
|
| 250 |
def test_failure_prevention_bonus(self):
|
| 251 |
"""Test bonus for preventing cascading failures."""
|
| 252 |
correlation_chains = [["alert_001", "alert_002", "alert_003"]]
|
| 253 |
+
grader = HardTaskGrader()
|
| 254 |
+
grader.update_correlation_state(correlation_chains)
|
| 255 |
|
| 256 |
# Handle first alert in chain (early detection)
|
| 257 |
alert = Alert(
|
|
|
|
| 268 |
|
| 269 |
grader.grade_action(action, alert, reward)
|
| 270 |
|
| 271 |
+
m = grader.get_metrics()
|
| 272 |
+
assert m["chains_stopped"] >= 1, "Should register failure prevention"
|
| 273 |
|
| 274 |
def test_system_failure_penalty(self):
|
| 275 |
"""Test heavy penalty for system failures."""
|
| 276 |
grader = HardTaskGrader()
|
| 277 |
|
| 278 |
# Record a failure
|
| 279 |
+
grader.record_failures(1)
|
| 280 |
|
| 281 |
+
assert grader._system_failures == 1
|
|
|
|
| 282 |
|
| 283 |
# Stability score should be reduced
|
| 284 |
stability = grader.calculate_stability_score()
|
|
|
|
| 287 |
def test_missed_correlated_alert_penalty(self):
|
| 288 |
"""Test extra penalty for missing correlated alerts."""
|
| 289 |
correlation_chains = [["alert_001", "alert_002"]]
|
| 290 |
+
grader = HardTaskGrader()
|
| 291 |
+
grader.update_correlation_state(correlation_chains)
|
| 292 |
|
| 293 |
alert = Alert(
|
| 294 |
id="alert_001",
|
|
|
|
| 304 |
|
| 305 |
contribution = grader.grade_action(action, alert, reward)
|
| 306 |
|
| 307 |
+
# Should have negative contribution for missing correlated critical
|
| 308 |
+
assert contribution < -0.2, f"Should have extra penalty for correlated miss, got {contribution}"
|
| 309 |
|
| 310 |
def test_correlation_detection_rate(self):
|
| 311 |
"""Test calculation of correlation detection rate."""
|
|
|
|
| 313 |
["alert_001", "alert_002"],
|
| 314 |
["alert_003", "alert_004"],
|
| 315 |
]
|
| 316 |
+
grader = HardTaskGrader()
|
| 317 |
+
grader.update_correlation_state(correlation_chains)
|
| 318 |
|
| 319 |
# Handle one chain
|
| 320 |
+
alert = Alert(id="alert_001", visible_severity=0.8, confidence=0.85, alert_type="CPU", age=1, true_severity=0.85, is_correlated=True)
|
| 321 |
+
grader.grade_action(Action(alert_id="alert_001", action_type="INVESTIGATE"), alert, Reward(value=0))
|
| 322 |
|
| 323 |
rate = grader.calculate_correlation_detection_rate()
|
| 324 |
assert abs(rate - 0.5) < 0.01, "Should detect 50% of chains"
|
|
|
|
| 336 |
|
| 337 |
# Multiple failures
|
| 338 |
for _ in range(3):
|
| 339 |
+
grader.record_failures(1)
|
| 340 |
|
| 341 |
stability = grader.calculate_stability_score()
|
| 342 |
assert stability < 1.0, "Failures should reduce stability"
|