Hacktrix-121 commited on
Commit
7cd2458
·
1 Parent(s): 7ee6ca2

error handling

Browse files
openenv.yaml CHANGED
@@ -140,7 +140,7 @@ tasks:
140
  correlation_probability: 0.10
141
  success_threshold: 0.70 # correct_actions / total_actions >= 0.70
142
  grader: "tasks.easy.EasyTaskGrader"
143
- grading_formula: "score = correct_actions / total_actions"
144
 
145
  - id: "medium"
146
  name: "Resource-Constrained Triage"
@@ -158,7 +158,8 @@ tasks:
158
  grader: "tasks.medium.MediumTaskGrader"
159
  grading_formula: |
160
  raw = resolved_score / max_possible_score
161
- score = max(0, raw - fp_penalty(0.30) - critical_miss_penalty(0.20))
 
162
 
163
  - id: "hard"
164
  name: "Cascading Failure Prevention"
@@ -178,7 +179,8 @@ tasks:
178
  grading_formula: |
179
  chain_score = Σ stop_reward(position) × severity_weight
180
  stability = {0 failures: 1.0, 1: 0.80, 2: 0.60, 3: 0.30, 4+: 0.00}
181
- score = min(chain_score / max_possible * stability, 1.0)
 
182
 
183
  # ── Evaluation metrics (produced by graders) ──────────────────────────────────
184
  metrics:
 
140
  correlation_probability: 0.10
141
  success_threshold: 0.70 # correct_actions / total_actions >= 0.70
142
  grader: "tasks.easy.EasyTaskGrader"
143
+ grading_formula: "score = max(0.01, min(0.99, (correct_actions / total_actions) * 0.98 + 0.01))"
144
 
145
  - id: "medium"
146
  name: "Resource-Constrained Triage"
 
158
  grader: "tasks.medium.MediumTaskGrader"
159
  grading_formula: |
160
  raw = resolved_score / max_possible_score
161
+ base_score = max(0, raw - fp_penalty(0.30) - critical_miss_penalty(0.20))
162
+ score = max(0.01, min(0.99, base_score * 0.98 + 0.01))
163
 
164
  - id: "hard"
165
  name: "Cascading Failure Prevention"
 
179
  grading_formula: |
180
  chain_score = Σ stop_reward(position) × severity_weight
181
  stability = {0 failures: 1.0, 1: 0.80, 2: 0.60, 3: 0.30, 4+: 0.00}
182
+ base_score = min(chain_score / max_possible * stability, 1.0)
183
+ score = max(0.01, min(0.99, base_score * 0.98 + 0.01))
184
 
185
  # ── Evaluation metrics (produced by graders) ──────────────────────────────────
186
  metrics:
pytest_output.txt ADDED
Binary file (25.1 kB). View file
 
src/adaptive_alert_triage/server.py CHANGED
@@ -130,7 +130,8 @@ def _tick(info: Dict) -> None:
130
 
131
 
132
  def _score() -> float:
133
- return _step_correct / _step_total if _step_total else 0.0
 
134
 
135
 
136
  # ── PPO helpers ───────────────────────────────────────────────────────────────
@@ -602,7 +603,8 @@ async def ws_train(websocket: WebSocket):
602
  obs, reward, done, info = env.step(act)
603
  lt += 1
604
  if info.get("action_correct", False): lc += 1
605
- s = lc / lt if lt else 0.0
 
606
  if done: episode_scores.append(s)
607
  info["task_score"] = s
608
  await websocket.send_json({
 
130
 
131
 
132
  def _score() -> float:
133
+ raw = _step_correct / _step_total if _step_total else 0.0
134
+ return max(0.01, min(round(0.01 + 0.98 * raw, 2), 0.99))
135
 
136
 
137
  # ── PPO helpers ───────────────────────────────────────────────────────────────
 
603
  obs, reward, done, info = env.step(act)
604
  lt += 1
605
  if info.get("action_correct", False): lc += 1
606
+ raw_s = lc / lt if lt else 0.0
607
+ s = max(0.01, min(round(0.01 + 0.98 * raw_s, 2), 0.99))
608
  if done: episode_scores.append(s)
609
  info["task_score"] = s
610
  await websocket.send_json({
tasks/hard.py CHANGED
@@ -411,17 +411,16 @@ class HardTaskGrader:
411
  """
412
  Fraction of chains that were successfully stopped (any position).
413
 
414
- Returns 0.99 when no chains exist (nothing to detect).
415
  """
416
  if not self._chains:
417
- return 0.99
418
  stopped = sum(
419
  1 for c in self._chains.values()
420
  if c.completed and not c.hit_failure
421
  )
422
  raw = stopped / len(self._chains)
423
- # Clamp to (0, 1)
424
- return max(0.01, min(raw, 0.99))
425
 
426
  def calculate_stability_score(self) -> float:
427
  """Return the stability multiplier for the current failure count."""
@@ -570,13 +569,11 @@ class HardTaskGrader:
570
 
571
  @staticmethod
572
  def _stability_score(failures: int) -> float:
573
- """Step-function stability multiplier clamped to (0, 1)."""
574
  for threshold, score in _STABILITY_BY_FAILURES:
575
  if failures <= threshold:
576
- # Clamp stability scores to strict (0, 1)
577
- return max(0.01, min(score, 0.99))
578
- # Return floor clamped to (0, 1)
579
- return max(0.01, min(_STABILITY_FLOOR, 0.99))
580
 
581
 
582
  # ---------------------------------------------------------------------------
 
411
  """
412
  Fraction of chains that were successfully stopped (any position).
413
 
414
+ Returns 1.0 when no chains exist (nothing to detect).
415
  """
416
  if not self._chains:
417
+ return 1.0
418
  stopped = sum(
419
  1 for c in self._chains.values()
420
  if c.completed and not c.hit_failure
421
  )
422
  raw = stopped / len(self._chains)
423
+ return raw
 
424
 
425
  def calculate_stability_score(self) -> float:
426
  """Return the stability multiplier for the current failure count."""
 
569
 
570
  @staticmethod
571
  def _stability_score(failures: int) -> float:
572
+ """Step-function stability multiplier."""
573
  for threshold, score in _STABILITY_BY_FAILURES:
574
  if failures <= threshold:
575
+ return score
576
+ return _STABILITY_FLOOR
 
 
577
 
578
 
579
  # ---------------------------------------------------------------------------
tasks/medium.py CHANGED
@@ -236,16 +236,15 @@ class MediumTaskGrader:
236
  Fraction of INVESTIGATE + ESCALATE actions that were productive.
237
 
238
  Productive = action on an alert with true_severity ≥ 0.50.
239
- Returns 0.99 when no costly actions were taken (or 0.99 for perfect efficiency).
240
  """
241
  costly = [h for h in self._action_history
242
  if h["action"] in ("INVESTIGATE", "ESCALATE")]
243
  if not costly:
244
- return 0.99
245
  productive = sum(1 for h in costly if h["true_severity"] >= _MEDIUM_LOWER)
246
  raw = productive / len(costly)
247
- # Clamp to (0, 1)
248
- return max(0.01, min(raw, 0.99))
249
 
250
  # ------------------------------------------------------------------
251
  # Metrics
 
236
  Fraction of INVESTIGATE + ESCALATE actions that were productive.
237
 
238
  Productive = action on an alert with true_severity ≥ 0.50.
239
+ Returns 1.0 when no costly actions were taken (or 1.0 for perfect efficiency).
240
  """
241
  costly = [h for h in self._action_history
242
  if h["action"] in ("INVESTIGATE", "ESCALATE")]
243
  if not costly:
244
+ return 1.0
245
  productive = sum(1 for h in costly if h["true_severity"] >= _MEDIUM_LOWER)
246
  raw = productive / len(costly)
247
+ return raw
 
248
 
249
  # ------------------------------------------------------------------
250
  # Metrics
tests/test_tasks.py CHANGED
@@ -226,7 +226,8 @@ class TestHardTaskGrader:
226
  def test_correlation_detection(self):
227
  """Test bonus for handling correlated alerts."""
228
  correlation_chains = [["alert_001", "alert_002", "alert_003"]]
229
- grader = HardTaskGrader(correlation_chains=correlation_chains)
 
230
 
231
  alert = Alert(
232
  id="alert_001",
@@ -249,7 +250,8 @@ class TestHardTaskGrader:
249
  def test_failure_prevention_bonus(self):
250
  """Test bonus for preventing cascading failures."""
251
  correlation_chains = [["alert_001", "alert_002", "alert_003"]]
252
- grader = HardTaskGrader(correlation_chains=correlation_chains)
 
253
 
254
  # Handle first alert in chain (early detection)
255
  alert = Alert(
@@ -266,17 +268,17 @@ class TestHardTaskGrader:
266
 
267
  grader.grade_action(action, alert, reward)
268
 
269
- assert grader.failures_prevented >= 1, "Should register failure prevention"
 
270
 
271
  def test_system_failure_penalty(self):
272
  """Test heavy penalty for system failures."""
273
  grader = HardTaskGrader()
274
 
275
  # Record a failure
276
- grader.record_system_failure("alert_001")
277
 
278
- assert grader.system_failures == 1
279
- assert grader.stability_penalty > 0.0
280
 
281
  # Stability score should be reduced
282
  stability = grader.calculate_stability_score()
@@ -285,7 +287,8 @@ class TestHardTaskGrader:
285
  def test_missed_correlated_alert_penalty(self):
286
  """Test extra penalty for missing correlated alerts."""
287
  correlation_chains = [["alert_001", "alert_002"]]
288
- grader = HardTaskGrader(correlation_chains=correlation_chains)
 
289
 
290
  alert = Alert(
291
  id="alert_001",
@@ -301,8 +304,8 @@ class TestHardTaskGrader:
301
 
302
  contribution = grader.grade_action(action, alert, reward)
303
 
304
- # Should have heavy penalty for missing correlated critical
305
- assert contribution < -2.0, "Should have extra penalty for correlated miss"
306
 
307
  def test_correlation_detection_rate(self):
308
  """Test calculation of correlation detection rate."""
@@ -310,10 +313,12 @@ class TestHardTaskGrader:
310
  ["alert_001", "alert_002"],
311
  ["alert_003", "alert_004"],
312
  ]
313
- grader = HardTaskGrader(correlation_chains=correlation_chains)
 
314
 
315
  # Handle one chain
316
- grader.chains_handled.add(0)
 
317
 
318
  rate = grader.calculate_correlation_detection_rate()
319
  assert abs(rate - 0.5) < 0.01, "Should detect 50% of chains"
@@ -331,7 +336,7 @@ class TestHardTaskGrader:
331
 
332
  # Multiple failures
333
  for _ in range(3):
334
- grader.record_system_failure()
335
 
336
  stability = grader.calculate_stability_score()
337
  assert stability < 1.0, "Failures should reduce stability"
 
226
  def test_correlation_detection(self):
227
  """Test bonus for handling correlated alerts."""
228
  correlation_chains = [["alert_001", "alert_002", "alert_003"]]
229
+ grader = HardTaskGrader()
230
+ grader.update_correlation_state(correlation_chains)
231
 
232
  alert = Alert(
233
  id="alert_001",
 
250
  def test_failure_prevention_bonus(self):
251
  """Test bonus for preventing cascading failures."""
252
  correlation_chains = [["alert_001", "alert_002", "alert_003"]]
253
+ grader = HardTaskGrader()
254
+ grader.update_correlation_state(correlation_chains)
255
 
256
  # Handle first alert in chain (early detection)
257
  alert = Alert(
 
268
 
269
  grader.grade_action(action, alert, reward)
270
 
271
+ m = grader.get_metrics()
272
+ assert m["chains_stopped"] >= 1, "Should register failure prevention"
273
 
274
  def test_system_failure_penalty(self):
275
  """Test heavy penalty for system failures."""
276
  grader = HardTaskGrader()
277
 
278
  # Record a failure
279
+ grader.record_failures(1)
280
 
281
+ assert grader._system_failures == 1
 
282
 
283
  # Stability score should be reduced
284
  stability = grader.calculate_stability_score()
 
287
  def test_missed_correlated_alert_penalty(self):
288
  """Test extra penalty for missing correlated alerts."""
289
  correlation_chains = [["alert_001", "alert_002"]]
290
+ grader = HardTaskGrader()
291
+ grader.update_correlation_state(correlation_chains)
292
 
293
  alert = Alert(
294
  id="alert_001",
 
304
 
305
  contribution = grader.grade_action(action, alert, reward)
306
 
307
+ # Should have negative contribution for missing correlated critical
308
+ assert contribution < -0.2, f"Should have extra penalty for correlated miss, got {contribution}"
309
 
310
  def test_correlation_detection_rate(self):
311
  """Test calculation of correlation detection rate."""
 
313
  ["alert_001", "alert_002"],
314
  ["alert_003", "alert_004"],
315
  ]
316
+ grader = HardTaskGrader()
317
+ grader.update_correlation_state(correlation_chains)
318
 
319
  # Handle one chain
320
+ alert = Alert(id="alert_001", visible_severity=0.8, confidence=0.85, alert_type="CPU", age=1, true_severity=0.85, is_correlated=True)
321
+ grader.grade_action(Action(alert_id="alert_001", action_type="INVESTIGATE"), alert, Reward(value=0))
322
 
323
  rate = grader.calculate_correlation_detection_rate()
324
  assert abs(rate - 0.5) < 0.01, "Should detect 50% of chains"
 
336
 
337
  # Multiple failures
338
  for _ in range(3):
339
+ grader.record_failures(1)
340
 
341
  stability = grader.calculate_stability_score()
342
  assert stability < 1.0, "Failures should reduce stability"