Spaces:
Sleeping
Sleeping
| # openenv.yaml β OpenEnv Specification for Adaptive Alert Triage | |
| # Matches the actual implementation in src/adaptive_alert_triage/ | |
| # Validated against: env.py, models.py, tasks/easy.py, tasks/medium.py, tasks/hard.py | |
| name: "AdaptiveAlertTriage" | |
| version: "0.1.0" | |
| description: | | |
| A partially-observable RL environment that simulates real-time IT alert triage | |
| and incident response. An agent receives a continuous stream of system alerts | |
| and must decide β for each one β whether to INVESTIGATE, IGNORE, ESCALATE, or | |
| DELAY, under time pressure, resource constraints, and the risk of cascading | |
| failures from unhandled correlated alerts. | |
| This environment models a task performed daily by DevOps and SOC engineers: | |
| triaging noisy monitoring signals while preventing real incidents from | |
| escalating into outages. | |
| authors: | |
| - name: "Scalar Hackathon Team" | |
| email: "team@scalar.com" | |
| license: "MIT" | |
| tags: | |
| - reinforcement-learning | |
| - openenv | |
| - alert-triage | |
| - incident-response | |
| - partial-observability | |
| - resource-constraints | |
| - cascading-failures | |
| # ββ Environment class βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| environment: | |
| module: "adaptive_alert_triage.env" | |
| class: "AdaptiveAlertTriageEnv" | |
| # Constructor accepts: task_id ("easy"|"medium"|"hard"), seed (int, optional) | |
| # ββ OpenEnv interface βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # All three methods are implemented in AdaptiveAlertTriageEnv | |
| interface: | |
| reset: | |
| signature: "reset(seed=None, options=None) -> Observation" | |
| description: | | |
| Resets the episode. Generates an initial batch of synthetic alerts | |
| using the task-specific correlation_probability. Returns an Observation | |
| with alerts stripped of hidden fields (true_severity, is_correlated). | |
| step: | |
| signature: "step(action: Action) -> (Observation, Reward, done: bool, info: dict)" | |
| description: | | |
| Processes one Action, updates alert queue, checks for failures, generates | |
| new alerts, and returns the next observation. The info dict always | |
| contains: processed_alerts, correlation_groups, failures_this_step, | |
| system_failure, action_correct, cumulative_reward, step, failures_count. | |
| state: | |
| signature: "state() -> EpisodeState" | |
| description: | | |
| Returns the full internal EpisodeState including hidden ground-truth | |
| (true_severities, correlation_groups, false_positives, pending_failures). | |
| For evaluation and replay only β never exposed to the agent during training. | |
| # ββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| config: | |
| actions: | |
| - "INVESTIGATE" | |
| - "IGNORE" | |
| - "ESCALATE" | |
| - "DELAY" | |
| # ββ Observation space βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| observation: | |
| type: "Pydantic BaseModel (Observation)" | |
| fields: | |
| alerts: | |
| type: "List[Alert]" | |
| description: "Active alerts awaiting triage. Each Alert has id, visible_severity, confidence, alert_type, age." | |
| hidden_fields: "true_severity, is_correlated β stripped before returned to agent" | |
| system_load: | |
| type: "float [0.0, 1.0]" | |
| description: "Current infrastructure utilisation" | |
| queue_length: | |
| type: "int >= 0" | |
| description: "Number of active alerts in queue" | |
| time_remaining: | |
| type: "int >= 0" | |
| description: "Steps left before episode ends" | |
| episode_step: | |
| type: "int >= 0" | |
| description: "Current step index (0-based)" | |
| resource_budget: | |
| type: "Optional[int]" | |
| description: "Remaining INVESTIGATE actions this step. None = unconstrained (easy task)." | |
| # ββ Action space ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| action: | |
| type: "Pydantic BaseModel (Action)" | |
| fields: | |
| alert_id: | |
| type: "str" | |
| description: "ID of the target alert β must match an ID in current observation.alerts" | |
| action_type: | |
| type: "Literal['INVESTIGATE','IGNORE','ESCALATE','DELAY']" | |
| description: | | |
| INVESTIGATE β allocates resources to diagnose; counts against resource_budget | |
| IGNORE β dismisses alert as noise (best for false positives) | |
| ESCALATE β routes to specialist team (no budget cost) | |
| DELAY β keeps alert in queue for re-evaluation next step | |
| metadata: | |
| type: "Dict[str, Any]" | |
| description: "Optional context bag (e.g. reasoning from LLM agents)" | |
| # ββ Reward ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| reward: | |
| type: "Pydantic BaseModel (Reward)" | |
| description: "Dense, shaped reward decomposed into named components" | |
| schedule: | |
| critical_handled: "+10.0 β INVESTIGATE or ESCALATE on critical alert (true_severity >= 0.75)" | |
| failure_prevented: "+5.0 β correlated alert handled (prevents cascade)" | |
| false_positive_ignored: "+3.0 β IGNORE on a false positive" | |
| medium_handled: "+2.0 * true_severity β INVESTIGATE on medium alert" | |
| unnecessary_invest: "-2.0 β INVESTIGATE on a false positive" | |
| missed_critical: "-8.0 β IGNORE on a critical alert" | |
| risky_delay: "-2.4 β DELAY on a critical alert" | |
| task_multipliers: "easy=1.0, medium=1.1, hard=1.2" | |
| range: [-8.0, 15.0] # per step before task multiplier; cascade bonus included in max | |
| # ββ Tasks βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| tasks: | |
| - id: "easy" | |
| name: "Basic Alert Prioritisation" | |
| description: | | |
| Classify and respond to independent alerts with no resource constraint. | |
| The agent must learn to INVESTIGATE/ESCALATE critical alerts | |
| (true_severity >= 0.75) and IGNORE false positives (< 0.30). | |
| DELAY is always wrong in this task. | |
| difficulty: 1 | |
| max_steps: 30 | |
| failure_threshold: 5 | |
| max_investigations_per_step: null # unconstrained | |
| correlation_probability: 0.10 | |
| success_threshold: 0.70 # correct_actions / total_actions >= 0.70 | |
| grader: "tasks.easy.EasyTaskGrader" | |
| grading_formula: "score = (correct_actions / total_actions) * 0.98 + 0.01" | |
| - id: "medium" | |
| name: "Resource-Constrained Triage" | |
| description: | | |
| Triage under a hard per-step investigation budget of K=3. | |
| Agent must prioritise high-value investigations over false positives | |
| and use ESCALATE when budget is exhausted. Grader penalises wasting | |
| budget on FPs and missing critical alerts. | |
| difficulty: 2 | |
| max_steps: 40 | |
| failure_threshold: 5 | |
| max_investigations_per_step: 3 | |
| correlation_probability: 0.20 | |
| success_threshold: 0.55 | |
| grader: "tasks.medium.MediumTaskGrader" | |
| grading_formula: | | |
| raw = resolved_score / max_possible_score | |
| fp_penalty = 0.30 * (unnecessary_investigations / total_investigations) | |
| miss_penalty = 0.20 * (critical_missed / max(critical_total, 1)) | |
| penalised = raw - fp_penalty - miss_penalty | |
| score = (penalised * 0.6) + 0.35 | |
| - id: "hard" | |
| name: "Cascading Failure Prevention" | |
| description: | | |
| Detect and stop correlated alert chains before they cascade into | |
| system failures. Chains arrive sequentially: trigger at step N, | |
| child at step N+k if trigger was missed. Agent cannot observe | |
| is_correlated β must infer from visible patterns. Stability | |
| multiplier drops sharply with each system failure. | |
| difficulty: 3 | |
| max_steps: 50 | |
| failure_threshold: 3 # stricter than easy/medium | |
| max_investigations_per_step: 3 | |
| correlation_probability: 0.40 | |
| success_threshold: 0.50 | |
| grader: "tasks.hard.HardTaskGrader" | |
| grading_formula: | | |
| chain_score = Ξ£ stop_reward(position) Γ severity_weight | |
| stability = {0 failures: 1.0, 1: 0.80, 2: 0.60, 3: 0.30, 4+: 0.00} | |
| raw = (chain_score / max_possible) * stability | |
| score = (raw * 0.98) + 0.01 | |
| # ββ Evaluation metrics (produced by graders) ββββββββββββββββββββββββββββββββββ | |
| metrics: | |
| - name: "correct_action_rate" | |
| description: "Fraction of actions matching the optimal ground-truth policy" | |
| range: [0.0, 1.0] | |
| tasks: ["easy"] | |
| - name: "resolved_score" | |
| description: "Weighted resolution quality normalised by max possible" | |
| range: [0.0, 1.0] | |
| tasks: ["medium"] | |
| - name: "resource_efficiency" | |
| description: "Ratio of productive investigations to total INVESTIGATE actions" | |
| range: [0.0, 1.0] | |
| tasks: ["medium"] | |
| - name: "chain_detection_rate" | |
| description: "Fraction of correlated chains stopped before system failure" | |
| range: [0.0, 1.0] | |
| tasks: ["hard"] | |
| - name: "system_failures" | |
| description: "Number of system failures triggered (lower is better)" | |
| range: [0, 10] | |
| tasks: ["hard"] | |
| - name: "stability_score" | |
| description: "Stability multiplier based on failure count" | |
| range: [0.0, 1.0] | |
| tasks: ["hard"] | |
| # ββ Baseline agents βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| baselines: | |
| - name: "rule_based" | |
| module: "agents.baseline" | |
| class: "RuleBasedAgent" | |
| type: "threshold" | |
| description: "Simple severity/confidence thresholding policy" | |
| scores: | |
| easy: 0.539 | |
| medium: 0.618 | |
| hard: 0.355 | |
| - name: "improved_rule_based" | |
| module: "agents.baseline" | |
| class: "ImprovedRuleBasedAgent" | |
| type: "threshold" | |
| description: "Rule-based with age-urgency, system-load awareness, resource budget guard" | |
| scores: | |
| easy: 0.250 | |
| medium: 0.355 | |
| hard: 0.068 | |
| - name: "ppo_lstm" | |
| module: "rl_agent" | |
| class: "PPOTrainer" | |
| type: "rl" | |
| description: "PPO with LSTM memory β pure numpy, trained 300+ episodes per task" | |
| scores: | |
| easy: 0.665 | |
| medium: 0.931 | |
| hard: 0.325 | |
| - name: "llm_openai" | |
| module: "inference" | |
| class: "LLMTriageAgent" | |
| type: "llm" | |
| description: "OpenAI-compatible LLM agent via API_BASE_URL / MODEL_NAME / HF_TOKEN" | |
| # ββ Infra / Docker ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| docker: | |
| image: "adaptive-alert-triage:latest" | |
| build: "docker build -t adaptive-alert-triage ." | |
| run: "docker run -p 8000:8000 adaptive-alert-triage" | |
| entrypoint: "uvicorn src.adaptive_alert_triage.server:app --host 0.0.0.0 --port 8000" | |
| # ββ Setup and validation ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| setup: | |
| python: ">=3.9" | |
| install: "pip install -e ." | |
| pythonpath: "src" | |
| test: "pytest tests/" | |
| validate: "openenv validate" | |
| baseline: "python inference.py --n 3" | |
| api_version: "1.0" | |
| framework: "openenv" | |
| documentation: | |
| readme: "README.md" | |
| baseline: "inference.py" | |
| agents: "agents/" | |
| tasks: "tasks/" | |
| api_docs: "src/adaptive_alert_triage/" | |
| server: "src/adaptive_alert_triage/server.py" | |