Spaces:

scaler-hack
/

scaler-openenv

Sleeping

App Files Files Community

scaler-openenv / openenv.yaml

Hacktrix-121

grader fixes

c18a9d1 about 1 month ago

raw

history blame contribute delete

11.9 kB

	# openenv.yaml — OpenEnv Specification for Adaptive Alert Triage
	# Matches the actual implementation in src/adaptive_alert_triage/
	# Validated against: env.py, models.py, tasks/easy.py, tasks/medium.py, tasks/hard.py

	name: "AdaptiveAlertTriage"
	version: "0.1.0"
	description: \|
	A partially-observable RL environment that simulates real-time IT alert triage
	and incident response. An agent receives a continuous stream of system alerts
	and must decide — for each one — whether to INVESTIGATE, IGNORE, ESCALATE, or
	DELAY, under time pressure, resource constraints, and the risk of cascading
	failures from unhandled correlated alerts.

	This environment models a task performed daily by DevOps and SOC engineers:
	triaging noisy monitoring signals while preventing real incidents from
	escalating into outages.

	authors:
	- name: "Scalar Hackathon Team"
	email: "team@scalar.com"

	license: "MIT"

	tags:
	- reinforcement-learning
	- openenv
	- alert-triage
	- incident-response
	- partial-observability
	- resource-constraints
	- cascading-failures

	# ── Environment class ─────────────────────────────────────────────────────────
	environment:
	module: "adaptive_alert_triage.env"
	class: "AdaptiveAlertTriageEnv"
	# Constructor accepts: task_id ("easy"\|"medium"\|"hard"), seed (int, optional)

	# ── OpenEnv interface ─────────────────────────────────────────────────────────
	# All three methods are implemented in AdaptiveAlertTriageEnv
	interface:
	reset:
	signature: "reset(seed=None, options=None) -> Observation"
	description: \|
	Resets the episode. Generates an initial batch of synthetic alerts
	using the task-specific correlation_probability. Returns an Observation
	with alerts stripped of hidden fields (true_severity, is_correlated).
	step:
	signature: "step(action: Action) -> (Observation, Reward, done: bool, info: dict)"
	description: \|
	Processes one Action, updates alert queue, checks for failures, generates
	new alerts, and returns the next observation. The info dict always
	contains: processed_alerts, correlation_groups, failures_this_step,
	system_failure, action_correct, cumulative_reward, step, failures_count.
	state:
	signature: "state() -> EpisodeState"
	description: \|
	Returns the full internal EpisodeState including hidden ground-truth
	(true_severities, correlation_groups, false_positives, pending_failures).
	For evaluation and replay only — never exposed to the agent during training.

	# ── Configuration ─────────────────────────────────────────────────────────────
	config:
	actions:
	- "INVESTIGATE"
	- "IGNORE"
	- "ESCALATE"
	- "DELAY"

	# ── Observation space ─────────────────────────────────────────────────────────
	observation:
	type: "Pydantic BaseModel (Observation)"
	fields:
	alerts:
	type: "List[Alert]"
	description: "Active alerts awaiting triage. Each Alert has id, visible_severity, confidence, alert_type, age."
	hidden_fields: "true_severity, is_correlated — stripped before returned to agent"
	system_load:
	type: "float [0.0, 1.0]"
	description: "Current infrastructure utilisation"
	queue_length:
	type: "int >= 0"
	description: "Number of active alerts in queue"
	time_remaining:
	type: "int >= 0"
	description: "Steps left before episode ends"
	episode_step:
	type: "int >= 0"
	description: "Current step index (0-based)"
	resource_budget:
	type: "Optional[int]"
	description: "Remaining INVESTIGATE actions this step. None = unconstrained (easy task)."

	# ── Action space ──────────────────────────────────────────────────────────────
	action:
	type: "Pydantic BaseModel (Action)"
	fields:
	alert_id:
	type: "str"
	description: "ID of the target alert — must match an ID in current observation.alerts"
	action_type:
	type: "Literal['INVESTIGATE','IGNORE','ESCALATE','DELAY']"
	description: \|
	INVESTIGATE — allocates resources to diagnose; counts against resource_budget
	IGNORE — dismisses alert as noise (best for false positives)
	ESCALATE — routes to specialist team (no budget cost)
	DELAY — keeps alert in queue for re-evaluation next step
	metadata:
	type: "Dict[str, Any]"
	description: "Optional context bag (e.g. reasoning from LLM agents)"

	# ── Reward ────────────────────────────────────────────────────────────────────
	reward:
	type: "Pydantic BaseModel (Reward)"
	description: "Dense, shaped reward decomposed into named components"
	schedule:
	critical_handled: "+10.0 — INVESTIGATE or ESCALATE on critical alert (true_severity >= 0.75)"
	failure_prevented: "+5.0 — correlated alert handled (prevents cascade)"
	false_positive_ignored: "+3.0 — IGNORE on a false positive"
	medium_handled: "+2.0 * true_severity — INVESTIGATE on medium alert"
	unnecessary_invest: "-2.0 — INVESTIGATE on a false positive"
	missed_critical: "-8.0 — IGNORE on a critical alert"
	risky_delay: "-2.4 — DELAY on a critical alert"
	task_multipliers: "easy=1.0, medium=1.1, hard=1.2"
	range: [-8.0, 15.0] # per step before task multiplier; cascade bonus included in max

	# ── Tasks ─────────────────────────────────────────────────────────────────────
	tasks:
	- id: "easy"
	name: "Basic Alert Prioritisation"
	description: \|
	Classify and respond to independent alerts with no resource constraint.
	The agent must learn to INVESTIGATE/ESCALATE critical alerts
	(true_severity >= 0.75) and IGNORE false positives (< 0.30).
	DELAY is always wrong in this task.
	difficulty: 1
	max_steps: 30
	failure_threshold: 5
	max_investigations_per_step: null # unconstrained
	correlation_probability: 0.10
	success_threshold: 0.70 # correct_actions / total_actions >= 0.70
	grader: "tasks.easy.EasyTaskGrader"
	grading_formula: "score = (correct_actions / total_actions) * 0.98 + 0.01"

	- id: "medium"
	name: "Resource-Constrained Triage"
	description: \|
	Triage under a hard per-step investigation budget of K=3.
	Agent must prioritise high-value investigations over false positives
	and use ESCALATE when budget is exhausted. Grader penalises wasting
	budget on FPs and missing critical alerts.
	difficulty: 2
	max_steps: 40
	failure_threshold: 5
	max_investigations_per_step: 3
	correlation_probability: 0.20
	success_threshold: 0.55
	grader: "tasks.medium.MediumTaskGrader"
	grading_formula: \|
	raw = resolved_score / max_possible_score
	fp_penalty = 0.30 * (unnecessary_investigations / total_investigations)
	miss_penalty = 0.20 * (critical_missed / max(critical_total, 1))
	penalised = raw - fp_penalty - miss_penalty
	score = (penalised * 0.6) + 0.35

	- id: "hard"
	name: "Cascading Failure Prevention"
	description: \|
	Detect and stop correlated alert chains before they cascade into
	system failures. Chains arrive sequentially: trigger at step N,
	child at step N+k if trigger was missed. Agent cannot observe
	is_correlated — must infer from visible patterns. Stability
	multiplier drops sharply with each system failure.
	difficulty: 3
	max_steps: 50
	failure_threshold: 3 # stricter than easy/medium
	max_investigations_per_step: 3
	correlation_probability: 0.40
	success_threshold: 0.50
	grader: "tasks.hard.HardTaskGrader"
	grading_formula: \|
	chain_score = Σ stop_reward(position) × severity_weight
	stability = {0 failures: 1.0, 1: 0.80, 2: 0.60, 3: 0.30, 4+: 0.00}
	raw = (chain_score / max_possible) * stability
	score = (raw * 0.98) + 0.01

	# ── Evaluation metrics (produced by graders) ──────────────────────────────────
	metrics:
	- name: "correct_action_rate"
	description: "Fraction of actions matching the optimal ground-truth policy"
	range: [0.0, 1.0]
	tasks: ["easy"]

	- name: "resolved_score"
	description: "Weighted resolution quality normalised by max possible"
	range: [0.0, 1.0]
	tasks: ["medium"]

	- name: "resource_efficiency"
	description: "Ratio of productive investigations to total INVESTIGATE actions"
	range: [0.0, 1.0]
	tasks: ["medium"]

	- name: "chain_detection_rate"
	description: "Fraction of correlated chains stopped before system failure"
	range: [0.0, 1.0]
	tasks: ["hard"]

	- name: "system_failures"
	description: "Number of system failures triggered (lower is better)"
	range: [0, 10]
	tasks: ["hard"]

	- name: "stability_score"
	description: "Stability multiplier based on failure count"
	range: [0.0, 1.0]
	tasks: ["hard"]

	# ── Baseline agents ───────────────────────────────────────────────────────────
	baselines:
	- name: "rule_based"
	module: "agents.baseline"
	class: "RuleBasedAgent"
	type: "threshold"
	description: "Simple severity/confidence thresholding policy"
	scores:
	easy: 0.539
	medium: 0.618
	hard: 0.355

	- name: "improved_rule_based"
	module: "agents.baseline"
	class: "ImprovedRuleBasedAgent"
	type: "threshold"
	description: "Rule-based with age-urgency, system-load awareness, resource budget guard"
	scores:
	easy: 0.250
	medium: 0.355
	hard: 0.068

	- name: "ppo_lstm"
	module: "rl_agent"
	class: "PPOTrainer"
	type: "rl"
	description: "PPO with LSTM memory — pure numpy, trained 300+ episodes per task"
	scores:
	easy: 0.665
	medium: 0.931
	hard: 0.325

	- name: "llm_openai"
	module: "inference"
	class: "LLMTriageAgent"
	type: "llm"
	description: "OpenAI-compatible LLM agent via API_BASE_URL / MODEL_NAME / HF_TOKEN"

	# ── Infra / Docker ────────────────────────────────────────────────────────────
	docker:
	image: "adaptive-alert-triage:latest"
	build: "docker build -t adaptive-alert-triage ."
	run: "docker run -p 8000:8000 adaptive-alert-triage"
	entrypoint: "uvicorn src.adaptive_alert_triage.server:app --host 0.0.0.0 --port 8000"

	# ── Setup and validation ──────────────────────────────────────────────────────
	setup:
	python: ">=3.9"
	install: "pip install -e ."
	pythonpath: "src"
	test: "pytest tests/"
	validate: "openenv validate"
	baseline: "python inference.py --n 3"

	api_version: "1.0"
	framework: "openenv"

	documentation:
	readme: "README.md"
	baseline: "inference.py"
	agents: "agents/"
	tasks: "tasks/"
	api_docs: "src/adaptive_alert_triage/"
	server: "src/adaptive_alert_triage/server.py"