name: NovaTechIncidentCommand
description: >
  Seeded OpenEnv incident-response benchmark built from a realistic NovaTech log corpus.
  Agents operate under partial observability: they must query logs, inspect dependencies,
  update a structured causal hypothesis, choose safe containment, and submit a final report.

tasks:
  - id: easy
    description: Detect a clear login outage caused by auth-service heap exhaustion.
  - id: medium
    description: Resolve competing hypotheses during a payment confirmation outage.
  - id: hard
    description: Reconstruct a cascading multi-service incident under partial observability.

action_space:
  type: structured
  fields:
    session_id: string
    action_type: "query_logs | inspect_dependencies | update_hypothesis | execute_containment | submit_report | request_more | no_anomalies"
    query: "optional structured filter with service_name, server_id, levels, start_time, end_time, text_contains, limit"
    target_service: "optional service name"
    hypothesis: "optional structured tuple: primary_service, failure_mode, dependency, customer_impact, confidence"
    containment_plan: "optional list of containment action names"
    report: "optional structured report with evidence_log_ids, impacted_services, root_cause, containment_plan, summary"

observation_space:
  type: structured
  fields:
    session_id: string
    task_id: string
    task_title: string
    briefing: "structured incident briefing with incident window, objective, suspected_services, customer_statement, operational_constraints"
    dependency_graph: "service dependency map"
    visible_logs: "list of currently revealed log entries only"
    revealed_log_count: integer
    visited_services: "list of services explored so far"
    submitted_containment: "list of chosen containment actions"
    last_hypothesis: "optional structured root-cause hypothesis"
    step_number: integer
    max_steps: integer
    feedback: string
    done: boolean
  notes:
    - "Observations expose only agent-revealed logs."
    - "The dependency graph is visible, but hidden logs and gold evidence remain private."
    - "The latest structured hypothesis is included so agents can reason iteratively."

reward_definition:
  type: scalar
  range: [0.0, 1.0]
  components:
    signal_reward: "Rewards newly discovered relevant signals and evidence quality."
    hypothesis_reward: "Rewards improvement toward the gold causal tuple and safe containment alignment."
    efficiency_reward: "Rewards solving within the action budget."
    penalty: "Penalizes unseen evidence, contradictions, forbidden containment, loops, and empty queries."
  techniques:
    - "Information-gain shaping: focused discovery beats broad noisy retrieval."
    - "Best-hypothesis tracking: reward is tied to causal improvement across the episode."
    - "Observation-consistent grading: unseen evidence references are rejected."
    - "Contradiction penalties: evidence, cause, impact, and timeline must agree."
    - "Safety shaping: destructive containment is penalized even if diagnosis is partially correct."

interfaces:
  reset: "reset() -> initial observation"
  step: "step(action) -> observation, reward, done, info"
  state: "state() -> non-leaking public session state"