name: sre-engineer-llm
version: 3.1.0
description: >
  Tier-escalating SRE training environment. Three tiers escalate three
  different dimensions: Triage (compute), Strategy (horizon), Operations
  (realism).

  Triage ships 12 templates x 6 entries each (1 base + 5 procgen variants) =
  72 deterministic scenarios over a 4-service topology. The 5-component
  rubric (outcome 0.45 + action_validity 0.20 + format 0.10 + anticheat
  0.15 + efficiency 0.10) sums to exactly 1.0 and pins a heuristic ceiling
  of [0.65, 0.80] with a scripted-optimal reference at >=0.90.

  Strategy runs as a Python orchestrator that chains Triage episodes with
  persistent horizon state (unresolved alerts, pending deploys, tech-debt
  counter, horizon-decay reward). It does not simulate a 15-20 service
  topology faithfully -- the YAMLs declare a richer action set as a design
  spec only.

  Operations runs as a Python state-machine simulator over a 22-node service
  graph using the same 11-action interface as Triage. The docker-compose
  stack under sre_gym/operations/families/ references stub images that are
  not published; it is shipped as design spec, not as runnable
  infrastructure.

  Training is not committed to this repo. See notebooks/01 for the GRPO
  pipeline that needs to be executed externally (Colab A100). The README
  baseline tables show frontier-LLM measurements only; the trained-model
  row is intentionally absent until a real run is committed.
author: Daksh Verma
license: Apache-2.0

environment:
  action_type: UnifiedIncidentAction
  observation_type: UnifiedIncidentObservation
  state_type: UnifiedIncidentState
  max_steps: 13
  difficulties: [easy, medium, hard]
  reward_type: dense
  # GRPO/TRL training contract: parallel env instances per training step.
  # Required for the OpenEnv batched-rollout pattern documented at
  # https://huggingface.co/docs/trl/openenv.
  max_concurrent_envs: 64
  scenario_count: 72                     # 12 templates x 6 entries (1 base + 5 procgen)
  scenario_templates: 12
  procgen_variants_per_template: 5       # 5 variants per template; 6 entries total each
  deterministic_seeded: true
  tier: triage                           # the runnable surface served by this Space
  tier_escalation_dimension: compute     # see docs/ARCHITECTURE.md for full design

tiers:
  triage:
    runnable: true
    runnable_kind: live_environment       # real /reset + /step routes against the live env
    escalation_dimension: compute
    persona: "ML student / Kaggle, $30 of HF credits"
    scenario_count: 72
    docs: docs/TRIAGE_TIER.md
    notes: "12 base templates + 5 procgen variants each = 72 deterministic scenarios."
  strategy:
    runnable: true
    runnable_kind: python_orchestrator    # chained Triage episodes with horizon state
    escalation_dimension: horizon
    persona: "seed/Series A startup, $300-500 budget"
    scenario_count: 3
    docs: docs/STRATEGY_TIER.md
    notes: >
      Strategy runs each scenario as a sequence of Triage episodes glued
      together by the horizon-state object (unresolved alerts, pending
      deploys, tech-debt counter, horizon-decay reward). It is NOT a
      simulator of a 15-20 service topology; the wider action universe
      declared in the YAMLs (~28 actions / scenario) is design spec
      only and is not implemented in the env.
  operations:
    runnable: true
    runnable_kind: python_simulator       # graph state machine, not real cluster
    escalation_dimension: realism
    persona: "enterprise SRE platform, 8x A100/H100"
    scenario_count: 1                     # one specced family
    chaos_pattern_count: 12               # includes one alias (payment_webhook_storm)
    docs: docs/OPERATIONS_TIER.md
    notes: >
      Operations runs as an in-memory 22-node graph mutator. Reuses the
      Triage 11-action interface; correct_action across patterns is heavily
      skewed toward rollback_deploy (11/12) so the patterns are separable
      on observation alone -- not a hidden-information benchmark. The
      compose file under sre_gym/operations/families/ references stub
      images that are NOT published; do not attempt `docker compose up`
      expecting it to pull successfully.

reward:
  rubric_components:
    outcome: 0.45
    action_validity: 0.20
    format: 0.10
    anticheat: 0.15
    efficiency: 0.10
  composite_total: 1.0
  heuristic_ceiling_band: [0.65, 0.80]
  scripted_expert_floor: 0.90
  docs: docs/REWARD_DESIGN.md

training:
  status: executed                         # SFT + GRPO end-to-end run completed
  base_model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
  pipeline: notebooks/01_triage_train_grpo_qwen25_7b.ipynb
  comparison: notebooks/02_triage_eval_compare_all.ipynb
  pool_server: coliseum/                  # parallel-rollout lease pool for GRPO trainers
  current_artifacts:
    - outputs/qwen25_7b_sft_final/        # SFT checkpoint, eval perplexity 1.755
    - outputs/qwen25_7b_grpo_final/       # GRPO checkpoint
    - eval/results/qwen25_7b_comparison_raw.csv
    - eval/results/qwen25_7b_comparison_hero.png
  trajectory_corpus:
    seed_v2_120_jsonl_rows: 120           # 72 expert + 24 mediocre + 24 failure
    templates_covered: 12                 # all 12 Triage templates have ≥5 episodes

huggingface:
  space_id: Madhav189/SystemTruth             # canonical HF Space
  github_repo: Madhav-GPT/SystemTruth         # canonical GitHub
  sdk: docker
  hardware: cpu-basic