File size: 1,997 Bytes
6a71058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
name: mlops-debug-env
version: "1.0.0"
description: >
  MLOps Pipeline Debugger: an AI agent acts as a senior ML engineer
  investigating a broken training run. The environment procedurally generates
  realistic training artifacts (logs, configs, preprocessing code, eval results)
  with one planted fault. The agent must systematically investigate and submit
  a structured diagnosis. Three tasks: config error (easy) → data leakage (medium)
  → silent evaluation bug (hard). All graders are fully deterministic.
author: Mohit Goyal
license: MIT
tags: [openenv, rl, mlops, debugging, machine-learning, agents]
tasks:
  - id: easy
    name: Config Error Diagnosis
    difficulty: easy
    max_steps: 20
    bug_pool: [exploding_lr, wrong_optimizer, batch_size_overflow]
    reward_range: [0.0, 1.0]
  - id: medium
    name: Data Leakage Detection
    difficulty: medium
    max_steps: 30
    bug_pool: [data_leakage_scaler, data_leakage_overlap, wrong_split_ratio]
    reward_range: [0.0, 1.0]
  - id: hard
    name: Silent Evaluation Bug
    difficulty: hard
    max_steps: 40
    bug_pool: [label_encoder_mismatch, silent_metric_swap, tokenizer_version_drift]
    reward_range: [0.0, 1.0]
    asymmetric_penalty: true
action_space:
  type: discrete_structured
  actions: [read_config, read_logs, check_dataset_stats, inspect_preprocessing,
            read_eval_results, run_sanity_check, query_artifact, submit_diagnosis]
observation_space:
  type: structured_text
  fields: [task_id, run_summary, available_artifacts, artifacts_read,
           last_action_result, step_count, max_steps, done, messages]
reward:
  type: dense_and_terminal
  per_step: "+0.02 new artifact read, -0.02 duplicate read, +0.01 new sanity check"
  terminal: "0.15 category + 0.25 file + 0.30 field + 0.30 fix. Hard task 1.5x penalty."
api:
  reset: POST /reset
  step: POST /step
  state: GET /state
  health: GET /health
  websocket: /ws
runtime:
  port: 7860
  workers: 1
  framework: fastapi
  python: "3.11"