Rockerleo commited on
Commit
dc936ba
·
verified ·
1 Parent(s): 1e82f9d

Upload server/openenv.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. server/openenv.yaml +76 -13
server/openenv.yaml CHANGED
@@ -5,51 +5,114 @@ description: >
5
  investigating a broken training run. The environment procedurally generates
6
  realistic training artifacts (logs, configs, preprocessing code, eval results)
7
  with one planted fault. The agent must systematically investigate and submit
8
- a structured diagnosis. Three tasks: config error (easy) data leakage (medium)
9
- silent evaluation bug (hard). All graders are fully deterministic.
10
- author: Mohit Goyal
11
  license: MIT
12
- tags: [openenv, rl, mlops, debugging, machine-learning, agents]
 
 
 
 
 
 
 
13
  tasks:
14
  - id: easy
15
  name: Config Error Diagnosis
16
  difficulty: easy
17
  max_steps: 20
18
  bug_pool: [exploding_lr, wrong_optimizer, batch_size_overflow]
19
- reward_range: [0.0, 1.0]
 
 
 
 
20
  - id: medium
21
  name: Data Leakage Detection
22
  difficulty: medium
23
  max_steps: 30
24
  bug_pool: [data_leakage_scaler, data_leakage_overlap, wrong_split_ratio]
25
- reward_range: [0.0, 1.0]
 
 
 
 
 
26
  - id: hard
27
  name: Silent Evaluation Bug
28
  difficulty: hard
29
  max_steps: 40
30
  bug_pool: [label_encoder_mismatch, silent_metric_swap, tokenizer_version_drift]
31
- reward_range: [0.0, 1.0]
32
  asymmetric_penalty: true
 
 
 
 
 
 
33
  action_space:
34
  type: discrete_structured
35
- actions: [read_config, read_logs, check_dataset_stats, inspect_preprocessing,
36
- read_eval_results, run_sanity_check, query_artifact, submit_diagnosis]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  observation_space:
38
  type: structured_text
39
- fields: [task_id, run_summary, available_artifacts, artifacts_read,
40
- last_action_result, step_count, max_steps, done, messages]
 
 
 
 
 
 
 
 
 
 
 
41
  reward:
42
  type: dense_and_terminal
43
- per_step: "+0.02 new artifact read, -0.02 duplicate read, +0.01 new sanity check"
44
- terminal: "0.15 category + 0.25 file + 0.30 field + 0.30 fix. Hard task 1.5x penalty."
 
 
 
 
 
 
 
 
 
45
  api:
46
  reset: POST /reset
47
  step: POST /step
48
  state: GET /state
49
  health: GET /health
 
 
50
  websocket: /ws
 
51
  runtime:
52
  port: 7860
53
  workers: 1
54
  framework: fastapi
55
  python: "3.11"
 
 
5
  investigating a broken training run. The environment procedurally generates
6
  realistic training artifacts (logs, configs, preprocessing code, eval results)
7
  with one planted fault. The agent must systematically investigate and submit
8
+ a structured diagnosis. Three tasks: config error (easy) -> data leakage (medium)
9
+ -> silent evaluation bug (hard). All graders are fully deterministic.
10
+ author: Code Clashers
11
  license: MIT
12
+ tags: [openenv, rl, mlops, debugging, machine-learning, agents, pytorch]
13
+
14
+ grading:
15
+ type: deterministic
16
+ judge: none
17
+ method: keyword_and_substring_matching
18
+ reproducible: true
19
+
20
  tasks:
21
  - id: easy
22
  name: Config Error Diagnosis
23
  difficulty: easy
24
  max_steps: 20
25
  bug_pool: [exploding_lr, wrong_optimizer, batch_size_overflow]
26
+ reward_range: [0.01, 0.99]
27
+ description: >
28
+ Diagnose a training failure caused by a hyperparameter misconfiguration.
29
+ Symptoms are visible in training logs (loss explosion, oscillation, trivial overfitting).
30
+
31
  - id: medium
32
  name: Data Leakage Detection
33
  difficulty: medium
34
  max_steps: 30
35
  bug_pool: [data_leakage_scaler, data_leakage_overlap, wrong_split_ratio]
36
+ reward_range: [0.01, 0.99]
37
+ description: >
38
+ Identify data leakage in the preprocessing pipeline. Val accuracy is suspiciously
39
+ high from epoch 1, but test performance tells a different story. Requires correlating
40
+ logs, eval results, and preprocessing code.
41
+
42
  - id: hard
43
  name: Silent Evaluation Bug
44
  difficulty: hard
45
  max_steps: 40
46
  bug_pool: [label_encoder_mismatch, silent_metric_swap, tokenizer_version_drift]
47
+ reward_range: [0.01, 0.99]
48
  asymmetric_penalty: true
49
+ penalty_multiplier: 1.5
50
+ description: >
51
+ Find a silent bug in the evaluation pipeline. Training logs look completely normal.
52
+ No errors, no warnings. Only a val/test metric gap reveals the issue. Requires
53
+ reasoning about what is absent rather than what is present.
54
+
55
  action_space:
56
  type: discrete_structured
57
+ actions:
58
+ - read_config
59
+ - read_logs
60
+ - check_dataset_stats
61
+ - inspect_preprocessing
62
+ - read_eval_results
63
+ - run_sanity_check
64
+ - query_artifact
65
+ - submit_diagnosis
66
+ sanity_check_types:
67
+ - label_consistency
68
+ - data_leakage
69
+ - gradient_norms
70
+ - class_balance
71
+ - feature_statistics
72
+ - encoder_version_match
73
+ - loss_trajectory
74
+ - metric_gap_analysis
75
+
76
  observation_space:
77
  type: structured_text
78
+ fields:
79
+ - task_id
80
+ - task_description
81
+ - run_id
82
+ - run_summary
83
+ - available_artifacts
84
+ - artifacts_read
85
+ - last_action_result
86
+ - step_count
87
+ - max_steps
88
+ - done
89
+ - messages
90
+
91
  reward:
92
  type: dense_and_terminal
93
+ per_step:
94
+ new_artifact_read: +0.02
95
+ duplicate_read: -0.02
96
+ new_sanity_check: +0.01
97
+ terminal:
98
+ failure_category: +0.15
99
+ root_cause_file: +0.25
100
+ root_cause_field: +0.30
101
+ proposed_fix: +0.30
102
+ hard_task_penalty: "if score < 0.70, additional 0.5x on missed components"
103
+
104
  api:
105
  reset: POST /reset
106
  step: POST /step
107
  state: GET /state
108
  health: GET /health
109
+ tasks: GET /tasks
110
+ openenv_state: GET /openenv/state
111
  websocket: /ws
112
+
113
  runtime:
114
  port: 7860
115
  workers: 1
116
  framework: fastapi
117
  python: "3.11"
118
+ container: docker