File size: 5,486 Bytes
f30d05a
 
 
 
b2742eb
 
3c1b0c7
 
f30d05a
 
3c1b0c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f30d05a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
 
 
 
 
 
 
 
 
f30d05a
 
 
 
 
 
 
 
 
3c1b0c7
 
 
 
 
 
 
 
 
 
 
 
f30d05a
3c1b0c7
 
f30d05a
 
 
 
 
 
 
 
3c1b0c7
 
f30d05a
b2742eb
3c1b0c7
 
 
 
 
 
 
 
 
 
f30d05a
b2742eb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
name: sql-db-engineer-agent
version: '2.0.0'
description: 'SQL Database Engineer Agent β€” An OpenEnv environment where AI agents learn to act like senior database engineers. Evolved from SQL Query Debugger (Round 1). Agents manage production databases over 50+ steps: inspecting slow queries, creating indexes, rewriting queries, partitioning tables.'
tags: [openenv, real-world, sql, database, engineering, indexing, reinforcement-learning, long-horizon, world-modeling, self-improvement, wildcard]
hackathon: 'META x PyTorch x SST OpenEnv Hackathon 2026'
domain: 'SQL Database Engineering'

tasks:
  # ── Round 1: SQL Query Debugger (15 tasks β€” backward compatible) ──

  - id: easy_001
    difficulty: easy
    description: 'Fix SQL syntax error: missing commas in SELECT clause'

  - id: easy_002
    difficulty: easy
    description: 'Fix SQL syntax error: missing WHERE keyword'

  - id: easy_003
    difficulty: easy
    description: 'Fix SQL syntax error: unclosed string literal'

  - id: easy_004
    difficulty: easy
    description: 'Fix SQL syntax error: ORDER used instead of ORDER BY'

  - id: easy_005
    difficulty: easy
    description: 'Fix SQL syntax error: GROUP used instead of GROUP BY'

  - id: medium_001
    difficulty: medium
    description: 'Fix wrong JOIN type: INNER JOIN excludes users with no orders'

  - id: medium_002
    difficulty: medium
    description: 'Fix wrong JOIN condition causing incorrect product associations'

  - id: medium_003
    difficulty: medium
    description: 'Fix aggregation logic: HAVING clause used incorrectly as WHERE'

  - id: medium_004
    difficulty: medium
    description: 'Fix correlated subquery correlating on wrong column'

  - id: medium_005
    difficulty: medium
    description: 'Fix DISTINCT misuse with COUNT causing invalid query'

  - id: hard_001
    difficulty: hard
    description: 'Detect and fix N+1 query pattern with correlated subqueries'

  - id: hard_002
    difficulty: hard
    description: 'Fix function on indexed column preventing index usage'

  - id: hard_003
    difficulty: hard
    description: 'Fix implicit cartesian product from missing JOIN condition'

  - id: hard_004
    difficulty: hard
    description: 'Fix SELECT * in multi-table JOIN causing over-fetching'

  - id: hard_005
    difficulty: hard
    description: 'Fix window function misuse with missing PARTITION BY and ORDER BY'

  # ── Round 2: DB Engineering Scenarios (15 tasks β€” long-horizon) ──

  - id: easy_s001
    difficulty: easy
    description: 'User lookup query taking 2s on 10K users table β€” add email index'

  - id: easy_s002
    difficulty: easy
    description: 'Order status query scanning 50K orders β€” composite index needed'

  - id: easy_s003
    difficulty: easy
    description: 'Product search doing full table scan on 20K products'

  - id: easy_s004
    difficulty: easy
    description: 'Session lookup hitting 15K sessions without index'

  - id: easy_s005
    difficulty: easy
    description: 'Log table filter slow on 30K entries β€” compound index fix'

  - id: medium_s001
    difficulty: medium
    description: 'E-commerce DB: 50K orders + 8K users, two slow queries'

  - id: medium_s002
    difficulty: medium
    description: 'Blog platform: 100K posts + 20K authors, search and lookup slow'

  - id: medium_s003
    difficulty: medium
    description: 'Inventory: 80K products + 200K stock movements, rewrite + index'

  - id: medium_s004
    difficulty: medium
    description: 'Ticketing system: 60K tickets + 5K agents, status queue slow'

  - id: medium_s005
    difficulty: medium
    description: 'Analytics DB: 150K events + 10K users, funnel query slow'

  - id: hard_s001
    difficulty: hard
    description: 'Financial DB: 500K transactions across 4 tables, 3 slow queries'

  - id: hard_s002
    difficulty: hard
    description: 'SaaS platform: 8-table schema, 200K+ records, dashboard 20s+'

  - id: hard_s003
    difficulty: hard
    description: 'Healthcare: 1M patient records, compliance + clinical queries'

  - id: hard_s004
    difficulty: hard
    description: 'Gaming leaderboard: 2M players, 5M matches, ranking degraded'

  - id: hard_s005
    difficulty: hard
    description: 'Logistics: 6 tables, 3M shipments + 10M tracking records'

action_space:
  type: discrete
  actions:
    - identify_error
    - propose_fix
    - submit_answer
    - request_hint
    - explain_issue
    - optimize_query
    - inspect_query
    - analyze_indexes
    - create_index
    - rewrite_query
    - add_column
    - drop_index
    - partition_table
    - analyze_statistics
    - submit_report

observation_space:
  type: dict
  fields:
    - task_id
    - task_description
    - current_context
    - step_count
    - difficulty
    - max_steps
    - hints_used
    - previous_actions
    - metadata

reward:
  min: 0.001
  max: 0.999
  type: dense_with_milestones
  description: >
    Dense reward at every step.
    Round 1: partial credit for identification, fixing, explanation quality.
    Round 2: step reward + delta reward (DB performance change) +
    milestone bonuses at 25%/50%/75% improvement + terminal score.

episode:
  max_steps: 50
  termination: 'submit_report (Round 2) or submit_answer/optimize_query (Round 1) or max_steps reached'

api:
  port: 7860
  endpoints:
    - GET  /health
    - POST /reset
    - POST /step
    - GET  /state
    - GET  /tasks
    - POST /grader
    - POST /baseline
    - GET  /progress