File size: 2,944 Bytes
cc0da5c
c15d346
cc0da5c
 
c15d346
 
 
 
 
 
cc0da5c
 
 
 
 
 
 
c15d346
 
cc0da5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c15d346
cc0da5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c15d346
 
 
 
 
cc0da5c
 
 
 
 
 
c15d346
cc0da5c
c15d346
 
cc0da5c
 
c15d346
cc0da5c
c15d346
 
 
 
 
 
 
 
cc0da5c
 
c15d346
 
 
 
 
 
 
cc0da5c
 
c15d346
 
 
 
 
 
 
 
cc0da5c
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
name: sql-optim-env
version: "2.0.0"
description: >
  An OpenEnv-compliant reinforcement learning environment where AI agents
  learn to diagnose and optimize SQL queries. Unlike any other submission,
  optimized queries are ACTUALLY EXECUTED against a DuckDB in-memory
  database with realistic synthetic data (500k orders, 1M events).
  Reward is computed from real execution speedup + result correctness β€”
  not keyword heuristics. Five tasks from easy anti-patterns to expert
  window function audits.

tags:
  - openenv
  - sql
  - database
  - performance
  - optimization
  - duckdb
  - execution-grounded
  - llm-agent

language: python
framework: openenv

observation_space:
  type: object
  fields:
    task_id: string
    task_name: string
    task_description: string
    sql_query: string
    schema_info: string
    dialect: string
    difficulty: string
    step_count: integer
    max_steps: integer
    issues_found_so_far: array
    last_execution: object

action_space:
  type: object
  fields:
    suggestions: array
    optimized_query: string
    summary: string
    estimated_improvement: string
    approved: boolean

reward:
  type: float
  min: 0.0
  max: 1.0
  description: >
    Execution-grounded composite score:
    Real Speedup (35%) β€” actual DuckDB timing ratio,
    Result Correctness (20%) β€” both queries return identical data,
    Issue Detection (25%) β€” keyword match vs ground truth,
    Approval Correctness (8%), Summary Quality (7%), Severity Labels (5%).

tasks:
  - id: task_1_basic_antipatterns
    name: "Basic SQL Anti-pattern Detection"
    difficulty: easy
    max_steps: 3
    description: "SELECT *, CAST on filter column, YEAR() function β€” 3 classic anti-patterns on 500k rows"

  - id: task_2_correlated_subqueries
    name: "N+1 Correlated Subquery Elimination"
    difficulty: medium
    max_steps: 4
    description: "3 correlated subqueries causing ~10M row reads β€” rewrite to single aggregation JOIN"

  - id: task_3_wildcard_scan
    name: "Wildcard LIKE & Projection Optimization"
    difficulty: medium-hard
    max_steps: 4
    description: "Leading-wildcard LIKE on 1M events, SELECT *, pre-filter push-down"

  - id: task_4_implicit_join
    name: "Implicit Cross Join & Scalar Subquery Elimination"
    difficulty: hard
    max_steps: 5
    description: "Comma-syntax join risk + 2 correlated global aggregations β€” rewrite with CTE"

  - id: task_5_window_functions
    name: "Window Function & Full-Scan Audit"
    difficulty: expert
    max_steps: 5
    description: "5 window functions over 1M unfiltered rows including a global RANK() sort"

endpoints:
  reset:       POST /reset
  step:        POST /step
  state:       GET  /state
  tasks:       GET  /tasks
  grader:      POST /grader
  baseline:    POST /baseline
  execute:     POST /execute
  leaderboard: GET  /leaderboard

deployment:
  platform: huggingface-spaces
  port: 7860
  dockerfile: Dockerfile