File size: 5,644 Bytes
3c665d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
719c147
3c665d2
263261a
 
 
3c665d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
name: sql-agent-openenv
version: "1.0.0"
description: >
  A SQL generation and repair environment where an AI agent learns to write
  correct SQL queries through a self-debugging loop powered by a LinUCB
  contextual bandit and GEPA prompt evolution. Models real-world data analyst
  workflows β€” querying databases with natural language, handling errors, and
  iteratively improving.

author: sql-agent-openenv-team
tags:
  - openenv
  - sql
  - rl
  - nlp
  - contextual-bandit

# ── Endpoints ────────────────────────────────────────────────────────────────
api:
  reset: /reset
  step: /step
  state: /state

# ── Action Space ─────────────────────────────────────────────────────────────
action_space:
  type: discrete
  n: 9
  actions:
    - name: generate
      description: "Generate SQL from scratch (first attempt)"
    - name: rewrite_full
      description: "Completely rewrite the query from scratch"
    - name: fix_column
      description: "Fix wrong column name references using schema"
    - name: fix_table
      description: "Fix wrong table name references or JOIN structure"
    - name: add_groupby
      description: "Add or fix GROUP BY / aggregation clauses"
    - name: rewrite_cte
      description: "Restructure CTEs or subqueries"
    - name: fix_syntax
      description: "Fix syntax errors (brackets, commas, keywords)"
    - name: change_dialect
      description: "Convert to SQLite-compatible functions"
    - name: relax_filter
      description: "Broaden or remove overly strict WHERE conditions"

# ── Observation Space ────────────────────────────────────────────────────────
observation_space:
  type: dict
  fields:
    - name: question
      type: string
      description: "Natural language question to answer with SQL"
    - name: schema_info
      type: string
      description: "Full database schema (tables, columns, types, FK relationships)"
    - name: current_sql
      type: string
      nullable: true
      description: "The SQL generated on the last attempt (null on first step)"
    - name: error_message
      type: string
      nullable: true
      description: "SQLite error message from the last attempt (null on success)"
    - name: error_class
      type: string
      nullable: true
      description: "Classified error type (e.g. no_such_column, syntax_error)"
    - name: attempt_number
      type: integer
      description: "Current attempt number (0 at reset, increments each step)"
    - name: max_attempts
      type: integer
      description: "Maximum allowed attempts per episode (5)"
    - name: task_id
      type: string
      description: "Active task identifier"
    - name: task_difficulty
      type: string
      description: "Task difficulty level: easy | medium | hard"

# ── Reward ───────────────────────────────────────────────────────────────────
reward:
  range: [0.05, 0.95]
  description: >
    Task score is the grader output clamped strictly inside (0, 1). Graders
    score partial progress (column correctness, row-count match) and apply
    attempt penalties for multi-step repair episodes.

# ── Tasks ────────────────────────────────────────────────────────────────────
tasks:
  - id: simple_queries
    name: Simple SQL Queries
    difficulty: easy
    description: >
      Single-table SELECT queries. Agent must retrieve correct rows by applying
      basic filters and projections on the marketplace database.
    question_count: 5
    grader: >
      Checks that required output columns are present and row count falls
      within expected bounds. Attempt penalty not applied.

  - id: join_queries
    name: SQL Join Queries
    difficulty: medium
    description: >
      Multi-table JOIN queries with GROUP BY and aggregation. Agent must
      correctly join tables and compute aggregates over the marketplace data.
    question_count: 5
    grader: >
      Correct columns + row count score multiplied by (1.0 - 0.1*(attempts-1)).
      Rewards efficient, first-try solutions.

  - id: complex_queries
    name: Complex SQL Queries
    difficulty: hard
    description: >
      Advanced queries using CTEs, window functions, nested aggregations, and
      multi-level joins. Requires precise SQLite syntax knowledge.
    question_count: 5
    grader: >
      Strict correctness required. Score capped at 0.8 without first-attempt
      bonus. Attempt penalty of 0.1*(attempts-1) applied. Hard tasks genuinely
      challenge frontier models.

# ── Environment Metadata ─────────────────────────────────────────────────────
metadata:
  max_steps_per_episode: 5
  database: SQLite (marketplace schema β€” users, products, orders, reviews, sellers)
  rl_algorithm: LinUCB contextual bandit (feature_dim=20, 8 repair actions)
  prompt_optimizer: GEPA (Generative Evolutionary Prompt Adaptation)
  runtime_estimate_minutes: 5
  compute_requirements:
    vcpu: 2
    memory_gb: 4