File size: 4,677 Bytes
a39d8ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# nl2sql-bench/openenv.yaml
# OpenEnv environment manifest β€” validated by `openenv validate`

name: nl2sql-bench
version: "0.1.0"
description: >
  Natural Language to SQL query generation environment for RL training.
  An agent iteratively writes and refines SQLite queries against a synthetic
  e-commerce database to answer business questions. Multi-turn episodes with
  dense, shaped rewards. Three difficulty tasks: easy (single-table),
  medium (JOIN + GROUP BY), hard (window functions + CTEs).

author: "nl2sql-bench team"
license: MIT
tags:
  - openenv
  - nl2sql
  - sql
  - analytics
  - rl-training
  - deterministic
  - multi-turn

# ── Task definitions ────────────────────────────────────────────────────────
tasks:
  - name: simple-filter
    difficulty: easy
    description: >
      Single-table SELECT with WHERE, ORDER BY, and LIMIT.
      Tests basic SQL fluency. Expected solve rate: high.
    max_steps: 5
    reward_range: [0.0, 1.0]

  - name: join-aggregation
    difficulty: medium
    description: >
      Multi-table JOINs with GROUP BY, HAVING, and aggregation functions
      (COUNT, SUM, AVG, ROUND). Tests relational reasoning.
    max_steps: 5
    reward_range: [0.0, 1.0]

  - name: analytics-window
    difficulty: hard
    description: >
      Advanced analytics using CTEs, window functions (DENSE_RANK,
      ROW_NUMBER, running SUM), and nested subqueries. Tests multi-step
      planning and SQLite-specific syntax.
    max_steps: 5
    reward_range: [0.0, 1.0]

# ── Action / Observation space ──────────────────────────────────────────────
action_space:
  type: object
  properties:
    query:
      type: string
      description: "A SQLite SELECT query string."

observation_space:
  type: object
  properties:
    question:
      type: string
      description: "Natural-language question the agent must answer."
    schema_context:
      type: string
      description: "Compact database schema description for the agent."
    task_name:
      type: string
      description: "Active task identifier."
    last_query:
      type: string
      description: "The SQL query submitted on the previous step."
    last_result:
      type: array
      description: "Up to 10 rows returned by the last query (list of dicts)."
    last_error:
      type: string
      nullable: true
      description: "SQLite error string if last query failed, else null."
    result_columns:
      type: array
      description: "Column names of last_result."
    step:
      type: integer
      description: "Current step number (1-indexed; 0 after reset)."
    max_steps:
      type: integer
      description: "Maximum steps per episode."
    done:
      type: boolean
      description: "True when episode ends (exact match or step limit reached)."
    reward:
      type: number
      nullable: true
      description: "Reward for the most recent step [0.0, 1.0]."
    score:
      type: number
      description: "Normalised cumulative episode score [0.0, 1.0]."

# ── Reward function description ─────────────────────────────────────────────
reward:
  type: shaped
  range: [0.0, 1.0]
  components:
    - name: syntax_ok
      weight: 0.10
      description: "Query executes without SQLite error."
    - name: columns_match
      weight: 0.20
      description: "Returned column names match ground truth exactly."
    - name: row_count_match
      weight: 0.20
      description: "Number of returned rows matches ground truth."
    - name: exact_match
      weight: 0.50
      description: "Full result set matches ground truth (order-aware for ORDER BY)."
    - name: step_penalty
      weight: -0.05
      description: "Deducted per step beyond the first (encourages efficiency)."

# ── Deployment ──────────────────────────────────────────────────────────────
server:
  port: 7860
  dockerfile: Dockerfile
  healthcheck: /health

# ── Baseline ────────────────────────────────────────────────────────────────
baseline:
  script: inference.py
  model: Qwen/Qwen2.5-72B-Instruct
  expected_scores:
    simple-filter:    0.70
    join-aggregation: 0.45
    analytics-window: 0.25