Spaces:

ritvik360
/

nl2sql-bench

Running

File size: 4,677 Bytes

a39d8ef

# nl2sql-bench/openenv.yaml
# OpenEnv environment manifest — validated by `openenv validate`

name: nl2sql-bench
version: "0.1.0"
description: >
  Natural Language to SQL query generation environment for RL training.
  An agent iteratively writes and refines SQLite queries against a synthetic
  e-commerce database to answer business questions. Multi-turn episodes with
  dense, shaped rewards. Three difficulty tasks: easy (single-table),
  medium (JOIN + GROUP BY), hard (window functions + CTEs).

author: "nl2sql-bench team"
license: MIT
tags:
  - openenv
  - nl2sql
  - sql
  - analytics
  - rl-training
  - deterministic
  - multi-turn

# ── Task definitions ────────────────────────────────────────────────────────
tasks:
  - name: simple-filter
    difficulty: easy
    description: >
      Single-table SELECT with WHERE, ORDER BY, and LIMIT.
      Tests basic SQL fluency. Expected solve rate: high.
    max_steps: 5
    reward_range: [0.0, 1.0]

  - name: join-aggregation
    difficulty: medium
    description: >
      Multi-table JOINs with GROUP BY, HAVING, and aggregation functions
      (COUNT, SUM, AVG, ROUND). Tests relational reasoning.
    max_steps: 5
    reward_range: [0.0, 1.0]

  - name: analytics-window
    difficulty: hard
    description: >
      Advanced analytics using CTEs, window functions (DENSE_RANK,
      ROW_NUMBER, running SUM), and nested subqueries. Tests multi-step
      planning and SQLite-specific syntax.
    max_steps: 5
    reward_range: [0.0, 1.0]

# ── Action / Observation space ──────────────────────────────────────────────
action_space:
  type: object
  properties:
    query:
      type: string
      description: "A SQLite SELECT query string."

observation_space:
  type: object
  properties:
    question:
      type: string
      description: "Natural-language question the agent must answer."
    schema_context:
      type: string
      description: "Compact database schema description for the agent."
    task_name:
      type: string
      description: "Active task identifier."
    last_query:
      type: string
      description: "The SQL query submitted on the previous step."
    last_result:
      type: array
      description: "Up to 10 rows returned by the last query (list of dicts)."
    last_error:
      type: string
      nullable: true
      description: "SQLite error string if last query failed, else null."
    result_columns:
      type: array
      description: "Column names of last_result."
    step:
      type: integer
      description: "Current step number (1-indexed; 0 after reset)."
    max_steps:
      type: integer
      description: "Maximum steps per episode."
    done:
      type: boolean
      description: "True when episode ends (exact match or step limit reached)."
    reward:
      type: number
      nullable: true
      description: "Reward for the most recent step [0.0, 1.0]."
    score:
      type: number
      description: "Normalised cumulative episode score [0.0, 1.0]."

# ── Reward function description ─────────────────────────────────────────────
reward:
  type: shaped
  range: [0.0, 1.0]
  components:
    - name: syntax_ok
      weight: 0.10
      description: "Query executes without SQLite error."
    - name: columns_match
      weight: 0.20
      description: "Returned column names match ground truth exactly."
    - name: row_count_match
      weight: 0.20
      description: "Number of returned rows matches ground truth."
    - name: exact_match
      weight: 0.50
      description: "Full result set matches ground truth (order-aware for ORDER BY)."
    - name: step_penalty
      weight: -0.05
      description: "Deducted per step beyond the first (encourages efficiency)."

# ── Deployment ──────────────────────────────────────────────────────────────
server:
  port: 7860
  dockerfile: Dockerfile
  healthcheck: /health

# ── Baseline ────────────────────────────────────────────────────────────────
baseline:
  script: inference.py
  model: Qwen/Qwen2.5-72B-Instruct
  expected_scores:
    simple-filter:    0.70
    join-aggregation: 0.45
    analytics-window: 0.25