Spaces:

shreyas231219
/

Meta-Pytorch-Openenv

Sleeping

App Files Files Community

shreyas231219 commited on Mar 29

Commit

615101f

verified ·

1 Parent(s): 873275a

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

Dockerfile +61 -0
README.md +104 -12
__init__.py +16 -0
baseline_inference_groq.py +193 -0
client.py +55 -0
inference.py +200 -0
models.py +38 -0
openenv.yaml +50 -0
pyproject.toml +32 -0
server/__init__.py +11 -0
server/app.py +76 -0
server/environment.py +397 -0
server/requirements.txt +2 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,61 @@

+# SQL/Data Cleaning Sandbox  Dockerfile for Hugging Face Spaces
+# Use official Python 3.11 slim image
+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Install required system packages
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git curl build-essential && \
+    rm -rf /var/lib/apt/lists/*
+# Copy project files
+COPY . /app/
+# Install python dependencies directly bypassing complex managers to ensure maximum Hugging Face compatibility
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir uvicorn openenv-core[core]>=0.2.2 requests>=2.31.0 openai>=1.0.0 groq>=0.4.0 python-dotenv
+# OpenEnv needs the workspace in PYTHONPATH
+ENV PYTHONPATH="/app"
+# Default fallback task
+ENV TASK_ID="easy"
+# Hugging Face Spaces exposes port 7860
+EXPOSE 7860
+# Command to run the OpenEnv Server directly
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,104 @@
----
-title: Meta Pytorch Openenv
-emoji: 🐠
-colorFrom: blue
-colorTo: purple
-sdk: docker
-pinned: false
-license: apache-2.0
-short_description: Openenv from scratch for testing
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Meta-Pytorch-Openenv
+emoji: 🦀
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_port: 7860
+base_path: /web
+---
+# SQL / Data Cleaning Sandbox
+An **OpenEnv**-compliant environment where AI agents clean messy SQLite databases
+using SQL queries and Python code.
+## Overview
+| Feature | Details |
+|---|---|
+| **Interface** | `step()` / `reset()` / `state()` |
+| **Action space** | `{ tool: "sql" \| "python", command: "..." }` |
+| **Observation** | `{ output, error, current_step, max_steps, task_description }` |
+| **Reward** | 0.0 - 1.0 with **partial progress signals** |
+| **Tasks** | 3 (easy, medium, hard) |
+## Tasks
+### Easy - Data Triage
+> Find the total revenue from the `sales` table for January 2024.
+**Grader**: Checks if the computed total matches the expected float value (1000.00).
+### Medium - Data Cleaning
+> Fix duplicate emails, NULL ages, and uppercase emails in the `users` table.
+**Grader**: Partial scoring:
+- 0.3 for all emails lowercase
+- 0.4 for no duplicate emails
+- 0.3 for no NULL ages
+### Hard - Schema Migration
+> Normalize `flat_orders` into `customers` + `orders` tables with foreign keys.
+**Grader**: Partial scoring:
+- 0.2 for correct `customers` schema
+- 0.2 for correct `orders` schema
+- 0.2 for 4 unique customers
+- 0.2 for 6 orders migrated
+- 0.2 for valid FK integrity
+## Quick Start
+### Local Development
+```bash
+# Install dependencies
+pip install openenv-core
+# Run the server (defaults to the 'easy' task)
+cd sql_sandbox
+TASK_ID=easy python -m server.app
+# Switch tasks via env var
+TASK_ID=medium python -m server.app
+TASK_ID=hard python -m server.app
+```
+### Docker (Hugging Face Spaces Ready)
+```bash
+# Build
+docker build -t sql-sandbox:latest .
+# Run on HF Spaces default port 7860
+docker run -p 7860:7860 sql-sandbox:latest
+```
+## Baseline Inference
+Runs GPT-4o on all three tasks and prints reproducible scores:
+```bash
+export HF_TOKEN=sk-...
+export MODEL_NAME=gpt-4o
+python inference.py --url http://localhost:7860
+```
+## Project Structure
+```
+sql_sandbox/
+├── init.py             # Package exports
+├── models.py               # Action & Observation Pydantic models
+├── client.py               # EnvClient subclass
+├── openenv.yaml            # OpenEnv manifest
+├── pyproject.toml           # Dependencies
+├── inference.py            # GPT-4o baseline script
+├── README.md               # This file
+└── server/
+    ├── init.py
+    ├── app.py              # FastAPI application
+    ├── environment.py      # Core environment logic + graders
+    ├── requirements.txt
+    └── Dockerfile
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""SQL/Data Cleaning Sandbox Environment."""
+from .client import SqlSandboxEnv
+from .models import SqlSandboxAction, SqlSandboxObservation
+__all__ = [
+    "SqlSandboxAction",
+    "SqlSandboxObservation",
+    "SqlSandboxEnv",
+]

baseline_inference_groq.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+Baseline Inference Script for SQL/Data Cleaning Sandbox -- Groq Edition.
+Uses Groq (llama-3.3-70b-versatile) to solve all three tasks and prints
+reproducible scores via the OpenEnv WebSocket client.
+Usage:
+    set GROQ_API_KEY=gsk-...          # Windows
+    export GROQ_API_KEY=gsk-...       # Linux/macOS
+    python baseline_inference_groq.py                    # local server
+    python baseline_inference_groq.py --url https://...  # remote server
+"""
+import argparse
+import json
+import os
+import sys
+from dotenv import load_dotenv
+load_dotenv()
+from groq import Groq
+from client import SqlSandboxEnv
+from models import SqlSandboxAction
+# ---------------------------------------------------------------------------
+# System prompt shared across all tasks
+# ---------------------------------------------------------------------------
+SYSTEM_PROMPT = """\
+You are a data engineering assistant working inside a SQLite sandbox.
+You can execute two types of actions:
+1. {"tool": "sql",    "command": "<SQL query>"}
+2. {"tool": "python", "command": "<Python code>"}
+Rules:
+- Respond with EXACTLY ONE JSON object per turn -- no markdown, no explanation.
+- In Python code, the variables `conn` (sqlite3.Connection) and `cursor`
+  (sqlite3.Cursor) are already available. Do NOT call sqlite3.connect().
+- SQLite STRFTIME months are zero-padded: use '01' not '1', or use LIKE '2024-01-%'.
+- When you believe the task is fully complete, send:
+  {"tool": "sql", "command": "SELECT 'DONE'"}
+"""
+# ---------------------------------------------------------------------------
+# Core agent loop -- one task, one WebSocket session
+# ---------------------------------------------------------------------------
+def _run_task_agent(base_url: str, task_id: str, max_turns: int = 15) -> float:
+    """
+    Open a fresh WebSocket session, reset the environment to the given task,
+    then run an LLM agent loop until done or max_turns is reached.
+    Returns the final reward (0.0 - 1.0).
+    """
+    client_llm = Groq(api_key=os.environ["GROQ_API_KEY"])
+    final_reward = 0.0
+    # Each task gets its own WebSocket session to avoid state leakage
+    with SqlSandboxEnv(base_url=base_url).sync() as env:
+        # reset() with task_id seeds the correct DB table for this task
+        reset_resp = env.reset(task_id=task_id)
+        task_desc = reset_resp.observation.task_description
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user",   "content": f"Task: {task_desc}\n\nBegin."},
+        ]
+        print(f"\n  --- Session: {task_id} ---")
+        for turn in range(max_turns):
+            # 1. Ask the LLM
+            response = client_llm.chat.completions.create(
+                model="llama-3.3-70b-versatile",
+                messages=messages,
+                temperature=0.0,
+                max_tokens=512,
+            )
+            assistant_msg = response.choices[0].message.content.strip()
+            # 2. Parse action JSON (handle optional markdown fences)
+            try:
+                raw = assistant_msg
+                if raw.startswith("```"):
+                    raw = raw.split("```")[1]
+                    if raw.startswith("json"):
+                        raw = raw[4:]
+                action_data = json.loads(raw)
+                tool    = action_data["tool"]
+                command = action_data["command"]
+            except (json.JSONDecodeError, KeyError):
+                # Feed parse error back to LLM, do NOT count as a step
+                messages.append({"role": "assistant", "content": assistant_msg})
+                messages.append({
+                    "role": "user",
+                    "content": (
+                        'Invalid JSON. Reply with exactly one JSON object:\n'
+                        '{"tool": "sql" | "python", "command": "..."}'
+                    ),
+                })
+                continue
+            # 3. Execute the action via OpenEnv step()
+            step_resp = env.step(SqlSandboxAction(tool=tool, command=command))
+            reward = step_resp.reward or 0.0
+            done   = step_resp.done
+            output = step_resp.observation.output or ""
+            error  = step_resp.observation.error  or ""
+            final_reward = reward
+            print(f"  [Turn {turn+1:02d}] tool={tool:<6} | reward={reward:.4f} | done={done}")
+            if done:
+                break
+            # 4. Feed result back to LLM for the next turn
+            messages.append({"role": "assistant", "content": assistant_msg})
+            feedback = f"Output:\n{output[:1500]}"
+            if error:
+                feedback += f"\nError:\n{error[:500]}"
+            feedback += f"\nReward so far: {reward:.4f}"
+            messages.append({"role": "user", "content": feedback})
+    return final_reward
+# ---------------------------------------------------------------------------
+# Per-difficulty entry points (called by main, importable for custom use)
+# ---------------------------------------------------------------------------
+def easy_run(base_url: str, max_turns: int = 15) -> float:
+    print(f"\n{'='*50}\nRunning task: easy\n{'='*50}")
+    score = _run_task_agent(base_url, "easy", max_turns)
+    print(f"  Final score: {score:.4f}")
+    return score
+def med_run(base_url: str, max_turns: int = 15) -> float:
+    print(f"\n{'='*50}\nRunning task: medium\n{'='*50}")
+    score = _run_task_agent(base_url, "medium", max_turns)
+    print(f"  Final score: {score:.4f}")
+    return score
+def hard_run(base_url: str, max_turns: int = 15) -> float:
+    print(f"\n{'='*50}\nRunning task: hard\n{'='*50}")
+    score = _run_task_agent(base_url, "hard", max_turns)
+    print(f"  Final score: {score:.4f}")
+    return score
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(
+        description="Groq baseline inference for the SQL/Data Cleaning Sandbox"
+    )
+    parser.add_argument(
+        "--url",
+        default="http://localhost:8000",
+        help="Base URL of the running environment server (default: http://localhost:8000)",
+    )
+    parser.add_argument(
+        "--max-turns",
+        type=int,
+        default=15,
+        help="Maximum agent turns per task (default: 15)",
+    )
+    args = parser.parse_args()
+    if "GROQ_API_KEY" not in os.environ:
+        print("ERROR: GROQ_API_KEY environment variable is not set.")
+        sys.exit(1)
+    results: dict[str, float] = {}
+    results["easy"]   = easy_run(args.url, args.max_turns)
+    results["medium"] = med_run(args.url,  args.max_turns)
+    results["hard"]   = hard_run(args.url, args.max_turns)
+    avg = sum(results.values()) / len(results)
+    print(f"\n{'='*50}")
+    print("RESULTS SUMMARY")
+    print(f"{'='*50}")
+    for task_id, score in results.items():
+        print(f"  {task_id:<10}: {score:.4f}")
+    print(f"  {'average':<10}: {avg:.4f}")
+if __name__ == "__main__":
+    main()

client.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""SQL Sandbox Environment Client."""
+from typing import Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from models import SqlSandboxAction, SqlSandboxObservation
+class SqlSandboxEnv(EnvClient[SqlSandboxAction, SqlSandboxObservation, State]):
+    """
+    Client for the SQL/Data Cleaning Sandbox.
+    Example:
+        >>> with SqlSandboxEnv(base_url="http://localhost:8000") as client:
+        ...     result = client.reset()
+        ...     print(result.observation.task_description)
+        ...     result = client.step(SqlSandboxAction(tool="sql", command="SELECT * FROM sales"))
+        ...     print(result.observation.output)
+    """
+    def _step_payload(self, action: SqlSandboxAction) -> Dict:
+        return {"tool": action.tool, "command": action.command}
+    def _parse_result(self, payload: Dict) -> StepResult[SqlSandboxObservation]:
+        obs_data = payload.get("observation", {})
+        observation = SqlSandboxObservation(
+            output=obs_data.get("output", ""),
+            error=obs_data.get("error"),
+            current_step=obs_data.get("current_step", 0),
+            max_steps=obs_data.get("max_steps", 20),
+            task_description=obs_data.get("task_description", ""),
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+            metadata=obs_data.get("metadata", {}),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

inference.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+Baseline Inference Script for SQL/Data Cleaning Sandbox  OpenAI Edition.
+Uses OpenAI (gpt-4o) to solve all three tasks and prints reproducible
+scores via the OpenEnv WebSocket client.
+Usage:
+    set HF_TOKEN=sk-...          # Windows
+    export HF_TOKEN=sk-...       # Linux/macOS
+    python inference.py                    # local server
+    python inference.py --url https://...  # remote server
+"""
+import argparse
+import json
+import os
+import sys
+from dotenv import load_dotenv
+load_dotenv()
+from openai import OpenAI
+from client import SqlSandboxEnv
+from models import SqlSandboxAction
+# ---------------------------------------------------------------------------
+# System prompt shared across all tasks
+# ---------------------------------------------------------------------------
+SYSTEM_PROMPT = """\
+You are a data engineering assistant working inside a SQLite sandbox.
+You can execute two types of actions:
+1. {"tool": "sql",    "command": "<SQL query>"}
+2. {"tool": "python", "command": "<Python code>"}
+Rules:
+- Respond with EXACTLY ONE JSON object per turn  no markdown, no explanation.
+- In Python code, the variables `conn` (sqlite3.Connection) and `cursor`
+  (sqlite3.Cursor) are already available. Do NOT call sqlite3.connect().
+- SQLite STRFTIME months are zero-padded: use '01' not '1', or use LIKE '2024-01-%'.
+- When you believe the task is fully complete, send:
+  {"tool": "sql", "command": "SELECT 'DONE'"}
+"""
+# ---------------------------------------------------------------------------
+# Core agent loop  one task, one WebSocket session
+# ---------------------------------------------------------------------------
+def _run_task_agent(base_url: str, task_id: str, max_turns: int = 15) -> float:
+    """
+    Open a fresh WebSocket session, reset the environment to the given task,
+    then run an LLM agent loop until done or max_turns is reached.
+    Returns the final reward (0.0  1.0).
+    """
+    api_key = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY")
+    api_base_url = os.environ.get("API_BASE_URL")
+    model_name = os.environ.get("MODEL_NAME", "gpt-4o")
+    client_llm = OpenAI(
+        api_key=api_key,
+        base_url=api_base_url,
+    )
+    final_reward = 0.0
+    # Each task gets its own WebSocket session to avoid state leakage
+    with SqlSandboxEnv(base_url=base_url).sync() as env:
+        # reset() with task_id seeds the correct DB table for this task
+        reset_resp = env.reset(task_id=task_id)
+        task_desc = reset_resp.observation.task_description
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user",   "content": f"Task: {task_desc}\n\nBegin."},
+        ]
+        print(f"\n  --- Session: {task_id} ---")
+        for turn in range(max_turns):
+            # 1. Ask the LLM
+            response = client_llm.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                temperature=0.0,
+                max_tokens=512,
+            )
+            assistant_msg = response.choices[0].message.content.strip()
+            # 2. Parse action JSON (handle optional markdown fences)
+            try:
+                raw = assistant_msg
+                if raw.startswith("```"):
+                    raw = raw.split("```")[1]
+                    if raw.startswith("json"):
+                        raw = raw[4:]
+                action_data = json.loads(raw)
+                tool    = action_data["tool"]
+                command = action_data["command"]
+            except (json.JSONDecodeError, KeyError):
+                # Feed parse error back to LLM, do NOT count as a step
+                messages.append({"role": "assistant", "content": assistant_msg})
+                messages.append({
+                    "role": "user",
+                    "content": (
+                        'Invalid JSON. Reply with exactly one JSON object:\n'
+                        '{"tool": "sql" | "python", "command": "..."}'
+                    ),
+                })
+                continue
+            # 3. Execute the action via OpenEnv step()
+            step_resp = env.step(SqlSandboxAction(tool=tool, command=command))
+            reward = step_resp.reward or 0.0
+            done   = step_resp.done
+            output = step_resp.observation.output or ""
+            error  = step_resp.observation.error  or ""
+            final_reward = reward
+            print(f"  [Turn {turn+1:02d}] tool={tool:<6} | reward={reward:.4f} | done={done}")
+            if done:
+                break
+            # 4. Feed result back to LLM for the next turn
+            messages.append({"role": "assistant", "content": assistant_msg})
+            feedback = f"Output:\n{output[:1500]}"
+            if error:
+                feedback += f"\nError:\n{error[:500]}"
+            feedback += f"\nReward so far: {reward:.4f}"
+            messages.append({"role": "user", "content": feedback})
+    return final_reward
+# ---------------------------------------------------------------------------
+# Per-difficulty entry points (called by main, importable for custom use)
+# ---------------------------------------------------------------------------
+def easy_run(base_url: str, max_turns: int = 15) -> float:
+    print(f"\n{'='*50}\nRunning task: easy\n{'='*50}")
+    score = _run_task_agent(base_url, "easy", max_turns)
+    print(f"  Final score: {score:.4f}")
+    return score
+def med_run(base_url: str, max_turns: int = 15) -> float:
+    print(f"\n{'='*50}\nRunning task: medium\n{'='*50}")
+    score = _run_task_agent(base_url, "medium", max_turns)
+    print(f"  Final score: {score:.4f}")
+    return score
+def hard_run(base_url: str, max_turns: int = 15) -> float:
+    print(f"\n{'='*50}\nRunning task: hard\n{'='*50}")
+    score = _run_task_agent(base_url, "hard", max_turns)
+    print(f"  Final score: {score:.4f}")
+    return score
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(
+        description="OpenAI baseline inference for the SQL/Data Cleaning Sandbox"
+    )
+    parser.add_argument(
+        "--url",
+        default="http://localhost:8000",
+        help="Base URL of the running environment server (default: http://localhost:8000)",
+    )
+    parser.add_argument(
+        "--max-turns",
+        type=int,
+        default=15,
+        help="Maximum agent turns per task (default: 15)",
+    )
+    args = parser.parse_args()
+    if not os.environ.get("HF_TOKEN") and not os.environ.get("OPENAI_API_KEY"):
+        print("ERROR: HF_TOKEN (or OPENAI_API_KEY) environment variable is not set per checklist.")
+        sys.exit(1)
+    results: dict[str, float] = {}
+    results["easy"]   = easy_run(args.url, args.max_turns)
+    results["medium"] = med_run(args.url,  args.max_turns)
+    results["hard"]   = hard_run(args.url, args.max_turns)
+    avg = sum(results.values()) / len(results)
+    print(f"\n{'='*50}")
+    print("RESULTS SUMMARY")
+    print(f"{'='*50}")
+    for task_id, score in results.items():
+        print(f"  {task_id:<10}: {score:.4f}")
+    print(f"  {'average':<10}: {avg:.4f}")
+if __name__ == "__main__":
+    main()

models.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Data models for the SQL/Data Cleaning Sandbox Environment.
+Agents interact by sending SQL queries or Python snippets to clean
+messy databases and generate reports.
+"""
+from typing import Literal, Optional
+from openenv.core.env_server.types import Action, Observation
+from pydantic import Field
+class SqlSandboxAction(Action):
+    """Action for the SQL Sandbox  run a SQL query or Python snippet."""
+    tool: Literal["sql", "python"] = Field(
+        ..., description="Tool to use: 'sql' for SQLite queries, 'python' for Python scripts"
+    )
+    command: str = Field(
+        ..., description="The SQL query or Python code to execute"
+    )
+class SqlSandboxObservation(Observation):
+    """Observation returned after each step."""
+    output: str = Field(default="", description="stdout / query result")
+    error: Optional[str] = Field(default=None, description="stderr or error message")
+    current_step: int = Field(default=0, description="Current step number")
+    max_steps: int = Field(default=20, description="Maximum allowed steps")
+    task_description: str = Field(default="", description="Current task description")

openenv.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+spec_version: 1
+name: sql_sandbox
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000
+description: >
+  SQL/Data Cleaning Sandbox - a real-world OpenEnv environment where AI agents
+  clean messy databases via SQL and Python. Three tasks from easy to hard with
+  partial-progress grading (0.0-1.0).
+reward_range: [0.0, 1.0]
+tasks:
+  - id: easy
+    name: Data Triage
+    difficulty: easy
+    description: Find the total revenue from sales for January 2024.
+  - id: medium
+    name: Data Cleaning
+    difficulty: medium
+    description: Fix duplicate emails, null ages, and case inconsistencies in the users table.
+  - id: hard
+    name: Schema Migration
+    difficulty: hard
+    description: Normalize a flat orders table into customers + orders with foreign keys.
+action_space:
+  type: object
+  properties:
+    tool:
+      type: string
+      enum: [sql, python]
+    command:
+      type: string
+observation_space:
+  type: object
+  properties:
+    output:
+      type: string
+    error:
+      type: string
+    current_step:
+      type: integer
+    max_steps:
+      type: integer
+    task_description:
+      type: string

pyproject.toml ADDED Viewed

	@@ -0,0 +1,32 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-sql-sandbox"
+version = "0.1.0"
+description = "SQL/Data Cleaning Sandbox - An OpenEnv environment for agentic data engineering evaluation"
+requires-python = ">=3.10"
+dependencies = [
+    "openenv-core[core]>=0.2.2",
+    "requests>=2.31.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+inference = [
+    "openai>=1.0.0",
+    "requests>=2.31.0",
+    "groq>=0.4.0",
+]
+[project.scripts]
+server = "sql_sandbox.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["sql_sandbox", "sql_sandbox.server"]
+package-dir = { "sql_sandbox" = ".", "sql_sandbox.server" = "server" }

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Sql Sandbox environment server components."""
+from .environment import SqlSandboxEnvironment
+__all__ = ["SqlSandboxEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+FastAPI application for the Sql Sandbox Environment.
+This module creates an HTTP server that exposes the SqlSandboxEnvironment
+over HTTP and WebSocket endpoints, compatible with EnvClient.
+Endpoints:
+    - POST /reset: Reset the environment
+    - POST /step: Execute an action
+    - GET /state: Get current environment state
+    - GET /schema: Get action/observation schemas
+    - WS /ws: WebSocket endpoint for persistent sessions
+Usage:
+    # Development (with auto-reload):
+    uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+    # Production:
+    uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
+    # Or run directly:
+    python -m server.app
+"""
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:  # pragma: no cover
+    raise ImportError(
+        "openenv is required for the web interface. Install dependencies with '\n    uv sync\n'"
+    ) from e
+try:
+    from ..models import SqlSandboxAction, SqlSandboxObservation
+    from .environment import SqlSandboxEnvironment
+except (ImportError, ModuleNotFoundError):
+    from models import SqlSandboxAction, SqlSandboxObservation
+    from server.environment import SqlSandboxEnvironment
+# Create the app with web interface and README integration
+app = create_app(
+    SqlSandboxEnvironment,
+    SqlSandboxAction,
+    SqlSandboxObservation,
+    env_name="sql_sandbox",
+    max_concurrent_envs=10,  # increase this number to allow more concurrent WebSocket sessions
+)
+import os
+@app.post("/set_task/{task_id}")
+def set_task(task_id: str):
+    os.environ["TASK_ID"] = task_id
+    return {"status": "ok", "task_id": task_id}
+def main():
+    """
+    Entry point for direct execution via uv run or python -m.
+    This function enables running the server without Docker:
+        uv run --project . server
+        python -m sql_sandbox.server.app
+    """
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+if __name__ == "__main__":
+    main()

server/environment.py ADDED Viewed

	@@ -0,0 +1,397 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+SQL/Data Cleaning Sandbox Environment Implementation.
+Three tasks (easy  medium  hard) for AI agents:
+  1. Data Triage    query revenue from sales data
+  2. Data Cleaning  fix duplicates & nulls in a users table
+  3. Schema Migration  normalize a flat table into two related tables
+"""
+import io
+import os
+import sqlite3
+import sys
+import tempfile
+import traceback
+from contextlib import redirect_stderr, redirect_stdout
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+try:
+    from ..models import SqlSandboxAction, SqlSandboxObservation
+except ImportError:
+    from models import SqlSandboxAction, SqlSandboxObservation
+# ---------------------------------------------------------------------------
+# Task definitions
+# ---------------------------------------------------------------------------
+TASKS = {
+    "easy": {
+        "id": "easy",
+        "description": (
+            "Find the total revenue from the 'sales' table for January 2024. "
+            "The table has columns: id, product, amount, sale_date (YYYY-MM-DD). "
+            "Return the exact total as a single number by running a SQL query. "
+            "The expected result should be a SELECT query that returns one number."
+        ),
+        "max_steps": 10,
+    },
+    "medium": {
+        "id": "medium",
+        "description": (
+            "The 'users' table has duplicate emails and NULL values in the 'age' column. "
+            "Clean the data so that: (1) all emails are lowercase, "
+            "(2) duplicate emails are removed (keep the row with the lowest id), "
+            "(3) all NULL ages are replaced with 0. "
+            "Use SQL or Python to fix the table in-place."
+        ),
+        "max_steps": 15,
+    },
+    "hard": {
+        "id": "hard",
+        "description": (
+            "The 'flat_orders' table has columns: order_id, order_date, "
+            "customer_name, customer_email, product, quantity, price. "
+            "Normalize this into two tables: 'customers' (id INTEGER PRIMARY KEY, "
+            "name TEXT, email TEXT UNIQUE) and 'orders' (id INTEGER PRIMARY KEY, "
+            "customer_id INTEGER REFERENCES customers(id), order_date TEXT, "
+            "product TEXT, quantity INTEGER, price REAL). "
+            "Maintain foreign key integrity and migrate all data."
+        ),
+        "max_steps": 20,
+    },
+}
+# ---------------------------------------------------------------------------
+# Seed data generators
+# ---------------------------------------------------------------------------
+def _seed_easy(conn: sqlite3.Connection):
+    """Create sales table with known data."""
+    conn.execute("DROP TABLE IF EXISTS sales")
+    conn.execute(
+        "CREATE TABLE sales (id INTEGER PRIMARY KEY, product TEXT, amount REAL, sale_date TEXT)"
+    )
+    rows = [
+        (1, "Widget A", 150.00, "2024-01-05"),
+        (2, "Widget B", 250.50, "2024-01-12"),
+        (3, "Widget C", 99.99, "2024-01-20"),
+        (4, "Widget A", 150.00, "2024-01-28"),
+        (5, "Widget D", 349.51, "2024-01-15"),
+        (6, "Widget A", 200.00, "2024-02-03"),
+        (7, "Widget B", 75.00, "2023-12-30"),
+    ]
+    conn.executemany("INSERT INTO sales VALUES (?,?,?,?)", rows)
+    conn.commit()
+def _seed_medium(conn: sqlite3.Connection):
+    """Create users table with messy data."""
+    conn.execute("DROP TABLE IF EXISTS users")
+    conn.execute(
+        "CREATE TABLE users (id INTEGER PRIMARY KEY, name TEXT, email TEXT, age INTEGER)"
+    )
+    rows = [
+        (1, "Alice", "Alice@Example.com", 30),
+        (2, "Bob", "bob@example.com", None),
+        (3, "Charlie", "charlie@test.com", 25),
+        (4, "Alice Dup", "alice@example.com", 28),
+        (5, "Dave", "DAVE@Test.COM", None),
+        (6, "Eve", "eve@example.com", 35),
+        (7, "Dave Dup", "dave@test.com", 40),
+        (8, "Frank", "frank@example.com", None),
+    ]
+    conn.executemany("INSERT INTO users VALUES (?,?,?,?)", rows)
+    conn.commit()
+def _seed_hard(conn: sqlite3.Connection):
+    """Create flat_orders table."""
+    conn.execute("DROP TABLE IF EXISTS flat_orders")
+    conn.execute("DROP TABLE IF EXISTS customers")
+    conn.execute("DROP TABLE IF EXISTS orders")
+    conn.execute(
+        "CREATE TABLE flat_orders ("
+        "order_id INTEGER, order_date TEXT, customer_name TEXT, "
+        "customer_email TEXT, product TEXT, quantity INTEGER, price REAL)"
+    )
+    rows = [
+        (1, "2024-01-10", "Alice", "alice@example.com", "Laptop", 1, 999.99),
+        (2, "2024-01-11", "Bob", "bob@example.com", "Mouse", 2, 25.50),
+        (3, "2024-01-12", "Alice", "alice@example.com", "Keyboard", 1, 75.00),
+        (4, "2024-01-13", "Charlie", "charlie@example.com", "Monitor", 1, 300.00),
+        (5, "2024-01-14", "Bob", "bob@example.com", "Webcam", 1, 50.00),
+        (6, "2024-01-15", "Diana", "diana@example.com", "USB Hub", 3, 15.99),
+    ]
+    conn.executemany("INSERT INTO flat_orders VALUES (?,?,?,?,?,?,?)", rows)
+    conn.commit()
+SEED_FNS = {"easy": _seed_easy, "medium": _seed_medium, "hard": _seed_hard}
+# ---------------------------------------------------------------------------
+# Graders
+# ---------------------------------------------------------------------------
+EASY_EXPECTED = 1000.00  # 150 + 250.5 + 99.99 + 150 + 349.51
+def grade_easy(conn: sqlite3.Connection, last_output: str) -> float:
+    """Check if agent returned correct total revenue for Jan 2024."""
+    if not last_output:
+        return 0.0
+    # We inspect the agent's query execution result to see if 1000.0 is present.
+    try:
+        # Convert output strings to simple float checks.
+        import re
+        numbers = re.findall(r"[-+]?\d*\.\d+|\d+", last_output)
+        for num in numbers:
+            if abs(float(num) - EASY_EXPECTED) < 0.01:
+                return 1.0
+    except Exception:
+        pass
+    return 0.0
+def grade_medium(conn: sqlite3.Connection, last_output: str) -> float:
+    """Check cleaning quality: no duplicates, no nulls, lowercase emails."""
+    score = 0.0
+    try:
+        # Check table exists
+        cur = conn.execute("SELECT COUNT(*) FROM users")
+        total = cur.fetchone()[0]
+        if total == 0:
+            return 0.0
+        # Check lowercase emails (0.3)
+        cur = conn.execute("SELECT COUNT(*) FROM users WHERE email != LOWER(email)")
+        upper_count = cur.fetchone()[0]
+        if upper_count == 0:
+            score += 0.3
+        # Check no duplicate emails (0.4)
+        cur = conn.execute(
+            "SELECT COUNT(*) FROM (SELECT LOWER(email) as e FROM users GROUP BY e HAVING COUNT(*) > 1)"
+        )
+        dup_count = cur.fetchone()[0]
+        if dup_count == 0:
+            score += 0.4
+        # Check no NULL ages (0.3)
+        cur = conn.execute("SELECT COUNT(*) FROM users WHERE age IS NULL")
+        null_count = cur.fetchone()[0]
+        if null_count == 0:
+            score += 0.3
+    except Exception:
+        pass
+    return round(score, 2)
+def grade_hard(conn: sqlite3.Connection, last_output: str) -> float:
+    """Verify normalized schema and data integrity."""
+    score = 0.0
+    try:
+        # Check 'customers' table exists with correct columns (0.2)
+        cur = conn.execute("PRAGMA table_info(customers)")
+        cols = {r[1] for r in cur.fetchall()}
+        if {"id", "name", "email"}.issubset(cols):
+            score += 0.2
+        # Check 'orders' table exists with correct columns (0.2)
+        cur = conn.execute("PRAGMA table_info(orders)")
+        cols = {r[1] for r in cur.fetchall()}
+        if {"id", "customer_id", "order_date", "product", "quantity", "price"}.issubset(cols):
+            score += 0.2
+        # Check customer count = 4 unique customers (0.2)
+        cur = conn.execute("SELECT COUNT(*) FROM customers")
+        if cur.fetchone()[0] == 4:
+            score += 0.2
+        # Check orders count = 6 (0.2)
+        cur = conn.execute("SELECT COUNT(*) FROM orders")
+        if cur.fetchone()[0] == 6:
+            score += 0.2
+        # Check FK integrity: all customer_ids in orders exist in customers (0.2)
+        cur = conn.execute(
+            "SELECT COUNT(*) FROM orders WHERE customer_id NOT IN (SELECT id FROM customers)"
+        )
+        if cur.fetchone()[0] == 0:
+            score += 0.2
+    except Exception:
+        pass
+    return round(score, 2)
+GRADERS = {"easy": grade_easy, "medium": grade_medium, "hard": grade_hard}
+# ---------------------------------------------------------------------------
+# Environment
+# ---------------------------------------------------------------------------
+class SqlSandboxEnvironment(Environment):
+    """
+    SQL / Data Cleaning Sandbox  a real-world OpenEnv environment.
+    The agent sends SQL or Python commands to clean messy databases.
+    Partial progress rewards are given after each step.
+    """
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self):
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._db_path = os.path.join(tempfile.gettempdir(), f"sqlsandbox_{uuid4().hex[:8]}.db")
+        self._conn: sqlite3.Connection | None = None
+        self._task_id = os.environ.get("TASK_ID", "easy")
+        self._task = TASKS[self._task_id]
+        self._max_steps = self._task["max_steps"]
+        self._done = False
+        self._last_reward = 0.0
+    # ---- helpers -----------------------------------------------------------
+    def _get_conn(self) -> sqlite3.Connection:
+        if self._conn is None:
+            self._conn = sqlite3.connect(self._db_path)
+            self._conn.execute("PRAGMA foreign_keys = ON")
+        return self._conn
+    def _partial_reward(self, last_output: str) -> float:
+        """Run the grader to compute partial progress."""
+        return GRADERS[self._task_id](self._get_conn(), last_output)
+    def _exec_sql(self, query: str) -> tuple[str, str | None]:
+        try:
+            conn = self._get_conn()
+            cur = conn.execute(query)
+            if cur.description:
+                cols = [d[0] for d in cur.description]
+                rows = cur.fetchall()
+                header = " | ".join(cols)
+                body = "\n".join(" | ".join(str(c) for c in r) for r in rows)
+                output = f"{header}\n{body}" if rows else header + "\n(no rows)"
+            else:
+                output = f"OK  {conn.total_changes} row(s) affected"
+            conn.commit()
+            return output, None
+        except Exception as e:
+            return "", str(e)
+    def _exec_python(self, code: str) -> tuple[str, str | None]:
+        stdout_buf, stderr_buf = io.StringIO(), io.StringIO()
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            globs = {
+                "__builtins__": __builtins__,
+                "sqlite3": sqlite3,
+                "DB_PATH": self._db_path,
+                "conn": conn,
+                "cursor": cursor,
+            }
+            with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf):
+                exec(code, globs)
+            # Automatically commit any schema changes the LLM's python code made
+            conn.commit()
+            out = stdout_buf.getvalue()
+            err = stderr_buf.getvalue() or None
+            return out, err
+        except Exception:
+            return stdout_buf.getvalue(), traceback.format_exc()
+    # ---- OpenEnv interface -------------------------------------------------
+    def reset(self, **kwargs) -> SqlSandboxObservation:
+        """Resets the environment and forces a task switch if task_id is provided."""
+        # 1. Close current connection to ensure file handles are released
+        if self._conn:
+            self._conn.close()
+            self._conn = None
+        # 2. Update task context from kwargs (primary) or environment (fallback)
+        # This is the fix for the 'Easy task persistence' bug.
+        self._task_id = kwargs.get("task_id", os.environ.get("TASK_ID", "easy"))
+        self._task = TASKS[self._task_id]
+        self._max_steps = self._task["max_steps"]
+        # 3. Re-initialize episode state
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._done = False
+        self._last_reward = 0.0
+        # 4. Open fresh connection and re-seed for the specific task_id
+        # Seed functions use 'DROP TABLE IF EXISTS' which handles cleanup.
+        conn = self._get_conn()
+        SEED_FNS[self._task_id](conn)
+        return SqlSandboxObservation(
+            output=f"Environment ready. Task: {self._task['description']}",
+            error=None,
+            current_step=0,
+            max_steps=self._max_steps,
+            task_description=self._task["description"],
+            done=False,
+            reward=0.0,
+        )
+    def step(self, action: SqlSandboxAction) -> SqlSandboxObservation:  # type: ignore[override]
+        self._state.step_count += 1
+        step = self._state.step_count
+        if self._done:
+            return SqlSandboxObservation(
+                output="Episode already finished. Call reset().",
+                error=None,
+                current_step=step,
+                max_steps=self._max_steps,
+                task_description=self._task["description"],
+                done=True,
+                reward=self._last_reward,
+            )
+        # Execute action
+        if action.tool == "sql":
+            output, error = self._exec_sql(action.command)
+        else:
+            output, error = self._exec_python(action.command)
+        # Compute partial reward
+        reward = self._partial_reward(output)
+        # Check termination
+        done = step >= self._max_steps or reward >= 1.0
+        if done:
+            self._done = True
+        self._last_reward = reward
+        # Small penalty for errors to discourage random guessing
+        if error:
+            reward = max(0.0, reward - 0.05)
+        return SqlSandboxObservation(
+            output=output[:4000],  # cap output size
+            error=error[:2000] if error else None,
+            current_step=step,
+            max_steps=self._max_steps,
+            task_description=self._task["description"],
+            done=done,
+            reward=round(reward, 4),
+        )
+    @property
+    def state(self) -> State:
+        return self._state

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ openenv-core[core]>=0.2.2
2	+ requests>=2.31.0

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff