Spaces:

Mohammed-Altaf
/

DataAnalysis_Env

Sleeping

App Files Files Community

Env code

by Mohammed-Altaf - opened Apr 4

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+35

-6106

This PR is in draft mode

Files changed (33) hide show

.env.example +0 -21
.gitattributes +35 -0
.gitignore +0 -13
.python-version +0 -1
Dockerfile +0 -32
README.md +0 -176
__init__.py +0 -15
baseline.py +0 -163
client.py +0 -56
datasets/sales.csv +0 -0
datasets/store_data.db +0 -0
helpers/__init__.py +0 -0
helpers/constants.py +0 -9
helpers/logging.py +0 -44
helpers/prompts.py +0 -27
helpers/response_parser.py +0 -181
inference.py +0 -172
models.py +0 -52
openenv.yaml +0 -21
pyproject.toml +0 -25
server/Dockerfile +0 -58
server/__init__.py +0 -0
server/app.py +0 -15
server/data_analysis_env.py +0 -296
tasks/__init__.py +0 -29
tasks/base_task.py +0 -51
tasks/task_easy.py +0 -53
tasks/task_hard.py +0 -103
tasks/task_hard_2.py +0 -103
tasks/task_hard_3.py +0 -107
tasks/task_medium.py +0 -77
tasks/task_medium_2.py +0 -88
uv.lock +0 -0

.env.example DELETED Viewed

@@ -1,21 +0,0 @@
-# Copy this file to .env and fill in your values.
-# .env is gitignored — never commit actual keys.
-#
-# Usage:
-#   cp .env.example .env
-#   # edit .env with your values
-#   uv run python inference.py
-# OpenAI-compatible LLM API endpoint
-API_BASE_URL=https://router.huggingface.co/v1
-# Model identifier
-MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
-# API key (Hugging Face token or other provider key)
-HF_TOKEN=hf_...
-# (Optional) Override the environment server URL
-# Default is the deployed HF Space: https://mohammed-altaf-dataanalysis-env.hf.space
-# Override for local testing:
-# ENV_SERVER_URL=http://localhost:8000

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,13 +0,0 @@
-__pycache__/
-*.py[oc]
-build/
-dist/
-wheels/
-*.egg-info
-.venv
-OpenEnv/
-*.ipynb
-personal/
-.env
-CLAUDE.md
-.claude

.python-version DELETED Viewed

	@@ -1 +0,0 @@
1	- 3.13

Dockerfile DELETED Viewed

@@ -1,32 +0,0 @@
-ARG BASE_IMAGE=python:3.13-slim
-FROM ${BASE_IMAGE}
-WORKDIR /app
-# Install uv
-RUN pip install uv --no-cache-dir
-# Copy project files
-COPY pyproject.toml uv.lock* ./
-COPY models.py client.py __init__.py baseline.py inference.py openenv.yaml ./
-COPY server/ ./server/
-COPY tasks/ ./tasks/
-COPY datasets/ ./datasets/
-COPY helpers/ ./helpers/
-# Install dependencies into the uv-managed venv
-RUN uv sync --frozen --no-dev
-# Make the venv's python/pip the default so `python inference.py` works
-# without needing `uv run` as a prefix
-ENV PATH="/app/.venv/bin:$PATH"
-# Ensure local modules (client, models, helpers, tasks) are always importable
-# regardless of the working directory the evaluator uses
-ENV PYTHONPATH="/app:$PYTHONPATH"
-# HF Spaces runs containers as a non-root user on port 7860
-ENV PORT=7860
-EXPOSE 7860
-CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md DELETED Viewed

@@ -1,176 +0,0 @@
----
-title: Data Analysis Agent Environment
-emoji: 📊
-colorFrom: blue
-colorTo: indigo
-sdk: docker
-pinned: false
----
-# Data Analysis Agent Environment
-An OpenEnv-compliant RL environment for training and evaluating data analysis agents. Agents execute pandas code against a business dataset to answer analytical questions, graded by deterministic programmatic graders.
-## Motivation
-Data analysis is a universal real-world task. Every business needs analysts who can query datasets, compute metrics, and extract insights. This environment lets RL agents practice that exact workflow — explore a dataset with code, then submit a precise answer — with automatic scoring.
-## Action & Observation Spaces
-### Action (`DataAction`)
-| Field | Type | Description |
-|---|---|---|
-| `action_type` | `"execute_code"` or `"submit_answer"` | What the agent wants to do |
-| `code` | `str` (optional) | Python/pandas code to execute |
-| `answer` | `str` (optional) | Final answer to submit for grading |
-### Observation (`DataObservation`)
-| Field | Type | Description |
-|---|---|---|
-| `output` | `str` | Stdout from code execution or environment messages |
-| `success` | `bool` | Whether the action succeeded |
-| `error` | `str` (optional) | Error message if action failed |
-| `task_description` | `str` | The question to answer (set on reset) |
-| `dataset_info` | `str` | Dataset schema summary (set on reset) |
-| `done` | `bool` | Whether the episode is over |
-| `reward` | `float` | Step reward |
-### State (`DataState`)
-| Field | Type | Description |
-|---|---|---|
-| `episode_id` | `str` | Unique episode identifier |
-| `step_count` | `int` | Current step number |
-| `task_id` | `int` | Active task (1–6) |
-| `answer_submitted` | `bool` | Whether final answer was submitted |
-| `final_score` | `float` | Graded score after submission |
-## Tasks
-Tasks use two data sources:
-- **`df`** — synthetic e-commerce sales CSV (~2000 orders): `order_id`, `customer_id`, `product_name`, `category`, `quantity`, `unit_price`, `total_price`, `order_date`, `city`, `country`
-- **SQLite DB** (`store_data.db`) — additional tables for cross-source tasks: `customer_profiles` (300 rows), `product_catalog` (25 rows)
-### Task 1 — Easy: Top Revenue Category
-- **Question**: What is the top-selling product category by total revenue?
-- **Grading**: Containment match (case-insensitive) → 1.0 or 0.0
-- **Expected difficulty**: Single groupby + sum + argmax
-### Task 2 — Medium: City Revenue Share
-- **Question**: Which city generates the most revenue? What percentage of total revenue does it represent?
-- **Grading**: 0.5 for correct city + 0.5 for percentage within ±0.1%
-- **Expected difficulty**: Groupby + percentage calculation + formatting
-### Task 3 — Medium: Repeat Customer Cohort Analysis
-- **Question**: How many unique customers ordered in both January and December? Compare their average order value to all other customers.
-- **Grading**: 0.33 per correct field (count, cohort AOV, other AOV)
-- **Expected difficulty**: Temporal filtering, set intersection, conditional aggregation
-### Task 4 — Hard: Monthly Revenue Ratio
-- **Question**: Which month had the highest vs. lowest total revenue? What is the ratio between them?
-- **Grading**: 0.33 for best month + 0.33 for worst month + 0.34 for ratio within ±0.01
-- **Expected difficulty**: Monthly resample/groupby, min/max comparison, ratio formatting
-### Task 5 — Hard: Customer Loyalty Tier Revenue (cross-source)
-- **Question**: Which customer loyalty tier generates the highest total revenue and what percentage does it represent?
-- **Data**: Requires joining `df` with `customer_profiles` table from SQLite on `customer_id`
-- **Grading**: 0.33 for tier name + 0.33 for revenue within ±0.5% + 0.34 for percentage within ±0.1
-- **Expected difficulty**: SQLite query → pandas merge → groupby aggregation
-### Task 6 — Hard: Supplier Profitability (cross-source)
-- **Question**: Which supplier has the highest total profit? What is their average profit margin?
-- **Data**: Requires joining `df` with `product_catalog` table from SQLite on `product_name`
-- **Grading**: 0.33 for supplier name + 0.34 for total profit within ±0.5% + 0.33 for avg margin within ±0.1
-- **Expected difficulty**: SQLite query → pandas merge → per-order profit/margin calculation → group aggregation
-## Reward Function
-| Event | Reward |
-|---|---|
-| Successful code execution | +0.05 |
-| Code execution error | -0.05 |
-| Final answer (graded) | 0.0 — 1.0 based on task grader |
-| Max steps (20) exceeded | 0.0 |
-## Setup & Usage
-### Prerequisites
-- Python 3.13+
-- [uv](https://docs.astral.sh/uv/) package manager
-### Install
-```bash
-uv sync
-```
-### Run the server
-```bash
-uv run uvicorn server.app:app --host 0.0.0.0 --port 8000
-```
-### Run the inference
-- First export all the required env variables mentioned in the .env.example. Then run below command
-```bash
-uv run python inference.py
-```
-### Run the baseline
-```bash
-OPENAI_API_KEY=sk-... uv run python baseline.py
-# Against a deployed HF Space:
-OPENAI_API_KEY=sk-... uv run python baseline.py --base-url https://<your-username>-<space-name>.hf.space
-```
-### Docker (local)
-```bash
-docker build -t data-analysis-env .
-docker run -p 7860:7860 data-analysis-env
-```
-### Client usage (Python)
-```python
-from client import DataAnalysisClient
-from models import DataAction
-# Async
-async with DataAnalysisClient(base_url="http://localhost:8000") as client:
-    result = await client.reset(task_id=1)
-    result = await client.step(DataAction(action_type="execute_code", code="print(df.head())"))
-    result = await client.step(DataAction(action_type="submit_answer", answer="Electronics"))
-# Sync
-with DataAnalysisClient(base_url="http://localhost:8000").sync() as client:
-    result = client.reset(task_id=2)
-    result = client.step(DataAction(action_type="execute_code", code="print(df.groupby('city')['total_price'].sum())"))
-```
-## Project Structure
-```
-├── models.py                  # DataAction, DataObservation, DataState
-├── client.py                  # DataAnalysisClient (EnvClient subclass)
-├── inference.py               # HF inference script (uses HF Inference API)
-├── baseline.py                # OpenAI baseline inference script
-├── helpers/
-│   └── response_parser.py     # Robust LLM JSON response parser
-├── tasks/
-│   ├── base_task.py           # Task ABC with grade() interface
-│   ├── task_easy.py           # Task 1 (Easy): Top revenue category
-│   ├── task_medium.py         # Task 2 (Medium): City revenue share
-│   ├── task_medium_2.py       # Task 4 (Hard): Monthly revenue ratio
-│   ├── task_hard.py           # Task 3 (Medium): Repeat customer cohort
-│   ├── task_hard_2.py         # Task 5 (Hard): Customer loyalty tier revenue
-│   └── task_hard_3.py         # Task 6 (Hard): Supplier profitability
-├── datasets/
-│   ├── sales.csv              # Synthetic e-commerce sales dataset
-│   └── store_data.db          # SQLite DB: customer_profiles, product_catalog
-├── server/
-│   ├── app.py                 # FastAPI app entry point
-│   └── data_analysis_env.py   # Environment implementation
-├── Dockerfile                 # HF Spaces Docker build (port 7860)
-├── openenv.yaml               # OpenEnv spec metadata
-└── pyproject.toml             # Dependencies and project config
-```

__init__.py DELETED Viewed

@@ -1,15 +0,0 @@
-"""Data Analysis Agent Environment for OpenEnv.
-An RL environment where agents execute pandas code against a business
-dataset to answer analytical questions with programmatic grading.
-"""
-from client import DataAnalysisClient
-from models import DataAction, DataObservation, DataState
-__all__ = [
-    "DataAnalysisClient",
-    "DataAction",
-    "DataObservation",
-    "DataState",
-]

baseline.py DELETED Viewed

@@ -1,163 +0,0 @@
-"""Baseline inference script for the Data Analysis Agent environment.
-Uses the OpenAI API to run a model (gpt-4o-mini) against all 6 tasks
-and produces reproducible baseline scores.
-The script uses DataAnalysisClient (WebSocket) because the HTTP endpoints
-are stateless — each request gets a fresh env instance. State (namespace,
-task, dataset) only persists within a WebSocket session.
-Tasks 1-3 use only the pandas DataFrame (df). Tasks 4-6 are cross-source:
-they also require querying a SQLite database via sqlite3.connect(db_path).
-Usage:
-    OPENAI_API_KEY=sk-... uv run python baseline.py
-    OPENAI_API_KEY=sk-... uv run python baseline.py --base-url http://localhost:8000
-"""
-import argparse
-import json
-import os
-import sys
-from openai import OpenAI
-from client import DataAnalysisClient
-from helpers.prompts import SYSTEM_PROMPT
-from models import DataAction
-def run_task(openai_client: OpenAI, env_client: DataAnalysisClient, task_id: int, max_steps: int = 15) -> float:
-    """Run a single task using the OpenAI API as the agent.
-    Args:
-        openai_client: The OpenAI client instance.
-        env_client: The connected DataAnalysisClient (sync wrapper).
-        task_id: Which task to run (1–6).
-        max_steps: Maximum agent steps before giving up.
-    Returns:
-        The final score for this task (0.0 to 1.0).
-    """
-    result = env_client.reset(task_id=task_id)
-    obs = result.observation
-    messages = [
-        {"role": "system", "content": SYSTEM_PROMPT},
-        {
-            "role": "user",
-            "content": f"Task: {obs.task_description}\n\nDataset Info:\n{obs.dataset_info}",
-        },
-    ]
-    print(f"\n--- Task {task_id} ---")
-    print(f"Question: {obs.task_description}")
-    for step in range(max_steps):
-        response = openai_client.chat.completions.create(
-            model="gpt-4o-mini",
-            messages=messages,
-            temperature=0.0,
-        )
-        assistant_msg = response.choices[0].message.content.strip()
-        # Parse the agent's JSON response
-        try:
-            # Handle markdown code blocks if present
-            if assistant_msg.startswith("```"):
-                assistant_msg = assistant_msg.split("```")[1]
-                if assistant_msg.startswith("json"):
-                    assistant_msg = assistant_msg[4:]
-                assistant_msg = assistant_msg.strip()
-            action = json.loads(assistant_msg)
-        except json.JSONDecodeError:
-            messages.append({"role": "assistant", "content": assistant_msg})
-            messages.append(
-                {
-                    "role": "user",
-                    "content": "Invalid JSON. Please respond with valid JSON only.",
-                }
-            )
-            continue
-        action_type = action.get("action", "")
-        if action_type == "execute_code":
-            result = env_client.step(DataAction(action_type="execute_code", code=action.get("code", "")))
-            obs = result.observation
-            result_text = f"Output: {obs.output}" if not obs.error else f"Error: {obs.error}"
-            print(f"  Step {step + 1}: execute_code -> {result_text[:120]}")
-            messages.append({"role": "assistant", "content": assistant_msg})
-            messages.append({"role": "user", "content": result_text})
-        elif action_type == "submit_answer":
-            result = env_client.step(DataAction(action_type="submit_answer", answer=action.get("answer", "")))
-            obs = result.observation
-            score = obs.metadata.get("score", 0.0) if obs.metadata else result.reward
-            print(f"  Step {step + 1}: submit_answer -> '{action.get('answer', '')}'")
-            print(f"  Score: {score:.2f}")
-            return score
-        else:
-            messages.append({"role": "assistant", "content": assistant_msg})
-            messages.append(
-                {
-                    "role": "user",
-                    "content": f"Unknown action '{action_type}'. Use 'execute_code' or 'submit_answer'.",
-                }
-            )
-    print("  Max steps reached without submitting an answer.")
-    return 0.0
-def main():
-    """Run baseline inference across all 6 tasks and report scores."""
-    parser = argparse.ArgumentParser(description="Baseline inference for Data Analysis Env")
-    parser.add_argument(
-        "--base-url",
-        default="http://localhost:8000",
-        help="Environment server URL (default: http://localhost:8000)",
-    )
-    args = parser.parse_args()
-    api_key = os.environ.get("OPENAI_API_KEY")
-    if not api_key:
-        print("Error: OPENAI_API_KEY environment variable is required.")
-        sys.exit(1)
-    openai_client = OpenAI(api_key=api_key)
-    print("=" * 55)
-    print("Data Analysis Agent - Baseline Inference")
-    print(f"Server: {args.base_url}")
-    print("Model: gpt-4o-mini")
-    print("=" * 55)
-    scores = {}
-    difficulties = {
-        1: "Easy",
-        2: "Medium",
-        3: "Medium",
-        4: "Hard",
-        5: "Hard",
-        6: "Hard",
-    }
-    with DataAnalysisClient(base_url=args.base_url).sync() as env_client:
-        for task_id in [1, 2, 3, 4, 5, 6]:
-            score = run_task(openai_client, env_client, task_id)
-            scores[task_id] = score
-    print("\n" + "=" * 55)
-    print("RESULTS")
-    print("=" * 55)
-    for task_id, score in scores.items():
-        print(f"  Task {task_id} ({difficulties[task_id]:6s}): {score:.2f}")
-    avg = sum(scores.values()) / len(scores)
-    print(f"\n  Average Score: {avg:.2f}")
-    print("=" * 55)
-if __name__ == "__main__":
-    main()

client.py DELETED Viewed

@@ -1,56 +0,0 @@
-from models import DataAction, DataObservation, DataState
-from openenv.core.client_types import StepResult
-from openenv.core.env_client import EnvClient
-class DataAnalysisClient(EnvClient[DataAction, DataObservation, DataState]):
-    """Client for interacting with the Data Analysis environment server.
-    Supports both async and sync usage patterns:
-        - Async: ``async with DataAnalysisClient(base_url=...) as client:``
-        - Sync: ``with DataAnalysisClient(base_url=...).sync() as client:``
-    """
-    def _step_payload(self, action: DataAction) -> dict:
-        """Convert a DataAction into a JSON-serializable payload.
-        Args:
-            action: The action to send to the server.
-        Returns:
-            A dictionary representation of the action.
-        """
-        payload = {"action_type": action.action_type}
-        if action.code is not None:
-            payload["code"] = action.code
-        if action.answer is not None:
-            payload["answer"] = action.answer
-        return payload
-    def _parse_result(self, payload: dict) -> StepResult[DataObservation]:
-        """Parse the server's JSON response into a StepResult.
-        Args:
-            payload: The raw JSON response from the server.
-        Returns:
-            A StepResult containing the parsed observation, reward, and done flag.
-        """
-        obs_data = payload.get("observation", payload)
-        obs = DataObservation(**obs_data)
-        return StepResult(
-            observation=obs,
-            reward=payload.get("reward", obs.reward),
-            done=payload.get("done", obs.done),
-        )
-    def _parse_state(self, payload: dict) -> DataState:
-        """Parse the server's state response into a DataState.
-        Args:
-            payload: The raw JSON state response from the server.
-        Returns:
-            A DataState object reflecting the current episode state.
-        """
-        return DataState(**payload)

datasets/sales.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

datasets/store_data.db DELETED Viewed

Binary file (24.6 kB)

helpers/__init__.py DELETED Viewed

File without changes

helpers/constants.py DELETED Viewed

@@ -1,9 +0,0 @@
-import os
-TEMPERATURE = 0.0
-MAX_TOKENS = 1024
-MAX_STEPS = 15
-API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
-MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
-API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
-ENV_SERVER_URL = os.getenv("ENV_SERVER_URL") or "https://mohammed-altaf-dataanalysis-env.hf.space"

helpers/logging.py DELETED Viewed

@@ -1,44 +0,0 @@
-from typing import List, Optional
-def safe_score(raw: float) -> float:
-    """Clamp a raw score to the strictly-open interval (0.05, 0.95).
-    Args:
-        raw: Unclamped score value.
-    Returns:
-        Score guaranteed to be in [0.05, 0.95].
-    """
-    return max(0.05, min(0.95, float(raw)))
-def log_start(task: str, env: str, model: str) -> None:
-    """Emit the [START] line at episode begin."""
-    print(f"[START] task={task} env={env} model={model}", flush=True)
-def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
-    """Emit one [STEP] line immediately after env.step() returns.
-    Args:
-        step: 1-based step number.
-        action: Compact single-line action label (e.g. 'execute_code').
-        reward: Step reward, formatted to 2 decimal places.
-        done: Whether the episode ended after this step.
-        error: Raw error string from the env, or None.
-    """
-    error_val = error.replace("\n", " ") if error else "null"
-    done_val = str(done).lower()
-    print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
-def log_end(task_id: int, score: float, steps: int) -> None:
-    """Emit the [END] line after the episode completes.
-    Args:
-        task_id: The task number that just ran.
-        score: Final clamped score in [0.05, 0.95].
-        steps: Total number of steps taken.
-    """
-    print(f"[END] task={task_id} score={score:.2f} steps={steps}", flush=True)

helpers/prompts.py DELETED Viewed

@@ -1,27 +0,0 @@
-SYSTEM_PROMPT = """
-<ROLE>
-You are a data analyst. You have two data sources available:
-1. `df` — a pandas DataFrame (sales CSV, pre-loaded)
-2. A SQLite database at `db_path` — contains additional tables (e.g. customer_profiles, product_catalog)
-</ROLE>
-<RULES>
-- Use `print()` to output results
-- `pd`, `np`, `sqlite3`, and `db_path` are already in scope — NEVER use import statements (they will fail)
-- `df` is a pandas DataFrame — use pandas operations on it, NEVER SQL
-- To query the SQLite database use: `conn = sqlite3.connect(db_path)` then `pd.read_sql(query, conn)`
-- For cross-source tasks: query SQLite for the extra data, then merge with `df` using pandas
-- When you have the answer, submit it in the exact format requested
-- Be precise with numbers and formatting
-</RULES>
-<RESPONSE>
-Respond with JSON in one of these formats:
-1. To execute code: {"action": "execute_code", "code": "your python code here"}
-2. To submit answer: {"action": "submit_answer", "answer": "your answer here"}
-</RESPONSE>
-<NOTE>
-Respond with ONLY the JSON, no other text.
-</NOTE>
-"""

helpers/response_parser.py DELETED Viewed

@@ -1,181 +0,0 @@
-import json
-import re
-from typing import Any
-FALLBACK_ACTION = json.dumps({"action": "submit_answer", "answer": "unknown"})
-def _sanitize_string_value(match: re.Match) -> str:
-    """
-    Receives a regex match of ("key": "value") and cleans only the value part.
-    Escapes unescaped newlines, tabs, carriage returns, and inner double quotes.
-    NOTE: This is the core trick LangChain uses in _replace_new_line / _custom_parser.
-    """
-    opening = match.group(1)
-    value = match.group(2)
-    closing = match.group(3)
-    value = re.sub(r"\n", r"\\n", value)
-    value = re.sub(r"\r", r"\\r", value)
-    value = re.sub(r"\t", r"\\t", value)
-    value = re.sub(r'(?<!\\)"', r'\\"', value)  # escape unescaped inner quotes
-    return opening + value + closing
-def _sanitize_all_string_values(text: str) -> str:
-    """
-    Apply _sanitize_string_value to every JSON string value in the text.
-    Uses re.DOTALL so values that span multiple lines are handled correctly.
-    NOTE: Generalised version of LangChain's _custom_parser (which only targeted action_input).
-    """
-    return re.sub(
-        r'("[\w]+"\s*:\s*")(.*?)(")',
-        _sanitize_string_value,
-        text,
-        flags=re.DOTALL,
-    )
-def _preprocess(text: str) -> str:
-    """Fix common LLM response quirks before attempting JSON parsing."""
-    # Strip markdown code fences  (```json ... ``` or ``` ... ```)
-    match = re.search(r"```(?:json)?\s*(.*?)```", text, re.DOTALL)
-    if match:
-        text = match.group(1).strip()
-    # Double curly braces  {{"k": "v"}}  →  {"k": "v"}
-    text = text.replace("{{", "{").replace("}}", "}")
-    text = re.sub(r"\bTrue\b", "true", text)
-    text = re.sub(r"\bFalse\b", "false", text)
-    text = re.sub(r"\bNone\b", "null", text)
-    text = re.sub(r",\s*([}\]])", r"\1", text)
-    # Outer single-quote wrap  '{"k": "v"}'  →  {"k": "v"}
-    if text.startswith("'") and text.endswith("'"):
-        text = text[1:-1].replace("\\'", "'")
-    return text.strip()
-def _extract_json_blob(text: str) -> str:
-    """
-    Pull out the first {...} or [...] blob from text that has prose around it.
-    Inspired by LangChain's _json_markdown_re fallback in parse_json_markdown.
-    """
-    match = re.search(r"(\{.*\}|\[.*\])", text, re.DOTALL)
-    return match.group(1) if match else text
-def _parse_partial_json(s: str) -> Any:
-    """
-    Parse JSON that may be truncated / missing closing brackets.
-    Adapted from LangChain's parse_partial_json (originally from open-interpreter).
-    Uses a stack to track open containers and closes them before parsing.
-    """
-    s = s.strip()
-    try:
-        return json.loads(s)
-    except json.JSONDecodeError:
-        pass
-    stack = []
-    is_inside = False
-    position = 0
-    for i, char in enumerate(s):
-        if is_inside:
-            if char == '"' and s[i - 1] != "\\":
-                is_inside = False
-        else:
-            if char == '"':
-                is_inside = True
-                stack.append('"')
-            elif char in "{[":
-                stack.append(char)
-            elif char in "}]":
-                if stack and stack[-1] in "{[":
-                    stack.pop()
-        position = i
-    completed = s[: position + 1]
-    for bracket in reversed(stack):
-        if bracket == '"':
-            completed += '"'
-        elif bracket == "{":
-            completed += "}"
-        elif bracket == "[":
-            completed += "]"
-    return json.loads(completed)
-def _extract_fields_direct(text: str) -> dict:
-    """Extract action fields using greedy regex anchored to the last closing quote.
-    Handles the case where the model emits unescaped double-quote characters inside
-    a "code" or "answer" value (e.g. df["col"]).  The non-greedy `(.*?)` in
-    _sanitize_all_string_values stops at the *first* inner quote and corrupts the
-    output.  By using a greedy `(.*)` anchored with a lookahead for the last `"}`
-    boundary we capture the full value regardless of inner quotes.
-    Args:
-        text: Pre-processed JSON-like string.
-    Returns:
-        Dict with 'action' and 'code'/'answer' keys.
-    Raises:
-        ValueError: If the action field cannot be found or the value cannot be
-            extracted for the detected action type.
-    """
-    action_match = re.search(r'"action"\s*:\s*"(\w+)"', text)
-    if not action_match:
-        raise ValueError("No 'action' field found")
-    action_type = action_match.group(1)
-    if action_type == "execute_code":
-        m = re.search(r'"code"\s*:\s*"(.*)"(?=\s*})', text, re.DOTALL)
-        if m:
-            return {"action": "execute_code", "code": m.group(1)}
-    elif action_type == "submit_answer":
-        m = re.search(r'"answer"\s*:\s*"(.*)"(?=\s*})', text, re.DOTALL)
-        if m:
-            return {"action": "submit_answer", "answer": m.group(1)}
-    raise ValueError(f"Could not extract value for action_type={action_type!r}")
-def parse_model_action(response_text: str) -> dict:
-    """
-    Parse a raw LLM response into an action dict.
-    Pipeline (mirrors LangChain's JsonOutputParser internals):
-      1. _preprocess      – fix markdown fences, double braces, Python literals …
-      2. _sanitize_all_string_values – escape unescaped quotes/newlines inside values
-      3. _extract_json_blob           – strip surrounding prose
-      4. _parse_partial_json          – close truncated JSON with a stack algorithm
-    Each strategy is tried independently so a failure in one doesn't block others.
-    """
-    text = response_text.strip()
-    strategies = [
-        lambda t: _parse_partial_json(t),
-        lambda t: _parse_partial_json(_sanitize_all_string_values(_preprocess(t))),
-        lambda t: _parse_partial_json(_sanitize_all_string_values(_preprocess(_extract_json_blob(t)))),
-        lambda t: _parse_partial_json(_sanitize_all_string_values(_extract_json_blob(_preprocess(t)))),
-        lambda t: _parse_partial_json(_sanitize_all_string_values(t)),
-        lambda t: _extract_fields_direct(_preprocess(_extract_json_blob(t))),
-        lambda t: _extract_fields_direct(_extract_json_blob(t)),
-    ]
-    for strategy in strategies:
-        try:
-            return strategy(text)
-        except (json.JSONDecodeError, ValueError):
-            continue
-    print(f"JSON Decoding Error while parsing action in response text: {response_text}")
-    return json.loads(FALLBACK_ACTION)

inference.py DELETED Viewed

@@ -1,172 +0,0 @@
-from typing import Any, List
-try:
-    from dotenv import load_dotenv
-    load_dotenv()
-except ImportError:
-    pass
-from openai import OpenAI
-from client import DataAnalysisClient
-from helpers.constants import *
-from helpers.logging import log_end, log_start, log_step, safe_score
-from helpers.prompts import SYSTEM_PROMPT
-from helpers.response_parser import FALLBACK_ACTION, parse_model_action
-from models import DataAction
-def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
-    """Run a single task episode using the language model as the agent.
-    Args:
-        openai_client: Configured OpenAI-compatible client.
-        env_client: Connected DataAnalysisClient (sync wrapper).
-        task_id: Task to evaluate (1 - 6)
-    Returns:
-        Final clamped score for this task in [0.05, 0.95].
-    """
-    try:
-        result = env_client.reset(task_id=task_id)
-    except Exception as exc:
-        print(f"[DEBUG] env reset failed: {exc}", flush=True)
-        log_start(task=str(task_id), env=ENV_SERVER_URL, model=MODEL_NAME)
-        log_end(task_id=task_id, score=safe_score(0.0), steps=0)
-        return safe_score(0.0)
-    obs = result.observation
-    rewards: List[float] = []
-    messages = [
-        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": f"Task: {obs.task_description}\n\nDataset Info:\n{obs.dataset_info}",
-                }
-            ],
-        },
-    ]
-    log_start(task=str(task_id), env=ENV_SERVER_URL, model=MODEL_NAME)
-    for step in range(MAX_STEPS):
-        try:
-            completion = openai_client.chat.completions.create(
-                model=MODEL_NAME,
-                messages=messages,
-                temperature=TEMPERATURE,
-                max_tokens=MAX_TOKENS,
-                stream=False,
-            )
-            response_text = completion.choices[0].message.content or ""
-        except Exception as exc:
-            print(f"[DEBUG] Model request failed: {exc}", flush=True)
-            response_text = FALLBACK_ACTION
-        action = parse_model_action(response_text)
-        action_type = action.get("action", "")
-        if action_type == "execute_code":
-            try:
-                exec_result = env_client.step(DataAction(action_type="execute_code", code=action.get("code", "")))
-                exec_obs = exec_result.observation
-                reward = exec_result.reward or 0.0
-                done = exec_result.done
-            except Exception as exc:
-                print(f"[DEBUG] env step failed: {exc}", flush=True)
-                log_step(step=step + 1, action=action_type, reward=0.0, done=False, error=str(exc))
-                rewards.append(0.0)
-                continue
-            rewards.append(reward)
-            error = exec_obs.error if not exec_obs.success else None
-            result_text = f"Output: {exec_obs.output}" if not exec_obs.error else f"Error: {exec_obs.error}"
-            log_step(step=step + 1, action=action_type, reward=reward, done=done, error=error)
-            messages.append({"role": "assistant", "content": response_text})
-            messages.append({"role": "user", "content": [{"type": "text", "text": result_text}]})
-        elif action_type == "submit_answer":
-            try:
-                submit_result = env_client.step(
-                    DataAction(action_type="submit_answer", answer=action.get("answer", ""))
-                )
-                submit_obs = submit_result.observation
-                raw_score = float(submit_obs.metadata.get("score", 0.0) if submit_obs.metadata else submit_result.reward)
-            except Exception as exc:
-                print(f"[DEBUG] env step failed: {exc}", flush=True)
-                log_step(step=step + 1, action=action_type, reward=0.0, done=True, error=str(exc))
-                final_score = safe_score(sum(rewards) / len(rewards)) if rewards else safe_score(0.0)
-                log_end(task_id=task_id, score=final_score, steps=step + 1)
-                return final_score
-            clamped = safe_score(raw_score)
-            rewards.append(clamped)
-            log_step(step=step + 1, action=action_type, reward=clamped, done=True, error=None)
-            final_score = safe_score(sum(rewards) / len(rewards))
-            log_end(task_id=task_id, score=final_score, steps=step + 1)
-            return final_score
-        else:
-            log_step(
-                step=step + 1,
-                action=action_type or "unknown",
-                reward=0.0,
-                done=False,
-                error=f"unknown action '{action_type}'",
-            )
-            messages.append({"role": "assistant", "content": response_text})
-            messages.append(
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": f"Unknown action '{action_type}'. Use 'execute_code' or 'submit_answer'.",
-                        }
-                    ],
-                }
-            )
-    # Max steps reached without submission
-    final_score = safe_score(sum(rewards) / len(rewards)) if rewards else safe_score(0.0)
-    log_end(task_id=task_id, score=final_score, steps=MAX_STEPS)
-    return final_score
-def main():
-    """Run inference across all 6 tasks and report scores."""
-    print("Executing Data Analysis Environment")
-    openai_client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
-    scores = {}
-    difficulties = {
-        1: "Easy_TopRevenueCategoryTask",
-        2: "Medium_CityRevenueShareTask",
-        3: "Medium_RepeatCustomerCohortTask",
-        4: "Hard_MonthlyRevenueRatioTask",
-        5: "Hard_CustomerLoyaltyRevenueTask",
-        6: "Hard_SupplierProfitabilityTask",
-    }
-    with DataAnalysisClient(base_url=ENV_SERVER_URL).sync() as env_client:
-        for task_id in difficulties.keys():
-            score = run_task(openai_client=openai_client, env_client=env_client, task_id=task_id)
-            scores[task_id] = score
-    print("\n" + "=" * 55)
-    print("RESULTS")
-    print("=" * 55)
-    for task_id, score in scores.items():
-        print(f"  Task {task_id} ({difficulties[task_id]:6s}): {score:.2f}")
-    avg = sum(scores.values()) / len(scores)
-    print(f"\n  Average Score : {avg:.2f}")
-    print("=" * 55)
-if __name__ == "__main__":
-    main()

models.py DELETED Viewed

@@ -1,52 +0,0 @@
-from typing import Literal, Optional
-from openenv.core.env_server import Action, Observation, State
-class DataAction(Action):
-    """Agent action for the data analysis environment.
-    The agent can either execute pandas code against the loaded dataset
-    or submit a final answer to be graded.
-    Attributes:
-        action_type: Whether to execute code or submit an answer.
-        code: Python/pandas code to execute (required when action_type is "execute_code").
-        answer: Final answer string (required when action_type is "submit_answer").
-    """
-    action_type: Literal["execute_code", "submit_answer"]
-    code: Optional[str] = None
-    answer: Optional[str] = None
-class DataObservation(Observation):
-    """Observation returned after each step or reset.
-    Attributes:
-        output: String output from code execution or environment messages.
-        success: Whether the last action executed without errors.
-        error: Error message if the last action failed.
-        task_description: The task question, populated on reset.
-        dataset_info: Column names and dtypes summary, populated on reset.
-    """
-    output: str = ""
-    success: bool = True
-    error: Optional[str] = None
-    task_description: str = ""
-    dataset_info: str = ""
-class DataState(State):
-    """Episode state for the data analysis environment.
-    Attributes:
-        task_id: The current task being evaluated (1, 2, or 3).
-        answer_submitted: Whether the agent has submitted a final answer.
-        final_score: The graded score after answer submission (0.0 to 1.0).
-    """
-    task_id: int = 1
-    answer_submitted: bool = False
-    final_score: float = 0.0

openenv.yaml DELETED Viewed

@@ -1,21 +0,0 @@
-spec_version: 1
-name: data_analysis_env
-version: "0.1.0"
-description: "RL environment for training data analysis agents on business datasets"
-type: space
-runtime: fastapi
-app: server.app:app
-port: 8000
-tasks:
-  - id: task_easy
-    grader: tasks.task_easy:TopRevenueCategoryTask
-  - id: task_medium
-    grader: tasks.task_medium:CityRevenueShareTask
-  - id: task_medium_2
-    grader: tasks.task_medium_2:MonthlyRevenueRatioTask
-  - id: task_hard
-    grader: tasks.task_hard:RepeatCustomerCohortTask
-  - id: task_hard_2
-    grader: tasks.task_hard_2:CustomerLoyaltyRevenueTask
-  - id: task_hard_3
-    grader: tasks.task_hard_3:SupplierProfitabilityTask

pyproject.toml DELETED Viewed

@@ -1,25 +0,0 @@
-[project]
-name = "openenv-data-analysis-env"
-version = "0.1.0"
-description = "RL environment for training data analysis agents on business datasets"
-readme = "README.md"
-requires-python = ">=3.13"
-dependencies = [
-    "openenv-core>=0.2.3",
-    "fastapi>=0.115.0",
-    "pydantic>=2.0.0",
-    "uvicorn>=0.24.0",
-    "pandas>=2.0.0",
-    "numpy>=1.24.0",
-    "openai>=1.0.0",
-    "black>=26.3.1",
-    "isort>=8.0.1",
-    "python-dotenv>=1.2.2",
-]
-[project.scripts]
-server = "server.app:main"
-[tool.black]
-line-length = 120
-target-version = ["py313"]

server/Dockerfile DELETED Viewed

@@ -1,58 +0,0 @@
-# Multi-stage build for the Data Analysis Agent environment
-ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
-FROM ${BASE_IMAGE} AS builder
-WORKDIR /app
-# Copy environment code
-COPY .. /app/env
-WORKDIR /app/env
-# Ensure uv is available
-RUN if ! command -v uv >/dev/null 2>&1; then \
-        curl -LsSf https://astral.sh/uv/install.sh | sh && \
-        mv /root/.local/bin/uv /usr/local/bin/uv && \
-        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
-    fi
-# Install git for build-time dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-# Install dependencies with cache
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ -f uv.lock ]; then \
-        uv sync --frozen --no-install-project --no-editable; \
-    else \
-        uv sync --no-install-project --no-editable; \
-    fi
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ -f uv.lock ]; then \
-        uv sync --frozen --no-editable; \
-    else \
-        uv sync --no-editable; \
-    fi
-# Final runtime stage
-FROM ${BASE_IMAGE}
-WORKDIR /app
-# Copy virtual environment and code
-COPY --from=builder /app/env/.venv /app/.venv
-COPY --from=builder /app/env /app/env
-ENV PATH="/app/.venv/bin:$PATH"
-ENV PYTHONPATH="/app/env:$PYTHONPATH"
-# Health check — uses /docs which FastAPI always exposes
-HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
-    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/docs')" || exit 1
-EXPOSE 8000
-# Run server
-CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

server/__init__.py DELETED Viewed

File without changes

server/app.py DELETED Viewed

@@ -1,15 +0,0 @@
-from models import DataAction, DataObservation
-from openenv.core.env_server import create_app
-from server.data_analysis_env import DataAnalysisEnv
-app = create_app(DataAnalysisEnv, DataAction, DataObservation, env_name="data_analysis_env", max_concurrent_envs=3)
-def main():
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)
-if __name__ == "__main__":
-    main()

server/data_analysis_env.py DELETED Viewed

@@ -1,296 +0,0 @@
-import io
-import sqlite3
-import sys
-import uuid
-from pathlib import Path
-from typing import Any, Optional
-import numpy as np
-import pandas as pd
-from models import DataAction, DataObservation, DataState
-from openenv.core.env_server import Environment
-from tasks import TASKS
-DATASET_PATH = Path(__file__).resolve().parent.parent / "datasets" / "sales.csv"
-DB_PATH = Path(__file__).resolve().parent.parent / "datasets" / "store_data.db"
-class DataAnalysisEnv(Environment):
-    """Environment for training data analysis agents on business datasets.
-    The agent receives a task question and can execute pandas code against
-    a pre-loaded DataFrame. The episode ends when the agent submits an answer
-    or exceeds the maximum number of steps.
-    Attributes:
-        MAX_STEPS: Maximum steps before forced episode termination.
-    """
-    MAX_STEPS = 20
-    SUPPORTS_CONCURRENT_SESSIONS = True
-    def __init__(self):
-        super().__init__()
-        self._source_df = pd.read_csv(DATASET_PATH)
-        self._df = self._source_df.copy()
-        self._state = DataState()
-        self._task = None
-        self._exec_namespace = {}
-    def _build_namespace(self) -> dict:
-        """Build a restricted execution namespace for agent code.
-        The namespace includes only pandas, numpy, and the dataset copy.
-        Dangerous builtins like open, exec, eval, and __import__ are removed.
-        Returns:
-            A dictionary to use as the globals for exec().
-        """
-        safe_builtins = (
-            {
-                k: v
-                for k, v in __builtins__.items()
-                if k not in ("open", "exec", "eval", "__import__", "compile", "exit", "quit")
-            }
-            if isinstance(__builtins__, dict)
-            else {
-                k: getattr(__builtins__, k)
-                for k in dir(__builtins__)
-                if k not in ("open", "exec", "eval", "__import__", "compile", "exit", "quit") and not k.startswith("_")
-            }
-        )
-        return {
-            "__builtins__": safe_builtins,
-            "df": self._df.copy(),
-            "pd": pd,
-            "np": np,
-            "sqlite3": sqlite3,
-            "db_path": str(DB_PATH),
-        }
-    def _dataset_info(self) -> str:
-        """Generate a summary of the dataset schema for the agent.
-        Includes the sales DataFrame schema plus the SQLite database table schemas
-        so the agent knows what data is available and where to find it.
-        Returns:
-            A string describing column names, dtypes, row count, a sample for df,
-            and table schemas for the SQLite database.
-        """
-        buf = io.StringIO()
-        self._df.info(buf=buf)
-        info_str = buf.getvalue()
-        sample = self._df.head(3).to_string()
-        df_section = f"=== df (pandas DataFrame, pre-loaded from sales CSV) ===\nShape: {self._df.shape}\n{info_str}\nSample rows:\n{sample}"
-        try:
-            conn = sqlite3.connect(DB_PATH)
-            cursor = conn.cursor()
-            cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
-            tables = [row[0] for row in cursor.fetchall()]
-            db_lines = ["\n=== SQLite database (accessible via sqlite3.connect(db_path)) ==="]
-            for table in tables:
-                cursor.execute(f"PRAGMA table_info({table})")
-                cols = [(row[1], row[2]) for row in cursor.fetchall()]
-                cursor.execute(f"SELECT COUNT(*) FROM {table}")
-                count = cursor.fetchone()[0]
-                col_str = ", ".join(f"{c} ({t})" for c, t in cols)
-                db_lines.append(f"  Table '{table}' ({count} rows): {col_str}")
-            conn.close()
-            db_section = "\n".join(db_lines)
-        except Exception:
-            db_section = "\n=== SQLite database: schema unavailable ==="
-        return f"{df_section}\n{db_section}"
-    def reset(
-        self,
-        seed: Optional[int] = None,
-        episode_id: Optional[str] = None,
-        **kwargs: Any,
-    ) -> DataObservation:
-        """Reset the environment for a new episode.
-        Args:
-            seed: Optional random seed (unused, kept for interface compliance).
-            episode_id: Optional episode identifier; generated if not provided.
-            **kwargs: Additional keyword arguments. Supports 'task_id' (int, 1-6).
-        Returns:
-            An initial observation with the task description and dataset info.
-        """
-        task_id = kwargs.get("task_id", 1)
-        eid = episode_id or str(uuid.uuid4())
-        self._df = self._source_df.copy()
-        self._state = DataState(episode_id=eid, step_count=0, task_id=task_id)
-        self._exec_namespace = self._build_namespace()
-        task_cls = TASKS.get(task_id)
-        if task_cls is None:
-            return DataObservation(
-                done=True,
-                reward=0.0,
-                success=False,
-                error=f"Invalid task_id: {task_id}. Must be 1–6.",
-            )
-        self._task = task_cls(self._df)
-        return DataObservation(
-            done=False,
-            reward=0.0,
-            output="Environment ready. Use 'execute_code' actions to explore the dataset, then 'submit_answer' with your result.",
-            task_description=self._task.description,
-            dataset_info=self._dataset_info(),
-            metadata={"task_id": task_id, "difficulty": self._task.difficulty},
-        )
-    def step(
-        self,
-        action: DataAction,
-        timeout_s: Optional[float] = None,
-        **kwargs: Any,
-    ) -> DataObservation:
-        """Execute one step in the environment.
-        Handles two action types:
-        - execute_code: runs pandas code in a sandboxed namespace
-        - submit_answer: grades the agent's final answer and ends the episode
-        Args:
-            action: The agent's action (execute_code or submit_answer).
-            timeout_s: Optional timeout in seconds (unused).
-            **kwargs: Additional keyword arguments.
-        Returns:
-            An observation with execution output, reward, and done flag.
-        """
-        self._state.step_count += 1
-        if self._state.answer_submitted:
-            return DataObservation(
-                done=True,
-                reward=0.0,
-                output="Episode is already finished. Call reset() to start a new one.",
-                success=False,
-            )
-        # Check max steps
-        if self._state.step_count >= self.MAX_STEPS and action.action_type != "submit_answer":
-            self._state.answer_submitted = True
-            return DataObservation(
-                done=True,
-                reward=0.0,
-                output=f"Maximum steps ({self.MAX_STEPS}) exceeded without submitting an answer.",
-                success=False,
-                metadata={"reason": "max_steps_exceeded"},
-            )
-        if action.action_type == "execute_code":
-            return self._handle_execute_code(action)
-        elif action.action_type == "submit_answer":
-            return self._handle_submit_answer(action)
-        else:
-            return DataObservation(
-                done=False,
-                reward=-0.05,
-                success=False,
-                error=f"Unknown action_type: {action.action_type}",
-            )
-    def _handle_execute_code(self, action: DataAction) -> DataObservation:
-        """Execute pandas code in the sandboxed namespace.
-        Args:
-            action: The action containing the code to execute.
-        Returns:
-            An observation with stdout output or error message.
-        """
-        if not action.code:
-            return DataObservation(
-                done=False,
-                reward=-0.05,
-                success=False,
-                error="No code provided for execute_code action.",
-            )
-        stdout_capture = io.StringIO()
-        old_stdout = sys.stdout
-        try:
-            sys.stdout = stdout_capture
-            exec(action.code, self._exec_namespace)
-            sys.stdout = old_stdout
-            output = stdout_capture.getvalue()
-            # If code produced no print output, try to get the last expression value
-            if not output.strip():
-                try:
-                    result = eval(action.code.strip().split("\n")[-1], self._exec_namespace)
-                    if result is not None:
-                        output = str(result)
-                except Exception:
-                    output = "(Code executed successfully with no output)"
-            return DataObservation(
-                done=False,
-                reward=0.05,
-                output=output[:5000],
-                success=True,
-                metadata={"steps_remaining": self.MAX_STEPS - self._state.step_count},
-            )
-        except Exception as e:
-            sys.stdout = old_stdout
-            return DataObservation(
-                done=False,
-                reward=-0.05,
-                success=False,
-                error=f"{type(e).__name__}: {e}",
-                output="",
-                metadata={"steps_remaining": self.MAX_STEPS - self._state.step_count},
-            )
-    def _handle_submit_answer(self, action: DataAction) -> DataObservation:
-        """Grade the agent's submitted answer and end the episode.
-        Args:
-            action: The action containing the answer to grade.
-        Returns:
-            An observation with the final score and done=True.
-        """
-        if not action.answer:
-            return DataObservation(
-                done=False,
-                reward=-0.05,
-                success=False,
-                error="No answer provided for submit_answer action.",
-            )
-        self._state.answer_submitted = True
-        raw_score = self._task.grade(action.answer)
-        score = max(0.05, min(0.95, raw_score))
-        self._state.final_score = score
-        return DataObservation(
-            done=True,
-            reward=score,
-            output=f"Answer submitted. Score: {score:.2f}/1.00",
-            success=True,
-            metadata={
-                "score": score,
-                "expected_answer": self._task.expected_answer(),
-                "submitted_answer": action.answer,
-            },
-        )
-    @property
-    def state(self) -> DataState:
-        """Return the current episode state.
-        Returns:
-            The current DataState with episode_id, step_count, task_id, etc.
-        """
-        return self._state

tasks/__init__.py DELETED Viewed

@@ -1,29 +0,0 @@
-"""Task definitions for the Data Analysis Agent environment."""
-from tasks.base_task import BaseTask
-from tasks.task_easy import TopRevenueCategoryTask
-from tasks.task_hard import RepeatCustomerCohortTask
-from tasks.task_hard_2 import CustomerLoyaltyRevenueTask
-from tasks.task_hard_3 import SupplierProfitabilityTask
-from tasks.task_medium import CityRevenueShareTask
-from tasks.task_medium_2 import MonthlyRevenueRatioTask
-TASKS = {
-    1: TopRevenueCategoryTask,
-    2: CityRevenueShareTask,
-    3: RepeatCustomerCohortTask,
-    4: MonthlyRevenueRatioTask,
-    5: CustomerLoyaltyRevenueTask,
-    6: SupplierProfitabilityTask,
-}
-__all__ = [
-    "BaseTask",
-    "TASKS",
-    "TopRevenueCategoryTask",
-    "CityRevenueShareTask",
-    "RepeatCustomerCohortTask",
-    "MonthlyRevenueRatioTask",
-    "CustomerLoyaltyRevenueTask",
-    "SupplierProfitabilityTask",
-]

tasks/base_task.py DELETED Viewed

@@ -1,51 +0,0 @@
-from abc import ABC, abstractmethod
-import pandas as pd
-class BaseTask(ABC):
-    """Base class for all data analysis tasks.
-    Subclasses must implement the question, compute the expected answer
-    from the dataset, and provide a grading function.
-    Attributes:
-        df: The pandas DataFrame containing the dataset.
-    """
-    def __init__(self, df: pd.DataFrame):
-        self.df = df
-    @property
-    @abstractmethod
-    def task_id(self) -> int:
-        """Return the unique task identifier."""
-    @property
-    @abstractmethod
-    def difficulty(self) -> str:
-        """Return the difficulty level: 'easy', 'medium', or 'hard'."""
-    @property
-    @abstractmethod
-    def description(self) -> str:
-        """Return the task question shown to the agent."""
-    @abstractmethod
-    def expected_answer(self) -> str:
-        """Compute and return the ground-truth answer from the dataset.
-        Returns:
-            The expected answer as a formatted string.
-        """
-    @abstractmethod
-    def grade(self, answer: str) -> float:
-        """Grade the agent's submitted answer.
-        Args:
-            answer: The agent's submitted answer string.
-        Returns:
-            A score between 0.0 and 1.0.
-        """

tasks/task_easy.py DELETED Viewed

@@ -1,53 +0,0 @@
-from tasks.base_task import BaseTask
-class TopRevenueCategoryTask(BaseTask):
-    """Easy task: find the product category with the highest total revenue.
-    The agent must group the dataset by category, sum the total_price column,
-    and identify which category has the highest revenue.
-    """
-    @property
-    def task_id(self) -> int:
-        """Return the task identifier."""
-        return 1
-    @property
-    def difficulty(self) -> str:
-        """Return the difficulty level."""
-        return "easy"
-    @property
-    def description(self) -> str:
-        """Return the task question."""
-        return (
-            "What is the top-selling product category by total revenue? "
-            "Submit just the category name as your answer."
-        )
-    def expected_answer(self) -> str:
-        """Compute the top revenue category from the dataset.
-        Returns:
-            The name of the category with the highest total_price sum.
-        """
-        return self.df.groupby("category")["total_price"].sum().idxmax()
-    def grade(self, answer: str) -> float:
-        """Grade the answer by case-insensitive containment check.
-        Accepts the answer if the expected category name appears anywhere in
-        the submitted string, so responses like 'The top category is Clothing'
-        or 'Clothing ($74,792.74)' still receive full credit.
-        Args:
-            answer: The agent's submitted category name.
-        Returns:
-            1.0 if the expected category appears in the answer, 0.0 otherwise.
-        """
-        expected = self.expected_answer().strip().lower()
-        submitted = answer.strip().lower()
-        raw = 1.0 if expected in submitted else 0.0
-        return max(0.05, min(0.95, raw))

tasks/task_hard.py DELETED Viewed

@@ -1,103 +0,0 @@
-import re
-import pandas as pd
-from tasks.base_task import BaseTask
-class RepeatCustomerCohortTask(BaseTask):
-    """Hard task: find customers who ordered in both January and December.
-    The agent must identify customers present in both months, count them,
-    and compare their average order value to all other customers.
-    """
-    @property
-    def task_id(self) -> int:
-        return 3
-    @property
-    def difficulty(self) -> str:
-        return "hard"
-    @property
-    def description(self) -> str:
-        return (
-            "How many unique customers placed orders in BOTH January and December? "
-            "What is their average order value compared to all other customers? "
-            "Submit your answer in the format: "
-            "'Cohort: N customers, Cohort AOV: $X.XX, Other AOV: $X.XX'"
-        )
-    def _compute_cohort(self) -> tuple[set, float, float]:
-        """Compute the cohort of customers ordering in both January and December.
-        Returns:
-            A tuple of (cohort_customer_ids, cohort_aov, other_aov).
-        """
-        df = self.df.copy()
-        df["order_date"] = pd.to_datetime(df["order_date"])
-        jan_customers = set(df[df["order_date"].dt.month == 1]["customer_id"])
-        dec_customers = set(df[df["order_date"].dt.month == 12]["customer_id"])
-        cohort = jan_customers & dec_customers
-        cohort_aov = df[df["customer_id"].isin(cohort)]["total_price"].mean()
-        other_aov = df[~df["customer_id"].isin(cohort)]["total_price"].mean()
-        return cohort, round(cohort_aov, 2), round(other_aov, 2)
-    def expected_answer(self) -> str:
-        """Compute the expected cohort analysis answer.
-        Returns:
-            Formatted string like 'Cohort: 57 customers, Cohort AOV: $126.57, Other AOV: $122.94'.
-        """
-        cohort, cohort_aov, other_aov = self._compute_cohort()
-        return f"Cohort: {len(cohort)} customers, Cohort AOV: ${cohort_aov}, Other AOV: ${other_aov}"
-    def grade(self, answer: str) -> float:
-        """Grade the answer with partial credit for each of the three fields.
-        Scoring:
-            - 0.33 for correct customer count (exact match)
-            - 0.33 for cohort AOV within ±0.5% of expected
-            - 0.34 for other AOV within ±0.5% of expected
-        Args:
-            answer: The agent's submitted answer string.
-        Returns:
-            A score between 0.0 and 1.0.
-        """
-        cohort, expected_cohort_aov, expected_other_aov = self._compute_cohort()
-        expected_count = len(cohort)
-        score = 0.0
-        # Check customer count
-        count_match = re.search(r"Cohort:\s*(\d+)\s*customers?", answer, re.IGNORECASE)
-        if count_match:
-            if int(count_match.group(1)) == expected_count:
-                score += 0.33
-        # Check cohort AOV
-        cohort_aov_match = re.search(r"Cohort\s+AOV:\s*\$?([\d.]+)", answer, re.IGNORECASE)
-        if cohort_aov_match:
-            try:
-                submitted = float(cohort_aov_match.group(1))
-                tolerance = expected_cohort_aov * 0.005
-                if abs(submitted - expected_cohort_aov) <= tolerance:
-                    score += 0.33
-            except ValueError:
-                pass
-        # Check other AOV
-        other_aov_match = re.search(r"Other\s+AOV:\s*\$?([\d.]+)", answer, re.IGNORECASE)
-        if other_aov_match:
-            try:
-                submitted = float(other_aov_match.group(1))
-                tolerance = expected_other_aov * 0.005
-                if abs(submitted - expected_other_aov) <= tolerance:
-                    score += 0.34
-            except ValueError:
-                pass
-        return max(0.05, min(0.95, score))

tasks/task_hard_2.py DELETED Viewed

@@ -1,103 +0,0 @@
-import re
-import sqlite3
-from pathlib import Path
-import pandas as pd
-from tasks.base_task import BaseTask
-DB_PATH = Path(__file__).resolve().parent.parent / "datasets" / "store_data.db"
-class CustomerLoyaltyRevenueTask(BaseTask):
-    """Hard task: find the highest-revenue customer loyalty tier using cross-source data.
-    The agent must query the customer_profiles table from the SQLite database,
-    join it with the sales DataFrame on customer_id, and compute revenue by tier.
-    The database is accessible via sqlite3.connect(db_path) in the sandbox.
-    """
-    @property
-    def task_id(self) -> int:
-        return 5
-    @property
-    def difficulty(self) -> str:
-        return "hard"
-    @property
-    def description(self) -> str:
-        return (
-            "Using the customer profiles database (connect with sqlite3.connect(db_path)), "
-            "which customer loyalty tier generates the highest total revenue? "
-            "What percentage of total revenue does it represent? "
-            "Round percentage to 2 decimal places. "
-            "Submit your answer in the format: "
-            "'Top tier: <name>, Revenue: $X.XX, Percentage: X.XX%'"
-        )
-    def _compute(self) -> tuple:
-        """Compute the top loyalty tier and its revenue share.
-        Returns:
-            A tuple of (top_tier, tier_revenue, percentage).
-        """
-        conn = sqlite3.connect(DB_PATH)
-        profiles = pd.read_sql("SELECT customer_id, loyalty_tier FROM customer_profiles", conn)
-        conn.close()
-        merged = self.df.merge(profiles, on="customer_id", how="left")
-        tier_rev = merged.groupby("loyalty_tier")["total_price"].sum()
-        total = merged["total_price"].sum()
-        top = tier_rev.idxmax()
-        rev = tier_rev[top]
-        pct = rev / total * 100
-        return top, rev, pct
-    def expected_answer(self) -> str:
-        """Compute the expected formatted answer.
-        Returns:
-            Formatted string like 'Top tier: Bronze, Revenue: $97210.91, Percentage: 39.28%'.
-        """
-        top, rev, pct = self._compute()
-        return f"Top tier: {top}, Revenue: ${rev:.2f}, Percentage: {round(pct, 2)}%"
-    def grade(self, answer: str) -> float:
-        """Grade with partial credit for each of the three fields.
-        Scoring:
-            - 0.33 for correct tier name (case-insensitive)
-            - 0.33 for revenue within ±0.5% of expected
-            - 0.34 for percentage within ±0.1 of expected
-        Args:
-            answer: The agent's submitted answer string.
-        Returns:
-            A score between 0.0 and 1.0.
-        """
-        top, expected_rev, expected_pct = self._compute()
-        score = 0.0
-        tier_match = re.search(r"Top tier:\s*([^,]+)", answer, re.IGNORECASE)
-        if tier_match and tier_match.group(1).strip().lower() == top.lower():
-            score += 0.33
-        rev_match = re.search(r"Revenue:\s*\$?([\d.]+)", answer, re.IGNORECASE)
-        if rev_match:
-            try:
-                submitted = float(rev_match.group(1))
-                if abs(submitted - expected_rev) <= expected_rev * 0.005:
-                    score += 0.33
-            except ValueError:
-                pass
-        pct_match = re.search(r"Percentage:\s*([\d.]+)%?", answer, re.IGNORECASE)
-        if pct_match:
-            try:
-                if abs(float(pct_match.group(1)) - expected_pct) <= 0.1:
-                    score += 0.34
-            except ValueError:
-                pass
-        return max(0.05, min(0.95, score))

tasks/task_hard_3.py DELETED Viewed

@@ -1,107 +0,0 @@
-import re
-import sqlite3
-from pathlib import Path
-import pandas as pd
-from tasks.base_task import BaseTask
-DB_PATH = Path(__file__).resolve().parent.parent / "datasets" / "store_data.db"
-class SupplierProfitabilityTask(BaseTask):
-    """Hard task: find the most profitable supplier using cross-source data.
-    The agent must query the product_catalog table from the SQLite database,
-    join it with the sales DataFrame on product_name, compute per-order profit
-    and margin, then aggregate by supplier.
-    The database is accessible via sqlite3.connect(db_path) in the sandbox.
-    """
-    @property
-    def task_id(self) -> int:
-        return 6
-    @property
-    def difficulty(self) -> str:
-        return "hard"
-    @property
-    def description(self) -> str:
-        return (
-            "Using the product catalog database (connect with sqlite3.connect(db_path)), "
-            "which supplier has the highest total profit from orders? "
-            "(profit per order = (unit_price - cost_price) * quantity) "
-            "What is their total profit and average profit margin? "
-            "(margin % = (unit_price - cost_price) / unit_price * 100, "
-            "averaged across all their orders) "
-            "Round total profit to 2 decimal places and avg margin to 2 decimal places. "
-            "Submit your answer in the format: "
-            "'Supplier: <name>, Total profit: $X.XX, Avg margin: X.XX%'"
-        )
-    def _compute(self) -> tuple:
-        """Compute the top supplier by profit and their average margin.
-        Returns:
-            A tuple of (supplier_name, total_profit, avg_margin_pct).
-        """
-        conn = sqlite3.connect(DB_PATH)
-        catalog = pd.read_sql("SELECT product_name, supplier, cost_price FROM product_catalog", conn)
-        conn.close()
-        merged = self.df.merge(catalog, on="product_name", how="left")
-        merged["profit"] = (merged["unit_price"] - merged["cost_price"]) * merged["quantity"]
-        merged["margin"] = (merged["unit_price"] - merged["cost_price"]) / merged["unit_price"] * 100
-        sup_profit = merged.groupby("supplier")["profit"].sum()
-        sup_margin = merged.groupby("supplier")["margin"].mean()
-        top = sup_profit.idxmax()
-        return top, sup_profit[top], sup_margin[top]
-    def expected_answer(self) -> str:
-        """Compute the expected formatted answer.
-        Returns:
-            Formatted string like 'Supplier: FashionWorld, Total profit: $38292.08, Avg margin: 52.08%'.
-        """
-        top, profit, margin = self._compute()
-        return f"Supplier: {top}, Total profit: ${profit:.2f}, Avg margin: {round(margin, 2)}%"
-    def grade(self, answer: str) -> float:
-        """Grade with partial credit for each of the three fields.
-        Scoring:
-            - 0.33 for correct supplier name (case-insensitive)
-            - 0.34 for total profit within ±0.5% of expected
-            - 0.33 for avg margin within ±0.1 of expected
-        Args:
-            answer: The agent's submitted answer string.
-        Returns:
-            A score between 0.0 and 1.0.
-        """
-        top, expected_profit, expected_margin = self._compute()
-        score = 0.0
-        sup_match = re.search(r"Supplier:\s*([^,]+)", answer, re.IGNORECASE)
-        if sup_match and sup_match.group(1).strip().lower() == top.lower():
-            score += 0.33
-        profit_match = re.search(r"Total profit:\s*\$?([\d.]+)", answer, re.IGNORECASE)
-        if profit_match:
-            try:
-                submitted = float(profit_match.group(1))
-                if abs(submitted - expected_profit) <= expected_profit * 0.005:
-                    score += 0.34
-            except ValueError:
-                pass
-        margin_match = re.search(r"Avg margin:\s*([\d.]+)%?", answer, re.IGNORECASE)
-        if margin_match:
-            try:
-                if abs(float(margin_match.group(1)) - expected_margin) <= 0.1:
-                    score += 0.33
-            except ValueError:
-                pass
-        return max(0.05, min(0.95, score))

tasks/task_medium.py DELETED Viewed

@@ -1,77 +0,0 @@
-import re
-import pandas as pd
-from tasks.base_task import BaseTask
-class CityRevenueShareTask(BaseTask):
-    """Medium task: identify the city with the highest revenue and its percentage share.
-    The agent must group by city, compute total revenue per city,
-    find the top city, and calculate what percentage of overall revenue it represents.
-    """
-    @property
-    def task_id(self) -> int:
-        return 2
-    @property
-    def difficulty(self) -> str:
-        return "medium"
-    @property
-    def description(self) -> str:
-        return (
-            "Which city generates the most revenue? What percentage of total revenue "
-            "does it represent? Round to 2 decimal places. "
-            "Submit your answer in the format: 'City: <name>, Percentage: <X.XX>%'"
-        )
-    def expected_answer(self) -> str:
-        """Compute the top city and its revenue share.
-        Returns:
-            Formatted string like 'City: London, Percentage: 10.81%'.
-        """
-        city_rev = self.df.groupby("city")["total_price"].sum()
-        top_city = city_rev.idxmax()
-        pct = round(city_rev[top_city] / city_rev.sum() * 100, 2)
-        return f"City: {top_city}, Percentage: {pct}%"
-    def grade(self, answer: str) -> float:
-        """Grade the answer with partial credit for city and percentage.
-        Scoring:
-            - 0.5 for correct city name (case-insensitive)
-            - 0.5 for percentage within ±0.1 of expected
-        Args:
-            answer: The agent's submitted answer string.
-        Returns:
-            A score between 0.0 and 1.0.
-        """
-        score = 0.0
-        city_rev = self.df.groupby("city")["total_price"].sum()
-        expected_city = city_rev.idxmax()
-        expected_pct = round(city_rev[expected_city] / city_rev.sum() * 100, 2)
-        # Check city
-        city_match = re.search(r"City:\s*([^,]+)", answer, re.IGNORECASE)
-        if city_match:
-            submitted_city = city_match.group(1).strip()
-            if submitted_city.lower() == expected_city.lower():
-                score += 0.5
-        # Check percentage
-        pct_match = re.search(r"Percentage:\s*([\d.]+)%?", answer, re.IGNORECASE)
-        if pct_match:
-            try:
-                submitted_pct = float(pct_match.group(1))
-                if abs(submitted_pct - expected_pct) <= 0.1:
-                    score += 0.5
-            except ValueError:
-                pass
-        return max(0.05, min(0.95, score))

tasks/task_medium_2.py DELETED Viewed

@@ -1,88 +0,0 @@
-import re
-import pandas as pd
-from tasks.base_task import BaseTask
-class MonthlyRevenueRatioTask(BaseTask):
-    """Medium task: find the best and worst months by revenue and compute their ratio.
-    The agent must parse order_date, group by month, find the extremes,
-    and compute how many times larger the best month is versus the worst.
-    """
-    @property
-    def task_id(self) -> int:
-        return 4
-    @property
-    def difficulty(self) -> str:
-        return "medium"
-    @property
-    def description(self) -> str:
-        return (
-            "What is the best and worst performing month by total revenue in 2024? "
-            "What is the ratio of best to worst month revenue? Round ratio to 2 decimal places. "
-            "Submit your answer in the format: "
-            "'Best: YYYY-MM ($X.XX), Worst: YYYY-MM ($X.XX), Ratio: X.XX'"
-        )
-    def _compute(self) -> tuple:
-        """Compute the best month, worst month, and their revenue ratio.
-        Returns:
-            A tuple of (best_month_str, best_rev, worst_month_str, worst_rev, ratio).
-        """
-        df = self.df.copy()
-        df["order_date"] = pd.to_datetime(df["order_date"])
-        monthly = df.groupby(df["order_date"].dt.to_period("M"))["total_price"].sum()
-        best = monthly.idxmax()
-        worst = monthly.idxmin()
-        ratio = round(monthly[best] / monthly[worst], 2)
-        return str(best), monthly[best], str(worst), monthly[worst], ratio
-    def expected_answer(self) -> str:
-        """Compute the expected formatted answer.
-        Returns:
-            Formatted string like 'Best: 2024-12 ($23025.82), Worst: 2024-05 ($16871.48), Ratio: 1.36'.
-        """
-        best, best_rev, worst, worst_rev, ratio = self._compute()
-        return f"Best: {best} (${best_rev:.2f}), Worst: {worst} (${worst_rev:.2f}), Ratio: {ratio}"
-    def grade(self, answer: str) -> float:
-        """Grade with partial credit for each of the three fields.
-        Scoring:
-            - 0.33 for correct best month (exact YYYY-MM match)
-            - 0.33 for correct worst month (exact YYYY-MM match)
-            - 0.34 for ratio within ±0.01 of expected
-        Args:
-            answer: The agent's submitted answer string.
-        Returns:
-            A score between 0.0 and 1.0.
-        """
-        best, _, worst, _, expected_ratio = self._compute()
-        score = 0.0
-        best_match = re.search(r"Best:\s*([\d]{4}-[\d]{2})", answer, re.IGNORECASE)
-        if best_match and best_match.group(1).strip() == best:
-            score += 0.33
-        worst_match = re.search(r"Worst:\s*([\d]{4}-[\d]{2})", answer, re.IGNORECASE)
-        if worst_match and worst_match.group(1).strip() == worst:
-            score += 0.33
-        ratio_match = re.search(r"Ratio:\s*([\d.]+)", answer, re.IGNORECASE)
-        if ratio_match:
-            try:
-                if abs(float(ratio_match.group(1)) - expected_ratio) <= 0.01:
-                    score += 0.34
-            except ValueError:
-                pass
-        return max(0.05, min(0.95, score))

uv.lock DELETED Viewed

The diff for this file is too large to render. See raw diff