Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -35
- Dockerfile +34 -0
- README.md +125 -5
- codedark/.pytest_cache/.gitignore +2 -0
- codedark/.pytest_cache/CACHEDIR.TAG +4 -0
- codedark/.pytest_cache/README.md +8 -0
- codedark/.pytest_cache/v/cache/lastfailed +6 -0
- codedark/.pytest_cache/v/cache/nodeids +25 -0
- codedark/README.md +174 -0
- codedark/__init__.py +38 -0
- codedark/client.py +197 -0
- codedark/data/bank.csv +3 -0
- codedark/data/road.csv +3 -0
- codedark/data/tasks/final_25_tasks.jsonl +25 -0
- codedark/models.py +150 -0
- codedark/openenv.yaml +62 -0
- codedark/pyproject.toml +60 -0
- codedark/server/Dockerfile +34 -0
- codedark/server/__init__.py +1 -0
- codedark/server/app.py +195 -0
- codedark/server/environment.py +319 -0
- codedark/server/requirements.txt +7 -0
- codedark/server/scoring.py +332 -0
- codedark/server/tools.py +308 -0
- codedark/tests/__init__.py +1 -0
- codedark/tests/test_environment.py +181 -0
- data/bank.csv +3 -0
- data/road.csv +3 -0
- data/tasks/final_25_tasks.jsonl +25 -0
- requirements.txt +6 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,2 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.jsonl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && \
|
| 8 |
+
apt-get install -y --no-install-recommends curl && \
|
| 9 |
+
rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy requirements and install Python dependencies
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Copy application code
|
| 16 |
+
COPY codedark/ ./codedark/
|
| 17 |
+
COPY data/ ./data/
|
| 18 |
+
|
| 19 |
+
# Environment variables
|
| 20 |
+
ENV PYTHONUNBUFFERED=1
|
| 21 |
+
ENV CODEDARK_DATA_DIR=/app/data
|
| 22 |
+
ENV CODEDARK_TASKS_PATH=/app/data/tasks/final_25_tasks.jsonl
|
| 23 |
+
ENV HOST=0.0.0.0
|
| 24 |
+
ENV PORT=7860
|
| 25 |
+
|
| 26 |
+
# Expose HuggingFace Spaces port
|
| 27 |
+
EXPOSE 7860
|
| 28 |
+
|
| 29 |
+
# Healthcheck
|
| 30 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
|
| 31 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
| 32 |
+
|
| 33 |
+
# Run server
|
| 34 |
+
CMD ["python", "-m", "uvicorn", "codedark.server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,130 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: CodeDark Environment Server
|
| 3 |
+
emoji: 📊
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
tags:
|
| 10 |
+
- openenv
|
| 11 |
+
- reinforcement-learning
|
| 12 |
+
- data-analytics
|
| 13 |
+
- agents
|
| 14 |
+
- benchmark
|
| 15 |
---
|
| 16 |
|
| 17 |
+
# CodeDark: Data Analytics Environment for RL Agents
|
| 18 |
+
|
| 19 |
+
**OpenEnv-compatible multi-turn environment for training AI agents on real business analytics tasks.**
|
| 20 |
+
|
| 21 |
+
## Overview
|
| 22 |
+
|
| 23 |
+
CodeDark is the first data analytics environment in the OpenEnv ecosystem. It challenges AI agents to analyze CSV datasets using Python/Pandas, testing their ability to be data scientists rather than just code executors.
|
| 24 |
+
|
| 25 |
+
### Key Features
|
| 26 |
+
|
| 27 |
+
- **Real Business Tasks**: Bank marketing and road safety datasets with genuine analytical questions
|
| 28 |
+
- **Multi-Turn Interaction**: Agents explore data, save notes, ask clarifications, and submit answers
|
| 29 |
+
- **Shaped Rewards**: 80% correctness + 10% efficiency + 10% token cost
|
| 30 |
+
- **Pre-Benchmarked**: 25 curated L5-L6 difficulty tasks validated on 11+ models
|
| 31 |
+
|
| 32 |
+
## Quick Start
|
| 33 |
+
|
| 34 |
+
### Connect to the Environment
|
| 35 |
+
|
| 36 |
+
```python
|
| 37 |
+
from openenv import EnvClient
|
| 38 |
+
|
| 39 |
+
# Connect to this Space
|
| 40 |
+
env = EnvClient.from_hub("openenv/codedark")
|
| 41 |
+
|
| 42 |
+
# Reset for a new task
|
| 43 |
+
obs = env.reset()
|
| 44 |
+
print(f"Task: {obs['question']}")
|
| 45 |
+
|
| 46 |
+
# Execute Python code
|
| 47 |
+
obs = env.step({"tool": "run_python", "args": "<code>result = df.shape</code>"})
|
| 48 |
+
print(f"Result: {obs['stdout']}")
|
| 49 |
+
|
| 50 |
+
# Submit answer
|
| 51 |
+
obs = env.step({"tool": "submit_answer", "args": "<answer>42.5</answer>"})
|
| 52 |
+
print(f"Reward: {obs['reward']}")
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
### Available Tools
|
| 56 |
+
|
| 57 |
+
| Tool | Description |
|
| 58 |
+
| --------------- | -------------------------------------------------------------- |
|
| 59 |
+
| `run_python` | Execute Python/pandas code. Store result in `result` variable. |
|
| 60 |
+
| `read_notes` | Read saved notes from previous turns. |
|
| 61 |
+
| `save_note` | Save observations for later recall. |
|
| 62 |
+
| `clarify` | Ask clarifying questions (max 2 per episode). |
|
| 63 |
+
| `submit_answer` | Submit final answer. Ends episode. |
|
| 64 |
+
|
| 65 |
+
## Datasets
|
| 66 |
+
|
| 67 |
+
### Bank Marketing (750K rows)
|
| 68 |
+
|
| 69 |
+
- **Target**: Term deposit subscription prediction
|
| 70 |
+
- **Features**: age, job, marital, education, balance, housing, loan, contact, day, month, duration, campaign
|
| 71 |
+
|
| 72 |
+
### Road Safety (500K rows)
|
| 73 |
+
|
| 74 |
+
- **Target**: Accident risk assessment
|
| 75 |
+
- **Features**: road_type, num_lanes, curvature, speed_limit, lighting, weather, time_of_day
|
| 76 |
+
|
| 77 |
+
## Task Difficulty
|
| 78 |
+
|
| 79 |
+
| Level | Complexity | Example |
|
| 80 |
+
| ----- | --------------- | -------------------------------------------- |
|
| 81 |
+
| L4 | Quartile/binned | "Subscription rate in Q1 balance?" |
|
| 82 |
+
| L5 | Multi-condition | "Rate for month='may' AND job='management'?" |
|
| 83 |
+
| L6 | Nested extrema | "In lowest subscription month, avg day?" |
|
| 84 |
+
|
| 85 |
+
## Reward Structure
|
| 86 |
+
|
| 87 |
+
| Component | Weight | Description |
|
| 88 |
+
| ----------- | ------ | ----------------------------------------------- |
|
| 89 |
+
| Correctness | 80% | Binary correct/incorrect with numeric tolerance |
|
| 90 |
+
| Efficiency | 10% | Fewer turns = better score |
|
| 91 |
+
| Token Cost | 10% | Lower token usage = better score |
|
| 92 |
+
|
| 93 |
+
## API Endpoints
|
| 94 |
+
|
| 95 |
+
| Endpoint | Method | Description |
|
| 96 |
+
| ----------- | ------ | --------------------- |
|
| 97 |
+
| `/health` | GET | Health check |
|
| 98 |
+
| `/reset` | POST | Reset for new episode |
|
| 99 |
+
| `/step` | POST | Execute action |
|
| 100 |
+
| `/state` | GET | Current state |
|
| 101 |
+
| `/metadata` | GET | Environment metadata |
|
| 102 |
+
| `/schema` | GET | Type schemas |
|
| 103 |
+
|
| 104 |
+
## Benchmark Results
|
| 105 |
+
|
| 106 |
+
Pre-benchmarked on 11+ models with 1,844 completions:
|
| 107 |
+
|
| 108 |
+
| Model | Accuracy | Avg Turns |
|
| 109 |
+
| ---------------- | -------- | --------- |
|
| 110 |
+
| Claude Opus 4.5 | 77.3% | 4.2 |
|
| 111 |
+
| Qwen3 Max | 46.7% | 5.1 |
|
| 112 |
+
| Mistral Large | 45.3% | 5.8 |
|
| 113 |
+
| Llama 4 Maverick | 38.7% | 6.2 |
|
| 114 |
+
|
| 115 |
+
## Links
|
| 116 |
+
|
| 117 |
+
- **GitHub**: [vj-09/codeblue-env](https://github.com/vj-09/codeblue-env)
|
| 118 |
+
- **Leaderboard**: [analytics-rl.com](https://www.analytics-rl.com)
|
| 119 |
+
- **OpenEnv Spec**: [meta-pytorch/OpenEnv](https://github.com/meta-pytorch/OpenEnv)
|
| 120 |
+
|
| 121 |
+
## License
|
| 122 |
+
|
| 123 |
+
MIT License
|
| 124 |
+
|
| 125 |
+
## Author
|
| 126 |
+
|
| 127 |
+
**Vijay Athithya**
|
| 128 |
+
|
| 129 |
+
- GitHub: [@vj-09](https://github.com/vj-09)
|
| 130 |
+
- LinkedIn: [vijay-athithya](https://www.linkedin.com/in/vijay-athithya/)
|
codedark/.pytest_cache/.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Created by pytest automatically.
|
| 2 |
+
*
|
codedark/.pytest_cache/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
codedark/.pytest_cache/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
codedark/.pytest_cache/v/cache/lastfailed
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tests/test_environment.py::TestModels": true,
|
| 3 |
+
"tests/test_environment.py::TestTools": true,
|
| 4 |
+
"tests/test_environment.py::TestScoring": true,
|
| 5 |
+
"tests/test_environment.py::TestEnvironment": true
|
| 6 |
+
}
|
codedark/.pytest_cache/v/cache/nodeids
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
"tests/test_environment.py::TestEnvironment::test_reset_loads_task",
|
| 3 |
+
"tests/test_environment.py::TestEnvironment::test_reset_specific_task",
|
| 4 |
+
"tests/test_environment.py::TestEnvironment::test_step_read_notes",
|
| 5 |
+
"tests/test_environment.py::TestEnvironment::test_step_run_python",
|
| 6 |
+
"tests/test_environment.py::TestEnvironment::test_step_save_note",
|
| 7 |
+
"tests/test_environment.py::TestEnvironment::test_step_submit_answer",
|
| 8 |
+
"tests/test_environment.py::TestEnvironment::test_turn_counting",
|
| 9 |
+
"tests/test_environment.py::TestModels::test_action_creation",
|
| 10 |
+
"tests/test_environment.py::TestModels::test_observation_defaults",
|
| 11 |
+
"tests/test_environment.py::TestModels::test_state_defaults",
|
| 12 |
+
"tests/test_environment.py::TestScoring::test_compute_reward_correct",
|
| 13 |
+
"tests/test_environment.py::TestScoring::test_correctness_exact_match",
|
| 14 |
+
"tests/test_environment.py::TestScoring::test_correctness_scale_error",
|
| 15 |
+
"tests/test_environment.py::TestScoring::test_correctness_within_tolerance",
|
| 16 |
+
"tests/test_environment.py::TestScoring::test_correctness_wrong",
|
| 17 |
+
"tests/test_environment.py::TestScoring::test_efficiency_correct_answer",
|
| 18 |
+
"tests/test_environment.py::TestScoring::test_efficiency_incorrect_answer",
|
| 19 |
+
"tests/test_environment.py::TestTools::test_parse_run_python",
|
| 20 |
+
"tests/test_environment.py::TestTools::test_parse_run_python_missing_tag",
|
| 21 |
+
"tests/test_environment.py::TestTools::test_parse_submit_answer",
|
| 22 |
+
"tests/test_environment.py::TestTools::test_read_notes_empty",
|
| 23 |
+
"tests/test_environment.py::TestTools::test_read_notes_with_content",
|
| 24 |
+
"tests/test_environment.py::TestTools::test_save_note"
|
| 25 |
+
]
|
codedark/README.md
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeDark
|
| 2 |
+
|
| 3 |
+
**OpenEnv-compatible multi-turn data analytics environment for RL agent training.**
|
| 4 |
+
|
| 5 |
+
Train AI agents to be data scientists, not just code executors. CodeDark features real business analytics tasks with pandas/numpy, multi-metric reward shaping, and skill-based curriculum.
|
| 6 |
+
|
| 7 |
+
## Quick Start
|
| 8 |
+
|
| 9 |
+
### Server
|
| 10 |
+
|
| 11 |
+
```bash
|
| 12 |
+
# Install
|
| 13 |
+
pip install -e .
|
| 14 |
+
|
| 15 |
+
# Run server
|
| 16 |
+
python -m codedark.server.app
|
| 17 |
+
# Server runs at http://localhost:8000
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
### Client
|
| 21 |
+
|
| 22 |
+
```python
|
| 23 |
+
from codedark import CodeDarkEnv
|
| 24 |
+
|
| 25 |
+
env = CodeDarkEnv("http://localhost:8000")
|
| 26 |
+
|
| 27 |
+
# Reset for new episode
|
| 28 |
+
obs = env.reset()
|
| 29 |
+
print(f"Task: {obs['question']}")
|
| 30 |
+
|
| 31 |
+
# Execute Python code
|
| 32 |
+
obs = env.run_python("result = df.shape")
|
| 33 |
+
print(f"Shape: {obs['stdout']}")
|
| 34 |
+
|
| 35 |
+
# Explore the data
|
| 36 |
+
obs = env.run_python("result = df.columns.tolist()")
|
| 37 |
+
print(f"Columns: {obs['stdout']}")
|
| 38 |
+
|
| 39 |
+
# Calculate and submit answer
|
| 40 |
+
obs = env.run_python("result = df['y'].mean() * 100")
|
| 41 |
+
obs = env.submit_answer(11.26)
|
| 42 |
+
print(f"Reward: {obs['reward']}")
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
### Docker
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
# Build
|
| 49 |
+
docker build -t codedark:latest -f server/Dockerfile .
|
| 50 |
+
|
| 51 |
+
# Run
|
| 52 |
+
docker run -p 8000:8000 codedark:latest
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Tools
|
| 56 |
+
|
| 57 |
+
Agents have access to 5 tools:
|
| 58 |
+
|
| 59 |
+
| Tool | Description |
|
| 60 |
+
| --------------- | --------------------------------------------------------------- |
|
| 61 |
+
| `run_python` | Execute Python/pandas code. Store output in `result` variable. |
|
| 62 |
+
| `read_notes` | Read all saved notes from previous turns. |
|
| 63 |
+
| `save_note` | Save observations for later recall. Notes persist across turns. |
|
| 64 |
+
| `clarify` | Ask clarifying questions about the task (max 2 per episode). |
|
| 65 |
+
| `submit_answer` | Submit final answer. Ends episode. |
|
| 66 |
+
|
| 67 |
+
## Reward Structure
|
| 68 |
+
|
| 69 |
+
Total reward is computed from three components (max 1.0):
|
| 70 |
+
|
| 71 |
+
| Component | Weight | Description |
|
| 72 |
+
| ----------- | ------ | ----------------------------------------------- |
|
| 73 |
+
| Correctness | 80% | Binary correct/incorrect with numeric tolerance |
|
| 74 |
+
| Efficiency | 10% | Fewer turns = better score |
|
| 75 |
+
| Token Cost | 10% | Lower token usage = better score |
|
| 76 |
+
|
| 77 |
+
## Datasets
|
| 78 |
+
|
| 79 |
+
### Bank Marketing
|
| 80 |
+
|
| 81 |
+
- **Records**: 750,000 customers
|
| 82 |
+
- **Target**: Term deposit subscription (y = 0/1)
|
| 83 |
+
- **Features**: age, job, marital, education, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome
|
| 84 |
+
|
| 85 |
+
### Road Safety
|
| 86 |
+
|
| 87 |
+
- **Records**: 500,000 road segments
|
| 88 |
+
- **Target**: Accident risk (continuous)
|
| 89 |
+
- **Features**: road_type, num_lanes, curvature, speed_limit, lighting, weather, road_signs_present, time_of_day, num_reported_accidents
|
| 90 |
+
|
| 91 |
+
## Task Difficulty
|
| 92 |
+
|
| 93 |
+
| Level | Complexity | Example |
|
| 94 |
+
| ----- | --------------- | -------------------------------------------- |
|
| 95 |
+
| L4 | Quartile/binned | "Subscription rate in Q1 balance?" |
|
| 96 |
+
| L5 | Multi-condition | "Rate for month='may' AND job='management'?" |
|
| 97 |
+
| L6 | Nested extrema | "In lowest subscription month, avg day?" |
|
| 98 |
+
|
| 99 |
+
## API Endpoints
|
| 100 |
+
|
| 101 |
+
| Endpoint | Method | Description |
|
| 102 |
+
| ----------- | ------ | --------------------- |
|
| 103 |
+
| `/health` | GET | Health check |
|
| 104 |
+
| `/reset` | POST | Reset for new episode |
|
| 105 |
+
| `/step` | POST | Execute action |
|
| 106 |
+
| `/state` | GET | Current state |
|
| 107 |
+
| `/metadata` | GET | Environment metadata |
|
| 108 |
+
| `/schema` | GET | Type schemas |
|
| 109 |
+
|
| 110 |
+
## Benchmark Results
|
| 111 |
+
|
| 112 |
+
Pre-benchmarked on 11+ models with 1,844 completions:
|
| 113 |
+
|
| 114 |
+
| Model | Accuracy | Cost/Task |
|
| 115 |
+
| ---------------- | -------- | --------- |
|
| 116 |
+
| Claude Opus 4.5 | 77.3% | $0.89 |
|
| 117 |
+
| Qwen3 Max | 46.7% | $0.12 |
|
| 118 |
+
| Mistral Large | 45.3% | $0.18 |
|
| 119 |
+
| Llama 4 Maverick | 38.7% | $0.08 |
|
| 120 |
+
|
| 121 |
+
## Environment Variables
|
| 122 |
+
|
| 123 |
+
| Variable | Default | Description |
|
| 124 |
+
| --------------------- | --------------------------------- | ------------------------- |
|
| 125 |
+
| `CODEDARK_DATA_DIR` | `data/` | Path to CSV files |
|
| 126 |
+
| `CODEDARK_TASKS_PATH` | `data/tasks/final_25_tasks.jsonl` | Path to tasks file |
|
| 127 |
+
| `CODEDARK_MAX_TURNS` | `10` | Maximum turns per episode |
|
| 128 |
+
| `HOST` | `0.0.0.0` | Server host |
|
| 129 |
+
| `PORT` | `8000` | Server port |
|
| 130 |
+
|
| 131 |
+
## Project Structure
|
| 132 |
+
|
| 133 |
+
```
|
| 134 |
+
codedark/
|
| 135 |
+
├── __init__.py # Package exports
|
| 136 |
+
├── models.py # Action, Observation, State dataclasses
|
| 137 |
+
├── client.py # HTTP client
|
| 138 |
+
├── openenv.yaml # OpenEnv manifest
|
| 139 |
+
├── pyproject.toml # Package config
|
| 140 |
+
├── server/
|
| 141 |
+
│ ├── app.py # FastAPI application
|
| 142 |
+
│ ├── environment.py # Core environment logic
|
| 143 |
+
│ ├── tools.py # Tool implementations
|
| 144 |
+
│ ├── scoring.py # Reward computation
|
| 145 |
+
│ ├── Dockerfile # Container spec
|
| 146 |
+
│ └── requirements.txt # Dependencies
|
| 147 |
+
├── data/
|
| 148 |
+
│ ├── bank.csv # Bank marketing dataset
|
| 149 |
+
│ ├── road.csv # Road safety dataset
|
| 150 |
+
│ └── tasks/
|
| 151 |
+
│ └── final_25_tasks.jsonl
|
| 152 |
+
└── tests/
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
## OpenEnv Compatibility
|
| 156 |
+
|
| 157 |
+
CodeDark follows the [OpenEnv specification](https://huggingface.co/openenv):
|
| 158 |
+
|
| 159 |
+
- Gymnasium-style `reset()` / `step()` API
|
| 160 |
+
- Pydantic models for Action, Observation, State
|
| 161 |
+
- FastAPI server with standard endpoints
|
| 162 |
+
- Docker containerization for isolated execution
|
| 163 |
+
- HTTP + WebSocket transport
|
| 164 |
+
|
| 165 |
+
## License
|
| 166 |
+
|
| 167 |
+
MIT
|
| 168 |
+
|
| 169 |
+
## Author
|
| 170 |
+
|
| 171 |
+
Vijay Athithya
|
| 172 |
+
|
| 173 |
+
- GitHub: [vj-09](https://github.com/vj-09)
|
| 174 |
+
- LinkedIn: [vijay-athithya](https://www.linkedin.com/in/vijay-athithya/)
|
codedark/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeDark - OpenEnv-Compatible Data Analytics Environment
|
| 3 |
+
|
| 4 |
+
Multi-turn RL environment for training AI agents on data analytics tasks.
|
| 5 |
+
|
| 6 |
+
Example usage:
|
| 7 |
+
from codedark import CodeDarkEnv
|
| 8 |
+
|
| 9 |
+
env = CodeDarkEnv("http://localhost:8000")
|
| 10 |
+
obs = env.reset()
|
| 11 |
+
print(f"Task: {obs['question']}")
|
| 12 |
+
|
| 13 |
+
obs = env.run_python("result = df['y'].mean() * 100")
|
| 14 |
+
obs = env.submit_answer(11.26)
|
| 15 |
+
print(f"Reward: {obs['reward']}")
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from .client import CodeDarkEnv
|
| 19 |
+
from .models import (
|
| 20 |
+
CodeDarkAction,
|
| 21 |
+
CodeDarkObservation,
|
| 22 |
+
CodeDarkState,
|
| 23 |
+
ResetRequest,
|
| 24 |
+
StepRequest,
|
| 25 |
+
HealthResponse,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
__all__ = [
|
| 29 |
+
"CodeDarkEnv",
|
| 30 |
+
"CodeDarkAction",
|
| 31 |
+
"CodeDarkObservation",
|
| 32 |
+
"CodeDarkState",
|
| 33 |
+
"ResetRequest",
|
| 34 |
+
"StepRequest",
|
| 35 |
+
"HealthResponse",
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
__version__ = "0.1.0"
|
codedark/client.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeDark Client
|
| 3 |
+
|
| 4 |
+
HTTP client for interacting with CodeDark environment server.
|
| 5 |
+
Follows OpenEnv EnvClient pattern.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Any, Dict, Optional
|
| 9 |
+
import requests
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class CodeDarkEnv:
|
| 13 |
+
"""Client for CodeDark environment.
|
| 14 |
+
|
| 15 |
+
Example usage:
|
| 16 |
+
env = CodeDarkEnv("http://localhost:8000")
|
| 17 |
+
obs = env.reset()
|
| 18 |
+
print(f"Task: {obs['question']}")
|
| 19 |
+
|
| 20 |
+
obs = env.step("run_python", "<code>result = df.shape</code>")
|
| 21 |
+
print(f"Result: {obs['stdout']}")
|
| 22 |
+
|
| 23 |
+
obs = env.step("submit_answer", "<answer>11.26</answer>")
|
| 24 |
+
print(f"Reward: {obs['reward']}")
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, base_url: str = "http://localhost:8000", timeout: int = 30):
|
| 28 |
+
"""Initialize client.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
base_url: Server URL
|
| 32 |
+
timeout: Request timeout in seconds
|
| 33 |
+
"""
|
| 34 |
+
self.base_url = base_url.rstrip("/")
|
| 35 |
+
self.timeout = timeout
|
| 36 |
+
self._session = requests.Session()
|
| 37 |
+
|
| 38 |
+
def reset(
|
| 39 |
+
self, task_id: Optional[str] = None, seed: Optional[int] = None
|
| 40 |
+
) -> Dict[str, Any]:
|
| 41 |
+
"""Reset environment for a new episode.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
task_id: Specific task to load (optional)
|
| 45 |
+
seed: Random seed for task selection (optional)
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
Initial observation dict
|
| 49 |
+
"""
|
| 50 |
+
payload = {}
|
| 51 |
+
if task_id is not None:
|
| 52 |
+
payload["task_id"] = task_id
|
| 53 |
+
if seed is not None:
|
| 54 |
+
payload["seed"] = seed
|
| 55 |
+
|
| 56 |
+
response = self._session.post(
|
| 57 |
+
f"{self.base_url}/reset",
|
| 58 |
+
json=payload if payload else None,
|
| 59 |
+
timeout=self.timeout,
|
| 60 |
+
)
|
| 61 |
+
response.raise_for_status()
|
| 62 |
+
return response.json()
|
| 63 |
+
|
| 64 |
+
def step(self, tool: str, args: str = "") -> Dict[str, Any]:
|
| 65 |
+
"""Execute an action.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
tool: Tool name (run_python, read_notes, save_note, clarify, submit_answer)
|
| 69 |
+
args: Tool-specific arguments
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Observation dict
|
| 73 |
+
"""
|
| 74 |
+
response = self._session.post(
|
| 75 |
+
f"{self.base_url}/step",
|
| 76 |
+
json={"tool": tool, "args": args},
|
| 77 |
+
timeout=self.timeout,
|
| 78 |
+
)
|
| 79 |
+
response.raise_for_status()
|
| 80 |
+
return response.json()
|
| 81 |
+
|
| 82 |
+
def state(self) -> Dict[str, Any]:
|
| 83 |
+
"""Get current environment state.
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
State dict
|
| 87 |
+
"""
|
| 88 |
+
response = self._session.get(
|
| 89 |
+
f"{self.base_url}/state",
|
| 90 |
+
timeout=self.timeout,
|
| 91 |
+
)
|
| 92 |
+
response.raise_for_status()
|
| 93 |
+
return response.json()
|
| 94 |
+
|
| 95 |
+
def health(self) -> Dict[str, Any]:
|
| 96 |
+
"""Check server health.
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
Health status dict
|
| 100 |
+
"""
|
| 101 |
+
response = self._session.get(
|
| 102 |
+
f"{self.base_url}/health",
|
| 103 |
+
timeout=self.timeout,
|
| 104 |
+
)
|
| 105 |
+
response.raise_for_status()
|
| 106 |
+
return response.json()
|
| 107 |
+
|
| 108 |
+
def metadata(self) -> Dict[str, Any]:
|
| 109 |
+
"""Get environment metadata.
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
Metadata dict
|
| 113 |
+
"""
|
| 114 |
+
response = self._session.get(
|
| 115 |
+
f"{self.base_url}/metadata",
|
| 116 |
+
timeout=self.timeout,
|
| 117 |
+
)
|
| 118 |
+
response.raise_for_status()
|
| 119 |
+
return response.json()
|
| 120 |
+
|
| 121 |
+
def schema(self) -> Dict[str, Any]:
|
| 122 |
+
"""Get environment type schemas.
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
Schema dict for action, observation, state
|
| 126 |
+
"""
|
| 127 |
+
response = self._session.get(
|
| 128 |
+
f"{self.base_url}/schema",
|
| 129 |
+
timeout=self.timeout,
|
| 130 |
+
)
|
| 131 |
+
response.raise_for_status()
|
| 132 |
+
return response.json()
|
| 133 |
+
|
| 134 |
+
# Convenience methods for common tools
|
| 135 |
+
|
| 136 |
+
def run_python(self, code: str) -> Dict[str, Any]:
|
| 137 |
+
"""Execute Python code.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
code: Python code to execute
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
Observation dict
|
| 144 |
+
"""
|
| 145 |
+
return self.step("run_python", f"<code>{code}</code>")
|
| 146 |
+
|
| 147 |
+
def read_notes(self) -> Dict[str, Any]:
|
| 148 |
+
"""Read all saved notes.
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
Observation dict
|
| 152 |
+
"""
|
| 153 |
+
return self.step("read_notes", "")
|
| 154 |
+
|
| 155 |
+
def save_note(self, content: str) -> Dict[str, Any]:
|
| 156 |
+
"""Save a note.
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
content: Note content
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
Observation dict
|
| 163 |
+
"""
|
| 164 |
+
return self.step("save_note", content)
|
| 165 |
+
|
| 166 |
+
def clarify(self, question: str) -> Dict[str, Any]:
|
| 167 |
+
"""Ask a clarifying question.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
question: Clarifying question
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
Observation dict
|
| 174 |
+
"""
|
| 175 |
+
return self.step("clarify", f"<question>{question}</question>")
|
| 176 |
+
|
| 177 |
+
def submit_answer(self, answer: Any) -> Dict[str, Any]:
|
| 178 |
+
"""Submit final answer.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
answer: Answer value
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
Final observation with reward
|
| 185 |
+
"""
|
| 186 |
+
return self.step("submit_answer", f"<answer>{answer}</answer>")
|
| 187 |
+
|
| 188 |
+
def close(self):
|
| 189 |
+
"""Close the session."""
|
| 190 |
+
self._session.close()
|
| 191 |
+
|
| 192 |
+
def __enter__(self):
|
| 193 |
+
return self
|
| 194 |
+
|
| 195 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 196 |
+
self.close()
|
| 197 |
+
return False
|
codedark/data/bank.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a071417203c9e1434df5fe794fffae6a55327502b00da9c4e9754d2ab7f7cede
|
| 3 |
+
size 65698328
|
codedark/data/road.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ee5955af18eca0d4b53e13f23bd6436422e40ca84077fb8cdcfa467aa62b68f
|
| 3 |
+
size 37936892
|
codedark/data/tasks/final_25_tasks.jsonl
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"id": "bank_hard_001", "dataset": "bank", "goal": "What's the subscription rate for month='may' AND job='management' AND balance in Q1?", "expected_output_type": "scalar", "level": "L6", "template": "multi_condition_filter", "golden": {"answer_value": 2.44, "answer_type": "scalar", "verification_code": "df['_q'] = pd.qcut(df['balance'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nfiltered = df[(df['month'] == 'may') & (df['job'] == 'management') & (df['_q'] == 'Q1')]\nresult = round((filtered['y'] == 1).mean() * 100, 2) if len(filtered) > 0 else 0.0\ndf.drop('_q', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "null", "missing", "nan", "empty", "na"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "multi_condition_filter", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"col_a": "month", "val_a": "may", "col_b": "job", "val_b": "management", "metric": "balance", "quartile": "Q1", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 2 |
+
{"id": "bank_hard_005", "dataset": "bank", "goal": "Among customers in top 95% of age AND bottom 10% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.59, "answer_type": "scalar", "verification_code": "p_high = df['age'].quantile(95 / 100)\np_low = df['duration'].quantile(10 / 100)\ncohort = df[(df['age'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "age", "metric_b": "duration", "pct_high": 95, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 3 |
+
{"id": "bank_hard_012", "dataset": "bank", "goal": "Find the job with lowest average balance. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 8.27, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['balance'].mean()\nextrema_val = group_stats.max() if 'lowest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "balance", "extrema": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 4 |
+
{"id": "bank_hard_019", "dataset": "bank", "goal": "What's the subscription rate for month='may' AND job='management' AND age in Q4?", "expected_output_type": "scalar", "level": "L6", "template": "multi_condition_filter", "golden": {"answer_value": 7.04, "answer_type": "scalar", "verification_code": "df['_q'] = pd.qcut(df['age'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nfiltered = df[(df['month'] == 'may') & (df['job'] == 'management') & (df['_q'] == 'Q4')]\nresult = round((filtered['y'] == 1).mean() * 100, 2) if len(filtered) > 0 else 0.0\ndf.drop('_q', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "null", "missing", "nan", "empty", "na"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "multi_condition_filter", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"col_a": "month", "val_a": "may", "col_b": "job", "val_b": "management", "metric": "age", "quartile": "Q4", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 5 |
+
{"id": "bank_hard_020", "dataset": "bank", "goal": "For month with above-average day, which have subscription rate > 25%? Return sorted list.", "expected_output_type": "list", "level": "L5", "template": "top_n_in_segment", "golden": {"answer_value": ["oct"], "answer_type": "list", "verification_code": "avg_metric = df.groupby('month')['day'].mean()\nhigh_metric = avg_metric[avg_metric > avg_metric.mean()].index\nrates = df[df['month'].isin(high_metric)].groupby('month')['y'].apply(\n lambda x: (x == 1).mean() * 100)\nresult = sorted(rates[rates > 25].index.tolist())", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["tie", "ties", "equal", "same", "duplicate"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "top_n_in_segment", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "month", "metric": "day", "threshold": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 6 |
+
{"id": "bank_hard_021", "dataset": "bank", "goal": "Find the job with highest average balance. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 24.62, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['balance'].mean()\nextrema_val = group_stats.max() if 'highest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "balance", "extrema": "highest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 7 |
+
{"id": "bank_hard_023", "dataset": "bank", "goal": "Among customers in top 90% of balance AND bottom 10% of day, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 33.53, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(90 / 100)\np_low = df['day'].quantile(10 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['day'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "day", "pct_high": 90, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 8 |
+
{"id": "bank_hard_026", "dataset": "bank", "goal": "For each job, compute volatility score: std(age) / mean(age). Return top 5 with [group, mean, std, volatility].", "expected_output_type": "dataframe", "level": "L6", "template": "segment_volatility", "golden": {"answer_value": [{"job": "unemployed", "mean": 40.97, "std": 9.74, "volatility": 0.2377}, {"job": "self-employed", "mean": 40.42, "std": 9.46, "volatility": 0.234}, {"job": "admin.", "mean": 39.68, "std": 9.23, "volatility": 0.2326}, {"job": "services", "mean": 38.94, "std": 8.86, "volatility": 0.2275}, {"job": "management", "mean": 40.2, "std": 9.13, "volatility": 0.2271}], "answer_type": "dataframe", "verification_code": "stats = df.groupby('job')['age'].agg(['mean', 'std']).round(2)\nstats['volatility'] = round(stats['std'] / stats['mean'], 4)\nresult = stats.nlargest(5, 'volatility').reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits", "volatility", "cv", "coefficient", "variation"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "segment_volatility", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "age"}}}
|
| 9 |
+
{"id": "bank_hard_028", "dataset": "bank", "goal": "Among customers in top 95% of balance AND bottom 10% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.84, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(95 / 100)\np_low = df['duration'].quantile(10 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 95, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 10 |
+
{"id": "bank_hard_029", "dataset": "bank", "goal": "Among customers in top 90% of balance AND bottom 25% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.54, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(90 / 100)\np_low = df['duration'].quantile(25 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 90, "pct_low": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 11 |
+
{"id": "bank_hard_030", "dataset": "bank", "goal": "Rank job by subscription rate. Which bottom-3 have above-median age?", "expected_output_type": "list", "level": "L6", "template": "ranked_anomaly", "golden": {"answer_value": ["blue-collar", "entrepreneur"], "answer_type": "list", "verification_code": "stats = df.groupby('job').agg(\n rate=('y', lambda x: (x == 1).mean()),\n avg_metric=('age', 'mean'))\nstats['rank'] = stats['rate'].rank()\nbottom_3 = stats[stats['rank'] <= 3]\nresult = sorted(bottom_3[bottom_3['avg_metric'] > stats['avg_metric'].median()].index.tolist())", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rank", "order", "sort", "ascending", "descending"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "ranked_anomaly", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "age", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 12 |
+
{"id": "bank_hard_031", "dataset": "bank", "goal": "For each month, compute volatility score: std(day) / mean(day). Return top 5 with [group, mean, std, volatility].", "expected_output_type": "dataframe", "level": "L6", "template": "segment_volatility", "golden": {"answer_value": [{"month": "feb", "mean": 6.05, "std": 5.46, "volatility": 0.9025}, {"month": "sep", "mean": 11.67, "std": 8.04, "volatility": 0.6889}, {"month": "mar", "mean": 13.45, "std": 9.18, "volatility": 0.6825}, {"month": "jun", "mean": 11.32, "std": 7.33, "volatility": 0.6475}, {"month": "dec", "mean": 14.19, "std": 8.83, "volatility": 0.6223}], "answer_type": "dataframe", "verification_code": "stats = df.groupby('month')['day'].agg(['mean', 'std']).round(2)\nstats['volatility'] = round(stats['std'] / stats['mean'], 4)\nresult = stats.nlargest(5, 'volatility').reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits", "volatility", "cv", "coefficient", "variation"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "segment_volatility", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "month", "metric": "day"}}}
|
| 13 |
+
{"id": "bank_hard_033", "dataset": "bank", "goal": "Show the average balance breakdown by job. Include count and mean balance for each category, sorted by mean descending.", "expected_output_type": "dataframe", "level": "L4", "template": "metric_breakdown", "golden": {"answer_value": [{"job": "retired", "count": 35185, "mean_balance": 1812.07}, {"job": "unknown", "count": 2917, "mean_balance": 1678.96}, {"job": "self-employed", "count": 19020, "mean_balance": 1598.27}, {"job": "student", "count": 11767, "mean_balance": 1577.32}, {"job": "management", "count": 175541, "mean_balance": 1510.39}, {"job": "unemployed", "count": 17634, "mean_balance": 1440.57}, {"job": "entrepreneur", "count": 17718, "mean_balance": 1306.75}, {"job": "housemaid", "count": 15912, "mean_balance": 1281.22}, {"job": "technician", "count": 138107, "mean_balance": 1071.57}, {"job": "admin.", "count": 81492, "mean_balance": 1019.92}, {"job": "blue-collar", "count": 170498, "mean_balance": 977.49}, {"job": "services", "count": 64209, "mean_balance": 834.63}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('job').agg(\n count=('balance', 'size'),\n mean_balance=('balance', lambda x: round(x.mean(), 2))\n).sort_values('mean_balance', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "metric_breakdown", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "balance"}}}
|
| 14 |
+
{"id": "bank_hard_035", "dataset": "bank", "goal": "Find the job with lowest average age. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 34.08, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['age'].mean()\nextrema_val = group_stats.max() if 'lowest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "age", "extrema": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 15 |
+
{"id": "bank_hard_038", "dataset": "bank", "goal": "Find the month with the lowest subscription rate. Within that group, what is the average day?", "expected_output_type": "scalar", "level": "L5", "template": "nested_extrema", "golden": {"answer_value": 16.09, "answer_type": "scalar", "verification_code": "group_rates = df.groupby('month')['y'].apply(lambda x: (x == 1).mean())\nouter_val = group_rates.max() if 'lowest' == 'highest' else group_rates.min()\nouter_tied = group_rates[group_rates == outer_val].sort_index()\nouter_group = outer_tied.index[0]\nsubset = df[df['month'] == outer_group]\nresult = round(subset['day'].mean(), 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["tie", "ties", "equal", "same", "duplicate", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "nested_extrema", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "month", "metric": "day", "extrema_outer": "lowest", "extrema_inner": "highest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 16 |
+
{"id": "bank_hard_039", "dataset": "bank", "goal": "Divide customers into 4 day quartiles. What is the subscription percentage (0-100) in the highest (top 25%) (Q4) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "quartile_conversion", "golden": {"answer_value": 11.55, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['day'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q4']\nresult = round((bin_data['y'] == 1).mean() * 100, 2)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "quartile_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "day", "quartile": "Q4", "quartile_desc": "highest (top 25%)", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 17 |
+
{"id": "bank_hard_040", "dataset": "bank", "goal": "Among customers in top 95% of balance AND bottom 25% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.65, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(95 / 100)\np_low = df['duration'].quantile(25 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 95, "pct_low": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 18 |
+
{"id": "bank_hard_041", "dataset": "bank", "goal": "Which job categories would have the biggest impact if brought to average subscription rate? Return top 3 by potential gain (count * rate gap), sorted by impact.", "expected_output_type": "list", "level": "L5", "template": "segment_improvement_potential", "golden": {"answer_value": ["blue-collar", "services", "entrepreneur"], "answer_type": "list", "verification_code": "overall_rate = (df['y'] == 1).mean()\ngroup_stats = df.groupby('job').agg(\n rate=('y', lambda x: (x == 1).mean()),\n count=('y', 'size')\n)\ngroup_stats['gap'] = overall_rate - group_stats['rate']\ngroup_stats['potential'] = group_stats['count'] * group_stats['gap']\ntop_potential = group_stats[group_stats['gap'] > 0].nlargest(3, 'potential')\nresult = top_potential.index.tolist()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "segment_improvement_potential", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 19 |
+
{"id": "bank_hard_044", "dataset": "bank", "goal": "Find the month with the lowest subscription rate. Within that group, what is the average age?", "expected_output_type": "scalar", "level": "L5", "template": "nested_extrema", "golden": {"answer_value": 38.98, "answer_type": "scalar", "verification_code": "group_rates = df.groupby('month')['y'].apply(lambda x: (x == 1).mean())\nouter_val = group_rates.max() if 'lowest' == 'highest' else group_rates.min()\nouter_tied = group_rates[group_rates == outer_val].sort_index()\nouter_group = outer_tied.index[0]\nsubset = df[df['month'] == outer_group]\nresult = round(subset['age'].mean(), 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["tie", "ties", "equal", "same", "duplicate", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "nested_extrema", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "month", "metric": "age", "extrema_outer": "lowest", "extrema_inner": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 20 |
+
{"id": "road_hard_014", "dataset": "road", "goal": "Which lighting categories have the highest total reported accidents? Show breakdown with [lighting, count, total_num_reported_accidents, avg_num_reported_accidents] sorted by total descending.", "expected_output_type": "dataframe", "level": "L4", "template": "count_segment_total", "golden": {"answer_value": [{"lighting": "dim", "count": 183826, "total_num_reported_accidents": 211283, "avg_num_reported_accidents": 1.15}, {"lighting": "daylight", "count": 178015, "total_num_reported_accidents": 207579, "avg_num_reported_accidents": 1.17}, {"lighting": "night", "count": 155913, "total_num_reported_accidents": 196214, "avg_num_reported_accidents": 1.26}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('lighting').agg(\n count=('num_reported_accidents', 'size'),\n total_num_reported_accidents=('num_reported_accidents', 'sum'),\n avg_num_reported_accidents=('num_reported_accidents', lambda x: round(x.mean(), 2))\n).sort_values('total_num_reported_accidents', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": [], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "count_segment_total", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "lighting", "target_col": "num_reported_accidents", "target_desc": "reported accidents"}}}
|
| 21 |
+
{"id": "road_hard_021", "dataset": "road", "goal": "Which weather categories have the highest average accident risk? Show breakdown with [weather, count, avg_accident_risk] sorted by average descending.", "expected_output_type": "dataframe", "level": "L4", "template": "continuous_segment_breakdown", "golden": {"answer_value": [{"weather": "foggy", "count": 181463, "avg_accident_risk": 0.3863}, {"weather": "rainy", "count": 156985, "avg_accident_risk": 0.3615}, {"weather": "clear", "count": 179306, "avg_accident_risk": 0.3101}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('weather').agg(\n count=('accident_risk', 'size'),\n avg_accident_risk=('accident_risk', lambda x: round(x.mean(), 4))\n).sort_values('avg_accident_risk', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": [], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_segment_breakdown", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "weather", "target_col": "accident_risk", "target_desc": "accident risk"}}}
|
| 22 |
+
{"id": "road_hard_015", "dataset": "road", "goal": "Divide records into 4 speed_limit quartiles. What is the average accident risk in the lower-middle (25-50%) (Q2) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "continuous_quartile_analysis", "golden": {"answer_value": 0.29, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['speed_limit'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q2']\nresult = round(bin_data['accident_risk'].mean(), 4)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_quartile_analysis", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "speed_limit", "quartile": "Q2", "quartile_desc": "lower-middle (25-50%)", "target_col": "accident_risk", "target_desc": "accident risk"}}}
|
| 23 |
+
{"id": "road_hard_002", "dataset": "road", "goal": "Divide records into 4 curvature quartiles. What is the average accident risk in the upper-middle (50-75%) (Q3) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "continuous_quartile_analysis", "golden": {"answer_value": 0.41, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['curvature'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q3']\nresult = round(bin_data['accident_risk'].mean(), 4)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_quartile_analysis", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "curvature", "quartile": "Q3", "quartile_desc": "upper-middle (50-75%)", "target_col": "accident_risk", "target_desc": "accident risk"}}}
|
| 24 |
+
{"id": "road_hard_007", "dataset": "road", "goal": "How much higher is the average accident risk for lighting='daylight' compared to 'night'? Return difference.", "expected_output_type": "scalar", "level": "L5", "template": "continuous_comparison", "golden": {"answer_value": -0.17, "answer_type": "scalar", "verification_code": "avg_a = df[df['lighting'] == 'daylight']['accident_risk'].mean()\navg_b = df[df['lighting'] == 'night']['accident_risk'].mean()\nresult = round(avg_a - avg_b, 4)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "continuous_comparison", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "lighting", "val_a": "daylight", "val_b": "night", "target_col": "accident_risk", "target_desc": "accident risk"}}}
|
| 25 |
+
{"id": "road_hard_004", "dataset": "road", "goal": "How much higher is the average accident risk for road_type='rural' compared to 'urban'? Return difference.", "expected_output_type": "scalar", "level": "L5", "template": "continuous_comparison", "golden": {"answer_value": -0.01, "answer_type": "scalar", "verification_code": "avg_a = df[df['road_type'] == 'rural']['accident_risk'].mean()\navg_b = df[df['road_type'] == 'urban']['accident_risk'].mean()\nresult = round(avg_a - avg_b, 4)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "continuous_comparison", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "road_type", "val_a": "rural", "val_b": "urban", "target_col": "accident_risk", "target_desc": "accident risk"}}}
|
codedark/models.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeDark Data Models
|
| 3 |
+
|
| 4 |
+
Pydantic models for Action, Observation, and State following OpenEnv spec.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
from typing import Optional, List, Any, Literal
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class CodeDarkAction(BaseModel):
|
| 12 |
+
"""
|
| 13 |
+
Action for CodeDark environment.
|
| 14 |
+
|
| 15 |
+
Agents send actions with a tool name and arguments.
|
| 16 |
+
|
| 17 |
+
Tools available:
|
| 18 |
+
- run_python: Execute Python code with pandas/numpy
|
| 19 |
+
- read_notes: Read all saved notes
|
| 20 |
+
- save_note: Save a note for later recall
|
| 21 |
+
- clarify: Ask clarifying question (max 2 per episode)
|
| 22 |
+
- submit_answer: Submit final answer (ends episode)
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
tool: Literal["run_python", "read_notes", "save_note", "clarify", "submit_answer"]
|
| 26 |
+
args: str = "" # Tool-specific arguments
|
| 27 |
+
|
| 28 |
+
model_config = {
|
| 29 |
+
"json_schema_extra": {
|
| 30 |
+
"examples": [
|
| 31 |
+
{"tool": "run_python", "args": "result = df['y'].mean() * 100"},
|
| 32 |
+
{"tool": "read_notes", "args": ""},
|
| 33 |
+
{"tool": "save_note", "args": "Average subscription rate is 11.26%"},
|
| 34 |
+
{"tool": "clarify", "args": "What does Q1 mean in this context?"},
|
| 35 |
+
{"tool": "submit_answer", "args": "11.26"},
|
| 36 |
+
]
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class CodeDarkObservation(BaseModel):
|
| 42 |
+
"""
|
| 43 |
+
Observation returned after each action.
|
| 44 |
+
|
| 45 |
+
Contains execution results, environment state, and episode info.
|
| 46 |
+
Reward is only populated when done=True.
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
# Execution results
|
| 50 |
+
stdout: str = ""
|
| 51 |
+
stderr: str = ""
|
| 52 |
+
exit_code: int = 0
|
| 53 |
+
|
| 54 |
+
# Turn tracking
|
| 55 |
+
turn: int = 0
|
| 56 |
+
max_turns: int = 10
|
| 57 |
+
|
| 58 |
+
# Persistent state
|
| 59 |
+
notes: List[str] = Field(default_factory=list)
|
| 60 |
+
|
| 61 |
+
# Task info
|
| 62 |
+
task_id: str = ""
|
| 63 |
+
question: str = ""
|
| 64 |
+
difficulty: str = "" # L4, L5, L6
|
| 65 |
+
dataset: str = "" # bank, road
|
| 66 |
+
|
| 67 |
+
# Episode status
|
| 68 |
+
done: bool = False
|
| 69 |
+
submitted: bool = False
|
| 70 |
+
|
| 71 |
+
# Reward components (only set when done=True)
|
| 72 |
+
reward: Optional[float] = None
|
| 73 |
+
correctness: Optional[float] = None
|
| 74 |
+
efficiency: Optional[float] = None
|
| 75 |
+
|
| 76 |
+
# Additional metadata
|
| 77 |
+
metadata: dict = Field(default_factory=dict)
|
| 78 |
+
|
| 79 |
+
model_config = {
|
| 80 |
+
"json_schema_extra": {
|
| 81 |
+
"examples": [
|
| 82 |
+
{
|
| 83 |
+
"stdout": "run_python Result:\n(45211, 17)",
|
| 84 |
+
"stderr": "",
|
| 85 |
+
"exit_code": 0,
|
| 86 |
+
"turn": 1,
|
| 87 |
+
"max_turns": 10,
|
| 88 |
+
"notes": [],
|
| 89 |
+
"task_id": "bank_hard_001",
|
| 90 |
+
"question": "What's the subscription rate for month='may'?",
|
| 91 |
+
"difficulty": "L5",
|
| 92 |
+
"dataset": "bank",
|
| 93 |
+
"done": False,
|
| 94 |
+
"submitted": False,
|
| 95 |
+
"reward": None,
|
| 96 |
+
}
|
| 97 |
+
]
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class CodeDarkState(BaseModel):
|
| 103 |
+
"""
|
| 104 |
+
Internal state for CodeDark environment.
|
| 105 |
+
|
| 106 |
+
Tracks episode progress, accumulated notes, and submission status.
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
episode_id: str = ""
|
| 110 |
+
step_count: int = 0
|
| 111 |
+
|
| 112 |
+
# Task info
|
| 113 |
+
task_id: str = ""
|
| 114 |
+
dataset: str = ""
|
| 115 |
+
|
| 116 |
+
# Accumulated state
|
| 117 |
+
notes: List[str] = Field(default_factory=list)
|
| 118 |
+
turn_count: int = 0
|
| 119 |
+
error_count: int = 0
|
| 120 |
+
clarify_count: int = 0
|
| 121 |
+
|
| 122 |
+
# Submission
|
| 123 |
+
submitted: bool = False
|
| 124 |
+
submitted_answer: Optional[Any] = None
|
| 125 |
+
|
| 126 |
+
# For scoring
|
| 127 |
+
expected_answer: Optional[Any] = None
|
| 128 |
+
tolerance: float = 0.01
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
class ResetRequest(BaseModel):
|
| 132 |
+
"""Request body for /reset endpoint."""
|
| 133 |
+
|
| 134 |
+
task_id: Optional[str] = None
|
| 135 |
+
seed: Optional[int] = None
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class StepRequest(BaseModel):
|
| 139 |
+
"""Request body for /step endpoint."""
|
| 140 |
+
|
| 141 |
+
tool: str
|
| 142 |
+
args: str = ""
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class HealthResponse(BaseModel):
|
| 146 |
+
"""Response for /health endpoint."""
|
| 147 |
+
|
| 148 |
+
status: str = "healthy"
|
| 149 |
+
environment: str = "codedark"
|
| 150 |
+
version: str = "0.1.0"
|
codedark/openenv.yaml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: codedark
|
| 2 |
+
version: "0.1.0"
|
| 3 |
+
description: |
|
| 4 |
+
CodeDark: Multi-turn data analytics environment for training RL agents.
|
| 5 |
+
|
| 6 |
+
Train AI agents to be data scientists, not just code executors.
|
| 7 |
+
Features real business analytics tasks with pandas/numpy,
|
| 8 |
+
multi-metric reward shaping, and skill-based curriculum.
|
| 9 |
+
|
| 10 |
+
author: Vijay Athithya
|
| 11 |
+
license: MIT
|
| 12 |
+
|
| 13 |
+
# Environment interface types
|
| 14 |
+
action: CodeDarkAction
|
| 15 |
+
observation: CodeDarkObservation
|
| 16 |
+
state: CodeDarkState
|
| 17 |
+
|
| 18 |
+
# Environment configuration
|
| 19 |
+
config:
|
| 20 |
+
max_turns: 10
|
| 21 |
+
datasets:
|
| 22 |
+
- bank
|
| 23 |
+
- road
|
| 24 |
+
difficulty_levels:
|
| 25 |
+
- L4
|
| 26 |
+
- L5
|
| 27 |
+
- L6
|
| 28 |
+
|
| 29 |
+
# Tools available to agents
|
| 30 |
+
tools:
|
| 31 |
+
- name: run_python
|
| 32 |
+
description: Execute Python/pandas code. Store output in 'result' variable.
|
| 33 |
+
- name: read_notes
|
| 34 |
+
description: Read all saved notes from previous turns.
|
| 35 |
+
- name: save_note
|
| 36 |
+
description: Save observations for later recall. Notes persist across turns.
|
| 37 |
+
- name: clarify
|
| 38 |
+
description: Ask clarifying questions about the task (max 2 per episode).
|
| 39 |
+
- name: submit_answer
|
| 40 |
+
description: Submit final answer. Ends episode.
|
| 41 |
+
|
| 42 |
+
# Reward structure
|
| 43 |
+
reward:
|
| 44 |
+
max_reward: 1.0
|
| 45 |
+
components:
|
| 46 |
+
- name: correctness
|
| 47 |
+
weight: 0.80
|
| 48 |
+
description: Binary correct/incorrect with numeric tolerance
|
| 49 |
+
- name: efficiency
|
| 50 |
+
weight: 0.10
|
| 51 |
+
description: Fewer turns = better score
|
| 52 |
+
- name: token_cost
|
| 53 |
+
weight: 0.10
|
| 54 |
+
description: Lower token usage = better score
|
| 55 |
+
|
| 56 |
+
# Benchmarking info
|
| 57 |
+
benchmark:
|
| 58 |
+
tasks: 25
|
| 59 |
+
models_evaluated: 11
|
| 60 |
+
completions: 1844
|
| 61 |
+
best_accuracy: "77.3%"
|
| 62 |
+
best_model: "claude-opus-4.5"
|
codedark/pyproject.toml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=45", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-codedark"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Multi-turn data analytics environment for RL agent training - OpenEnv compatible"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
license = {text = "MIT"}
|
| 12 |
+
authors = [
|
| 13 |
+
{name = "Vijay Athithya", email = "vijay@analytics-rl.com"}
|
| 14 |
+
]
|
| 15 |
+
keywords = ["openenv", "reinforcement-learning", "data-analytics", "llm", "agents"]
|
| 16 |
+
classifiers = [
|
| 17 |
+
"Development Status :: 4 - Beta",
|
| 18 |
+
"Intended Audience :: Developers",
|
| 19 |
+
"Intended Audience :: Science/Research",
|
| 20 |
+
"License :: OSI Approved :: MIT License",
|
| 21 |
+
"Programming Language :: Python :: 3",
|
| 22 |
+
"Programming Language :: Python :: 3.10",
|
| 23 |
+
"Programming Language :: Python :: 3.11",
|
| 24 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 25 |
+
]
|
| 26 |
+
dependencies = [
|
| 27 |
+
"fastapi>=0.115.0",
|
| 28 |
+
"pydantic>=2.0.0",
|
| 29 |
+
"uvicorn[standard]>=0.24.0",
|
| 30 |
+
"pandas>=2.0.0",
|
| 31 |
+
"numpy>=1.24.0",
|
| 32 |
+
"requests>=2.31.0",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
[project.optional-dependencies]
|
| 36 |
+
dev = [
|
| 37 |
+
"pytest>=8.0.0",
|
| 38 |
+
"pytest-cov>=4.0.0",
|
| 39 |
+
"pytest-asyncio>=0.23.0",
|
| 40 |
+
"httpx>=0.27.0",
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
[project.urls]
|
| 44 |
+
Homepage = "https://github.com/vj-09/codedark"
|
| 45 |
+
Documentation = "https://huggingface.co/spaces/openenv/codedark"
|
| 46 |
+
Repository = "https://github.com/vj-09/codedark"
|
| 47 |
+
|
| 48 |
+
[project.scripts]
|
| 49 |
+
codedark-server = "codedark.server.app:main"
|
| 50 |
+
|
| 51 |
+
[tool.setuptools]
|
| 52 |
+
packages = ["codedark", "codedark.server"]
|
| 53 |
+
package-dir = {"codedark" = ".", "codedark.server" = "server"}
|
| 54 |
+
|
| 55 |
+
[tool.setuptools.package-data]
|
| 56 |
+
codedark = ["data/*.csv", "data/tasks/*.jsonl", "openenv.yaml"]
|
| 57 |
+
|
| 58 |
+
[tool.pytest.ini_options]
|
| 59 |
+
asyncio_mode = "auto"
|
| 60 |
+
testpaths = ["tests"]
|
codedark/server/Dockerfile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && \
|
| 8 |
+
apt-get install -y --no-install-recommends curl && \
|
| 9 |
+
rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy requirements first for better caching
|
| 12 |
+
COPY server/requirements.txt ./requirements.txt
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Copy package
|
| 16 |
+
COPY . .
|
| 17 |
+
|
| 18 |
+
# Install package in editable mode
|
| 19 |
+
RUN pip install --no-cache-dir -e .
|
| 20 |
+
|
| 21 |
+
# Environment variables
|
| 22 |
+
ENV PYTHONUNBUFFERED=1
|
| 23 |
+
ENV HOST=0.0.0.0
|
| 24 |
+
ENV PORT=8000
|
| 25 |
+
|
| 26 |
+
# Expose port
|
| 27 |
+
EXPOSE 8000
|
| 28 |
+
|
| 29 |
+
# Healthcheck
|
| 30 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 31 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 32 |
+
|
| 33 |
+
# Run server
|
| 34 |
+
CMD ["python", "-m", "codedark.server.app"]
|
codedark/server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""CodeDark Server Package."""
|
codedark/server/app.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeDark FastAPI Server
|
| 3 |
+
|
| 4 |
+
OpenEnv-compatible HTTP server for CodeDark environment.
|
| 5 |
+
Provides /reset, /step, /state, and /health endpoints.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
from contextlib import asynccontextmanager
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
import uvicorn
|
| 13 |
+
from fastapi import FastAPI, HTTPException
|
| 14 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 15 |
+
|
| 16 |
+
from ..models import (
|
| 17 |
+
CodeDarkAction,
|
| 18 |
+
CodeDarkObservation,
|
| 19 |
+
CodeDarkState,
|
| 20 |
+
ResetRequest,
|
| 21 |
+
StepRequest,
|
| 22 |
+
HealthResponse,
|
| 23 |
+
)
|
| 24 |
+
from .environment import CodeDarkEnvironment
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Global environment instance
|
| 28 |
+
_env: Optional[CodeDarkEnvironment] = None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def get_env() -> CodeDarkEnvironment:
|
| 32 |
+
"""Get or create environment instance."""
|
| 33 |
+
global _env
|
| 34 |
+
if _env is None:
|
| 35 |
+
_env = CodeDarkEnvironment(
|
| 36 |
+
data_dir=os.environ.get("CODEDARK_DATA_DIR"),
|
| 37 |
+
tasks_path=os.environ.get("CODEDARK_TASKS_PATH"),
|
| 38 |
+
max_turns=int(os.environ.get("CODEDARK_MAX_TURNS", "10")),
|
| 39 |
+
)
|
| 40 |
+
return _env
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@asynccontextmanager
|
| 44 |
+
async def lifespan(app: FastAPI):
|
| 45 |
+
"""Lifespan context manager for startup/shutdown."""
|
| 46 |
+
# Startup: initialize environment
|
| 47 |
+
get_env()
|
| 48 |
+
yield
|
| 49 |
+
# Shutdown: cleanup if needed
|
| 50 |
+
global _env
|
| 51 |
+
_env = None
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# Create FastAPI app
|
| 55 |
+
app = FastAPI(
|
| 56 |
+
title="CodeDark Environment",
|
| 57 |
+
description="Multi-turn data analytics environment for RL agent training",
|
| 58 |
+
version="0.1.0",
|
| 59 |
+
lifespan=lifespan,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Add CORS middleware
|
| 63 |
+
app.add_middleware(
|
| 64 |
+
CORSMiddleware,
|
| 65 |
+
allow_origins=["*"],
|
| 66 |
+
allow_credentials=True,
|
| 67 |
+
allow_methods=["*"],
|
| 68 |
+
allow_headers=["*"],
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@app.get("/health", response_model=HealthResponse)
|
| 73 |
+
async def health():
|
| 74 |
+
"""Health check endpoint."""
|
| 75 |
+
return HealthResponse(
|
| 76 |
+
status="healthy",
|
| 77 |
+
environment="codedark",
|
| 78 |
+
version="0.1.0",
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@app.post("/reset", response_model=CodeDarkObservation)
|
| 83 |
+
async def reset(request: ResetRequest = None):
|
| 84 |
+
"""Reset environment for a new episode.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
request: Optional reset request with task_id and seed
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Initial observation
|
| 91 |
+
"""
|
| 92 |
+
env = get_env()
|
| 93 |
+
|
| 94 |
+
if request is None:
|
| 95 |
+
request = ResetRequest()
|
| 96 |
+
|
| 97 |
+
obs = env.reset(task_id=request.task_id, seed=request.seed)
|
| 98 |
+
return obs
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@app.post("/step", response_model=CodeDarkObservation)
|
| 102 |
+
async def step(request: StepRequest):
|
| 103 |
+
"""Execute an action and return observation.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
request: Step request with tool and args
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
Observation after action execution
|
| 110 |
+
"""
|
| 111 |
+
env = get_env()
|
| 112 |
+
|
| 113 |
+
# Validate tool
|
| 114 |
+
valid_tools = ["run_python", "read_notes", "save_note", "clarify", "submit_answer"]
|
| 115 |
+
if request.tool not in valid_tools:
|
| 116 |
+
raise HTTPException(
|
| 117 |
+
status_code=400,
|
| 118 |
+
detail=f"Invalid tool: {request.tool}. Valid tools: {valid_tools}",
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
action = CodeDarkAction(tool=request.tool, args=request.args)
|
| 122 |
+
obs = env.step(action)
|
| 123 |
+
return obs
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
@app.get("/state", response_model=CodeDarkState)
|
| 127 |
+
async def state():
|
| 128 |
+
"""Get current environment state.
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
Current CodeDarkState
|
| 132 |
+
"""
|
| 133 |
+
env = get_env()
|
| 134 |
+
return env.state
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
@app.get("/metadata")
|
| 138 |
+
async def metadata():
|
| 139 |
+
"""Get environment metadata.
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
Environment metadata dict
|
| 143 |
+
"""
|
| 144 |
+
env = get_env()
|
| 145 |
+
return {
|
| 146 |
+
"name": "codedark",
|
| 147 |
+
"version": "0.1.0",
|
| 148 |
+
"description": "Multi-turn data analytics environment for RL agent training",
|
| 149 |
+
"max_turns": env.max_turns,
|
| 150 |
+
"max_clarifications": env.max_clarifications,
|
| 151 |
+
"num_tasks": len(env.tasks),
|
| 152 |
+
"tools": [
|
| 153 |
+
{"name": "run_python", "description": "Execute Python/pandas code"},
|
| 154 |
+
{"name": "read_notes", "description": "Read all saved notes"},
|
| 155 |
+
{"name": "save_note", "description": "Save a note for later recall"},
|
| 156 |
+
{"name": "clarify", "description": "Ask clarifying question (max 2)"},
|
| 157 |
+
{"name": "submit_answer", "description": "Submit final answer"},
|
| 158 |
+
],
|
| 159 |
+
"reward_structure": {
|
| 160 |
+
"max_reward": 1.0,
|
| 161 |
+
"components": [
|
| 162 |
+
{"name": "correctness", "weight": 0.80},
|
| 163 |
+
{"name": "efficiency", "weight": 0.10},
|
| 164 |
+
{"name": "token_cost", "weight": 0.10},
|
| 165 |
+
],
|
| 166 |
+
},
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
@app.get("/schema")
|
| 171 |
+
async def schema():
|
| 172 |
+
"""Get environment schema for Action, Observation, State.
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
JSON schemas for all types
|
| 176 |
+
"""
|
| 177 |
+
return {
|
| 178 |
+
"action": CodeDarkAction.model_json_schema(),
|
| 179 |
+
"observation": CodeDarkObservation.model_json_schema(),
|
| 180 |
+
"state": CodeDarkState.model_json_schema(),
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def main():
|
| 185 |
+
"""Run the server."""
|
| 186 |
+
uvicorn.run(
|
| 187 |
+
"codedark.server.app:app",
|
| 188 |
+
host=os.environ.get("HOST", "0.0.0.0"),
|
| 189 |
+
port=int(os.environ.get("PORT", "8000")),
|
| 190 |
+
reload=os.environ.get("RELOAD", "false").lower() == "true",
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
main()
|
codedark/server/environment.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeDark Environment
|
| 3 |
+
|
| 4 |
+
OpenEnv-compatible environment for multi-turn data analytics tasks.
|
| 5 |
+
Agents analyze CSV data using Python/Pandas tools and submit answers.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import uuid
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Any, Dict, List, Optional
|
| 12 |
+
|
| 13 |
+
import pandas as pd
|
| 14 |
+
|
| 15 |
+
from ..models import CodeDarkAction, CodeDarkObservation, CodeDarkState
|
| 16 |
+
from .tools import (
|
| 17 |
+
run_python,
|
| 18 |
+
read_notes,
|
| 19 |
+
save_note,
|
| 20 |
+
clarify,
|
| 21 |
+
submit_answer,
|
| 22 |
+
parse_tool_call,
|
| 23 |
+
)
|
| 24 |
+
from .scoring import compute_reward
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class CodeDarkEnvironment:
|
| 28 |
+
"""CodeDark environment for multi-turn data analytics.
|
| 29 |
+
|
| 30 |
+
Features:
|
| 31 |
+
- Multi-turn agent evaluation
|
| 32 |
+
- 5 tools: run_python, read_notes, save_note, clarify, submit_answer
|
| 33 |
+
- Shaped rewards: correctness (80%) + efficiency (10%) + token cost (10%)
|
| 34 |
+
- Supports bank and road datasets
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
data_dir: Optional[str] = None,
|
| 40 |
+
tasks_path: Optional[str] = None,
|
| 41 |
+
max_turns: int = 10,
|
| 42 |
+
max_clarifications: int = 2,
|
| 43 |
+
):
|
| 44 |
+
"""Initialize CodeDark environment.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
data_dir: Path to directory containing CSV files
|
| 48 |
+
tasks_path: Path to tasks.jsonl file
|
| 49 |
+
max_turns: Maximum turns per episode (default: 10)
|
| 50 |
+
max_clarifications: Maximum clarifications per episode (default: 2)
|
| 51 |
+
"""
|
| 52 |
+
self.max_turns = max_turns
|
| 53 |
+
self.max_clarifications = max_clarifications
|
| 54 |
+
|
| 55 |
+
# Resolve paths
|
| 56 |
+
if data_dir:
|
| 57 |
+
self.data_dir = Path(data_dir)
|
| 58 |
+
else:
|
| 59 |
+
# Default to data/ relative to this file's parent
|
| 60 |
+
self.data_dir = Path(__file__).parent.parent / "data"
|
| 61 |
+
|
| 62 |
+
if tasks_path:
|
| 63 |
+
self.tasks_path = Path(tasks_path)
|
| 64 |
+
else:
|
| 65 |
+
self.tasks_path = self.data_dir / "tasks" / "final_25_tasks.jsonl"
|
| 66 |
+
|
| 67 |
+
# Load tasks
|
| 68 |
+
self.tasks = self._load_tasks()
|
| 69 |
+
self._tasks_by_id = {t["id"]: t for t in self.tasks}
|
| 70 |
+
self._task_index = 0
|
| 71 |
+
|
| 72 |
+
# Current episode state
|
| 73 |
+
self._state: Optional[CodeDarkState] = None
|
| 74 |
+
self._df: Optional[pd.DataFrame] = None
|
| 75 |
+
self._current_task: Optional[Dict] = None
|
| 76 |
+
|
| 77 |
+
def _load_tasks(self) -> List[Dict]:
|
| 78 |
+
"""Load tasks from JSONL file."""
|
| 79 |
+
if not self.tasks_path.exists():
|
| 80 |
+
return []
|
| 81 |
+
|
| 82 |
+
tasks = []
|
| 83 |
+
with open(self.tasks_path) as f:
|
| 84 |
+
for line in f:
|
| 85 |
+
if line.strip():
|
| 86 |
+
tasks.append(json.loads(line))
|
| 87 |
+
return tasks
|
| 88 |
+
|
| 89 |
+
def _load_data_for_task(self, task: Dict) -> Optional[pd.DataFrame]:
|
| 90 |
+
"""Load the appropriate CSV for a task.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
task: Task dictionary with 'dataset' field
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
DataFrame or None if not found
|
| 97 |
+
"""
|
| 98 |
+
dataset = task.get("dataset", "bank")
|
| 99 |
+
csv_path = self.data_dir / f"{dataset}.csv"
|
| 100 |
+
|
| 101 |
+
if csv_path.exists():
|
| 102 |
+
return pd.read_csv(csv_path)
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
@property
|
| 106 |
+
def state(self) -> CodeDarkState:
|
| 107 |
+
"""Return current environment state."""
|
| 108 |
+
if self._state is None:
|
| 109 |
+
self._state = CodeDarkState()
|
| 110 |
+
return self._state
|
| 111 |
+
|
| 112 |
+
def reset(
|
| 113 |
+
self, task_id: Optional[str] = None, seed: Optional[int] = None
|
| 114 |
+
) -> CodeDarkObservation:
|
| 115 |
+
"""Reset environment for a new episode.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
task_id: Specific task to load (optional)
|
| 119 |
+
seed: Random seed for task selection (optional)
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
Initial observation with task question
|
| 123 |
+
"""
|
| 124 |
+
# Select task
|
| 125 |
+
if task_id and task_id in self._tasks_by_id:
|
| 126 |
+
task = self._tasks_by_id[task_id]
|
| 127 |
+
elif self.tasks:
|
| 128 |
+
if seed is not None:
|
| 129 |
+
import random
|
| 130 |
+
|
| 131 |
+
random.seed(seed)
|
| 132 |
+
task = random.choice(self.tasks)
|
| 133 |
+
else:
|
| 134 |
+
# Round-robin through tasks
|
| 135 |
+
task = self.tasks[self._task_index % len(self.tasks)]
|
| 136 |
+
self._task_index += 1
|
| 137 |
+
else:
|
| 138 |
+
# No tasks loaded - return error observation
|
| 139 |
+
return CodeDarkObservation(
|
| 140 |
+
stderr="Error: No tasks loaded",
|
| 141 |
+
exit_code=1,
|
| 142 |
+
done=True,
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
self._current_task = task
|
| 146 |
+
|
| 147 |
+
# Load data for this task
|
| 148 |
+
self._df = self._load_data_for_task(task)
|
| 149 |
+
if self._df is None:
|
| 150 |
+
return CodeDarkObservation(
|
| 151 |
+
stderr=f"Error: Could not load data for dataset '{task.get('dataset', 'bank')}'",
|
| 152 |
+
exit_code=1,
|
| 153 |
+
done=True,
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
# Initialize state
|
| 157 |
+
self._state = CodeDarkState(
|
| 158 |
+
episode_id=str(uuid.uuid4()),
|
| 159 |
+
step_count=0,
|
| 160 |
+
task_id=task["id"],
|
| 161 |
+
dataset=task.get("dataset", "bank"),
|
| 162 |
+
notes=[],
|
| 163 |
+
turn_count=0,
|
| 164 |
+
error_count=0,
|
| 165 |
+
clarify_count=0,
|
| 166 |
+
submitted=False,
|
| 167 |
+
submitted_answer=None,
|
| 168 |
+
expected_answer=task["golden"]["answer_value"],
|
| 169 |
+
tolerance=task["golden"].get("tolerance", 0.01),
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
# Return initial observation
|
| 173 |
+
return CodeDarkObservation(
|
| 174 |
+
stdout=f"Task loaded. DataFrame shape: {self._df.shape}",
|
| 175 |
+
turn=0,
|
| 176 |
+
max_turns=self.max_turns,
|
| 177 |
+
notes=[],
|
| 178 |
+
task_id=task["id"],
|
| 179 |
+
question=task["goal"],
|
| 180 |
+
difficulty=task.get("level", "L5"),
|
| 181 |
+
dataset=task.get("dataset", "bank"),
|
| 182 |
+
done=False,
|
| 183 |
+
submitted=False,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
def step(self, action: CodeDarkAction) -> CodeDarkObservation:
|
| 187 |
+
"""Execute an action and return observation.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
action: CodeDarkAction with tool name and args
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
CodeDarkObservation with results
|
| 194 |
+
"""
|
| 195 |
+
if self._state is None or self._current_task is None:
|
| 196 |
+
return CodeDarkObservation(
|
| 197 |
+
stderr="Error: Environment not reset. Call reset() first.",
|
| 198 |
+
exit_code=1,
|
| 199 |
+
done=True,
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
if self._state.submitted:
|
| 203 |
+
return self._make_final_observation()
|
| 204 |
+
|
| 205 |
+
# Increment turn
|
| 206 |
+
self._state.turn_count += 1
|
| 207 |
+
self._state.step_count += 1
|
| 208 |
+
|
| 209 |
+
# Check turn limit
|
| 210 |
+
if self._state.turn_count > self.max_turns:
|
| 211 |
+
self._state.submitted = True
|
| 212 |
+
return self._make_final_observation()
|
| 213 |
+
|
| 214 |
+
# Parse tool-specific args
|
| 215 |
+
parsed_content, parse_error = parse_tool_call(action.args, action.tool)
|
| 216 |
+
|
| 217 |
+
if parse_error:
|
| 218 |
+
self._state.error_count += 1
|
| 219 |
+
return CodeDarkObservation(
|
| 220 |
+
stderr=f"{action.tool} Error: {parse_error}",
|
| 221 |
+
exit_code=1,
|
| 222 |
+
turn=self._state.turn_count,
|
| 223 |
+
max_turns=self.max_turns,
|
| 224 |
+
notes=self._state.notes.copy(),
|
| 225 |
+
task_id=self._state.task_id,
|
| 226 |
+
question=self._current_task["goal"],
|
| 227 |
+
difficulty=self._current_task.get("level", "L5"),
|
| 228 |
+
dataset=self._state.dataset,
|
| 229 |
+
done=False,
|
| 230 |
+
submitted=False,
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Execute tool
|
| 234 |
+
stdout, stderr, exit_code = "", "", 0
|
| 235 |
+
|
| 236 |
+
if action.tool == "run_python":
|
| 237 |
+
stdout, stderr, exit_code = run_python(parsed_content, self._df)
|
| 238 |
+
|
| 239 |
+
elif action.tool == "read_notes":
|
| 240 |
+
stdout, stderr, exit_code = read_notes(self._state.notes)
|
| 241 |
+
|
| 242 |
+
elif action.tool == "save_note":
|
| 243 |
+
stdout, stderr, exit_code = save_note(parsed_content, self._state.notes)
|
| 244 |
+
|
| 245 |
+
elif action.tool == "clarify":
|
| 246 |
+
stdout, stderr, exit_code, new_count = clarify(
|
| 247 |
+
question=parsed_content,
|
| 248 |
+
clarify_count=self._state.clarify_count,
|
| 249 |
+
max_clarifications=self.max_clarifications,
|
| 250 |
+
ambiguities=self._current_task.get("ambiguities", []),
|
| 251 |
+
answer_type=self._current_task.get("golden", {}).get(
|
| 252 |
+
"answer_type", "scalar"
|
| 253 |
+
),
|
| 254 |
+
)
|
| 255 |
+
self._state.clarify_count = new_count
|
| 256 |
+
|
| 257 |
+
elif action.tool == "submit_answer":
|
| 258 |
+
stdout, stderr, exit_code, answer = submit_answer(parsed_content)
|
| 259 |
+
if exit_code == 0:
|
| 260 |
+
self._state.submitted = True
|
| 261 |
+
self._state.submitted_answer = answer
|
| 262 |
+
return self._make_final_observation()
|
| 263 |
+
|
| 264 |
+
# Track errors
|
| 265 |
+
if exit_code != 0:
|
| 266 |
+
self._state.error_count += 1
|
| 267 |
+
|
| 268 |
+
return CodeDarkObservation(
|
| 269 |
+
stdout=stdout,
|
| 270 |
+
stderr=stderr,
|
| 271 |
+
exit_code=exit_code,
|
| 272 |
+
turn=self._state.turn_count,
|
| 273 |
+
max_turns=self.max_turns,
|
| 274 |
+
notes=self._state.notes.copy(),
|
| 275 |
+
task_id=self._state.task_id,
|
| 276 |
+
question=self._current_task["goal"],
|
| 277 |
+
difficulty=self._current_task.get("level", "L5"),
|
| 278 |
+
dataset=self._state.dataset,
|
| 279 |
+
done=False,
|
| 280 |
+
submitted=False,
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
def _make_final_observation(self) -> CodeDarkObservation:
|
| 284 |
+
"""Create final observation with reward computation."""
|
| 285 |
+
if self._state is None or self._current_task is None:
|
| 286 |
+
return CodeDarkObservation(done=True)
|
| 287 |
+
|
| 288 |
+
# Compute reward
|
| 289 |
+
reward, correctness, efficiency, token_cost = compute_reward(
|
| 290 |
+
submitted=self._state.submitted_answer,
|
| 291 |
+
expected=self._state.expected_answer,
|
| 292 |
+
tolerance=self._state.tolerance,
|
| 293 |
+
turns=self._state.turn_count,
|
| 294 |
+
max_turns=self.max_turns,
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
return CodeDarkObservation(
|
| 298 |
+
stdout="[EPISODE COMPLETE]",
|
| 299 |
+
turn=self._state.turn_count,
|
| 300 |
+
max_turns=self.max_turns,
|
| 301 |
+
notes=self._state.notes.copy(),
|
| 302 |
+
task_id=self._state.task_id,
|
| 303 |
+
question=self._current_task["goal"],
|
| 304 |
+
difficulty=self._current_task.get("level", "L5"),
|
| 305 |
+
dataset=self._state.dataset,
|
| 306 |
+
done=True,
|
| 307 |
+
submitted=self._state.submitted,
|
| 308 |
+
reward=reward,
|
| 309 |
+
correctness=correctness,
|
| 310 |
+
efficiency=efficiency,
|
| 311 |
+
metadata={
|
| 312 |
+
"submitted_answer": self._state.submitted_answer,
|
| 313 |
+
"expected_answer": self._state.expected_answer,
|
| 314 |
+
"tolerance": self._state.tolerance,
|
| 315 |
+
"error_count": self._state.error_count,
|
| 316 |
+
"clarify_count": self._state.clarify_count,
|
| 317 |
+
"token_cost_usd": token_cost,
|
| 318 |
+
},
|
| 319 |
+
)
|
codedark/server/requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeDark Server Requirements
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
uvicorn[standard]>=0.24.0
|
| 5 |
+
pandas>=2.0.0
|
| 6 |
+
numpy>=1.24.0
|
| 7 |
+
requests>=2.31.0
|
codedark/server/scoring.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeDark Scoring System
|
| 3 |
+
|
| 4 |
+
Reward computation with multi-metric scoring:
|
| 5 |
+
- 80% correctness (binary: exact match within tolerance)
|
| 6 |
+
- 10% efficiency (fewer turns = better)
|
| 7 |
+
- 10% token cost (lower usage = better)
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from typing import Any, Optional, Tuple
|
| 11 |
+
import ast
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def normalize_value(val: Any) -> Any:
|
| 15 |
+
"""Normalize a value for comparison.
|
| 16 |
+
|
| 17 |
+
Handles:
|
| 18 |
+
- String to float/int conversion
|
| 19 |
+
- String to list/dict parsing
|
| 20 |
+
- Float rounding for precision
|
| 21 |
+
- Percentage stripping
|
| 22 |
+
"""
|
| 23 |
+
if val is None:
|
| 24 |
+
return None
|
| 25 |
+
|
| 26 |
+
# Already a proper type - just normalize floats
|
| 27 |
+
if isinstance(val, float):
|
| 28 |
+
return round(val, 4)
|
| 29 |
+
if isinstance(val, int):
|
| 30 |
+
return float(val)
|
| 31 |
+
if isinstance(val, (list, dict)):
|
| 32 |
+
return val
|
| 33 |
+
|
| 34 |
+
# String handling
|
| 35 |
+
if isinstance(val, str):
|
| 36 |
+
val = val.strip()
|
| 37 |
+
|
| 38 |
+
# Try to parse as list/dict first
|
| 39 |
+
if val.startswith("[") or val.startswith("{"):
|
| 40 |
+
try:
|
| 41 |
+
return ast.literal_eval(val)
|
| 42 |
+
except (ValueError, SyntaxError):
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
# Try float (strip % if present)
|
| 46 |
+
try:
|
| 47 |
+
return round(float(val.rstrip("%")), 4)
|
| 48 |
+
except ValueError:
|
| 49 |
+
pass
|
| 50 |
+
|
| 51 |
+
# Return as lowercase string
|
| 52 |
+
return val.lower()
|
| 53 |
+
|
| 54 |
+
return val
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def parse_markdown_table(text: str) -> Optional[list]:
|
| 58 |
+
"""Parse markdown table to list of dicts.
|
| 59 |
+
|
| 60 |
+
Handles tables like:
|
| 61 |
+
| job | mean | std |
|
| 62 |
+
|-----|------|-----|
|
| 63 |
+
| retired | 40.97 | 9.74 |
|
| 64 |
+
"""
|
| 65 |
+
if not isinstance(text, str):
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
lines = text.strip().split("\n")
|
| 69 |
+
|
| 70 |
+
# Find table lines (contain |)
|
| 71 |
+
table_lines = [line for line in lines if "|" in line]
|
| 72 |
+
if len(table_lines) < 3: # Need header + separator + at least 1 row
|
| 73 |
+
return None
|
| 74 |
+
|
| 75 |
+
# Parse header
|
| 76 |
+
header_line = table_lines[0]
|
| 77 |
+
headers = [h.strip().lower() for h in header_line.split("|") if h.strip()]
|
| 78 |
+
if not headers:
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
# Skip separator line (contains ---)
|
| 82 |
+
data_start = 1
|
| 83 |
+
if "---" in table_lines[1] or "--|" in table_lines[1]:
|
| 84 |
+
data_start = 2
|
| 85 |
+
|
| 86 |
+
# Parse data rows
|
| 87 |
+
rows = []
|
| 88 |
+
for line in table_lines[data_start:]:
|
| 89 |
+
cells = [c.strip() for c in line.split("|") if c.strip()]
|
| 90 |
+
if len(cells) != len(headers):
|
| 91 |
+
continue
|
| 92 |
+
|
| 93 |
+
row = {}
|
| 94 |
+
for h, c in zip(headers, cells):
|
| 95 |
+
# Clean number formatting (commas, currency symbols)
|
| 96 |
+
c_clean = c.replace(",", "").replace("€", "").replace("$", "").strip()
|
| 97 |
+
try:
|
| 98 |
+
if "." in c_clean:
|
| 99 |
+
row[h] = round(float(c_clean), 4)
|
| 100 |
+
else:
|
| 101 |
+
row[h] = int(c_clean)
|
| 102 |
+
except ValueError:
|
| 103 |
+
row[h] = c.lower()
|
| 104 |
+
rows.append(row)
|
| 105 |
+
|
| 106 |
+
return rows if rows else None
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def compare_answers(submitted: Any, expected: Any, tolerance: float = 0.01) -> bool:
|
| 110 |
+
"""Compare answers with support for structured data and numeric tolerance.
|
| 111 |
+
|
| 112 |
+
Handles:
|
| 113 |
+
- Type mismatches (string "33.53" vs float 33.53)
|
| 114 |
+
- Floating point precision (rounds to 4 decimals)
|
| 115 |
+
- Nested structures (lists, dicts)
|
| 116 |
+
- String parsing for lists/dicts
|
| 117 |
+
"""
|
| 118 |
+
# Normalize both values
|
| 119 |
+
submitted_n = normalize_value(submitted)
|
| 120 |
+
expected_n = normalize_value(expected)
|
| 121 |
+
|
| 122 |
+
# Null checks
|
| 123 |
+
if submitted_n is None and expected_n is None:
|
| 124 |
+
return True
|
| 125 |
+
if submitted_n is None or expected_n is None:
|
| 126 |
+
return False
|
| 127 |
+
|
| 128 |
+
# Same type comparison after normalization
|
| 129 |
+
if type(submitted_n) == type(expected_n):
|
| 130 |
+
if isinstance(expected_n, list):
|
| 131 |
+
if len(submitted_n) != len(expected_n):
|
| 132 |
+
return False
|
| 133 |
+
# Check if list contains dicts (structured data) - use order-sensitive
|
| 134 |
+
if expected_n and isinstance(expected_n[0], dict):
|
| 135 |
+
return all(
|
| 136 |
+
compare_answers(s, e, tolerance)
|
| 137 |
+
for s, e in zip(submitted_n, expected_n)
|
| 138 |
+
)
|
| 139 |
+
# Simple values list - order-insensitive (we don't tell models to sort)
|
| 140 |
+
submitted_sorted = sorted([str(x).lower().strip() for x in submitted_n])
|
| 141 |
+
expected_sorted = sorted([str(x).lower().strip() for x in expected_n])
|
| 142 |
+
return submitted_sorted == expected_sorted
|
| 143 |
+
|
| 144 |
+
if isinstance(expected_n, dict):
|
| 145 |
+
if set(submitted_n.keys()) != set(expected_n.keys()):
|
| 146 |
+
return False
|
| 147 |
+
return all(
|
| 148 |
+
compare_answers(submitted_n[k], expected_n[k], tolerance)
|
| 149 |
+
for k in expected_n
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
if isinstance(expected_n, float):
|
| 153 |
+
return abs(submitted_n - expected_n) <= tolerance
|
| 154 |
+
|
| 155 |
+
# String comparison
|
| 156 |
+
return str(submitted_n) == str(expected_n)
|
| 157 |
+
|
| 158 |
+
# Type mismatch after normalization - try numeric comparison
|
| 159 |
+
try:
|
| 160 |
+
sub_f = (
|
| 161 |
+
float(submitted_n) if not isinstance(submitted_n, (list, dict)) else None
|
| 162 |
+
)
|
| 163 |
+
exp_f = float(expected_n) if not isinstance(expected_n, (list, dict)) else None
|
| 164 |
+
if sub_f is not None and exp_f is not None:
|
| 165 |
+
return abs(sub_f - exp_f) <= tolerance
|
| 166 |
+
except (ValueError, TypeError):
|
| 167 |
+
pass
|
| 168 |
+
|
| 169 |
+
# Try markdown table parsing if expected is list and submitted is string
|
| 170 |
+
if isinstance(expected_n, list) and isinstance(submitted_n, str):
|
| 171 |
+
parsed = parse_markdown_table(submitted) # Use original, not normalized
|
| 172 |
+
if parsed is not None:
|
| 173 |
+
return compare_answers(parsed, expected, tolerance)
|
| 174 |
+
|
| 175 |
+
# Fallback: string comparison
|
| 176 |
+
return str(submitted_n).lower() == str(expected_n).lower()
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def score_correctness(submitted: Any, expected: Any, tolerance: float = 0.01) -> float:
|
| 180 |
+
"""Score the submitted answer correctness. Weight: 0.80
|
| 181 |
+
|
| 182 |
+
Scoring:
|
| 183 |
+
- 0.80: Exact match
|
| 184 |
+
- 0.20: Almost there (rounding or 100x scale error)
|
| 185 |
+
- 0.00: Wrong
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
submitted: Submitted answer
|
| 189 |
+
expected: Expected answer
|
| 190 |
+
tolerance: Numeric tolerance for comparison
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
Correctness score (0.0, 0.20, or 0.80)
|
| 194 |
+
"""
|
| 195 |
+
if submitted is None:
|
| 196 |
+
return 0.0
|
| 197 |
+
|
| 198 |
+
try:
|
| 199 |
+
# Try numeric comparison first
|
| 200 |
+
submitted_f = float(submitted)
|
| 201 |
+
expected_f = float(expected)
|
| 202 |
+
|
| 203 |
+
# Exact match (within tolerance)
|
| 204 |
+
if abs(submitted_f - expected_f) < tolerance:
|
| 205 |
+
return 0.80
|
| 206 |
+
|
| 207 |
+
if expected_f != 0:
|
| 208 |
+
ratio = submitted_f / expected_f
|
| 209 |
+
|
| 210 |
+
# 100x scale error (decimal vs percentage)
|
| 211 |
+
# e.g., 0.0959 vs 9.59 or 9.59 vs 0.0959
|
| 212 |
+
if 0.009 < ratio < 0.011 or 99 < ratio < 101:
|
| 213 |
+
return 0.20
|
| 214 |
+
|
| 215 |
+
# Rounding error (within 1% of expected)
|
| 216 |
+
if 0.99 < ratio < 1.01:
|
| 217 |
+
return 0.20
|
| 218 |
+
|
| 219 |
+
except (ValueError, TypeError):
|
| 220 |
+
# Structured data comparison (lists, dicts)
|
| 221 |
+
if compare_answers(submitted, expected, tolerance=tolerance):
|
| 222 |
+
return 0.80
|
| 223 |
+
|
| 224 |
+
return 0.0
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def score_efficiency(turns: int, max_turns: int, is_correct: bool) -> float:
|
| 228 |
+
"""Score based on turns used (fewer = better). Weight: 0.10
|
| 229 |
+
|
| 230 |
+
Only applies if answer is correct.
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
turns: Number of turns used
|
| 234 |
+
max_turns: Maximum turns allowed
|
| 235 |
+
is_correct: Whether the answer was correct
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
Efficiency score (0.0 to 0.10)
|
| 239 |
+
"""
|
| 240 |
+
if not is_correct:
|
| 241 |
+
return 0.0
|
| 242 |
+
|
| 243 |
+
# Scale: 1 turn = 0.10, max_turns = 0.01
|
| 244 |
+
efficiency = max(0.0, 1.0 - (turns / max_turns))
|
| 245 |
+
return 0.10 * efficiency
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def score_token_cost(
|
| 249 |
+
input_tokens: int,
|
| 250 |
+
output_tokens: int,
|
| 251 |
+
is_correct: bool,
|
| 252 |
+
input_price: float = 1.0,
|
| 253 |
+
output_price: float = 5.0,
|
| 254 |
+
target_cost: float = 0.01,
|
| 255 |
+
max_cost: float = 0.10,
|
| 256 |
+
) -> Tuple[float, float]:
|
| 257 |
+
"""Score based on token cost (lower = better). Weight: 0.10
|
| 258 |
+
|
| 259 |
+
Only applies if answer is correct.
|
| 260 |
+
|
| 261 |
+
Args:
|
| 262 |
+
input_tokens: Number of input tokens used
|
| 263 |
+
output_tokens: Number of output tokens used
|
| 264 |
+
is_correct: Whether the answer was correct
|
| 265 |
+
input_price: Price per 1M input tokens (default $1)
|
| 266 |
+
output_price: Price per 1M output tokens (default $5)
|
| 267 |
+
target_cost: Cost for full score (default $0.01)
|
| 268 |
+
max_cost: Cost for zero score (default $0.10)
|
| 269 |
+
|
| 270 |
+
Returns:
|
| 271 |
+
Tuple of (token_score, cost_usd)
|
| 272 |
+
"""
|
| 273 |
+
if not is_correct:
|
| 274 |
+
return 0.0, 0.0
|
| 275 |
+
|
| 276 |
+
# Calculate cost in dollars
|
| 277 |
+
cost = (input_tokens * input_price / 1_000_000) + (
|
| 278 |
+
output_tokens * output_price / 1_000_000
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
# Scale: <$0.01 = full score, >$0.10 = 0
|
| 282 |
+
if cost <= target_cost:
|
| 283 |
+
efficiency = 1.0
|
| 284 |
+
elif cost >= max_cost:
|
| 285 |
+
efficiency = 0.0
|
| 286 |
+
else:
|
| 287 |
+
efficiency = 1.0 - ((cost - target_cost) / (max_cost - target_cost))
|
| 288 |
+
|
| 289 |
+
return 0.10 * efficiency, cost
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def compute_reward(
|
| 293 |
+
submitted: Any,
|
| 294 |
+
expected: Any,
|
| 295 |
+
tolerance: float,
|
| 296 |
+
turns: int,
|
| 297 |
+
max_turns: int,
|
| 298 |
+
input_tokens: int = 0,
|
| 299 |
+
output_tokens: int = 0,
|
| 300 |
+
) -> Tuple[float, float, float, float]:
|
| 301 |
+
"""Compute total reward from all components.
|
| 302 |
+
|
| 303 |
+
Args:
|
| 304 |
+
submitted: Submitted answer
|
| 305 |
+
expected: Expected answer
|
| 306 |
+
tolerance: Numeric tolerance
|
| 307 |
+
turns: Number of turns used
|
| 308 |
+
max_turns: Maximum turns allowed
|
| 309 |
+
input_tokens: Number of input tokens (optional)
|
| 310 |
+
output_tokens: Number of output tokens (optional)
|
| 311 |
+
|
| 312 |
+
Returns:
|
| 313 |
+
Tuple of (total_reward, correctness, efficiency, token_cost_usd)
|
| 314 |
+
"""
|
| 315 |
+
# Correctness (0.80 weight)
|
| 316 |
+
correctness = score_correctness(submitted, expected, tolerance)
|
| 317 |
+
is_correct = correctness > 0
|
| 318 |
+
|
| 319 |
+
# Efficiency (0.10 weight)
|
| 320 |
+
efficiency = score_efficiency(turns, max_turns, is_correct)
|
| 321 |
+
|
| 322 |
+
# Token cost (0.10 weight)
|
| 323 |
+
# If tokens not tracked, estimate from turns
|
| 324 |
+
if input_tokens == 0 and output_tokens == 0:
|
| 325 |
+
input_tokens = turns * 1000
|
| 326 |
+
output_tokens = turns * 500
|
| 327 |
+
|
| 328 |
+
token_score, cost_usd = score_token_cost(input_tokens, output_tokens, is_correct)
|
| 329 |
+
|
| 330 |
+
total_reward = correctness + efficiency + token_score
|
| 331 |
+
|
| 332 |
+
return total_reward, correctness, efficiency, cost_usd
|
codedark/server/tools.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeDark Tool Implementations
|
| 3 |
+
|
| 4 |
+
Tools available to agents:
|
| 5 |
+
- run_python: Execute Python/pandas code in sandboxed environment
|
| 6 |
+
- read_notes: Read all saved notes from current episode
|
| 7 |
+
- save_note: Save a note for later recall
|
| 8 |
+
- clarify: Ask clarifying question (max 2 per episode)
|
| 9 |
+
- submit_answer: Submit final answer (ends episode)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import re
|
| 13 |
+
import ast
|
| 14 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 15 |
+
|
| 16 |
+
import pandas as pd
|
| 17 |
+
import numpy as np
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Safe builtins for sandboxed code execution
|
| 21 |
+
SAFE_BUILTINS = {
|
| 22 |
+
"len": len,
|
| 23 |
+
"sum": sum,
|
| 24 |
+
"min": min,
|
| 25 |
+
"max": max,
|
| 26 |
+
"abs": abs,
|
| 27 |
+
"round": round,
|
| 28 |
+
"sorted": sorted,
|
| 29 |
+
"range": range,
|
| 30 |
+
"int": int,
|
| 31 |
+
"float": float,
|
| 32 |
+
"str": str,
|
| 33 |
+
"bool": bool,
|
| 34 |
+
"list": list,
|
| 35 |
+
"dict": dict,
|
| 36 |
+
"set": set,
|
| 37 |
+
"tuple": tuple,
|
| 38 |
+
"enumerate": enumerate,
|
| 39 |
+
"zip": zip,
|
| 40 |
+
"True": True,
|
| 41 |
+
"False": False,
|
| 42 |
+
"None": None,
|
| 43 |
+
"print": print,
|
| 44 |
+
"type": type,
|
| 45 |
+
"isinstance": isinstance,
|
| 46 |
+
"map": map,
|
| 47 |
+
"filter": filter,
|
| 48 |
+
"any": any,
|
| 49 |
+
"all": all,
|
| 50 |
+
"hasattr": hasattr,
|
| 51 |
+
"getattr": getattr,
|
| 52 |
+
"repr": repr,
|
| 53 |
+
"locals": locals,
|
| 54 |
+
"globals": globals,
|
| 55 |
+
"dir": dir,
|
| 56 |
+
"vars": vars,
|
| 57 |
+
"reversed": reversed,
|
| 58 |
+
"slice": slice,
|
| 59 |
+
"format": format,
|
| 60 |
+
"Exception": Exception,
|
| 61 |
+
"ValueError": ValueError,
|
| 62 |
+
"TypeError": TypeError,
|
| 63 |
+
"KeyError": KeyError,
|
| 64 |
+
"IndexError": IndexError,
|
| 65 |
+
"AttributeError": AttributeError,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def run_python(
|
| 70 |
+
code: str, df: pd.DataFrame, max_output_chars: int = 200
|
| 71 |
+
) -> Tuple[str, str, int]:
|
| 72 |
+
"""Execute Python code in sandboxed environment.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
code: Python code to execute
|
| 76 |
+
df: DataFrame available as 'df' in execution context
|
| 77 |
+
max_output_chars: Maximum characters for output truncation
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Tuple of (stdout, stderr, exit_code)
|
| 81 |
+
"""
|
| 82 |
+
if df is None:
|
| 83 |
+
return "", "Error: No dataframe loaded", 1
|
| 84 |
+
|
| 85 |
+
local_vars = {
|
| 86 |
+
"pd": pd,
|
| 87 |
+
"np": np,
|
| 88 |
+
"df": df.copy(),
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
exec(code, {"__builtins__": SAFE_BUILTINS}, local_vars)
|
| 93 |
+
result = local_vars.get("result")
|
| 94 |
+
|
| 95 |
+
if result is None:
|
| 96 |
+
return (
|
| 97 |
+
"",
|
| 98 |
+
"Error: No 'result' variable set. Store your result in 'result'.",
|
| 99 |
+
1,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Format output with truncation
|
| 103 |
+
if isinstance(result, pd.DataFrame):
|
| 104 |
+
preview = result.head(3).to_string()
|
| 105 |
+
elif isinstance(result, pd.Series):
|
| 106 |
+
preview = result.head(5).to_string()
|
| 107 |
+
else:
|
| 108 |
+
preview = str(result)
|
| 109 |
+
|
| 110 |
+
# Truncate if needed
|
| 111 |
+
if len(preview) > max_output_chars:
|
| 112 |
+
preview = preview[:max_output_chars] + "..."
|
| 113 |
+
|
| 114 |
+
return f"run_python Result:\n{preview}", "", 0
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
return "", f"run_python Error: {e}", 1
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def read_notes(notes: List[str]) -> Tuple[str, str, int]:
|
| 121 |
+
"""Read all saved notes.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
notes: List of saved notes
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
Tuple of (stdout, stderr, exit_code)
|
| 128 |
+
"""
|
| 129 |
+
if not notes:
|
| 130 |
+
return "No notes saved yet.", "", 0
|
| 131 |
+
|
| 132 |
+
notes_list = "\n".join(f"- {n}" for n in notes)
|
| 133 |
+
return f"Saved notes:\n{notes_list}", "", 0
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def save_note(content: str, notes: List[str]) -> Tuple[str, str, int]:
|
| 137 |
+
"""Save a note to persistent memory.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
content: Note content to save
|
| 141 |
+
notes: List to append note to (modified in place)
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
Tuple of (stdout, stderr, exit_code)
|
| 145 |
+
"""
|
| 146 |
+
content = content.strip()
|
| 147 |
+
if not content:
|
| 148 |
+
return "", "Error: Empty note content", 1
|
| 149 |
+
|
| 150 |
+
notes.append(content)
|
| 151 |
+
notes_list = "\n".join(f"- {n}" for n in notes)
|
| 152 |
+
return f"Note saved.\n\nAll notes:\n{notes_list}", "", 0
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def clarify(
|
| 156 |
+
question: str,
|
| 157 |
+
clarify_count: int,
|
| 158 |
+
max_clarifications: int,
|
| 159 |
+
ambiguities: Optional[List[str]] = None,
|
| 160 |
+
answer_type: str = "scalar",
|
| 161 |
+
) -> Tuple[str, str, int, int]:
|
| 162 |
+
"""Ask a clarifying question about the task.
|
| 163 |
+
|
| 164 |
+
Args:
|
| 165 |
+
question: The clarifying question
|
| 166 |
+
clarify_count: Current number of clarifications used
|
| 167 |
+
max_clarifications: Maximum allowed clarifications
|
| 168 |
+
ambiguities: List of known ambiguities from task metadata
|
| 169 |
+
answer_type: Expected answer type ("scalar", "list", etc.)
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
Tuple of (stdout, stderr, exit_code, new_clarify_count)
|
| 173 |
+
"""
|
| 174 |
+
if clarify_count >= max_clarifications:
|
| 175 |
+
return (
|
| 176 |
+
"",
|
| 177 |
+
f"Error: Maximum {max_clarifications} clarifications per episode. Please proceed with your best interpretation.",
|
| 178 |
+
1,
|
| 179 |
+
clarify_count,
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
question_lower = question.lower()
|
| 183 |
+
ambiguities = ambiguities or []
|
| 184 |
+
|
| 185 |
+
# Build clarification responses from task metadata
|
| 186 |
+
clarifications = {}
|
| 187 |
+
|
| 188 |
+
for amb in ambiguities:
|
| 189 |
+
amb_lower = amb.lower()
|
| 190 |
+
if (
|
| 191 |
+
"percentile" in amb_lower
|
| 192 |
+
or "inclusive" in amb_lower
|
| 193 |
+
or "exclusive" in amb_lower
|
| 194 |
+
):
|
| 195 |
+
clarifications["percentile"] = (
|
| 196 |
+
"Use >= for 'top X%' (inclusive of threshold) and <= for 'bottom X%'."
|
| 197 |
+
)
|
| 198 |
+
if "rate" in amb_lower or "percentage" in amb_lower:
|
| 199 |
+
clarifications["rate"] = (
|
| 200 |
+
"Express rates as percentages 0-100, rounded to 2 decimal places."
|
| 201 |
+
)
|
| 202 |
+
if (
|
| 203 |
+
"positive" in amb_lower
|
| 204 |
+
or "success" in amb_lower
|
| 205 |
+
or "target" in amb_lower
|
| 206 |
+
or "y=1" in amb_lower
|
| 207 |
+
):
|
| 208 |
+
clarifications["target"] = "Subscription/success means y=1 in the dataset."
|
| 209 |
+
if "boundary" in amb_lower:
|
| 210 |
+
clarifications["boundary"] = (
|
| 211 |
+
"Include boundary values (>=, <=) when filtering."
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# Add format clarifications from answer type
|
| 215 |
+
if answer_type == "scalar":
|
| 216 |
+
clarifications["format"] = (
|
| 217 |
+
"Return a single numeric value, rounded to 2 decimal places."
|
| 218 |
+
)
|
| 219 |
+
elif answer_type == "list":
|
| 220 |
+
clarifications["format"] = (
|
| 221 |
+
"Return as a list/DataFrame with the specified columns."
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# Try to match question to a clarification
|
| 225 |
+
response = None
|
| 226 |
+
for key, value in clarifications.items():
|
| 227 |
+
if key in question_lower or any(word in question_lower for word in key.split()):
|
| 228 |
+
response = value
|
| 229 |
+
break
|
| 230 |
+
|
| 231 |
+
if response:
|
| 232 |
+
return f"Clarification: {response}", "", 0, clarify_count + 1
|
| 233 |
+
else:
|
| 234 |
+
return (
|
| 235 |
+
"Clarification: Please proceed with your best interpretation based on standard data analysis conventions.",
|
| 236 |
+
"",
|
| 237 |
+
0,
|
| 238 |
+
clarify_count + 1,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def submit_answer(answer_str: str) -> Tuple[str, str, int, Any]:
|
| 243 |
+
"""Submit final answer.
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
answer_str: Answer string to parse and submit
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
Tuple of (stdout, stderr, exit_code, parsed_answer)
|
| 250 |
+
"""
|
| 251 |
+
answer_str = answer_str.strip().rstrip("%").strip()
|
| 252 |
+
|
| 253 |
+
if not answer_str:
|
| 254 |
+
return "", "Error: Empty answer", 1, None
|
| 255 |
+
|
| 256 |
+
# Try to parse as structured data first (list/dict)
|
| 257 |
+
try:
|
| 258 |
+
answer = ast.literal_eval(answer_str)
|
| 259 |
+
except (ValueError, SyntaxError):
|
| 260 |
+
# Fall back to numeric parsing
|
| 261 |
+
try:
|
| 262 |
+
answer = float(answer_str)
|
| 263 |
+
except ValueError:
|
| 264 |
+
answer = answer_str
|
| 265 |
+
|
| 266 |
+
return "[SUBMITTED]", "", 0, answer
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def parse_tool_call(args: str, tool_name: str) -> Tuple[Optional[str], Optional[str]]:
|
| 270 |
+
"""Parse tool-specific arguments from args string.
|
| 271 |
+
|
| 272 |
+
Args:
|
| 273 |
+
args: Raw args string
|
| 274 |
+
tool_name: Name of the tool being called
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
Tuple of (parsed_content, error_message)
|
| 278 |
+
"""
|
| 279 |
+
if tool_name == "run_python":
|
| 280 |
+
# Extract code from <code></code> tags
|
| 281 |
+
match = re.search(r"<code>(.*?)</code>", args, re.DOTALL)
|
| 282 |
+
if not match:
|
| 283 |
+
return None, "No <code> tag found. Use: <code>your_code</code>"
|
| 284 |
+
return match.group(1).strip(), None
|
| 285 |
+
|
| 286 |
+
elif tool_name == "clarify":
|
| 287 |
+
# Extract question from <question></question> tags
|
| 288 |
+
match = re.search(r"<question>(.*?)</question>", args, re.DOTALL)
|
| 289 |
+
if not match:
|
| 290 |
+
return (
|
| 291 |
+
None,
|
| 292 |
+
"No <question> tag found. Use: <question>your question</question>",
|
| 293 |
+
)
|
| 294 |
+
return match.group(1).strip(), None
|
| 295 |
+
|
| 296 |
+
elif tool_name == "submit_answer":
|
| 297 |
+
# Extract answer from <answer></answer> tags
|
| 298 |
+
match = re.search(r"<answer>(.*?)</answer>", args, re.DOTALL)
|
| 299 |
+
if not match:
|
| 300 |
+
return None, "No <answer> tag found. Use: <answer>value</answer>"
|
| 301 |
+
return match.group(1).strip(), None
|
| 302 |
+
|
| 303 |
+
elif tool_name in ("read_notes", "save_note"):
|
| 304 |
+
# These take raw args
|
| 305 |
+
return args.strip(), None
|
| 306 |
+
|
| 307 |
+
else:
|
| 308 |
+
return None, f"Unknown tool: {tool_name}"
|
codedark/tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""CodeDark Tests."""
|
codedark/tests/test_environment.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for CodeDark environment."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from codedark.models import CodeDarkAction, CodeDarkObservation, CodeDarkState
|
| 7 |
+
from codedark.server.environment import CodeDarkEnvironment
|
| 8 |
+
from codedark.server.scoring import score_correctness, score_efficiency, compute_reward
|
| 9 |
+
from codedark.server.tools import run_python, read_notes, save_note, parse_tool_call
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TestModels:
|
| 13 |
+
"""Test Pydantic models."""
|
| 14 |
+
|
| 15 |
+
def test_action_creation(self):
|
| 16 |
+
action = CodeDarkAction(tool="run_python", args="<code>result = 1</code>")
|
| 17 |
+
assert action.tool == "run_python"
|
| 18 |
+
assert "result = 1" in action.args
|
| 19 |
+
|
| 20 |
+
def test_observation_defaults(self):
|
| 21 |
+
obs = CodeDarkObservation()
|
| 22 |
+
assert obs.stdout == ""
|
| 23 |
+
assert obs.done is False
|
| 24 |
+
assert obs.reward is None
|
| 25 |
+
|
| 26 |
+
def test_state_defaults(self):
|
| 27 |
+
state = CodeDarkState()
|
| 28 |
+
assert state.notes == []
|
| 29 |
+
assert state.turn_count == 0
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class TestTools:
|
| 33 |
+
"""Test tool implementations."""
|
| 34 |
+
|
| 35 |
+
def test_parse_run_python(self):
|
| 36 |
+
content, error = parse_tool_call("<code>result = df.shape</code>", "run_python")
|
| 37 |
+
assert content == "result = df.shape"
|
| 38 |
+
assert error is None
|
| 39 |
+
|
| 40 |
+
def test_parse_run_python_missing_tag(self):
|
| 41 |
+
content, error = parse_tool_call("result = df.shape", "run_python")
|
| 42 |
+
assert content is None
|
| 43 |
+
assert "No <code> tag" in error
|
| 44 |
+
|
| 45 |
+
def test_parse_submit_answer(self):
|
| 46 |
+
content, error = parse_tool_call("<answer>42.5</answer>", "submit_answer")
|
| 47 |
+
assert content == "42.5"
|
| 48 |
+
assert error is None
|
| 49 |
+
|
| 50 |
+
def test_read_notes_empty(self):
|
| 51 |
+
stdout, stderr, exit_code = read_notes([])
|
| 52 |
+
assert "No notes saved" in stdout
|
| 53 |
+
assert exit_code == 0
|
| 54 |
+
|
| 55 |
+
def test_read_notes_with_content(self):
|
| 56 |
+
notes = ["Note 1", "Note 2"]
|
| 57 |
+
stdout, stderr, exit_code = read_notes(notes)
|
| 58 |
+
assert "Note 1" in stdout
|
| 59 |
+
assert "Note 2" in stdout
|
| 60 |
+
assert exit_code == 0
|
| 61 |
+
|
| 62 |
+
def test_save_note(self):
|
| 63 |
+
notes = []
|
| 64 |
+
stdout, stderr, exit_code = save_note("Test note", notes)
|
| 65 |
+
assert "Note saved" in stdout
|
| 66 |
+
assert len(notes) == 1
|
| 67 |
+
assert notes[0] == "Test note"
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class TestScoring:
|
| 71 |
+
"""Test scoring functions."""
|
| 72 |
+
|
| 73 |
+
def test_correctness_exact_match(self):
|
| 74 |
+
score = score_correctness(42.5, 42.5, tolerance=0.01)
|
| 75 |
+
assert score == 0.80
|
| 76 |
+
|
| 77 |
+
def test_correctness_within_tolerance(self):
|
| 78 |
+
score = score_correctness(42.505, 42.5, tolerance=0.01)
|
| 79 |
+
assert score == 0.80
|
| 80 |
+
|
| 81 |
+
def test_correctness_wrong(self):
|
| 82 |
+
score = score_correctness(100.0, 42.5, tolerance=0.01)
|
| 83 |
+
assert score == 0.0
|
| 84 |
+
|
| 85 |
+
def test_correctness_scale_error(self):
|
| 86 |
+
# 100x scale error: 0.425 vs 42.5
|
| 87 |
+
score = score_correctness(0.425, 42.5, tolerance=0.01)
|
| 88 |
+
assert score == 0.20
|
| 89 |
+
|
| 90 |
+
def test_efficiency_correct_answer(self):
|
| 91 |
+
score = score_efficiency(turns=1, max_turns=10, is_correct=True)
|
| 92 |
+
assert score > 0.08 # High efficiency for 1 turn
|
| 93 |
+
|
| 94 |
+
def test_efficiency_incorrect_answer(self):
|
| 95 |
+
score = score_efficiency(turns=1, max_turns=10, is_correct=False)
|
| 96 |
+
assert score == 0.0
|
| 97 |
+
|
| 98 |
+
def test_compute_reward_correct(self):
|
| 99 |
+
reward, corr, eff, cost = compute_reward(
|
| 100 |
+
submitted=42.5,
|
| 101 |
+
expected=42.5,
|
| 102 |
+
tolerance=0.01,
|
| 103 |
+
turns=3,
|
| 104 |
+
max_turns=10,
|
| 105 |
+
)
|
| 106 |
+
assert reward > 0.8 # At least correctness score
|
| 107 |
+
assert corr == 0.80
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class TestEnvironment:
|
| 111 |
+
"""Test environment class."""
|
| 112 |
+
|
| 113 |
+
@pytest.fixture
|
| 114 |
+
def env(self):
|
| 115 |
+
"""Create environment with test data."""
|
| 116 |
+
data_dir = Path(__file__).parent.parent / "data"
|
| 117 |
+
tasks_path = data_dir / "tasks" / "final_25_tasks.jsonl"
|
| 118 |
+
return CodeDarkEnvironment(
|
| 119 |
+
data_dir=str(data_dir),
|
| 120 |
+
tasks_path=str(tasks_path),
|
| 121 |
+
max_turns=10,
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
def test_reset_loads_task(self, env):
|
| 125 |
+
obs = env.reset()
|
| 126 |
+
assert obs.task_id != ""
|
| 127 |
+
assert obs.question != ""
|
| 128 |
+
assert obs.done is False
|
| 129 |
+
|
| 130 |
+
def test_reset_specific_task(self, env):
|
| 131 |
+
obs = env.reset(task_id="bank_hard_001")
|
| 132 |
+
assert obs.task_id == "bank_hard_001"
|
| 133 |
+
assert "subscription rate" in obs.question.lower()
|
| 134 |
+
|
| 135 |
+
def test_step_run_python(self, env):
|
| 136 |
+
env.reset(task_id="bank_hard_001")
|
| 137 |
+
action = CodeDarkAction(
|
| 138 |
+
tool="run_python", args="<code>result = df.shape</code>"
|
| 139 |
+
)
|
| 140 |
+
obs = env.step(action)
|
| 141 |
+
assert obs.exit_code == 0
|
| 142 |
+
assert "run_python Result" in obs.stdout
|
| 143 |
+
|
| 144 |
+
def test_step_save_note(self, env):
|
| 145 |
+
env.reset(task_id="bank_hard_001")
|
| 146 |
+
action = CodeDarkAction(tool="save_note", args="Test observation")
|
| 147 |
+
obs = env.step(action)
|
| 148 |
+
assert "Note saved" in obs.stdout
|
| 149 |
+
assert len(obs.notes) == 1
|
| 150 |
+
|
| 151 |
+
def test_step_read_notes(self, env):
|
| 152 |
+
env.reset(task_id="bank_hard_001")
|
| 153 |
+
# First save a note
|
| 154 |
+
env.step(CodeDarkAction(tool="save_note", args="Important finding"))
|
| 155 |
+
# Then read notes
|
| 156 |
+
obs = env.step(CodeDarkAction(tool="read_notes", args=""))
|
| 157 |
+
assert "Important finding" in obs.stdout
|
| 158 |
+
|
| 159 |
+
def test_step_submit_answer(self, env):
|
| 160 |
+
env.reset(task_id="bank_hard_001")
|
| 161 |
+
action = CodeDarkAction(tool="submit_answer", args="<answer>2.44</answer>")
|
| 162 |
+
obs = env.step(action)
|
| 163 |
+
assert obs.done is True
|
| 164 |
+
assert obs.submitted is True
|
| 165 |
+
assert obs.reward is not None
|
| 166 |
+
# This should be correct answer for bank_hard_001
|
| 167 |
+
assert obs.correctness == 0.80
|
| 168 |
+
|
| 169 |
+
def test_turn_counting(self, env):
|
| 170 |
+
env.reset()
|
| 171 |
+
assert env.state.turn_count == 0
|
| 172 |
+
|
| 173 |
+
env.step(CodeDarkAction(tool="run_python", args="<code>result = 1</code>"))
|
| 174 |
+
assert env.state.turn_count == 1
|
| 175 |
+
|
| 176 |
+
env.step(CodeDarkAction(tool="run_python", args="<code>result = 2</code>"))
|
| 177 |
+
assert env.state.turn_count == 2
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
pytest.main([__file__, "-v"])
|
data/bank.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a071417203c9e1434df5fe794fffae6a55327502b00da9c4e9754d2ab7f7cede
|
| 3 |
+
size 65698328
|
data/road.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ee5955af18eca0d4b53e13f23bd6436422e40ca84077fb8cdcfa467aa62b68f
|
| 3 |
+
size 37936892
|
data/tasks/final_25_tasks.jsonl
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"id": "bank_hard_001", "dataset": "bank", "goal": "What's the subscription rate for month='may' AND job='management' AND balance in Q1?", "expected_output_type": "scalar", "level": "L6", "template": "multi_condition_filter", "golden": {"answer_value": 2.44, "answer_type": "scalar", "verification_code": "df['_q'] = pd.qcut(df['balance'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nfiltered = df[(df['month'] == 'may') & (df['job'] == 'management') & (df['_q'] == 'Q1')]\nresult = round((filtered['y'] == 1).mean() * 100, 2) if len(filtered) > 0 else 0.0\ndf.drop('_q', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "null", "missing", "nan", "empty", "na"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "multi_condition_filter", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"col_a": "month", "val_a": "may", "col_b": "job", "val_b": "management", "metric": "balance", "quartile": "Q1", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 2 |
+
{"id": "bank_hard_005", "dataset": "bank", "goal": "Among customers in top 95% of age AND bottom 10% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.59, "answer_type": "scalar", "verification_code": "p_high = df['age'].quantile(95 / 100)\np_low = df['duration'].quantile(10 / 100)\ncohort = df[(df['age'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "age", "metric_b": "duration", "pct_high": 95, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 3 |
+
{"id": "bank_hard_012", "dataset": "bank", "goal": "Find the job with lowest average balance. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 8.27, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['balance'].mean()\nextrema_val = group_stats.max() if 'lowest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "balance", "extrema": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 4 |
+
{"id": "bank_hard_019", "dataset": "bank", "goal": "What's the subscription rate for month='may' AND job='management' AND age in Q4?", "expected_output_type": "scalar", "level": "L6", "template": "multi_condition_filter", "golden": {"answer_value": 7.04, "answer_type": "scalar", "verification_code": "df['_q'] = pd.qcut(df['age'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nfiltered = df[(df['month'] == 'may') & (df['job'] == 'management') & (df['_q'] == 'Q4')]\nresult = round((filtered['y'] == 1).mean() * 100, 2) if len(filtered) > 0 else 0.0\ndf.drop('_q', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "null", "missing", "nan", "empty", "na"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "multi_condition_filter", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"col_a": "month", "val_a": "may", "col_b": "job", "val_b": "management", "metric": "age", "quartile": "Q4", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 5 |
+
{"id": "bank_hard_020", "dataset": "bank", "goal": "For month with above-average day, which have subscription rate > 25%? Return sorted list.", "expected_output_type": "list", "level": "L5", "template": "top_n_in_segment", "golden": {"answer_value": ["oct"], "answer_type": "list", "verification_code": "avg_metric = df.groupby('month')['day'].mean()\nhigh_metric = avg_metric[avg_metric > avg_metric.mean()].index\nrates = df[df['month'].isin(high_metric)].groupby('month')['y'].apply(\n lambda x: (x == 1).mean() * 100)\nresult = sorted(rates[rates > 25].index.tolist())", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["tie", "ties", "equal", "same", "duplicate"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "top_n_in_segment", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "month", "metric": "day", "threshold": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 6 |
+
{"id": "bank_hard_021", "dataset": "bank", "goal": "Find the job with highest average balance. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 24.62, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['balance'].mean()\nextrema_val = group_stats.max() if 'highest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "balance", "extrema": "highest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 7 |
+
{"id": "bank_hard_023", "dataset": "bank", "goal": "Among customers in top 90% of balance AND bottom 10% of day, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 33.53, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(90 / 100)\np_low = df['day'].quantile(10 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['day'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "day", "pct_high": 90, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 8 |
+
{"id": "bank_hard_026", "dataset": "bank", "goal": "For each job, compute volatility score: std(age) / mean(age). Return top 5 with [group, mean, std, volatility].", "expected_output_type": "dataframe", "level": "L6", "template": "segment_volatility", "golden": {"answer_value": [{"job": "unemployed", "mean": 40.97, "std": 9.74, "volatility": 0.2377}, {"job": "self-employed", "mean": 40.42, "std": 9.46, "volatility": 0.234}, {"job": "admin.", "mean": 39.68, "std": 9.23, "volatility": 0.2326}, {"job": "services", "mean": 38.94, "std": 8.86, "volatility": 0.2275}, {"job": "management", "mean": 40.2, "std": 9.13, "volatility": 0.2271}], "answer_type": "dataframe", "verification_code": "stats = df.groupby('job')['age'].agg(['mean', 'std']).round(2)\nstats['volatility'] = round(stats['std'] / stats['mean'], 4)\nresult = stats.nlargest(5, 'volatility').reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits", "volatility", "cv", "coefficient", "variation"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "segment_volatility", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "age"}}}
|
| 9 |
+
{"id": "bank_hard_028", "dataset": "bank", "goal": "Among customers in top 95% of balance AND bottom 10% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.84, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(95 / 100)\np_low = df['duration'].quantile(10 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 95, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 10 |
+
{"id": "bank_hard_029", "dataset": "bank", "goal": "Among customers in top 90% of balance AND bottom 25% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.54, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(90 / 100)\np_low = df['duration'].quantile(25 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 90, "pct_low": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 11 |
+
{"id": "bank_hard_030", "dataset": "bank", "goal": "Rank job by subscription rate. Which bottom-3 have above-median age?", "expected_output_type": "list", "level": "L6", "template": "ranked_anomaly", "golden": {"answer_value": ["blue-collar", "entrepreneur"], "answer_type": "list", "verification_code": "stats = df.groupby('job').agg(\n rate=('y', lambda x: (x == 1).mean()),\n avg_metric=('age', 'mean'))\nstats['rank'] = stats['rate'].rank()\nbottom_3 = stats[stats['rank'] <= 3]\nresult = sorted(bottom_3[bottom_3['avg_metric'] > stats['avg_metric'].median()].index.tolist())", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rank", "order", "sort", "ascending", "descending"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "ranked_anomaly", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "age", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 12 |
+
{"id": "bank_hard_031", "dataset": "bank", "goal": "For each month, compute volatility score: std(day) / mean(day). Return top 5 with [group, mean, std, volatility].", "expected_output_type": "dataframe", "level": "L6", "template": "segment_volatility", "golden": {"answer_value": [{"month": "feb", "mean": 6.05, "std": 5.46, "volatility": 0.9025}, {"month": "sep", "mean": 11.67, "std": 8.04, "volatility": 0.6889}, {"month": "mar", "mean": 13.45, "std": 9.18, "volatility": 0.6825}, {"month": "jun", "mean": 11.32, "std": 7.33, "volatility": 0.6475}, {"month": "dec", "mean": 14.19, "std": 8.83, "volatility": 0.6223}], "answer_type": "dataframe", "verification_code": "stats = df.groupby('month')['day'].agg(['mean', 'std']).round(2)\nstats['volatility'] = round(stats['std'] / stats['mean'], 4)\nresult = stats.nlargest(5, 'volatility').reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits", "volatility", "cv", "coefficient", "variation"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "segment_volatility", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "month", "metric": "day"}}}
|
| 13 |
+
{"id": "bank_hard_033", "dataset": "bank", "goal": "Show the average balance breakdown by job. Include count and mean balance for each category, sorted by mean descending.", "expected_output_type": "dataframe", "level": "L4", "template": "metric_breakdown", "golden": {"answer_value": [{"job": "retired", "count": 35185, "mean_balance": 1812.07}, {"job": "unknown", "count": 2917, "mean_balance": 1678.96}, {"job": "self-employed", "count": 19020, "mean_balance": 1598.27}, {"job": "student", "count": 11767, "mean_balance": 1577.32}, {"job": "management", "count": 175541, "mean_balance": 1510.39}, {"job": "unemployed", "count": 17634, "mean_balance": 1440.57}, {"job": "entrepreneur", "count": 17718, "mean_balance": 1306.75}, {"job": "housemaid", "count": 15912, "mean_balance": 1281.22}, {"job": "technician", "count": 138107, "mean_balance": 1071.57}, {"job": "admin.", "count": 81492, "mean_balance": 1019.92}, {"job": "blue-collar", "count": 170498, "mean_balance": 977.49}, {"job": "services", "count": 64209, "mean_balance": 834.63}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('job').agg(\n count=('balance', 'size'),\n mean_balance=('balance', lambda x: round(x.mean(), 2))\n).sort_values('mean_balance', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "metric_breakdown", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "balance"}}}
|
| 14 |
+
{"id": "bank_hard_035", "dataset": "bank", "goal": "Find the job with lowest average age. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 34.08, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['age'].mean()\nextrema_val = group_stats.max() if 'lowest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "age", "extrema": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 15 |
+
{"id": "bank_hard_038", "dataset": "bank", "goal": "Find the month with the lowest subscription rate. Within that group, what is the average day?", "expected_output_type": "scalar", "level": "L5", "template": "nested_extrema", "golden": {"answer_value": 16.09, "answer_type": "scalar", "verification_code": "group_rates = df.groupby('month')['y'].apply(lambda x: (x == 1).mean())\nouter_val = group_rates.max() if 'lowest' == 'highest' else group_rates.min()\nouter_tied = group_rates[group_rates == outer_val].sort_index()\nouter_group = outer_tied.index[0]\nsubset = df[df['month'] == outer_group]\nresult = round(subset['day'].mean(), 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["tie", "ties", "equal", "same", "duplicate", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "nested_extrema", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "month", "metric": "day", "extrema_outer": "lowest", "extrema_inner": "highest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 16 |
+
{"id": "bank_hard_039", "dataset": "bank", "goal": "Divide customers into 4 day quartiles. What is the subscription percentage (0-100) in the highest (top 25%) (Q4) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "quartile_conversion", "golden": {"answer_value": 11.55, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['day'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q4']\nresult = round((bin_data['y'] == 1).mean() * 100, 2)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "quartile_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "day", "quartile": "Q4", "quartile_desc": "highest (top 25%)", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 17 |
+
{"id": "bank_hard_040", "dataset": "bank", "goal": "Among customers in top 95% of balance AND bottom 25% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.65, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(95 / 100)\np_low = df['duration'].quantile(25 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 95, "pct_low": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 18 |
+
{"id": "bank_hard_041", "dataset": "bank", "goal": "Which job categories would have the biggest impact if brought to average subscription rate? Return top 3 by potential gain (count * rate gap), sorted by impact.", "expected_output_type": "list", "level": "L5", "template": "segment_improvement_potential", "golden": {"answer_value": ["blue-collar", "services", "entrepreneur"], "answer_type": "list", "verification_code": "overall_rate = (df['y'] == 1).mean()\ngroup_stats = df.groupby('job').agg(\n rate=('y', lambda x: (x == 1).mean()),\n count=('y', 'size')\n)\ngroup_stats['gap'] = overall_rate - group_stats['rate']\ngroup_stats['potential'] = group_stats['count'] * group_stats['gap']\ntop_potential = group_stats[group_stats['gap'] > 0].nlargest(3, 'potential')\nresult = top_potential.index.tolist()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "segment_improvement_potential", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 19 |
+
{"id": "bank_hard_044", "dataset": "bank", "goal": "Find the month with the lowest subscription rate. Within that group, what is the average age?", "expected_output_type": "scalar", "level": "L5", "template": "nested_extrema", "golden": {"answer_value": 38.98, "answer_type": "scalar", "verification_code": "group_rates = df.groupby('month')['y'].apply(lambda x: (x == 1).mean())\nouter_val = group_rates.max() if 'lowest' == 'highest' else group_rates.min()\nouter_tied = group_rates[group_rates == outer_val].sort_index()\nouter_group = outer_tied.index[0]\nsubset = df[df['month'] == outer_group]\nresult = round(subset['age'].mean(), 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["tie", "ties", "equal", "same", "duplicate", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "nested_extrema", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "month", "metric": "age", "extrema_outer": "lowest", "extrema_inner": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
|
| 20 |
+
{"id": "road_hard_014", "dataset": "road", "goal": "Which lighting categories have the highest total reported accidents? Show breakdown with [lighting, count, total_num_reported_accidents, avg_num_reported_accidents] sorted by total descending.", "expected_output_type": "dataframe", "level": "L4", "template": "count_segment_total", "golden": {"answer_value": [{"lighting": "dim", "count": 183826, "total_num_reported_accidents": 211283, "avg_num_reported_accidents": 1.15}, {"lighting": "daylight", "count": 178015, "total_num_reported_accidents": 207579, "avg_num_reported_accidents": 1.17}, {"lighting": "night", "count": 155913, "total_num_reported_accidents": 196214, "avg_num_reported_accidents": 1.26}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('lighting').agg(\n count=('num_reported_accidents', 'size'),\n total_num_reported_accidents=('num_reported_accidents', 'sum'),\n avg_num_reported_accidents=('num_reported_accidents', lambda x: round(x.mean(), 2))\n).sort_values('total_num_reported_accidents', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": [], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "count_segment_total", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "lighting", "target_col": "num_reported_accidents", "target_desc": "reported accidents"}}}
|
| 21 |
+
{"id": "road_hard_021", "dataset": "road", "goal": "Which weather categories have the highest average accident risk? Show breakdown with [weather, count, avg_accident_risk] sorted by average descending.", "expected_output_type": "dataframe", "level": "L4", "template": "continuous_segment_breakdown", "golden": {"answer_value": [{"weather": "foggy", "count": 181463, "avg_accident_risk": 0.3863}, {"weather": "rainy", "count": 156985, "avg_accident_risk": 0.3615}, {"weather": "clear", "count": 179306, "avg_accident_risk": 0.3101}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('weather').agg(\n count=('accident_risk', 'size'),\n avg_accident_risk=('accident_risk', lambda x: round(x.mean(), 4))\n).sort_values('avg_accident_risk', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": [], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_segment_breakdown", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "weather", "target_col": "accident_risk", "target_desc": "accident risk"}}}
|
| 22 |
+
{"id": "road_hard_015", "dataset": "road", "goal": "Divide records into 4 speed_limit quartiles. What is the average accident risk in the lower-middle (25-50%) (Q2) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "continuous_quartile_analysis", "golden": {"answer_value": 0.29, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['speed_limit'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q2']\nresult = round(bin_data['accident_risk'].mean(), 4)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_quartile_analysis", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "speed_limit", "quartile": "Q2", "quartile_desc": "lower-middle (25-50%)", "target_col": "accident_risk", "target_desc": "accident risk"}}}
|
| 23 |
+
{"id": "road_hard_002", "dataset": "road", "goal": "Divide records into 4 curvature quartiles. What is the average accident risk in the upper-middle (50-75%) (Q3) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "continuous_quartile_analysis", "golden": {"answer_value": 0.41, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['curvature'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q3']\nresult = round(bin_data['accident_risk'].mean(), 4)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_quartile_analysis", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "curvature", "quartile": "Q3", "quartile_desc": "upper-middle (50-75%)", "target_col": "accident_risk", "target_desc": "accident risk"}}}
|
| 24 |
+
{"id": "road_hard_007", "dataset": "road", "goal": "How much higher is the average accident risk for lighting='daylight' compared to 'night'? Return difference.", "expected_output_type": "scalar", "level": "L5", "template": "continuous_comparison", "golden": {"answer_value": -0.17, "answer_type": "scalar", "verification_code": "avg_a = df[df['lighting'] == 'daylight']['accident_risk'].mean()\navg_b = df[df['lighting'] == 'night']['accident_risk'].mean()\nresult = round(avg_a - avg_b, 4)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "continuous_comparison", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "lighting", "val_a": "daylight", "val_b": "night", "target_col": "accident_risk", "target_desc": "accident risk"}}}
|
| 25 |
+
{"id": "road_hard_004", "dataset": "road", "goal": "How much higher is the average accident risk for road_type='rural' compared to 'urban'? Return difference.", "expected_output_type": "scalar", "level": "L5", "template": "continuous_comparison", "golden": {"answer_value": -0.01, "answer_type": "scalar", "verification_code": "avg_a = df[df['road_type'] == 'rural']['accident_risk'].mean()\navg_b = df[df['road_type'] == 'urban']['accident_risk'].mean()\nresult = round(avg_a - avg_b, 4)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "continuous_comparison", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "road_type", "val_a": "rural", "val_b": "urban", "target_col": "accident_risk", "target_desc": "accident risk"}}}
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.115.0
|
| 2 |
+
pydantic>=2.0.0
|
| 3 |
+
uvicorn[standard]>=0.24.0
|
| 4 |
+
pandas>=2.0.0
|
| 5 |
+
numpy>=1.24.0
|
| 6 |
+
requests>=2.31.0
|