albert-einstein-09 commited on
Commit
95d976b
·
verified ·
1 Parent(s): fa1e87d

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.csv filter=lfs diff=lfs merge=lfs -text
2
+ *.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && \
8
+ apt-get install -y --no-install-recommends curl && \
9
+ rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy requirements and install Python dependencies
12
+ COPY requirements.txt .
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Copy application code
16
+ COPY codedark/ ./codedark/
17
+ COPY data/ ./data/
18
+
19
+ # Environment variables
20
+ ENV PYTHONUNBUFFERED=1
21
+ ENV CODEDARK_DATA_DIR=/app/data
22
+ ENV CODEDARK_TASKS_PATH=/app/data/tasks/final_25_tasks.jsonl
23
+ ENV HOST=0.0.0.0
24
+ ENV PORT=7860
25
+
26
+ # Expose HuggingFace Spaces port
27
+ EXPOSE 7860
28
+
29
+ # Healthcheck
30
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
31
+ CMD curl -f http://localhost:7860/health || exit 1
32
+
33
+ # Run server
34
+ CMD ["python", "-m", "uvicorn", "codedark.server.app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,130 @@
1
  ---
2
- title: Codedark
3
- emoji: 🔥
4
- colorFrom: gray
5
- colorTo: green
6
  sdk: docker
7
  pinned: false
 
 
 
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: CodeDark Environment Server
3
+ emoji: 📊
4
+ colorFrom: yellow
5
+ colorTo: purple
6
  sdk: docker
7
  pinned: false
8
+ license: mit
9
+ tags:
10
+ - openenv
11
+ - reinforcement-learning
12
+ - data-analytics
13
+ - agents
14
+ - benchmark
15
  ---
16
 
17
+ # CodeDark: Data Analytics Environment for RL Agents
18
+
19
+ **OpenEnv-compatible multi-turn environment for training AI agents on real business analytics tasks.**
20
+
21
+ ## Overview
22
+
23
+ CodeDark is the first data analytics environment in the OpenEnv ecosystem. It challenges AI agents to analyze CSV datasets using Python/Pandas, testing their ability to be data scientists rather than just code executors.
24
+
25
+ ### Key Features
26
+
27
+ - **Real Business Tasks**: Bank marketing and road safety datasets with genuine analytical questions
28
+ - **Multi-Turn Interaction**: Agents explore data, save notes, ask clarifications, and submit answers
29
+ - **Shaped Rewards**: 80% correctness + 10% efficiency + 10% token cost
30
+ - **Pre-Benchmarked**: 25 curated L5-L6 difficulty tasks validated on 11+ models
31
+
32
+ ## Quick Start
33
+
34
+ ### Connect to the Environment
35
+
36
+ ```python
37
+ from openenv import EnvClient
38
+
39
+ # Connect to this Space
40
+ env = EnvClient.from_hub("openenv/codedark")
41
+
42
+ # Reset for a new task
43
+ obs = env.reset()
44
+ print(f"Task: {obs['question']}")
45
+
46
+ # Execute Python code
47
+ obs = env.step({"tool": "run_python", "args": "<code>result = df.shape</code>"})
48
+ print(f"Result: {obs['stdout']}")
49
+
50
+ # Submit answer
51
+ obs = env.step({"tool": "submit_answer", "args": "<answer>42.5</answer>"})
52
+ print(f"Reward: {obs['reward']}")
53
+ ```
54
+
55
+ ### Available Tools
56
+
57
+ | Tool | Description |
58
+ | --------------- | -------------------------------------------------------------- |
59
+ | `run_python` | Execute Python/pandas code. Store result in `result` variable. |
60
+ | `read_notes` | Read saved notes from previous turns. |
61
+ | `save_note` | Save observations for later recall. |
62
+ | `clarify` | Ask clarifying questions (max 2 per episode). |
63
+ | `submit_answer` | Submit final answer. Ends episode. |
64
+
65
+ ## Datasets
66
+
67
+ ### Bank Marketing (750K rows)
68
+
69
+ - **Target**: Term deposit subscription prediction
70
+ - **Features**: age, job, marital, education, balance, housing, loan, contact, day, month, duration, campaign
71
+
72
+ ### Road Safety (500K rows)
73
+
74
+ - **Target**: Accident risk assessment
75
+ - **Features**: road_type, num_lanes, curvature, speed_limit, lighting, weather, time_of_day
76
+
77
+ ## Task Difficulty
78
+
79
+ | Level | Complexity | Example |
80
+ | ----- | --------------- | -------------------------------------------- |
81
+ | L4 | Quartile/binned | "Subscription rate in Q1 balance?" |
82
+ | L5 | Multi-condition | "Rate for month='may' AND job='management'?" |
83
+ | L6 | Nested extrema | "In lowest subscription month, avg day?" |
84
+
85
+ ## Reward Structure
86
+
87
+ | Component | Weight | Description |
88
+ | ----------- | ------ | ----------------------------------------------- |
89
+ | Correctness | 80% | Binary correct/incorrect with numeric tolerance |
90
+ | Efficiency | 10% | Fewer turns = better score |
91
+ | Token Cost | 10% | Lower token usage = better score |
92
+
93
+ ## API Endpoints
94
+
95
+ | Endpoint | Method | Description |
96
+ | ----------- | ------ | --------------------- |
97
+ | `/health` | GET | Health check |
98
+ | `/reset` | POST | Reset for new episode |
99
+ | `/step` | POST | Execute action |
100
+ | `/state` | GET | Current state |
101
+ | `/metadata` | GET | Environment metadata |
102
+ | `/schema` | GET | Type schemas |
103
+
104
+ ## Benchmark Results
105
+
106
+ Pre-benchmarked on 11+ models with 1,844 completions:
107
+
108
+ | Model | Accuracy | Avg Turns |
109
+ | ---------------- | -------- | --------- |
110
+ | Claude Opus 4.5 | 77.3% | 4.2 |
111
+ | Qwen3 Max | 46.7% | 5.1 |
112
+ | Mistral Large | 45.3% | 5.8 |
113
+ | Llama 4 Maverick | 38.7% | 6.2 |
114
+
115
+ ## Links
116
+
117
+ - **GitHub**: [vj-09/codeblue-env](https://github.com/vj-09/codeblue-env)
118
+ - **Leaderboard**: [analytics-rl.com](https://www.analytics-rl.com)
119
+ - **OpenEnv Spec**: [meta-pytorch/OpenEnv](https://github.com/meta-pytorch/OpenEnv)
120
+
121
+ ## License
122
+
123
+ MIT License
124
+
125
+ ## Author
126
+
127
+ **Vijay Athithya**
128
+
129
+ - GitHub: [@vj-09](https://github.com/vj-09)
130
+ - LinkedIn: [vijay-athithya](https://www.linkedin.com/in/vijay-athithya/)
codedark/.pytest_cache/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Created by pytest automatically.
2
+ *
codedark/.pytest_cache/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
codedark/.pytest_cache/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
codedark/.pytest_cache/v/cache/lastfailed ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "tests/test_environment.py::TestModels": true,
3
+ "tests/test_environment.py::TestTools": true,
4
+ "tests/test_environment.py::TestScoring": true,
5
+ "tests/test_environment.py::TestEnvironment": true
6
+ }
codedark/.pytest_cache/v/cache/nodeids ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "tests/test_environment.py::TestEnvironment::test_reset_loads_task",
3
+ "tests/test_environment.py::TestEnvironment::test_reset_specific_task",
4
+ "tests/test_environment.py::TestEnvironment::test_step_read_notes",
5
+ "tests/test_environment.py::TestEnvironment::test_step_run_python",
6
+ "tests/test_environment.py::TestEnvironment::test_step_save_note",
7
+ "tests/test_environment.py::TestEnvironment::test_step_submit_answer",
8
+ "tests/test_environment.py::TestEnvironment::test_turn_counting",
9
+ "tests/test_environment.py::TestModels::test_action_creation",
10
+ "tests/test_environment.py::TestModels::test_observation_defaults",
11
+ "tests/test_environment.py::TestModels::test_state_defaults",
12
+ "tests/test_environment.py::TestScoring::test_compute_reward_correct",
13
+ "tests/test_environment.py::TestScoring::test_correctness_exact_match",
14
+ "tests/test_environment.py::TestScoring::test_correctness_scale_error",
15
+ "tests/test_environment.py::TestScoring::test_correctness_within_tolerance",
16
+ "tests/test_environment.py::TestScoring::test_correctness_wrong",
17
+ "tests/test_environment.py::TestScoring::test_efficiency_correct_answer",
18
+ "tests/test_environment.py::TestScoring::test_efficiency_incorrect_answer",
19
+ "tests/test_environment.py::TestTools::test_parse_run_python",
20
+ "tests/test_environment.py::TestTools::test_parse_run_python_missing_tag",
21
+ "tests/test_environment.py::TestTools::test_parse_submit_answer",
22
+ "tests/test_environment.py::TestTools::test_read_notes_empty",
23
+ "tests/test_environment.py::TestTools::test_read_notes_with_content",
24
+ "tests/test_environment.py::TestTools::test_save_note"
25
+ ]
codedark/README.md ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeDark
2
+
3
+ **OpenEnv-compatible multi-turn data analytics environment for RL agent training.**
4
+
5
+ Train AI agents to be data scientists, not just code executors. CodeDark features real business analytics tasks with pandas/numpy, multi-metric reward shaping, and skill-based curriculum.
6
+
7
+ ## Quick Start
8
+
9
+ ### Server
10
+
11
+ ```bash
12
+ # Install
13
+ pip install -e .
14
+
15
+ # Run server
16
+ python -m codedark.server.app
17
+ # Server runs at http://localhost:8000
18
+ ```
19
+
20
+ ### Client
21
+
22
+ ```python
23
+ from codedark import CodeDarkEnv
24
+
25
+ env = CodeDarkEnv("http://localhost:8000")
26
+
27
+ # Reset for new episode
28
+ obs = env.reset()
29
+ print(f"Task: {obs['question']}")
30
+
31
+ # Execute Python code
32
+ obs = env.run_python("result = df.shape")
33
+ print(f"Shape: {obs['stdout']}")
34
+
35
+ # Explore the data
36
+ obs = env.run_python("result = df.columns.tolist()")
37
+ print(f"Columns: {obs['stdout']}")
38
+
39
+ # Calculate and submit answer
40
+ obs = env.run_python("result = df['y'].mean() * 100")
41
+ obs = env.submit_answer(11.26)
42
+ print(f"Reward: {obs['reward']}")
43
+ ```
44
+
45
+ ### Docker
46
+
47
+ ```bash
48
+ # Build
49
+ docker build -t codedark:latest -f server/Dockerfile .
50
+
51
+ # Run
52
+ docker run -p 8000:8000 codedark:latest
53
+ ```
54
+
55
+ ## Tools
56
+
57
+ Agents have access to 5 tools:
58
+
59
+ | Tool | Description |
60
+ | --------------- | --------------------------------------------------------------- |
61
+ | `run_python` | Execute Python/pandas code. Store output in `result` variable. |
62
+ | `read_notes` | Read all saved notes from previous turns. |
63
+ | `save_note` | Save observations for later recall. Notes persist across turns. |
64
+ | `clarify` | Ask clarifying questions about the task (max 2 per episode). |
65
+ | `submit_answer` | Submit final answer. Ends episode. |
66
+
67
+ ## Reward Structure
68
+
69
+ Total reward is computed from three components (max 1.0):
70
+
71
+ | Component | Weight | Description |
72
+ | ----------- | ------ | ----------------------------------------------- |
73
+ | Correctness | 80% | Binary correct/incorrect with numeric tolerance |
74
+ | Efficiency | 10% | Fewer turns = better score |
75
+ | Token Cost | 10% | Lower token usage = better score |
76
+
77
+ ## Datasets
78
+
79
+ ### Bank Marketing
80
+
81
+ - **Records**: 750,000 customers
82
+ - **Target**: Term deposit subscription (y = 0/1)
83
+ - **Features**: age, job, marital, education, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome
84
+
85
+ ### Road Safety
86
+
87
+ - **Records**: 500,000 road segments
88
+ - **Target**: Accident risk (continuous)
89
+ - **Features**: road_type, num_lanes, curvature, speed_limit, lighting, weather, road_signs_present, time_of_day, num_reported_accidents
90
+
91
+ ## Task Difficulty
92
+
93
+ | Level | Complexity | Example |
94
+ | ----- | --------------- | -------------------------------------------- |
95
+ | L4 | Quartile/binned | "Subscription rate in Q1 balance?" |
96
+ | L5 | Multi-condition | "Rate for month='may' AND job='management'?" |
97
+ | L6 | Nested extrema | "In lowest subscription month, avg day?" |
98
+
99
+ ## API Endpoints
100
+
101
+ | Endpoint | Method | Description |
102
+ | ----------- | ------ | --------------------- |
103
+ | `/health` | GET | Health check |
104
+ | `/reset` | POST | Reset for new episode |
105
+ | `/step` | POST | Execute action |
106
+ | `/state` | GET | Current state |
107
+ | `/metadata` | GET | Environment metadata |
108
+ | `/schema` | GET | Type schemas |
109
+
110
+ ## Benchmark Results
111
+
112
+ Pre-benchmarked on 11+ models with 1,844 completions:
113
+
114
+ | Model | Accuracy | Cost/Task |
115
+ | ---------------- | -------- | --------- |
116
+ | Claude Opus 4.5 | 77.3% | $0.89 |
117
+ | Qwen3 Max | 46.7% | $0.12 |
118
+ | Mistral Large | 45.3% | $0.18 |
119
+ | Llama 4 Maverick | 38.7% | $0.08 |
120
+
121
+ ## Environment Variables
122
+
123
+ | Variable | Default | Description |
124
+ | --------------------- | --------------------------------- | ------------------------- |
125
+ | `CODEDARK_DATA_DIR` | `data/` | Path to CSV files |
126
+ | `CODEDARK_TASKS_PATH` | `data/tasks/final_25_tasks.jsonl` | Path to tasks file |
127
+ | `CODEDARK_MAX_TURNS` | `10` | Maximum turns per episode |
128
+ | `HOST` | `0.0.0.0` | Server host |
129
+ | `PORT` | `8000` | Server port |
130
+
131
+ ## Project Structure
132
+
133
+ ```
134
+ codedark/
135
+ ├── __init__.py # Package exports
136
+ ├── models.py # Action, Observation, State dataclasses
137
+ ├── client.py # HTTP client
138
+ ├── openenv.yaml # OpenEnv manifest
139
+ ├── pyproject.toml # Package config
140
+ ├── server/
141
+ │ ├── app.py # FastAPI application
142
+ │ ├── environment.py # Core environment logic
143
+ │ ├── tools.py # Tool implementations
144
+ │ ├── scoring.py # Reward computation
145
+ │ ├── Dockerfile # Container spec
146
+ │ └── requirements.txt # Dependencies
147
+ ├── data/
148
+ │ ├── bank.csv # Bank marketing dataset
149
+ │ ├── road.csv # Road safety dataset
150
+ │ └── tasks/
151
+ │ └── final_25_tasks.jsonl
152
+ └── tests/
153
+ ```
154
+
155
+ ## OpenEnv Compatibility
156
+
157
+ CodeDark follows the [OpenEnv specification](https://huggingface.co/openenv):
158
+
159
+ - Gymnasium-style `reset()` / `step()` API
160
+ - Pydantic models for Action, Observation, State
161
+ - FastAPI server with standard endpoints
162
+ - Docker containerization for isolated execution
163
+ - HTTP + WebSocket transport
164
+
165
+ ## License
166
+
167
+ MIT
168
+
169
+ ## Author
170
+
171
+ Vijay Athithya
172
+
173
+ - GitHub: [vj-09](https://github.com/vj-09)
174
+ - LinkedIn: [vijay-athithya](https://www.linkedin.com/in/vijay-athithya/)
codedark/__init__.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CodeDark - OpenEnv-Compatible Data Analytics Environment
3
+
4
+ Multi-turn RL environment for training AI agents on data analytics tasks.
5
+
6
+ Example usage:
7
+ from codedark import CodeDarkEnv
8
+
9
+ env = CodeDarkEnv("http://localhost:8000")
10
+ obs = env.reset()
11
+ print(f"Task: {obs['question']}")
12
+
13
+ obs = env.run_python("result = df['y'].mean() * 100")
14
+ obs = env.submit_answer(11.26)
15
+ print(f"Reward: {obs['reward']}")
16
+ """
17
+
18
+ from .client import CodeDarkEnv
19
+ from .models import (
20
+ CodeDarkAction,
21
+ CodeDarkObservation,
22
+ CodeDarkState,
23
+ ResetRequest,
24
+ StepRequest,
25
+ HealthResponse,
26
+ )
27
+
28
+ __all__ = [
29
+ "CodeDarkEnv",
30
+ "CodeDarkAction",
31
+ "CodeDarkObservation",
32
+ "CodeDarkState",
33
+ "ResetRequest",
34
+ "StepRequest",
35
+ "HealthResponse",
36
+ ]
37
+
38
+ __version__ = "0.1.0"
codedark/client.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CodeDark Client
3
+
4
+ HTTP client for interacting with CodeDark environment server.
5
+ Follows OpenEnv EnvClient pattern.
6
+ """
7
+
8
+ from typing import Any, Dict, Optional
9
+ import requests
10
+
11
+
12
+ class CodeDarkEnv:
13
+ """Client for CodeDark environment.
14
+
15
+ Example usage:
16
+ env = CodeDarkEnv("http://localhost:8000")
17
+ obs = env.reset()
18
+ print(f"Task: {obs['question']}")
19
+
20
+ obs = env.step("run_python", "<code>result = df.shape</code>")
21
+ print(f"Result: {obs['stdout']}")
22
+
23
+ obs = env.step("submit_answer", "<answer>11.26</answer>")
24
+ print(f"Reward: {obs['reward']}")
25
+ """
26
+
27
+ def __init__(self, base_url: str = "http://localhost:8000", timeout: int = 30):
28
+ """Initialize client.
29
+
30
+ Args:
31
+ base_url: Server URL
32
+ timeout: Request timeout in seconds
33
+ """
34
+ self.base_url = base_url.rstrip("/")
35
+ self.timeout = timeout
36
+ self._session = requests.Session()
37
+
38
+ def reset(
39
+ self, task_id: Optional[str] = None, seed: Optional[int] = None
40
+ ) -> Dict[str, Any]:
41
+ """Reset environment for a new episode.
42
+
43
+ Args:
44
+ task_id: Specific task to load (optional)
45
+ seed: Random seed for task selection (optional)
46
+
47
+ Returns:
48
+ Initial observation dict
49
+ """
50
+ payload = {}
51
+ if task_id is not None:
52
+ payload["task_id"] = task_id
53
+ if seed is not None:
54
+ payload["seed"] = seed
55
+
56
+ response = self._session.post(
57
+ f"{self.base_url}/reset",
58
+ json=payload if payload else None,
59
+ timeout=self.timeout,
60
+ )
61
+ response.raise_for_status()
62
+ return response.json()
63
+
64
+ def step(self, tool: str, args: str = "") -> Dict[str, Any]:
65
+ """Execute an action.
66
+
67
+ Args:
68
+ tool: Tool name (run_python, read_notes, save_note, clarify, submit_answer)
69
+ args: Tool-specific arguments
70
+
71
+ Returns:
72
+ Observation dict
73
+ """
74
+ response = self._session.post(
75
+ f"{self.base_url}/step",
76
+ json={"tool": tool, "args": args},
77
+ timeout=self.timeout,
78
+ )
79
+ response.raise_for_status()
80
+ return response.json()
81
+
82
+ def state(self) -> Dict[str, Any]:
83
+ """Get current environment state.
84
+
85
+ Returns:
86
+ State dict
87
+ """
88
+ response = self._session.get(
89
+ f"{self.base_url}/state",
90
+ timeout=self.timeout,
91
+ )
92
+ response.raise_for_status()
93
+ return response.json()
94
+
95
+ def health(self) -> Dict[str, Any]:
96
+ """Check server health.
97
+
98
+ Returns:
99
+ Health status dict
100
+ """
101
+ response = self._session.get(
102
+ f"{self.base_url}/health",
103
+ timeout=self.timeout,
104
+ )
105
+ response.raise_for_status()
106
+ return response.json()
107
+
108
+ def metadata(self) -> Dict[str, Any]:
109
+ """Get environment metadata.
110
+
111
+ Returns:
112
+ Metadata dict
113
+ """
114
+ response = self._session.get(
115
+ f"{self.base_url}/metadata",
116
+ timeout=self.timeout,
117
+ )
118
+ response.raise_for_status()
119
+ return response.json()
120
+
121
+ def schema(self) -> Dict[str, Any]:
122
+ """Get environment type schemas.
123
+
124
+ Returns:
125
+ Schema dict for action, observation, state
126
+ """
127
+ response = self._session.get(
128
+ f"{self.base_url}/schema",
129
+ timeout=self.timeout,
130
+ )
131
+ response.raise_for_status()
132
+ return response.json()
133
+
134
+ # Convenience methods for common tools
135
+
136
+ def run_python(self, code: str) -> Dict[str, Any]:
137
+ """Execute Python code.
138
+
139
+ Args:
140
+ code: Python code to execute
141
+
142
+ Returns:
143
+ Observation dict
144
+ """
145
+ return self.step("run_python", f"<code>{code}</code>")
146
+
147
+ def read_notes(self) -> Dict[str, Any]:
148
+ """Read all saved notes.
149
+
150
+ Returns:
151
+ Observation dict
152
+ """
153
+ return self.step("read_notes", "")
154
+
155
+ def save_note(self, content: str) -> Dict[str, Any]:
156
+ """Save a note.
157
+
158
+ Args:
159
+ content: Note content
160
+
161
+ Returns:
162
+ Observation dict
163
+ """
164
+ return self.step("save_note", content)
165
+
166
+ def clarify(self, question: str) -> Dict[str, Any]:
167
+ """Ask a clarifying question.
168
+
169
+ Args:
170
+ question: Clarifying question
171
+
172
+ Returns:
173
+ Observation dict
174
+ """
175
+ return self.step("clarify", f"<question>{question}</question>")
176
+
177
+ def submit_answer(self, answer: Any) -> Dict[str, Any]:
178
+ """Submit final answer.
179
+
180
+ Args:
181
+ answer: Answer value
182
+
183
+ Returns:
184
+ Final observation with reward
185
+ """
186
+ return self.step("submit_answer", f"<answer>{answer}</answer>")
187
+
188
+ def close(self):
189
+ """Close the session."""
190
+ self._session.close()
191
+
192
+ def __enter__(self):
193
+ return self
194
+
195
+ def __exit__(self, exc_type, exc_val, exc_tb):
196
+ self.close()
197
+ return False
codedark/data/bank.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a071417203c9e1434df5fe794fffae6a55327502b00da9c4e9754d2ab7f7cede
3
+ size 65698328
codedark/data/road.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ee5955af18eca0d4b53e13f23bd6436422e40ca84077fb8cdcfa467aa62b68f
3
+ size 37936892
codedark/data/tasks/final_25_tasks.jsonl ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"id": "bank_hard_001", "dataset": "bank", "goal": "What's the subscription rate for month='may' AND job='management' AND balance in Q1?", "expected_output_type": "scalar", "level": "L6", "template": "multi_condition_filter", "golden": {"answer_value": 2.44, "answer_type": "scalar", "verification_code": "df['_q'] = pd.qcut(df['balance'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nfiltered = df[(df['month'] == 'may') & (df['job'] == 'management') & (df['_q'] == 'Q1')]\nresult = round((filtered['y'] == 1).mean() * 100, 2) if len(filtered) > 0 else 0.0\ndf.drop('_q', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "null", "missing", "nan", "empty", "na"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "multi_condition_filter", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"col_a": "month", "val_a": "may", "col_b": "job", "val_b": "management", "metric": "balance", "quartile": "Q1", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
2
+ {"id": "bank_hard_005", "dataset": "bank", "goal": "Among customers in top 95% of age AND bottom 10% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.59, "answer_type": "scalar", "verification_code": "p_high = df['age'].quantile(95 / 100)\np_low = df['duration'].quantile(10 / 100)\ncohort = df[(df['age'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "age", "metric_b": "duration", "pct_high": 95, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
3
+ {"id": "bank_hard_012", "dataset": "bank", "goal": "Find the job with lowest average balance. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 8.27, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['balance'].mean()\nextrema_val = group_stats.max() if 'lowest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "balance", "extrema": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
4
+ {"id": "bank_hard_019", "dataset": "bank", "goal": "What's the subscription rate for month='may' AND job='management' AND age in Q4?", "expected_output_type": "scalar", "level": "L6", "template": "multi_condition_filter", "golden": {"answer_value": 7.04, "answer_type": "scalar", "verification_code": "df['_q'] = pd.qcut(df['age'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nfiltered = df[(df['month'] == 'may') & (df['job'] == 'management') & (df['_q'] == 'Q4')]\nresult = round((filtered['y'] == 1).mean() * 100, 2) if len(filtered) > 0 else 0.0\ndf.drop('_q', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "null", "missing", "nan", "empty", "na"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "multi_condition_filter", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"col_a": "month", "val_a": "may", "col_b": "job", "val_b": "management", "metric": "age", "quartile": "Q4", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
5
+ {"id": "bank_hard_020", "dataset": "bank", "goal": "For month with above-average day, which have subscription rate > 25%? Return sorted list.", "expected_output_type": "list", "level": "L5", "template": "top_n_in_segment", "golden": {"answer_value": ["oct"], "answer_type": "list", "verification_code": "avg_metric = df.groupby('month')['day'].mean()\nhigh_metric = avg_metric[avg_metric > avg_metric.mean()].index\nrates = df[df['month'].isin(high_metric)].groupby('month')['y'].apply(\n lambda x: (x == 1).mean() * 100)\nresult = sorted(rates[rates > 25].index.tolist())", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["tie", "ties", "equal", "same", "duplicate"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "top_n_in_segment", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "month", "metric": "day", "threshold": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
6
+ {"id": "bank_hard_021", "dataset": "bank", "goal": "Find the job with highest average balance. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 24.62, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['balance'].mean()\nextrema_val = group_stats.max() if 'highest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "balance", "extrema": "highest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
7
+ {"id": "bank_hard_023", "dataset": "bank", "goal": "Among customers in top 90% of balance AND bottom 10% of day, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 33.53, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(90 / 100)\np_low = df['day'].quantile(10 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['day'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "day", "pct_high": 90, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
8
+ {"id": "bank_hard_026", "dataset": "bank", "goal": "For each job, compute volatility score: std(age) / mean(age). Return top 5 with [group, mean, std, volatility].", "expected_output_type": "dataframe", "level": "L6", "template": "segment_volatility", "golden": {"answer_value": [{"job": "unemployed", "mean": 40.97, "std": 9.74, "volatility": 0.2377}, {"job": "self-employed", "mean": 40.42, "std": 9.46, "volatility": 0.234}, {"job": "admin.", "mean": 39.68, "std": 9.23, "volatility": 0.2326}, {"job": "services", "mean": 38.94, "std": 8.86, "volatility": 0.2275}, {"job": "management", "mean": 40.2, "std": 9.13, "volatility": 0.2271}], "answer_type": "dataframe", "verification_code": "stats = df.groupby('job')['age'].agg(['mean', 'std']).round(2)\nstats['volatility'] = round(stats['std'] / stats['mean'], 4)\nresult = stats.nlargest(5, 'volatility').reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits", "volatility", "cv", "coefficient", "variation"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "segment_volatility", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "age"}}}
9
+ {"id": "bank_hard_028", "dataset": "bank", "goal": "Among customers in top 95% of balance AND bottom 10% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.84, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(95 / 100)\np_low = df['duration'].quantile(10 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 95, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
10
+ {"id": "bank_hard_029", "dataset": "bank", "goal": "Among customers in top 90% of balance AND bottom 25% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.54, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(90 / 100)\np_low = df['duration'].quantile(25 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 90, "pct_low": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
11
+ {"id": "bank_hard_030", "dataset": "bank", "goal": "Rank job by subscription rate. Which bottom-3 have above-median age?", "expected_output_type": "list", "level": "L6", "template": "ranked_anomaly", "golden": {"answer_value": ["blue-collar", "entrepreneur"], "answer_type": "list", "verification_code": "stats = df.groupby('job').agg(\n rate=('y', lambda x: (x == 1).mean()),\n avg_metric=('age', 'mean'))\nstats['rank'] = stats['rate'].rank()\nbottom_3 = stats[stats['rank'] <= 3]\nresult = sorted(bottom_3[bottom_3['avg_metric'] > stats['avg_metric'].median()].index.tolist())", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rank", "order", "sort", "ascending", "descending"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "ranked_anomaly", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "age", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
12
+ {"id": "bank_hard_031", "dataset": "bank", "goal": "For each month, compute volatility score: std(day) / mean(day). Return top 5 with [group, mean, std, volatility].", "expected_output_type": "dataframe", "level": "L6", "template": "segment_volatility", "golden": {"answer_value": [{"month": "feb", "mean": 6.05, "std": 5.46, "volatility": 0.9025}, {"month": "sep", "mean": 11.67, "std": 8.04, "volatility": 0.6889}, {"month": "mar", "mean": 13.45, "std": 9.18, "volatility": 0.6825}, {"month": "jun", "mean": 11.32, "std": 7.33, "volatility": 0.6475}, {"month": "dec", "mean": 14.19, "std": 8.83, "volatility": 0.6223}], "answer_type": "dataframe", "verification_code": "stats = df.groupby('month')['day'].agg(['mean', 'std']).round(2)\nstats['volatility'] = round(stats['std'] / stats['mean'], 4)\nresult = stats.nlargest(5, 'volatility').reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits", "volatility", "cv", "coefficient", "variation"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "segment_volatility", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "month", "metric": "day"}}}
13
+ {"id": "bank_hard_033", "dataset": "bank", "goal": "Show the average balance breakdown by job. Include count and mean balance for each category, sorted by mean descending.", "expected_output_type": "dataframe", "level": "L4", "template": "metric_breakdown", "golden": {"answer_value": [{"job": "retired", "count": 35185, "mean_balance": 1812.07}, {"job": "unknown", "count": 2917, "mean_balance": 1678.96}, {"job": "self-employed", "count": 19020, "mean_balance": 1598.27}, {"job": "student", "count": 11767, "mean_balance": 1577.32}, {"job": "management", "count": 175541, "mean_balance": 1510.39}, {"job": "unemployed", "count": 17634, "mean_balance": 1440.57}, {"job": "entrepreneur", "count": 17718, "mean_balance": 1306.75}, {"job": "housemaid", "count": 15912, "mean_balance": 1281.22}, {"job": "technician", "count": 138107, "mean_balance": 1071.57}, {"job": "admin.", "count": 81492, "mean_balance": 1019.92}, {"job": "blue-collar", "count": 170498, "mean_balance": 977.49}, {"job": "services", "count": 64209, "mean_balance": 834.63}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('job').agg(\n count=('balance', 'size'),\n mean_balance=('balance', lambda x: round(x.mean(), 2))\n).sort_values('mean_balance', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "metric_breakdown", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "balance"}}}
14
+ {"id": "bank_hard_035", "dataset": "bank", "goal": "Find the job with lowest average age. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 34.08, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['age'].mean()\nextrema_val = group_stats.max() if 'lowest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "age", "extrema": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
15
+ {"id": "bank_hard_038", "dataset": "bank", "goal": "Find the month with the lowest subscription rate. Within that group, what is the average day?", "expected_output_type": "scalar", "level": "L5", "template": "nested_extrema", "golden": {"answer_value": 16.09, "answer_type": "scalar", "verification_code": "group_rates = df.groupby('month')['y'].apply(lambda x: (x == 1).mean())\nouter_val = group_rates.max() if 'lowest' == 'highest' else group_rates.min()\nouter_tied = group_rates[group_rates == outer_val].sort_index()\nouter_group = outer_tied.index[0]\nsubset = df[df['month'] == outer_group]\nresult = round(subset['day'].mean(), 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["tie", "ties", "equal", "same", "duplicate", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "nested_extrema", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "month", "metric": "day", "extrema_outer": "lowest", "extrema_inner": "highest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
16
+ {"id": "bank_hard_039", "dataset": "bank", "goal": "Divide customers into 4 day quartiles. What is the subscription percentage (0-100) in the highest (top 25%) (Q4) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "quartile_conversion", "golden": {"answer_value": 11.55, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['day'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q4']\nresult = round((bin_data['y'] == 1).mean() * 100, 2)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "quartile_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "day", "quartile": "Q4", "quartile_desc": "highest (top 25%)", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
17
+ {"id": "bank_hard_040", "dataset": "bank", "goal": "Among customers in top 95% of balance AND bottom 25% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.65, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(95 / 100)\np_low = df['duration'].quantile(25 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 95, "pct_low": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
18
+ {"id": "bank_hard_041", "dataset": "bank", "goal": "Which job categories would have the biggest impact if brought to average subscription rate? Return top 3 by potential gain (count * rate gap), sorted by impact.", "expected_output_type": "list", "level": "L5", "template": "segment_improvement_potential", "golden": {"answer_value": ["blue-collar", "services", "entrepreneur"], "answer_type": "list", "verification_code": "overall_rate = (df['y'] == 1).mean()\ngroup_stats = df.groupby('job').agg(\n rate=('y', lambda x: (x == 1).mean()),\n count=('y', 'size')\n)\ngroup_stats['gap'] = overall_rate - group_stats['rate']\ngroup_stats['potential'] = group_stats['count'] * group_stats['gap']\ntop_potential = group_stats[group_stats['gap'] > 0].nlargest(3, 'potential')\nresult = top_potential.index.tolist()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "segment_improvement_potential", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
19
+ {"id": "bank_hard_044", "dataset": "bank", "goal": "Find the month with the lowest subscription rate. Within that group, what is the average age?", "expected_output_type": "scalar", "level": "L5", "template": "nested_extrema", "golden": {"answer_value": 38.98, "answer_type": "scalar", "verification_code": "group_rates = df.groupby('month')['y'].apply(lambda x: (x == 1).mean())\nouter_val = group_rates.max() if 'lowest' == 'highest' else group_rates.min()\nouter_tied = group_rates[group_rates == outer_val].sort_index()\nouter_group = outer_tied.index[0]\nsubset = df[df['month'] == outer_group]\nresult = round(subset['age'].mean(), 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["tie", "ties", "equal", "same", "duplicate", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "nested_extrema", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "month", "metric": "age", "extrema_outer": "lowest", "extrema_inner": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
20
+ {"id": "road_hard_014", "dataset": "road", "goal": "Which lighting categories have the highest total reported accidents? Show breakdown with [lighting, count, total_num_reported_accidents, avg_num_reported_accidents] sorted by total descending.", "expected_output_type": "dataframe", "level": "L4", "template": "count_segment_total", "golden": {"answer_value": [{"lighting": "dim", "count": 183826, "total_num_reported_accidents": 211283, "avg_num_reported_accidents": 1.15}, {"lighting": "daylight", "count": 178015, "total_num_reported_accidents": 207579, "avg_num_reported_accidents": 1.17}, {"lighting": "night", "count": 155913, "total_num_reported_accidents": 196214, "avg_num_reported_accidents": 1.26}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('lighting').agg(\n count=('num_reported_accidents', 'size'),\n total_num_reported_accidents=('num_reported_accidents', 'sum'),\n avg_num_reported_accidents=('num_reported_accidents', lambda x: round(x.mean(), 2))\n).sort_values('total_num_reported_accidents', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": [], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "count_segment_total", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "lighting", "target_col": "num_reported_accidents", "target_desc": "reported accidents"}}}
21
+ {"id": "road_hard_021", "dataset": "road", "goal": "Which weather categories have the highest average accident risk? Show breakdown with [weather, count, avg_accident_risk] sorted by average descending.", "expected_output_type": "dataframe", "level": "L4", "template": "continuous_segment_breakdown", "golden": {"answer_value": [{"weather": "foggy", "count": 181463, "avg_accident_risk": 0.3863}, {"weather": "rainy", "count": 156985, "avg_accident_risk": 0.3615}, {"weather": "clear", "count": 179306, "avg_accident_risk": 0.3101}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('weather').agg(\n count=('accident_risk', 'size'),\n avg_accident_risk=('accident_risk', lambda x: round(x.mean(), 4))\n).sort_values('avg_accident_risk', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": [], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_segment_breakdown", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "weather", "target_col": "accident_risk", "target_desc": "accident risk"}}}
22
+ {"id": "road_hard_015", "dataset": "road", "goal": "Divide records into 4 speed_limit quartiles. What is the average accident risk in the lower-middle (25-50%) (Q2) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "continuous_quartile_analysis", "golden": {"answer_value": 0.29, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['speed_limit'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q2']\nresult = round(bin_data['accident_risk'].mean(), 4)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_quartile_analysis", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "speed_limit", "quartile": "Q2", "quartile_desc": "lower-middle (25-50%)", "target_col": "accident_risk", "target_desc": "accident risk"}}}
23
+ {"id": "road_hard_002", "dataset": "road", "goal": "Divide records into 4 curvature quartiles. What is the average accident risk in the upper-middle (50-75%) (Q3) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "continuous_quartile_analysis", "golden": {"answer_value": 0.41, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['curvature'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q3']\nresult = round(bin_data['accident_risk'].mean(), 4)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_quartile_analysis", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "curvature", "quartile": "Q3", "quartile_desc": "upper-middle (50-75%)", "target_col": "accident_risk", "target_desc": "accident risk"}}}
24
+ {"id": "road_hard_007", "dataset": "road", "goal": "How much higher is the average accident risk for lighting='daylight' compared to 'night'? Return difference.", "expected_output_type": "scalar", "level": "L5", "template": "continuous_comparison", "golden": {"answer_value": -0.17, "answer_type": "scalar", "verification_code": "avg_a = df[df['lighting'] == 'daylight']['accident_risk'].mean()\navg_b = df[df['lighting'] == 'night']['accident_risk'].mean()\nresult = round(avg_a - avg_b, 4)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "continuous_comparison", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "lighting", "val_a": "daylight", "val_b": "night", "target_col": "accident_risk", "target_desc": "accident risk"}}}
25
+ {"id": "road_hard_004", "dataset": "road", "goal": "How much higher is the average accident risk for road_type='rural' compared to 'urban'? Return difference.", "expected_output_type": "scalar", "level": "L5", "template": "continuous_comparison", "golden": {"answer_value": -0.01, "answer_type": "scalar", "verification_code": "avg_a = df[df['road_type'] == 'rural']['accident_risk'].mean()\navg_b = df[df['road_type'] == 'urban']['accident_risk'].mean()\nresult = round(avg_a - avg_b, 4)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "continuous_comparison", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "road_type", "val_a": "rural", "val_b": "urban", "target_col": "accident_risk", "target_desc": "accident risk"}}}
codedark/models.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CodeDark Data Models
3
+
4
+ Pydantic models for Action, Observation, and State following OpenEnv spec.
5
+ """
6
+
7
+ from pydantic import BaseModel, Field
8
+ from typing import Optional, List, Any, Literal
9
+
10
+
11
+ class CodeDarkAction(BaseModel):
12
+ """
13
+ Action for CodeDark environment.
14
+
15
+ Agents send actions with a tool name and arguments.
16
+
17
+ Tools available:
18
+ - run_python: Execute Python code with pandas/numpy
19
+ - read_notes: Read all saved notes
20
+ - save_note: Save a note for later recall
21
+ - clarify: Ask clarifying question (max 2 per episode)
22
+ - submit_answer: Submit final answer (ends episode)
23
+ """
24
+
25
+ tool: Literal["run_python", "read_notes", "save_note", "clarify", "submit_answer"]
26
+ args: str = "" # Tool-specific arguments
27
+
28
+ model_config = {
29
+ "json_schema_extra": {
30
+ "examples": [
31
+ {"tool": "run_python", "args": "result = df['y'].mean() * 100"},
32
+ {"tool": "read_notes", "args": ""},
33
+ {"tool": "save_note", "args": "Average subscription rate is 11.26%"},
34
+ {"tool": "clarify", "args": "What does Q1 mean in this context?"},
35
+ {"tool": "submit_answer", "args": "11.26"},
36
+ ]
37
+ }
38
+ }
39
+
40
+
41
+ class CodeDarkObservation(BaseModel):
42
+ """
43
+ Observation returned after each action.
44
+
45
+ Contains execution results, environment state, and episode info.
46
+ Reward is only populated when done=True.
47
+ """
48
+
49
+ # Execution results
50
+ stdout: str = ""
51
+ stderr: str = ""
52
+ exit_code: int = 0
53
+
54
+ # Turn tracking
55
+ turn: int = 0
56
+ max_turns: int = 10
57
+
58
+ # Persistent state
59
+ notes: List[str] = Field(default_factory=list)
60
+
61
+ # Task info
62
+ task_id: str = ""
63
+ question: str = ""
64
+ difficulty: str = "" # L4, L5, L6
65
+ dataset: str = "" # bank, road
66
+
67
+ # Episode status
68
+ done: bool = False
69
+ submitted: bool = False
70
+
71
+ # Reward components (only set when done=True)
72
+ reward: Optional[float] = None
73
+ correctness: Optional[float] = None
74
+ efficiency: Optional[float] = None
75
+
76
+ # Additional metadata
77
+ metadata: dict = Field(default_factory=dict)
78
+
79
+ model_config = {
80
+ "json_schema_extra": {
81
+ "examples": [
82
+ {
83
+ "stdout": "run_python Result:\n(45211, 17)",
84
+ "stderr": "",
85
+ "exit_code": 0,
86
+ "turn": 1,
87
+ "max_turns": 10,
88
+ "notes": [],
89
+ "task_id": "bank_hard_001",
90
+ "question": "What's the subscription rate for month='may'?",
91
+ "difficulty": "L5",
92
+ "dataset": "bank",
93
+ "done": False,
94
+ "submitted": False,
95
+ "reward": None,
96
+ }
97
+ ]
98
+ }
99
+ }
100
+
101
+
102
+ class CodeDarkState(BaseModel):
103
+ """
104
+ Internal state for CodeDark environment.
105
+
106
+ Tracks episode progress, accumulated notes, and submission status.
107
+ """
108
+
109
+ episode_id: str = ""
110
+ step_count: int = 0
111
+
112
+ # Task info
113
+ task_id: str = ""
114
+ dataset: str = ""
115
+
116
+ # Accumulated state
117
+ notes: List[str] = Field(default_factory=list)
118
+ turn_count: int = 0
119
+ error_count: int = 0
120
+ clarify_count: int = 0
121
+
122
+ # Submission
123
+ submitted: bool = False
124
+ submitted_answer: Optional[Any] = None
125
+
126
+ # For scoring
127
+ expected_answer: Optional[Any] = None
128
+ tolerance: float = 0.01
129
+
130
+
131
+ class ResetRequest(BaseModel):
132
+ """Request body for /reset endpoint."""
133
+
134
+ task_id: Optional[str] = None
135
+ seed: Optional[int] = None
136
+
137
+
138
+ class StepRequest(BaseModel):
139
+ """Request body for /step endpoint."""
140
+
141
+ tool: str
142
+ args: str = ""
143
+
144
+
145
+ class HealthResponse(BaseModel):
146
+ """Response for /health endpoint."""
147
+
148
+ status: str = "healthy"
149
+ environment: str = "codedark"
150
+ version: str = "0.1.0"
codedark/openenv.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: codedark
2
+ version: "0.1.0"
3
+ description: |
4
+ CodeDark: Multi-turn data analytics environment for training RL agents.
5
+
6
+ Train AI agents to be data scientists, not just code executors.
7
+ Features real business analytics tasks with pandas/numpy,
8
+ multi-metric reward shaping, and skill-based curriculum.
9
+
10
+ author: Vijay Athithya
11
+ license: MIT
12
+
13
+ # Environment interface types
14
+ action: CodeDarkAction
15
+ observation: CodeDarkObservation
16
+ state: CodeDarkState
17
+
18
+ # Environment configuration
19
+ config:
20
+ max_turns: 10
21
+ datasets:
22
+ - bank
23
+ - road
24
+ difficulty_levels:
25
+ - L4
26
+ - L5
27
+ - L6
28
+
29
+ # Tools available to agents
30
+ tools:
31
+ - name: run_python
32
+ description: Execute Python/pandas code. Store output in 'result' variable.
33
+ - name: read_notes
34
+ description: Read all saved notes from previous turns.
35
+ - name: save_note
36
+ description: Save observations for later recall. Notes persist across turns.
37
+ - name: clarify
38
+ description: Ask clarifying questions about the task (max 2 per episode).
39
+ - name: submit_answer
40
+ description: Submit final answer. Ends episode.
41
+
42
+ # Reward structure
43
+ reward:
44
+ max_reward: 1.0
45
+ components:
46
+ - name: correctness
47
+ weight: 0.80
48
+ description: Binary correct/incorrect with numeric tolerance
49
+ - name: efficiency
50
+ weight: 0.10
51
+ description: Fewer turns = better score
52
+ - name: token_cost
53
+ weight: 0.10
54
+ description: Lower token usage = better score
55
+
56
+ # Benchmarking info
57
+ benchmark:
58
+ tasks: 25
59
+ models_evaluated: 11
60
+ completions: 1844
61
+ best_accuracy: "77.3%"
62
+ best_model: "claude-opus-4.5"
codedark/pyproject.toml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "openenv-codedark"
7
+ version = "0.1.0"
8
+ description = "Multi-turn data analytics environment for RL agent training - OpenEnv compatible"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Vijay Athithya", email = "vijay@analytics-rl.com"}
14
+ ]
15
+ keywords = ["openenv", "reinforcement-learning", "data-analytics", "llm", "agents"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
25
+ ]
26
+ dependencies = [
27
+ "fastapi>=0.115.0",
28
+ "pydantic>=2.0.0",
29
+ "uvicorn[standard]>=0.24.0",
30
+ "pandas>=2.0.0",
31
+ "numpy>=1.24.0",
32
+ "requests>=2.31.0",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ dev = [
37
+ "pytest>=8.0.0",
38
+ "pytest-cov>=4.0.0",
39
+ "pytest-asyncio>=0.23.0",
40
+ "httpx>=0.27.0",
41
+ ]
42
+
43
+ [project.urls]
44
+ Homepage = "https://github.com/vj-09/codedark"
45
+ Documentation = "https://huggingface.co/spaces/openenv/codedark"
46
+ Repository = "https://github.com/vj-09/codedark"
47
+
48
+ [project.scripts]
49
+ codedark-server = "codedark.server.app:main"
50
+
51
+ [tool.setuptools]
52
+ packages = ["codedark", "codedark.server"]
53
+ package-dir = {"codedark" = ".", "codedark.server" = "server"}
54
+
55
+ [tool.setuptools.package-data]
56
+ codedark = ["data/*.csv", "data/tasks/*.jsonl", "openenv.yaml"]
57
+
58
+ [tool.pytest.ini_options]
59
+ asyncio_mode = "auto"
60
+ testpaths = ["tests"]
codedark/server/Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && \
8
+ apt-get install -y --no-install-recommends curl && \
9
+ rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy requirements first for better caching
12
+ COPY server/requirements.txt ./requirements.txt
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Copy package
16
+ COPY . .
17
+
18
+ # Install package in editable mode
19
+ RUN pip install --no-cache-dir -e .
20
+
21
+ # Environment variables
22
+ ENV PYTHONUNBUFFERED=1
23
+ ENV HOST=0.0.0.0
24
+ ENV PORT=8000
25
+
26
+ # Expose port
27
+ EXPOSE 8000
28
+
29
+ # Healthcheck
30
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
31
+ CMD curl -f http://localhost:8000/health || exit 1
32
+
33
+ # Run server
34
+ CMD ["python", "-m", "codedark.server.app"]
codedark/server/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """CodeDark Server Package."""
codedark/server/app.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CodeDark FastAPI Server
3
+
4
+ OpenEnv-compatible HTTP server for CodeDark environment.
5
+ Provides /reset, /step, /state, and /health endpoints.
6
+ """
7
+
8
+ import os
9
+ from contextlib import asynccontextmanager
10
+ from typing import Optional
11
+
12
+ import uvicorn
13
+ from fastapi import FastAPI, HTTPException
14
+ from fastapi.middleware.cors import CORSMiddleware
15
+
16
+ from ..models import (
17
+ CodeDarkAction,
18
+ CodeDarkObservation,
19
+ CodeDarkState,
20
+ ResetRequest,
21
+ StepRequest,
22
+ HealthResponse,
23
+ )
24
+ from .environment import CodeDarkEnvironment
25
+
26
+
27
+ # Global environment instance
28
+ _env: Optional[CodeDarkEnvironment] = None
29
+
30
+
31
+ def get_env() -> CodeDarkEnvironment:
32
+ """Get or create environment instance."""
33
+ global _env
34
+ if _env is None:
35
+ _env = CodeDarkEnvironment(
36
+ data_dir=os.environ.get("CODEDARK_DATA_DIR"),
37
+ tasks_path=os.environ.get("CODEDARK_TASKS_PATH"),
38
+ max_turns=int(os.environ.get("CODEDARK_MAX_TURNS", "10")),
39
+ )
40
+ return _env
41
+
42
+
43
+ @asynccontextmanager
44
+ async def lifespan(app: FastAPI):
45
+ """Lifespan context manager for startup/shutdown."""
46
+ # Startup: initialize environment
47
+ get_env()
48
+ yield
49
+ # Shutdown: cleanup if needed
50
+ global _env
51
+ _env = None
52
+
53
+
54
+ # Create FastAPI app
55
+ app = FastAPI(
56
+ title="CodeDark Environment",
57
+ description="Multi-turn data analytics environment for RL agent training",
58
+ version="0.1.0",
59
+ lifespan=lifespan,
60
+ )
61
+
62
+ # Add CORS middleware
63
+ app.add_middleware(
64
+ CORSMiddleware,
65
+ allow_origins=["*"],
66
+ allow_credentials=True,
67
+ allow_methods=["*"],
68
+ allow_headers=["*"],
69
+ )
70
+
71
+
72
+ @app.get("/health", response_model=HealthResponse)
73
+ async def health():
74
+ """Health check endpoint."""
75
+ return HealthResponse(
76
+ status="healthy",
77
+ environment="codedark",
78
+ version="0.1.0",
79
+ )
80
+
81
+
82
+ @app.post("/reset", response_model=CodeDarkObservation)
83
+ async def reset(request: ResetRequest = None):
84
+ """Reset environment for a new episode.
85
+
86
+ Args:
87
+ request: Optional reset request with task_id and seed
88
+
89
+ Returns:
90
+ Initial observation
91
+ """
92
+ env = get_env()
93
+
94
+ if request is None:
95
+ request = ResetRequest()
96
+
97
+ obs = env.reset(task_id=request.task_id, seed=request.seed)
98
+ return obs
99
+
100
+
101
+ @app.post("/step", response_model=CodeDarkObservation)
102
+ async def step(request: StepRequest):
103
+ """Execute an action and return observation.
104
+
105
+ Args:
106
+ request: Step request with tool and args
107
+
108
+ Returns:
109
+ Observation after action execution
110
+ """
111
+ env = get_env()
112
+
113
+ # Validate tool
114
+ valid_tools = ["run_python", "read_notes", "save_note", "clarify", "submit_answer"]
115
+ if request.tool not in valid_tools:
116
+ raise HTTPException(
117
+ status_code=400,
118
+ detail=f"Invalid tool: {request.tool}. Valid tools: {valid_tools}",
119
+ )
120
+
121
+ action = CodeDarkAction(tool=request.tool, args=request.args)
122
+ obs = env.step(action)
123
+ return obs
124
+
125
+
126
+ @app.get("/state", response_model=CodeDarkState)
127
+ async def state():
128
+ """Get current environment state.
129
+
130
+ Returns:
131
+ Current CodeDarkState
132
+ """
133
+ env = get_env()
134
+ return env.state
135
+
136
+
137
+ @app.get("/metadata")
138
+ async def metadata():
139
+ """Get environment metadata.
140
+
141
+ Returns:
142
+ Environment metadata dict
143
+ """
144
+ env = get_env()
145
+ return {
146
+ "name": "codedark",
147
+ "version": "0.1.0",
148
+ "description": "Multi-turn data analytics environment for RL agent training",
149
+ "max_turns": env.max_turns,
150
+ "max_clarifications": env.max_clarifications,
151
+ "num_tasks": len(env.tasks),
152
+ "tools": [
153
+ {"name": "run_python", "description": "Execute Python/pandas code"},
154
+ {"name": "read_notes", "description": "Read all saved notes"},
155
+ {"name": "save_note", "description": "Save a note for later recall"},
156
+ {"name": "clarify", "description": "Ask clarifying question (max 2)"},
157
+ {"name": "submit_answer", "description": "Submit final answer"},
158
+ ],
159
+ "reward_structure": {
160
+ "max_reward": 1.0,
161
+ "components": [
162
+ {"name": "correctness", "weight": 0.80},
163
+ {"name": "efficiency", "weight": 0.10},
164
+ {"name": "token_cost", "weight": 0.10},
165
+ ],
166
+ },
167
+ }
168
+
169
+
170
+ @app.get("/schema")
171
+ async def schema():
172
+ """Get environment schema for Action, Observation, State.
173
+
174
+ Returns:
175
+ JSON schemas for all types
176
+ """
177
+ return {
178
+ "action": CodeDarkAction.model_json_schema(),
179
+ "observation": CodeDarkObservation.model_json_schema(),
180
+ "state": CodeDarkState.model_json_schema(),
181
+ }
182
+
183
+
184
+ def main():
185
+ """Run the server."""
186
+ uvicorn.run(
187
+ "codedark.server.app:app",
188
+ host=os.environ.get("HOST", "0.0.0.0"),
189
+ port=int(os.environ.get("PORT", "8000")),
190
+ reload=os.environ.get("RELOAD", "false").lower() == "true",
191
+ )
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()
codedark/server/environment.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CodeDark Environment
3
+
4
+ OpenEnv-compatible environment for multi-turn data analytics tasks.
5
+ Agents analyze CSV data using Python/Pandas tools and submit answers.
6
+ """
7
+
8
+ import json
9
+ import uuid
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ import pandas as pd
14
+
15
+ from ..models import CodeDarkAction, CodeDarkObservation, CodeDarkState
16
+ from .tools import (
17
+ run_python,
18
+ read_notes,
19
+ save_note,
20
+ clarify,
21
+ submit_answer,
22
+ parse_tool_call,
23
+ )
24
+ from .scoring import compute_reward
25
+
26
+
27
+ class CodeDarkEnvironment:
28
+ """CodeDark environment for multi-turn data analytics.
29
+
30
+ Features:
31
+ - Multi-turn agent evaluation
32
+ - 5 tools: run_python, read_notes, save_note, clarify, submit_answer
33
+ - Shaped rewards: correctness (80%) + efficiency (10%) + token cost (10%)
34
+ - Supports bank and road datasets
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ data_dir: Optional[str] = None,
40
+ tasks_path: Optional[str] = None,
41
+ max_turns: int = 10,
42
+ max_clarifications: int = 2,
43
+ ):
44
+ """Initialize CodeDark environment.
45
+
46
+ Args:
47
+ data_dir: Path to directory containing CSV files
48
+ tasks_path: Path to tasks.jsonl file
49
+ max_turns: Maximum turns per episode (default: 10)
50
+ max_clarifications: Maximum clarifications per episode (default: 2)
51
+ """
52
+ self.max_turns = max_turns
53
+ self.max_clarifications = max_clarifications
54
+
55
+ # Resolve paths
56
+ if data_dir:
57
+ self.data_dir = Path(data_dir)
58
+ else:
59
+ # Default to data/ relative to this file's parent
60
+ self.data_dir = Path(__file__).parent.parent / "data"
61
+
62
+ if tasks_path:
63
+ self.tasks_path = Path(tasks_path)
64
+ else:
65
+ self.tasks_path = self.data_dir / "tasks" / "final_25_tasks.jsonl"
66
+
67
+ # Load tasks
68
+ self.tasks = self._load_tasks()
69
+ self._tasks_by_id = {t["id"]: t for t in self.tasks}
70
+ self._task_index = 0
71
+
72
+ # Current episode state
73
+ self._state: Optional[CodeDarkState] = None
74
+ self._df: Optional[pd.DataFrame] = None
75
+ self._current_task: Optional[Dict] = None
76
+
77
+ def _load_tasks(self) -> List[Dict]:
78
+ """Load tasks from JSONL file."""
79
+ if not self.tasks_path.exists():
80
+ return []
81
+
82
+ tasks = []
83
+ with open(self.tasks_path) as f:
84
+ for line in f:
85
+ if line.strip():
86
+ tasks.append(json.loads(line))
87
+ return tasks
88
+
89
+ def _load_data_for_task(self, task: Dict) -> Optional[pd.DataFrame]:
90
+ """Load the appropriate CSV for a task.
91
+
92
+ Args:
93
+ task: Task dictionary with 'dataset' field
94
+
95
+ Returns:
96
+ DataFrame or None if not found
97
+ """
98
+ dataset = task.get("dataset", "bank")
99
+ csv_path = self.data_dir / f"{dataset}.csv"
100
+
101
+ if csv_path.exists():
102
+ return pd.read_csv(csv_path)
103
+ return None
104
+
105
+ @property
106
+ def state(self) -> CodeDarkState:
107
+ """Return current environment state."""
108
+ if self._state is None:
109
+ self._state = CodeDarkState()
110
+ return self._state
111
+
112
+ def reset(
113
+ self, task_id: Optional[str] = None, seed: Optional[int] = None
114
+ ) -> CodeDarkObservation:
115
+ """Reset environment for a new episode.
116
+
117
+ Args:
118
+ task_id: Specific task to load (optional)
119
+ seed: Random seed for task selection (optional)
120
+
121
+ Returns:
122
+ Initial observation with task question
123
+ """
124
+ # Select task
125
+ if task_id and task_id in self._tasks_by_id:
126
+ task = self._tasks_by_id[task_id]
127
+ elif self.tasks:
128
+ if seed is not None:
129
+ import random
130
+
131
+ random.seed(seed)
132
+ task = random.choice(self.tasks)
133
+ else:
134
+ # Round-robin through tasks
135
+ task = self.tasks[self._task_index % len(self.tasks)]
136
+ self._task_index += 1
137
+ else:
138
+ # No tasks loaded - return error observation
139
+ return CodeDarkObservation(
140
+ stderr="Error: No tasks loaded",
141
+ exit_code=1,
142
+ done=True,
143
+ )
144
+
145
+ self._current_task = task
146
+
147
+ # Load data for this task
148
+ self._df = self._load_data_for_task(task)
149
+ if self._df is None:
150
+ return CodeDarkObservation(
151
+ stderr=f"Error: Could not load data for dataset '{task.get('dataset', 'bank')}'",
152
+ exit_code=1,
153
+ done=True,
154
+ )
155
+
156
+ # Initialize state
157
+ self._state = CodeDarkState(
158
+ episode_id=str(uuid.uuid4()),
159
+ step_count=0,
160
+ task_id=task["id"],
161
+ dataset=task.get("dataset", "bank"),
162
+ notes=[],
163
+ turn_count=0,
164
+ error_count=0,
165
+ clarify_count=0,
166
+ submitted=False,
167
+ submitted_answer=None,
168
+ expected_answer=task["golden"]["answer_value"],
169
+ tolerance=task["golden"].get("tolerance", 0.01),
170
+ )
171
+
172
+ # Return initial observation
173
+ return CodeDarkObservation(
174
+ stdout=f"Task loaded. DataFrame shape: {self._df.shape}",
175
+ turn=0,
176
+ max_turns=self.max_turns,
177
+ notes=[],
178
+ task_id=task["id"],
179
+ question=task["goal"],
180
+ difficulty=task.get("level", "L5"),
181
+ dataset=task.get("dataset", "bank"),
182
+ done=False,
183
+ submitted=False,
184
+ )
185
+
186
+ def step(self, action: CodeDarkAction) -> CodeDarkObservation:
187
+ """Execute an action and return observation.
188
+
189
+ Args:
190
+ action: CodeDarkAction with tool name and args
191
+
192
+ Returns:
193
+ CodeDarkObservation with results
194
+ """
195
+ if self._state is None or self._current_task is None:
196
+ return CodeDarkObservation(
197
+ stderr="Error: Environment not reset. Call reset() first.",
198
+ exit_code=1,
199
+ done=True,
200
+ )
201
+
202
+ if self._state.submitted:
203
+ return self._make_final_observation()
204
+
205
+ # Increment turn
206
+ self._state.turn_count += 1
207
+ self._state.step_count += 1
208
+
209
+ # Check turn limit
210
+ if self._state.turn_count > self.max_turns:
211
+ self._state.submitted = True
212
+ return self._make_final_observation()
213
+
214
+ # Parse tool-specific args
215
+ parsed_content, parse_error = parse_tool_call(action.args, action.tool)
216
+
217
+ if parse_error:
218
+ self._state.error_count += 1
219
+ return CodeDarkObservation(
220
+ stderr=f"{action.tool} Error: {parse_error}",
221
+ exit_code=1,
222
+ turn=self._state.turn_count,
223
+ max_turns=self.max_turns,
224
+ notes=self._state.notes.copy(),
225
+ task_id=self._state.task_id,
226
+ question=self._current_task["goal"],
227
+ difficulty=self._current_task.get("level", "L5"),
228
+ dataset=self._state.dataset,
229
+ done=False,
230
+ submitted=False,
231
+ )
232
+
233
+ # Execute tool
234
+ stdout, stderr, exit_code = "", "", 0
235
+
236
+ if action.tool == "run_python":
237
+ stdout, stderr, exit_code = run_python(parsed_content, self._df)
238
+
239
+ elif action.tool == "read_notes":
240
+ stdout, stderr, exit_code = read_notes(self._state.notes)
241
+
242
+ elif action.tool == "save_note":
243
+ stdout, stderr, exit_code = save_note(parsed_content, self._state.notes)
244
+
245
+ elif action.tool == "clarify":
246
+ stdout, stderr, exit_code, new_count = clarify(
247
+ question=parsed_content,
248
+ clarify_count=self._state.clarify_count,
249
+ max_clarifications=self.max_clarifications,
250
+ ambiguities=self._current_task.get("ambiguities", []),
251
+ answer_type=self._current_task.get("golden", {}).get(
252
+ "answer_type", "scalar"
253
+ ),
254
+ )
255
+ self._state.clarify_count = new_count
256
+
257
+ elif action.tool == "submit_answer":
258
+ stdout, stderr, exit_code, answer = submit_answer(parsed_content)
259
+ if exit_code == 0:
260
+ self._state.submitted = True
261
+ self._state.submitted_answer = answer
262
+ return self._make_final_observation()
263
+
264
+ # Track errors
265
+ if exit_code != 0:
266
+ self._state.error_count += 1
267
+
268
+ return CodeDarkObservation(
269
+ stdout=stdout,
270
+ stderr=stderr,
271
+ exit_code=exit_code,
272
+ turn=self._state.turn_count,
273
+ max_turns=self.max_turns,
274
+ notes=self._state.notes.copy(),
275
+ task_id=self._state.task_id,
276
+ question=self._current_task["goal"],
277
+ difficulty=self._current_task.get("level", "L5"),
278
+ dataset=self._state.dataset,
279
+ done=False,
280
+ submitted=False,
281
+ )
282
+
283
+ def _make_final_observation(self) -> CodeDarkObservation:
284
+ """Create final observation with reward computation."""
285
+ if self._state is None or self._current_task is None:
286
+ return CodeDarkObservation(done=True)
287
+
288
+ # Compute reward
289
+ reward, correctness, efficiency, token_cost = compute_reward(
290
+ submitted=self._state.submitted_answer,
291
+ expected=self._state.expected_answer,
292
+ tolerance=self._state.tolerance,
293
+ turns=self._state.turn_count,
294
+ max_turns=self.max_turns,
295
+ )
296
+
297
+ return CodeDarkObservation(
298
+ stdout="[EPISODE COMPLETE]",
299
+ turn=self._state.turn_count,
300
+ max_turns=self.max_turns,
301
+ notes=self._state.notes.copy(),
302
+ task_id=self._state.task_id,
303
+ question=self._current_task["goal"],
304
+ difficulty=self._current_task.get("level", "L5"),
305
+ dataset=self._state.dataset,
306
+ done=True,
307
+ submitted=self._state.submitted,
308
+ reward=reward,
309
+ correctness=correctness,
310
+ efficiency=efficiency,
311
+ metadata={
312
+ "submitted_answer": self._state.submitted_answer,
313
+ "expected_answer": self._state.expected_answer,
314
+ "tolerance": self._state.tolerance,
315
+ "error_count": self._state.error_count,
316
+ "clarify_count": self._state.clarify_count,
317
+ "token_cost_usd": token_cost,
318
+ },
319
+ )
codedark/server/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # CodeDark Server Requirements
2
+ fastapi>=0.115.0
3
+ pydantic>=2.0.0
4
+ uvicorn[standard]>=0.24.0
5
+ pandas>=2.0.0
6
+ numpy>=1.24.0
7
+ requests>=2.31.0
codedark/server/scoring.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CodeDark Scoring System
3
+
4
+ Reward computation with multi-metric scoring:
5
+ - 80% correctness (binary: exact match within tolerance)
6
+ - 10% efficiency (fewer turns = better)
7
+ - 10% token cost (lower usage = better)
8
+ """
9
+
10
+ from typing import Any, Optional, Tuple
11
+ import ast
12
+
13
+
14
+ def normalize_value(val: Any) -> Any:
15
+ """Normalize a value for comparison.
16
+
17
+ Handles:
18
+ - String to float/int conversion
19
+ - String to list/dict parsing
20
+ - Float rounding for precision
21
+ - Percentage stripping
22
+ """
23
+ if val is None:
24
+ return None
25
+
26
+ # Already a proper type - just normalize floats
27
+ if isinstance(val, float):
28
+ return round(val, 4)
29
+ if isinstance(val, int):
30
+ return float(val)
31
+ if isinstance(val, (list, dict)):
32
+ return val
33
+
34
+ # String handling
35
+ if isinstance(val, str):
36
+ val = val.strip()
37
+
38
+ # Try to parse as list/dict first
39
+ if val.startswith("[") or val.startswith("{"):
40
+ try:
41
+ return ast.literal_eval(val)
42
+ except (ValueError, SyntaxError):
43
+ pass
44
+
45
+ # Try float (strip % if present)
46
+ try:
47
+ return round(float(val.rstrip("%")), 4)
48
+ except ValueError:
49
+ pass
50
+
51
+ # Return as lowercase string
52
+ return val.lower()
53
+
54
+ return val
55
+
56
+
57
+ def parse_markdown_table(text: str) -> Optional[list]:
58
+ """Parse markdown table to list of dicts.
59
+
60
+ Handles tables like:
61
+ | job | mean | std |
62
+ |-----|------|-----|
63
+ | retired | 40.97 | 9.74 |
64
+ """
65
+ if not isinstance(text, str):
66
+ return None
67
+
68
+ lines = text.strip().split("\n")
69
+
70
+ # Find table lines (contain |)
71
+ table_lines = [line for line in lines if "|" in line]
72
+ if len(table_lines) < 3: # Need header + separator + at least 1 row
73
+ return None
74
+
75
+ # Parse header
76
+ header_line = table_lines[0]
77
+ headers = [h.strip().lower() for h in header_line.split("|") if h.strip()]
78
+ if not headers:
79
+ return None
80
+
81
+ # Skip separator line (contains ---)
82
+ data_start = 1
83
+ if "---" in table_lines[1] or "--|" in table_lines[1]:
84
+ data_start = 2
85
+
86
+ # Parse data rows
87
+ rows = []
88
+ for line in table_lines[data_start:]:
89
+ cells = [c.strip() for c in line.split("|") if c.strip()]
90
+ if len(cells) != len(headers):
91
+ continue
92
+
93
+ row = {}
94
+ for h, c in zip(headers, cells):
95
+ # Clean number formatting (commas, currency symbols)
96
+ c_clean = c.replace(",", "").replace("€", "").replace("$", "").strip()
97
+ try:
98
+ if "." in c_clean:
99
+ row[h] = round(float(c_clean), 4)
100
+ else:
101
+ row[h] = int(c_clean)
102
+ except ValueError:
103
+ row[h] = c.lower()
104
+ rows.append(row)
105
+
106
+ return rows if rows else None
107
+
108
+
109
+ def compare_answers(submitted: Any, expected: Any, tolerance: float = 0.01) -> bool:
110
+ """Compare answers with support for structured data and numeric tolerance.
111
+
112
+ Handles:
113
+ - Type mismatches (string "33.53" vs float 33.53)
114
+ - Floating point precision (rounds to 4 decimals)
115
+ - Nested structures (lists, dicts)
116
+ - String parsing for lists/dicts
117
+ """
118
+ # Normalize both values
119
+ submitted_n = normalize_value(submitted)
120
+ expected_n = normalize_value(expected)
121
+
122
+ # Null checks
123
+ if submitted_n is None and expected_n is None:
124
+ return True
125
+ if submitted_n is None or expected_n is None:
126
+ return False
127
+
128
+ # Same type comparison after normalization
129
+ if type(submitted_n) == type(expected_n):
130
+ if isinstance(expected_n, list):
131
+ if len(submitted_n) != len(expected_n):
132
+ return False
133
+ # Check if list contains dicts (structured data) - use order-sensitive
134
+ if expected_n and isinstance(expected_n[0], dict):
135
+ return all(
136
+ compare_answers(s, e, tolerance)
137
+ for s, e in zip(submitted_n, expected_n)
138
+ )
139
+ # Simple values list - order-insensitive (we don't tell models to sort)
140
+ submitted_sorted = sorted([str(x).lower().strip() for x in submitted_n])
141
+ expected_sorted = sorted([str(x).lower().strip() for x in expected_n])
142
+ return submitted_sorted == expected_sorted
143
+
144
+ if isinstance(expected_n, dict):
145
+ if set(submitted_n.keys()) != set(expected_n.keys()):
146
+ return False
147
+ return all(
148
+ compare_answers(submitted_n[k], expected_n[k], tolerance)
149
+ for k in expected_n
150
+ )
151
+
152
+ if isinstance(expected_n, float):
153
+ return abs(submitted_n - expected_n) <= tolerance
154
+
155
+ # String comparison
156
+ return str(submitted_n) == str(expected_n)
157
+
158
+ # Type mismatch after normalization - try numeric comparison
159
+ try:
160
+ sub_f = (
161
+ float(submitted_n) if not isinstance(submitted_n, (list, dict)) else None
162
+ )
163
+ exp_f = float(expected_n) if not isinstance(expected_n, (list, dict)) else None
164
+ if sub_f is not None and exp_f is not None:
165
+ return abs(sub_f - exp_f) <= tolerance
166
+ except (ValueError, TypeError):
167
+ pass
168
+
169
+ # Try markdown table parsing if expected is list and submitted is string
170
+ if isinstance(expected_n, list) and isinstance(submitted_n, str):
171
+ parsed = parse_markdown_table(submitted) # Use original, not normalized
172
+ if parsed is not None:
173
+ return compare_answers(parsed, expected, tolerance)
174
+
175
+ # Fallback: string comparison
176
+ return str(submitted_n).lower() == str(expected_n).lower()
177
+
178
+
179
+ def score_correctness(submitted: Any, expected: Any, tolerance: float = 0.01) -> float:
180
+ """Score the submitted answer correctness. Weight: 0.80
181
+
182
+ Scoring:
183
+ - 0.80: Exact match
184
+ - 0.20: Almost there (rounding or 100x scale error)
185
+ - 0.00: Wrong
186
+
187
+ Args:
188
+ submitted: Submitted answer
189
+ expected: Expected answer
190
+ tolerance: Numeric tolerance for comparison
191
+
192
+ Returns:
193
+ Correctness score (0.0, 0.20, or 0.80)
194
+ """
195
+ if submitted is None:
196
+ return 0.0
197
+
198
+ try:
199
+ # Try numeric comparison first
200
+ submitted_f = float(submitted)
201
+ expected_f = float(expected)
202
+
203
+ # Exact match (within tolerance)
204
+ if abs(submitted_f - expected_f) < tolerance:
205
+ return 0.80
206
+
207
+ if expected_f != 0:
208
+ ratio = submitted_f / expected_f
209
+
210
+ # 100x scale error (decimal vs percentage)
211
+ # e.g., 0.0959 vs 9.59 or 9.59 vs 0.0959
212
+ if 0.009 < ratio < 0.011 or 99 < ratio < 101:
213
+ return 0.20
214
+
215
+ # Rounding error (within 1% of expected)
216
+ if 0.99 < ratio < 1.01:
217
+ return 0.20
218
+
219
+ except (ValueError, TypeError):
220
+ # Structured data comparison (lists, dicts)
221
+ if compare_answers(submitted, expected, tolerance=tolerance):
222
+ return 0.80
223
+
224
+ return 0.0
225
+
226
+
227
+ def score_efficiency(turns: int, max_turns: int, is_correct: bool) -> float:
228
+ """Score based on turns used (fewer = better). Weight: 0.10
229
+
230
+ Only applies if answer is correct.
231
+
232
+ Args:
233
+ turns: Number of turns used
234
+ max_turns: Maximum turns allowed
235
+ is_correct: Whether the answer was correct
236
+
237
+ Returns:
238
+ Efficiency score (0.0 to 0.10)
239
+ """
240
+ if not is_correct:
241
+ return 0.0
242
+
243
+ # Scale: 1 turn = 0.10, max_turns = 0.01
244
+ efficiency = max(0.0, 1.0 - (turns / max_turns))
245
+ return 0.10 * efficiency
246
+
247
+
248
+ def score_token_cost(
249
+ input_tokens: int,
250
+ output_tokens: int,
251
+ is_correct: bool,
252
+ input_price: float = 1.0,
253
+ output_price: float = 5.0,
254
+ target_cost: float = 0.01,
255
+ max_cost: float = 0.10,
256
+ ) -> Tuple[float, float]:
257
+ """Score based on token cost (lower = better). Weight: 0.10
258
+
259
+ Only applies if answer is correct.
260
+
261
+ Args:
262
+ input_tokens: Number of input tokens used
263
+ output_tokens: Number of output tokens used
264
+ is_correct: Whether the answer was correct
265
+ input_price: Price per 1M input tokens (default $1)
266
+ output_price: Price per 1M output tokens (default $5)
267
+ target_cost: Cost for full score (default $0.01)
268
+ max_cost: Cost for zero score (default $0.10)
269
+
270
+ Returns:
271
+ Tuple of (token_score, cost_usd)
272
+ """
273
+ if not is_correct:
274
+ return 0.0, 0.0
275
+
276
+ # Calculate cost in dollars
277
+ cost = (input_tokens * input_price / 1_000_000) + (
278
+ output_tokens * output_price / 1_000_000
279
+ )
280
+
281
+ # Scale: <$0.01 = full score, >$0.10 = 0
282
+ if cost <= target_cost:
283
+ efficiency = 1.0
284
+ elif cost >= max_cost:
285
+ efficiency = 0.0
286
+ else:
287
+ efficiency = 1.0 - ((cost - target_cost) / (max_cost - target_cost))
288
+
289
+ return 0.10 * efficiency, cost
290
+
291
+
292
+ def compute_reward(
293
+ submitted: Any,
294
+ expected: Any,
295
+ tolerance: float,
296
+ turns: int,
297
+ max_turns: int,
298
+ input_tokens: int = 0,
299
+ output_tokens: int = 0,
300
+ ) -> Tuple[float, float, float, float]:
301
+ """Compute total reward from all components.
302
+
303
+ Args:
304
+ submitted: Submitted answer
305
+ expected: Expected answer
306
+ tolerance: Numeric tolerance
307
+ turns: Number of turns used
308
+ max_turns: Maximum turns allowed
309
+ input_tokens: Number of input tokens (optional)
310
+ output_tokens: Number of output tokens (optional)
311
+
312
+ Returns:
313
+ Tuple of (total_reward, correctness, efficiency, token_cost_usd)
314
+ """
315
+ # Correctness (0.80 weight)
316
+ correctness = score_correctness(submitted, expected, tolerance)
317
+ is_correct = correctness > 0
318
+
319
+ # Efficiency (0.10 weight)
320
+ efficiency = score_efficiency(turns, max_turns, is_correct)
321
+
322
+ # Token cost (0.10 weight)
323
+ # If tokens not tracked, estimate from turns
324
+ if input_tokens == 0 and output_tokens == 0:
325
+ input_tokens = turns * 1000
326
+ output_tokens = turns * 500
327
+
328
+ token_score, cost_usd = score_token_cost(input_tokens, output_tokens, is_correct)
329
+
330
+ total_reward = correctness + efficiency + token_score
331
+
332
+ return total_reward, correctness, efficiency, cost_usd
codedark/server/tools.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CodeDark Tool Implementations
3
+
4
+ Tools available to agents:
5
+ - run_python: Execute Python/pandas code in sandboxed environment
6
+ - read_notes: Read all saved notes from current episode
7
+ - save_note: Save a note for later recall
8
+ - clarify: Ask clarifying question (max 2 per episode)
9
+ - submit_answer: Submit final answer (ends episode)
10
+ """
11
+
12
+ import re
13
+ import ast
14
+ from typing import Any, Dict, List, Optional, Tuple
15
+
16
+ import pandas as pd
17
+ import numpy as np
18
+
19
+
20
+ # Safe builtins for sandboxed code execution
21
+ SAFE_BUILTINS = {
22
+ "len": len,
23
+ "sum": sum,
24
+ "min": min,
25
+ "max": max,
26
+ "abs": abs,
27
+ "round": round,
28
+ "sorted": sorted,
29
+ "range": range,
30
+ "int": int,
31
+ "float": float,
32
+ "str": str,
33
+ "bool": bool,
34
+ "list": list,
35
+ "dict": dict,
36
+ "set": set,
37
+ "tuple": tuple,
38
+ "enumerate": enumerate,
39
+ "zip": zip,
40
+ "True": True,
41
+ "False": False,
42
+ "None": None,
43
+ "print": print,
44
+ "type": type,
45
+ "isinstance": isinstance,
46
+ "map": map,
47
+ "filter": filter,
48
+ "any": any,
49
+ "all": all,
50
+ "hasattr": hasattr,
51
+ "getattr": getattr,
52
+ "repr": repr,
53
+ "locals": locals,
54
+ "globals": globals,
55
+ "dir": dir,
56
+ "vars": vars,
57
+ "reversed": reversed,
58
+ "slice": slice,
59
+ "format": format,
60
+ "Exception": Exception,
61
+ "ValueError": ValueError,
62
+ "TypeError": TypeError,
63
+ "KeyError": KeyError,
64
+ "IndexError": IndexError,
65
+ "AttributeError": AttributeError,
66
+ }
67
+
68
+
69
+ def run_python(
70
+ code: str, df: pd.DataFrame, max_output_chars: int = 200
71
+ ) -> Tuple[str, str, int]:
72
+ """Execute Python code in sandboxed environment.
73
+
74
+ Args:
75
+ code: Python code to execute
76
+ df: DataFrame available as 'df' in execution context
77
+ max_output_chars: Maximum characters for output truncation
78
+
79
+ Returns:
80
+ Tuple of (stdout, stderr, exit_code)
81
+ """
82
+ if df is None:
83
+ return "", "Error: No dataframe loaded", 1
84
+
85
+ local_vars = {
86
+ "pd": pd,
87
+ "np": np,
88
+ "df": df.copy(),
89
+ }
90
+
91
+ try:
92
+ exec(code, {"__builtins__": SAFE_BUILTINS}, local_vars)
93
+ result = local_vars.get("result")
94
+
95
+ if result is None:
96
+ return (
97
+ "",
98
+ "Error: No 'result' variable set. Store your result in 'result'.",
99
+ 1,
100
+ )
101
+
102
+ # Format output with truncation
103
+ if isinstance(result, pd.DataFrame):
104
+ preview = result.head(3).to_string()
105
+ elif isinstance(result, pd.Series):
106
+ preview = result.head(5).to_string()
107
+ else:
108
+ preview = str(result)
109
+
110
+ # Truncate if needed
111
+ if len(preview) > max_output_chars:
112
+ preview = preview[:max_output_chars] + "..."
113
+
114
+ return f"run_python Result:\n{preview}", "", 0
115
+
116
+ except Exception as e:
117
+ return "", f"run_python Error: {e}", 1
118
+
119
+
120
+ def read_notes(notes: List[str]) -> Tuple[str, str, int]:
121
+ """Read all saved notes.
122
+
123
+ Args:
124
+ notes: List of saved notes
125
+
126
+ Returns:
127
+ Tuple of (stdout, stderr, exit_code)
128
+ """
129
+ if not notes:
130
+ return "No notes saved yet.", "", 0
131
+
132
+ notes_list = "\n".join(f"- {n}" for n in notes)
133
+ return f"Saved notes:\n{notes_list}", "", 0
134
+
135
+
136
+ def save_note(content: str, notes: List[str]) -> Tuple[str, str, int]:
137
+ """Save a note to persistent memory.
138
+
139
+ Args:
140
+ content: Note content to save
141
+ notes: List to append note to (modified in place)
142
+
143
+ Returns:
144
+ Tuple of (stdout, stderr, exit_code)
145
+ """
146
+ content = content.strip()
147
+ if not content:
148
+ return "", "Error: Empty note content", 1
149
+
150
+ notes.append(content)
151
+ notes_list = "\n".join(f"- {n}" for n in notes)
152
+ return f"Note saved.\n\nAll notes:\n{notes_list}", "", 0
153
+
154
+
155
+ def clarify(
156
+ question: str,
157
+ clarify_count: int,
158
+ max_clarifications: int,
159
+ ambiguities: Optional[List[str]] = None,
160
+ answer_type: str = "scalar",
161
+ ) -> Tuple[str, str, int, int]:
162
+ """Ask a clarifying question about the task.
163
+
164
+ Args:
165
+ question: The clarifying question
166
+ clarify_count: Current number of clarifications used
167
+ max_clarifications: Maximum allowed clarifications
168
+ ambiguities: List of known ambiguities from task metadata
169
+ answer_type: Expected answer type ("scalar", "list", etc.)
170
+
171
+ Returns:
172
+ Tuple of (stdout, stderr, exit_code, new_clarify_count)
173
+ """
174
+ if clarify_count >= max_clarifications:
175
+ return (
176
+ "",
177
+ f"Error: Maximum {max_clarifications} clarifications per episode. Please proceed with your best interpretation.",
178
+ 1,
179
+ clarify_count,
180
+ )
181
+
182
+ question_lower = question.lower()
183
+ ambiguities = ambiguities or []
184
+
185
+ # Build clarification responses from task metadata
186
+ clarifications = {}
187
+
188
+ for amb in ambiguities:
189
+ amb_lower = amb.lower()
190
+ if (
191
+ "percentile" in amb_lower
192
+ or "inclusive" in amb_lower
193
+ or "exclusive" in amb_lower
194
+ ):
195
+ clarifications["percentile"] = (
196
+ "Use >= for 'top X%' (inclusive of threshold) and <= for 'bottom X%'."
197
+ )
198
+ if "rate" in amb_lower or "percentage" in amb_lower:
199
+ clarifications["rate"] = (
200
+ "Express rates as percentages 0-100, rounded to 2 decimal places."
201
+ )
202
+ if (
203
+ "positive" in amb_lower
204
+ or "success" in amb_lower
205
+ or "target" in amb_lower
206
+ or "y=1" in amb_lower
207
+ ):
208
+ clarifications["target"] = "Subscription/success means y=1 in the dataset."
209
+ if "boundary" in amb_lower:
210
+ clarifications["boundary"] = (
211
+ "Include boundary values (>=, <=) when filtering."
212
+ )
213
+
214
+ # Add format clarifications from answer type
215
+ if answer_type == "scalar":
216
+ clarifications["format"] = (
217
+ "Return a single numeric value, rounded to 2 decimal places."
218
+ )
219
+ elif answer_type == "list":
220
+ clarifications["format"] = (
221
+ "Return as a list/DataFrame with the specified columns."
222
+ )
223
+
224
+ # Try to match question to a clarification
225
+ response = None
226
+ for key, value in clarifications.items():
227
+ if key in question_lower or any(word in question_lower for word in key.split()):
228
+ response = value
229
+ break
230
+
231
+ if response:
232
+ return f"Clarification: {response}", "", 0, clarify_count + 1
233
+ else:
234
+ return (
235
+ "Clarification: Please proceed with your best interpretation based on standard data analysis conventions.",
236
+ "",
237
+ 0,
238
+ clarify_count + 1,
239
+ )
240
+
241
+
242
+ def submit_answer(answer_str: str) -> Tuple[str, str, int, Any]:
243
+ """Submit final answer.
244
+
245
+ Args:
246
+ answer_str: Answer string to parse and submit
247
+
248
+ Returns:
249
+ Tuple of (stdout, stderr, exit_code, parsed_answer)
250
+ """
251
+ answer_str = answer_str.strip().rstrip("%").strip()
252
+
253
+ if not answer_str:
254
+ return "", "Error: Empty answer", 1, None
255
+
256
+ # Try to parse as structured data first (list/dict)
257
+ try:
258
+ answer = ast.literal_eval(answer_str)
259
+ except (ValueError, SyntaxError):
260
+ # Fall back to numeric parsing
261
+ try:
262
+ answer = float(answer_str)
263
+ except ValueError:
264
+ answer = answer_str
265
+
266
+ return "[SUBMITTED]", "", 0, answer
267
+
268
+
269
+ def parse_tool_call(args: str, tool_name: str) -> Tuple[Optional[str], Optional[str]]:
270
+ """Parse tool-specific arguments from args string.
271
+
272
+ Args:
273
+ args: Raw args string
274
+ tool_name: Name of the tool being called
275
+
276
+ Returns:
277
+ Tuple of (parsed_content, error_message)
278
+ """
279
+ if tool_name == "run_python":
280
+ # Extract code from <code></code> tags
281
+ match = re.search(r"<code>(.*?)</code>", args, re.DOTALL)
282
+ if not match:
283
+ return None, "No <code> tag found. Use: <code>your_code</code>"
284
+ return match.group(1).strip(), None
285
+
286
+ elif tool_name == "clarify":
287
+ # Extract question from <question></question> tags
288
+ match = re.search(r"<question>(.*?)</question>", args, re.DOTALL)
289
+ if not match:
290
+ return (
291
+ None,
292
+ "No <question> tag found. Use: <question>your question</question>",
293
+ )
294
+ return match.group(1).strip(), None
295
+
296
+ elif tool_name == "submit_answer":
297
+ # Extract answer from <answer></answer> tags
298
+ match = re.search(r"<answer>(.*?)</answer>", args, re.DOTALL)
299
+ if not match:
300
+ return None, "No <answer> tag found. Use: <answer>value</answer>"
301
+ return match.group(1).strip(), None
302
+
303
+ elif tool_name in ("read_notes", "save_note"):
304
+ # These take raw args
305
+ return args.strip(), None
306
+
307
+ else:
308
+ return None, f"Unknown tool: {tool_name}"
codedark/tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """CodeDark Tests."""
codedark/tests/test_environment.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for CodeDark environment."""
2
+
3
+ import pytest
4
+ from pathlib import Path
5
+
6
+ from codedark.models import CodeDarkAction, CodeDarkObservation, CodeDarkState
7
+ from codedark.server.environment import CodeDarkEnvironment
8
+ from codedark.server.scoring import score_correctness, score_efficiency, compute_reward
9
+ from codedark.server.tools import run_python, read_notes, save_note, parse_tool_call
10
+
11
+
12
+ class TestModels:
13
+ """Test Pydantic models."""
14
+
15
+ def test_action_creation(self):
16
+ action = CodeDarkAction(tool="run_python", args="<code>result = 1</code>")
17
+ assert action.tool == "run_python"
18
+ assert "result = 1" in action.args
19
+
20
+ def test_observation_defaults(self):
21
+ obs = CodeDarkObservation()
22
+ assert obs.stdout == ""
23
+ assert obs.done is False
24
+ assert obs.reward is None
25
+
26
+ def test_state_defaults(self):
27
+ state = CodeDarkState()
28
+ assert state.notes == []
29
+ assert state.turn_count == 0
30
+
31
+
32
+ class TestTools:
33
+ """Test tool implementations."""
34
+
35
+ def test_parse_run_python(self):
36
+ content, error = parse_tool_call("<code>result = df.shape</code>", "run_python")
37
+ assert content == "result = df.shape"
38
+ assert error is None
39
+
40
+ def test_parse_run_python_missing_tag(self):
41
+ content, error = parse_tool_call("result = df.shape", "run_python")
42
+ assert content is None
43
+ assert "No <code> tag" in error
44
+
45
+ def test_parse_submit_answer(self):
46
+ content, error = parse_tool_call("<answer>42.5</answer>", "submit_answer")
47
+ assert content == "42.5"
48
+ assert error is None
49
+
50
+ def test_read_notes_empty(self):
51
+ stdout, stderr, exit_code = read_notes([])
52
+ assert "No notes saved" in stdout
53
+ assert exit_code == 0
54
+
55
+ def test_read_notes_with_content(self):
56
+ notes = ["Note 1", "Note 2"]
57
+ stdout, stderr, exit_code = read_notes(notes)
58
+ assert "Note 1" in stdout
59
+ assert "Note 2" in stdout
60
+ assert exit_code == 0
61
+
62
+ def test_save_note(self):
63
+ notes = []
64
+ stdout, stderr, exit_code = save_note("Test note", notes)
65
+ assert "Note saved" in stdout
66
+ assert len(notes) == 1
67
+ assert notes[0] == "Test note"
68
+
69
+
70
+ class TestScoring:
71
+ """Test scoring functions."""
72
+
73
+ def test_correctness_exact_match(self):
74
+ score = score_correctness(42.5, 42.5, tolerance=0.01)
75
+ assert score == 0.80
76
+
77
+ def test_correctness_within_tolerance(self):
78
+ score = score_correctness(42.505, 42.5, tolerance=0.01)
79
+ assert score == 0.80
80
+
81
+ def test_correctness_wrong(self):
82
+ score = score_correctness(100.0, 42.5, tolerance=0.01)
83
+ assert score == 0.0
84
+
85
+ def test_correctness_scale_error(self):
86
+ # 100x scale error: 0.425 vs 42.5
87
+ score = score_correctness(0.425, 42.5, tolerance=0.01)
88
+ assert score == 0.20
89
+
90
+ def test_efficiency_correct_answer(self):
91
+ score = score_efficiency(turns=1, max_turns=10, is_correct=True)
92
+ assert score > 0.08 # High efficiency for 1 turn
93
+
94
+ def test_efficiency_incorrect_answer(self):
95
+ score = score_efficiency(turns=1, max_turns=10, is_correct=False)
96
+ assert score == 0.0
97
+
98
+ def test_compute_reward_correct(self):
99
+ reward, corr, eff, cost = compute_reward(
100
+ submitted=42.5,
101
+ expected=42.5,
102
+ tolerance=0.01,
103
+ turns=3,
104
+ max_turns=10,
105
+ )
106
+ assert reward > 0.8 # At least correctness score
107
+ assert corr == 0.80
108
+
109
+
110
+ class TestEnvironment:
111
+ """Test environment class."""
112
+
113
+ @pytest.fixture
114
+ def env(self):
115
+ """Create environment with test data."""
116
+ data_dir = Path(__file__).parent.parent / "data"
117
+ tasks_path = data_dir / "tasks" / "final_25_tasks.jsonl"
118
+ return CodeDarkEnvironment(
119
+ data_dir=str(data_dir),
120
+ tasks_path=str(tasks_path),
121
+ max_turns=10,
122
+ )
123
+
124
+ def test_reset_loads_task(self, env):
125
+ obs = env.reset()
126
+ assert obs.task_id != ""
127
+ assert obs.question != ""
128
+ assert obs.done is False
129
+
130
+ def test_reset_specific_task(self, env):
131
+ obs = env.reset(task_id="bank_hard_001")
132
+ assert obs.task_id == "bank_hard_001"
133
+ assert "subscription rate" in obs.question.lower()
134
+
135
+ def test_step_run_python(self, env):
136
+ env.reset(task_id="bank_hard_001")
137
+ action = CodeDarkAction(
138
+ tool="run_python", args="<code>result = df.shape</code>"
139
+ )
140
+ obs = env.step(action)
141
+ assert obs.exit_code == 0
142
+ assert "run_python Result" in obs.stdout
143
+
144
+ def test_step_save_note(self, env):
145
+ env.reset(task_id="bank_hard_001")
146
+ action = CodeDarkAction(tool="save_note", args="Test observation")
147
+ obs = env.step(action)
148
+ assert "Note saved" in obs.stdout
149
+ assert len(obs.notes) == 1
150
+
151
+ def test_step_read_notes(self, env):
152
+ env.reset(task_id="bank_hard_001")
153
+ # First save a note
154
+ env.step(CodeDarkAction(tool="save_note", args="Important finding"))
155
+ # Then read notes
156
+ obs = env.step(CodeDarkAction(tool="read_notes", args=""))
157
+ assert "Important finding" in obs.stdout
158
+
159
+ def test_step_submit_answer(self, env):
160
+ env.reset(task_id="bank_hard_001")
161
+ action = CodeDarkAction(tool="submit_answer", args="<answer>2.44</answer>")
162
+ obs = env.step(action)
163
+ assert obs.done is True
164
+ assert obs.submitted is True
165
+ assert obs.reward is not None
166
+ # This should be correct answer for bank_hard_001
167
+ assert obs.correctness == 0.80
168
+
169
+ def test_turn_counting(self, env):
170
+ env.reset()
171
+ assert env.state.turn_count == 0
172
+
173
+ env.step(CodeDarkAction(tool="run_python", args="<code>result = 1</code>"))
174
+ assert env.state.turn_count == 1
175
+
176
+ env.step(CodeDarkAction(tool="run_python", args="<code>result = 2</code>"))
177
+ assert env.state.turn_count == 2
178
+
179
+
180
+ if __name__ == "__main__":
181
+ pytest.main([__file__, "-v"])
data/bank.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a071417203c9e1434df5fe794fffae6a55327502b00da9c4e9754d2ab7f7cede
3
+ size 65698328
data/road.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ee5955af18eca0d4b53e13f23bd6436422e40ca84077fb8cdcfa467aa62b68f
3
+ size 37936892
data/tasks/final_25_tasks.jsonl ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"id": "bank_hard_001", "dataset": "bank", "goal": "What's the subscription rate for month='may' AND job='management' AND balance in Q1?", "expected_output_type": "scalar", "level": "L6", "template": "multi_condition_filter", "golden": {"answer_value": 2.44, "answer_type": "scalar", "verification_code": "df['_q'] = pd.qcut(df['balance'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nfiltered = df[(df['month'] == 'may') & (df['job'] == 'management') & (df['_q'] == 'Q1')]\nresult = round((filtered['y'] == 1).mean() * 100, 2) if len(filtered) > 0 else 0.0\ndf.drop('_q', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "null", "missing", "nan", "empty", "na"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "multi_condition_filter", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"col_a": "month", "val_a": "may", "col_b": "job", "val_b": "management", "metric": "balance", "quartile": "Q1", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
2
+ {"id": "bank_hard_005", "dataset": "bank", "goal": "Among customers in top 95% of age AND bottom 10% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.59, "answer_type": "scalar", "verification_code": "p_high = df['age'].quantile(95 / 100)\np_low = df['duration'].quantile(10 / 100)\ncohort = df[(df['age'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "age", "metric_b": "duration", "pct_high": 95, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
3
+ {"id": "bank_hard_012", "dataset": "bank", "goal": "Find the job with lowest average balance. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 8.27, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['balance'].mean()\nextrema_val = group_stats.max() if 'lowest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "balance", "extrema": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
4
+ {"id": "bank_hard_019", "dataset": "bank", "goal": "What's the subscription rate for month='may' AND job='management' AND age in Q4?", "expected_output_type": "scalar", "level": "L6", "template": "multi_condition_filter", "golden": {"answer_value": 7.04, "answer_type": "scalar", "verification_code": "df['_q'] = pd.qcut(df['age'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nfiltered = df[(df['month'] == 'may') & (df['job'] == 'management') & (df['_q'] == 'Q4')]\nresult = round((filtered['y'] == 1).mean() * 100, 2) if len(filtered) > 0 else 0.0\ndf.drop('_q', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "null", "missing", "nan", "empty", "na"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "multi_condition_filter", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"col_a": "month", "val_a": "may", "col_b": "job", "val_b": "management", "metric": "age", "quartile": "Q4", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
5
+ {"id": "bank_hard_020", "dataset": "bank", "goal": "For month with above-average day, which have subscription rate > 25%? Return sorted list.", "expected_output_type": "list", "level": "L5", "template": "top_n_in_segment", "golden": {"answer_value": ["oct"], "answer_type": "list", "verification_code": "avg_metric = df.groupby('month')['day'].mean()\nhigh_metric = avg_metric[avg_metric > avg_metric.mean()].index\nrates = df[df['month'].isin(high_metric)].groupby('month')['y'].apply(\n lambda x: (x == 1).mean() * 100)\nresult = sorted(rates[rates > 25].index.tolist())", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["tie", "ties", "equal", "same", "duplicate"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "top_n_in_segment", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "month", "metric": "day", "threshold": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
6
+ {"id": "bank_hard_021", "dataset": "bank", "goal": "Find the job with highest average balance. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 24.62, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['balance'].mean()\nextrema_val = group_stats.max() if 'highest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "balance", "extrema": "highest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
7
+ {"id": "bank_hard_023", "dataset": "bank", "goal": "Among customers in top 90% of balance AND bottom 10% of day, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 33.53, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(90 / 100)\np_low = df['day'].quantile(10 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['day'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "day", "pct_high": 90, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
8
+ {"id": "bank_hard_026", "dataset": "bank", "goal": "For each job, compute volatility score: std(age) / mean(age). Return top 5 with [group, mean, std, volatility].", "expected_output_type": "dataframe", "level": "L6", "template": "segment_volatility", "golden": {"answer_value": [{"job": "unemployed", "mean": 40.97, "std": 9.74, "volatility": 0.2377}, {"job": "self-employed", "mean": 40.42, "std": 9.46, "volatility": 0.234}, {"job": "admin.", "mean": 39.68, "std": 9.23, "volatility": 0.2326}, {"job": "services", "mean": 38.94, "std": 8.86, "volatility": 0.2275}, {"job": "management", "mean": 40.2, "std": 9.13, "volatility": 0.2271}], "answer_type": "dataframe", "verification_code": "stats = df.groupby('job')['age'].agg(['mean', 'std']).round(2)\nstats['volatility'] = round(stats['std'] / stats['mean'], 4)\nresult = stats.nlargest(5, 'volatility').reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits", "volatility", "cv", "coefficient", "variation"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "segment_volatility", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "age"}}}
9
+ {"id": "bank_hard_028", "dataset": "bank", "goal": "Among customers in top 95% of balance AND bottom 10% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.84, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(95 / 100)\np_low = df['duration'].quantile(10 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 95, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
10
+ {"id": "bank_hard_029", "dataset": "bank", "goal": "Among customers in top 90% of balance AND bottom 25% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.54, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(90 / 100)\np_low = df['duration'].quantile(25 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 90, "pct_low": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
11
+ {"id": "bank_hard_030", "dataset": "bank", "goal": "Rank job by subscription rate. Which bottom-3 have above-median age?", "expected_output_type": "list", "level": "L6", "template": "ranked_anomaly", "golden": {"answer_value": ["blue-collar", "entrepreneur"], "answer_type": "list", "verification_code": "stats = df.groupby('job').agg(\n rate=('y', lambda x: (x == 1).mean()),\n avg_metric=('age', 'mean'))\nstats['rank'] = stats['rate'].rank()\nbottom_3 = stats[stats['rank'] <= 3]\nresult = sorted(bottom_3[bottom_3['avg_metric'] > stats['avg_metric'].median()].index.tolist())", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rank", "order", "sort", "ascending", "descending"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "ranked_anomaly", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "age", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
12
+ {"id": "bank_hard_031", "dataset": "bank", "goal": "For each month, compute volatility score: std(day) / mean(day). Return top 5 with [group, mean, std, volatility].", "expected_output_type": "dataframe", "level": "L6", "template": "segment_volatility", "golden": {"answer_value": [{"month": "feb", "mean": 6.05, "std": 5.46, "volatility": 0.9025}, {"month": "sep", "mean": 11.67, "std": 8.04, "volatility": 0.6889}, {"month": "mar", "mean": 13.45, "std": 9.18, "volatility": 0.6825}, {"month": "jun", "mean": 11.32, "std": 7.33, "volatility": 0.6475}, {"month": "dec", "mean": 14.19, "std": 8.83, "volatility": 0.6223}], "answer_type": "dataframe", "verification_code": "stats = df.groupby('month')['day'].agg(['mean', 'std']).round(2)\nstats['volatility'] = round(stats['std'] / stats['mean'], 4)\nresult = stats.nlargest(5, 'volatility').reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits", "volatility", "cv", "coefficient", "variation"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "segment_volatility", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "month", "metric": "day"}}}
13
+ {"id": "bank_hard_033", "dataset": "bank", "goal": "Show the average balance breakdown by job. Include count and mean balance for each category, sorted by mean descending.", "expected_output_type": "dataframe", "level": "L4", "template": "metric_breakdown", "golden": {"answer_value": [{"job": "retired", "count": 35185, "mean_balance": 1812.07}, {"job": "unknown", "count": 2917, "mean_balance": 1678.96}, {"job": "self-employed", "count": 19020, "mean_balance": 1598.27}, {"job": "student", "count": 11767, "mean_balance": 1577.32}, {"job": "management", "count": 175541, "mean_balance": 1510.39}, {"job": "unemployed", "count": 17634, "mean_balance": 1440.57}, {"job": "entrepreneur", "count": 17718, "mean_balance": 1306.75}, {"job": "housemaid", "count": 15912, "mean_balance": 1281.22}, {"job": "technician", "count": 138107, "mean_balance": 1071.57}, {"job": "admin.", "count": 81492, "mean_balance": 1019.92}, {"job": "blue-collar", "count": 170498, "mean_balance": 977.49}, {"job": "services", "count": 64209, "mean_balance": 834.63}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('job').agg(\n count=('balance', 'size'),\n mean_balance=('balance', lambda x: round(x.mean(), 2))\n).sort_values('mean_balance', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "metric_breakdown", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "balance"}}}
14
+ {"id": "bank_hard_035", "dataset": "bank", "goal": "Find the job with lowest average age. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 34.08, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['age'].mean()\nextrema_val = group_stats.max() if 'lowest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "age", "extrema": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
15
+ {"id": "bank_hard_038", "dataset": "bank", "goal": "Find the month with the lowest subscription rate. Within that group, what is the average day?", "expected_output_type": "scalar", "level": "L5", "template": "nested_extrema", "golden": {"answer_value": 16.09, "answer_type": "scalar", "verification_code": "group_rates = df.groupby('month')['y'].apply(lambda x: (x == 1).mean())\nouter_val = group_rates.max() if 'lowest' == 'highest' else group_rates.min()\nouter_tied = group_rates[group_rates == outer_val].sort_index()\nouter_group = outer_tied.index[0]\nsubset = df[df['month'] == outer_group]\nresult = round(subset['day'].mean(), 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["tie", "ties", "equal", "same", "duplicate", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "nested_extrema", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "month", "metric": "day", "extrema_outer": "lowest", "extrema_inner": "highest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
16
+ {"id": "bank_hard_039", "dataset": "bank", "goal": "Divide customers into 4 day quartiles. What is the subscription percentage (0-100) in the highest (top 25%) (Q4) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "quartile_conversion", "golden": {"answer_value": 11.55, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['day'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q4']\nresult = round((bin_data['y'] == 1).mean() * 100, 2)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "quartile_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "day", "quartile": "Q4", "quartile_desc": "highest (top 25%)", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
17
+ {"id": "bank_hard_040", "dataset": "bank", "goal": "Among customers in top 95% of balance AND bottom 25% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.65, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(95 / 100)\np_low = df['duration'].quantile(25 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 95, "pct_low": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
18
+ {"id": "bank_hard_041", "dataset": "bank", "goal": "Which job categories would have the biggest impact if brought to average subscription rate? Return top 3 by potential gain (count * rate gap), sorted by impact.", "expected_output_type": "list", "level": "L5", "template": "segment_improvement_potential", "golden": {"answer_value": ["blue-collar", "services", "entrepreneur"], "answer_type": "list", "verification_code": "overall_rate = (df['y'] == 1).mean()\ngroup_stats = df.groupby('job').agg(\n rate=('y', lambda x: (x == 1).mean()),\n count=('y', 'size')\n)\ngroup_stats['gap'] = overall_rate - group_stats['rate']\ngroup_stats['potential'] = group_stats['count'] * group_stats['gap']\ntop_potential = group_stats[group_stats['gap'] > 0].nlargest(3, 'potential')\nresult = top_potential.index.tolist()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "segment_improvement_potential", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
19
+ {"id": "bank_hard_044", "dataset": "bank", "goal": "Find the month with the lowest subscription rate. Within that group, what is the average age?", "expected_output_type": "scalar", "level": "L5", "template": "nested_extrema", "golden": {"answer_value": 38.98, "answer_type": "scalar", "verification_code": "group_rates = df.groupby('month')['y'].apply(lambda x: (x == 1).mean())\nouter_val = group_rates.max() if 'lowest' == 'highest' else group_rates.min()\nouter_tied = group_rates[group_rates == outer_val].sort_index()\nouter_group = outer_tied.index[0]\nsubset = df[df['month'] == outer_group]\nresult = round(subset['age'].mean(), 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["tie", "ties", "equal", "same", "duplicate", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "nested_extrema", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "month", "metric": "age", "extrema_outer": "lowest", "extrema_inner": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
20
+ {"id": "road_hard_014", "dataset": "road", "goal": "Which lighting categories have the highest total reported accidents? Show breakdown with [lighting, count, total_num_reported_accidents, avg_num_reported_accidents] sorted by total descending.", "expected_output_type": "dataframe", "level": "L4", "template": "count_segment_total", "golden": {"answer_value": [{"lighting": "dim", "count": 183826, "total_num_reported_accidents": 211283, "avg_num_reported_accidents": 1.15}, {"lighting": "daylight", "count": 178015, "total_num_reported_accidents": 207579, "avg_num_reported_accidents": 1.17}, {"lighting": "night", "count": 155913, "total_num_reported_accidents": 196214, "avg_num_reported_accidents": 1.26}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('lighting').agg(\n count=('num_reported_accidents', 'size'),\n total_num_reported_accidents=('num_reported_accidents', 'sum'),\n avg_num_reported_accidents=('num_reported_accidents', lambda x: round(x.mean(), 2))\n).sort_values('total_num_reported_accidents', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": [], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "count_segment_total", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "lighting", "target_col": "num_reported_accidents", "target_desc": "reported accidents"}}}
21
+ {"id": "road_hard_021", "dataset": "road", "goal": "Which weather categories have the highest average accident risk? Show breakdown with [weather, count, avg_accident_risk] sorted by average descending.", "expected_output_type": "dataframe", "level": "L4", "template": "continuous_segment_breakdown", "golden": {"answer_value": [{"weather": "foggy", "count": 181463, "avg_accident_risk": 0.3863}, {"weather": "rainy", "count": 156985, "avg_accident_risk": 0.3615}, {"weather": "clear", "count": 179306, "avg_accident_risk": 0.3101}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('weather').agg(\n count=('accident_risk', 'size'),\n avg_accident_risk=('accident_risk', lambda x: round(x.mean(), 4))\n).sort_values('avg_accident_risk', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": [], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_segment_breakdown", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "weather", "target_col": "accident_risk", "target_desc": "accident risk"}}}
22
+ {"id": "road_hard_015", "dataset": "road", "goal": "Divide records into 4 speed_limit quartiles. What is the average accident risk in the lower-middle (25-50%) (Q2) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "continuous_quartile_analysis", "golden": {"answer_value": 0.29, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['speed_limit'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q2']\nresult = round(bin_data['accident_risk'].mean(), 4)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_quartile_analysis", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "speed_limit", "quartile": "Q2", "quartile_desc": "lower-middle (25-50%)", "target_col": "accident_risk", "target_desc": "accident risk"}}}
23
+ {"id": "road_hard_002", "dataset": "road", "goal": "Divide records into 4 curvature quartiles. What is the average accident risk in the upper-middle (50-75%) (Q3) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "continuous_quartile_analysis", "golden": {"answer_value": 0.41, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['curvature'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q3']\nresult = round(bin_data['accident_risk'].mean(), 4)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_quartile_analysis", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "curvature", "quartile": "Q3", "quartile_desc": "upper-middle (50-75%)", "target_col": "accident_risk", "target_desc": "accident risk"}}}
24
+ {"id": "road_hard_007", "dataset": "road", "goal": "How much higher is the average accident risk for lighting='daylight' compared to 'night'? Return difference.", "expected_output_type": "scalar", "level": "L5", "template": "continuous_comparison", "golden": {"answer_value": -0.17, "answer_type": "scalar", "verification_code": "avg_a = df[df['lighting'] == 'daylight']['accident_risk'].mean()\navg_b = df[df['lighting'] == 'night']['accident_risk'].mean()\nresult = round(avg_a - avg_b, 4)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "continuous_comparison", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "lighting", "val_a": "daylight", "val_b": "night", "target_col": "accident_risk", "target_desc": "accident risk"}}}
25
+ {"id": "road_hard_004", "dataset": "road", "goal": "How much higher is the average accident risk for road_type='rural' compared to 'urban'? Return difference.", "expected_output_type": "scalar", "level": "L5", "template": "continuous_comparison", "golden": {"answer_value": -0.01, "answer_type": "scalar", "verification_code": "avg_a = df[df['road_type'] == 'rural']['accident_risk'].mean()\navg_b = df[df['road_type'] == 'urban']['accident_risk'].mean()\nresult = round(avg_a - avg_b, 4)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "continuous_comparison", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "road_type", "val_a": "rural", "val_b": "urban", "target_col": "accident_risk", "target_desc": "accident risk"}}}
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi>=0.115.0
2
+ pydantic>=2.0.0
3
+ uvicorn[standard]>=0.24.0
4
+ pandas>=2.0.0
5
+ numpy>=1.24.0
6
+ requests>=2.31.0