Spaces:
Sleeping
Sleeping
Financial Task Environment — code execution with real xlsx
Browse files- .dockerignore +10 -0
- .gitattributes +1 -0
- Dockerfile +33 -0
- README.md +181 -2
- __init__.py +10 -0
- client.py +45 -0
- data/0/0_ref_0.xlsx +3 -0
- data/0/0_src_0.xlsx +3 -0
- data/118/118_src_0.xlsx +3 -0
- data/119/119_src_0.xlsx +3 -0
- data/21/21_ref_0.xlsx +3 -0
- data/21/21_src_0.xlsx +3 -0
- data/24/24_ref_0.xlsx +3 -0
- data/24/24_src_0.xlsx +3 -0
- data/34/34_src_0.xlsx +3 -0
- data/35/35_ref_0.xlsx +3 -0
- data/35/35_src_0.xlsx +3 -0
- data/40/40_ref_0.xlsx +3 -0
- data/40/40_src_0.xlsx +3 -0
- data/60/60_ref_0.xlsx +3 -0
- data/60/60_src_0.xlsx +3 -0
- data/67/67_ref_0.xlsx +3 -0
- data/67/67_src_0.xlsx +3 -0
- graders.py +183 -0
- inference.py +325 -0
- models.py +44 -0
- openenv.yaml +47 -0
- pyproject.toml +35 -0
- server/Dockerfile +39 -0
- server/__init__.py +1 -0
- server/app.py +24 -0
- server/financial_environment.py +297 -0
- tasks.py +284 -0
- uv.lock +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
.git/
|
| 5 |
+
.venv/
|
| 6 |
+
outputs/
|
| 7 |
+
*.egg-info/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
.pytest_cache/
|
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*.xlsx filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Also place Dockerfile at repo root (required by validation)
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
WORKDIR /app/env
|
| 5 |
+
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
git curl \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 11 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 12 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx
|
| 13 |
+
|
| 14 |
+
COPY pyproject.toml /app/env/
|
| 15 |
+
|
| 16 |
+
RUN uv pip install --system --no-cache-dir \
|
| 17 |
+
"openenv-core>=0.2.0" \
|
| 18 |
+
"fastapi>=0.104.0" \
|
| 19 |
+
"uvicorn>=0.24.0" \
|
| 20 |
+
"pydantic>=2.0.0" \
|
| 21 |
+
"websockets>=12.0" \
|
| 22 |
+
"openpyxl>=3.1.0"
|
| 23 |
+
|
| 24 |
+
COPY . /app/env/
|
| 25 |
+
|
| 26 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 27 |
+
|
| 28 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
|
| 29 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
| 30 |
+
|
| 31 |
+
EXPOSE 8000
|
| 32 |
+
|
| 33 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -1,2 +1,181 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Financial Task Environment
|
| 3 |
+
emoji: 📊
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# Financial Task Environment
|
| 15 |
+
|
| 16 |
+
An [OpenEnv](https://github.com/meta-pytorch/OpenEnv) **code-execution
|
| 17 |
+
environment** for training and evaluating AI agents on **real-world finance
|
| 18 |
+
& accounting spreadsheet tasks**. Agents write Python code (using
|
| 19 |
+
`openpyxl`) to read, analyze, and modify authentic Excel workbooks from
|
| 20 |
+
enterprise workflows.
|
| 21 |
+
|
| 22 |
+
## Motivation
|
| 23 |
+
|
| 24 |
+
Finance professionals spend hundreds of hours on spreadsheet-centric tasks —
|
| 25 |
+
extracting values, computing ratios, auditing formulas, entering data, building
|
| 26 |
+
scenarios, and consolidating reports. This environment provides 10 diverse
|
| 27 |
+
tasks backed by real `.xlsx` files so agents can be trained and evaluated on
|
| 28 |
+
the same kind of work.
|
| 29 |
+
|
| 30 |
+
## How It Works
|
| 31 |
+
|
| 32 |
+
1. **Reset** with a `task_id` → receive task instructions + xlsx file path + a
|
| 33 |
+
summary of the spreadsheet contents.
|
| 34 |
+
2. **Execute code** (`action_type="code"`) → run Python code that reads or
|
| 35 |
+
modifies the xlsx. The environment returns stdout/stderr.
|
| 36 |
+
3. **Submit** a text answer (`action_type="submit"` for QA tasks) or a modified
|
| 37 |
+
file (`action_type="submit_file"` for MODIFY tasks).
|
| 38 |
+
4. The environment **grades** the submission: QA answers are scored by numeric
|
| 39 |
+
matching + keyword overlap; MODIFY tasks are scored by cell-level comparison
|
| 40 |
+
against a reference workbook.
|
| 41 |
+
|
| 42 |
+
## Tasks (10 total)
|
| 43 |
+
|
| 44 |
+
| # | Task ID | Title | Difficulty | Type | Category |
|
| 45 |
+
|---|---------|-------|------------|------|----------|
|
| 46 |
+
| 1 | `task_1` | Count Plants in Spreadsheet | Easy | QA | Calculation |
|
| 47 |
+
| 2 | `task_2` | Retrieve TW EOL Charge | Easy | QA | Cross-sheet Retrieval |
|
| 48 |
+
| 3 | `task_3` | Portfolio Mark-to-Market Change | Easy | QA | Calculation |
|
| 49 |
+
| 4 | `task_4` | Summarize Pipeline Imbalances | Medium | MODIFY | Calculation |
|
| 50 |
+
| 5 | `task_5` | Audit and Correct Formula Errors | Medium | MODIFY | Validation / Review |
|
| 51 |
+
| 6 | `task_6` | Create Table and Apply Filter | Medium | MODIFY | Structuring / Formatting |
|
| 52 |
+
| 7 | `task_7` | Add Weekday Row and Data Entry | Medium | MODIFY | Data Entry / Import |
|
| 53 |
+
| 8 | `task_8` | Balance Sheet Validation & Indicators | Hard | MODIFY | Validation, Calculation |
|
| 54 |
+
| 9 | `task_9` | Create Scenario3 Worksheet | Hard | MODIFY | Financial Modeling |
|
| 55 |
+
| 10 | `task_10` | Consolidate by Type and Area | Hard | MODIFY | Multi-type |
|
| 56 |
+
|
| 57 |
+
### Difficulty Progression
|
| 58 |
+
|
| 59 |
+
- **Easy (3 tasks):** QA — read the spreadsheet and answer a question.
|
| 60 |
+
- **Medium (4 tasks):** MODIFY — edit/augment the workbook (summaries, audits, formatting, data entry).
|
| 61 |
+
- **Hard (3 tasks):** MODIFY — complex multi-sheet operations (validation, new scenario sheets, consolidation).
|
| 62 |
+
|
| 63 |
+
## Action & Observation Spaces
|
| 64 |
+
|
| 65 |
+
### Action — `FinancialAction`
|
| 66 |
+
|
| 67 |
+
| Field | Type | Description |
|
| 68 |
+
|-------|------|-------------|
|
| 69 |
+
| `action_type` | `str` | `"code"` to execute Python, `"submit"` for text answer, `"submit_file"` for xlsx |
|
| 70 |
+
| `content` | `str` | Python code, text answer, or file path |
|
| 71 |
+
|
| 72 |
+
### Observation — `FinancialObservation`
|
| 73 |
+
|
| 74 |
+
| Field | Type | Description |
|
| 75 |
+
|-------|------|-------------|
|
| 76 |
+
| `task_id` | `str` | Current task identifier |
|
| 77 |
+
| `task_description` | `str` | Full task instructions + xlsx summary |
|
| 78 |
+
| `source_file` | `str` | Path to the working xlsx copy |
|
| 79 |
+
| `difficulty` | `str` | `easy`, `medium`, or `hard` |
|
| 80 |
+
| `task_type` | `str` | `QA` or `MODIFY` |
|
| 81 |
+
| `feedback` | `str` | Code output or grading result |
|
| 82 |
+
| `current_step` | `int` | Current step (max 15) |
|
| 83 |
+
| `done` | `bool` | Whether the episode is finished |
|
| 84 |
+
| `reward` | `float` | Reward for this step (0.0–1.0) |
|
| 85 |
+
|
| 86 |
+
## Reward Design
|
| 87 |
+
|
| 88 |
+
| Action | Reward | Signal |
|
| 89 |
+
|--------|--------|--------|
|
| 90 |
+
| `code` | 0.02 | Small reward for active exploration |
|
| 91 |
+
| `submit` / `submit_file` | 0.0–1.0 | Graded against reference |
|
| 92 |
+
| Max steps (15) | Episode ends | |
|
| 93 |
+
|
| 94 |
+
**QA grading:** Numeric extraction with 5% tolerance + keyword overlap.
|
| 95 |
+
**MODIFY grading:** 30% sheet-name match + 70% cell-level comparison (2% numeric tolerance).
|
| 96 |
+
|
| 97 |
+
## Setup & Usage
|
| 98 |
+
|
| 99 |
+
### Prerequisites
|
| 100 |
+
|
| 101 |
+
- Python 3.10+
|
| 102 |
+
- Docker (for containerized deployment)
|
| 103 |
+
- `pip install openenv-core openpyxl`
|
| 104 |
+
|
| 105 |
+
### Local Development
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
pip install -e ".[dev]"
|
| 109 |
+
PYTHONPATH=. uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
### Docker
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
docker build -t financial-task-env:latest .
|
| 116 |
+
docker run -p 8000:8000 financial-task-env:latest
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
### Baseline Inference
|
| 120 |
+
|
| 121 |
+
```bash
|
| 122 |
+
export API_BASE_URL="https://api.openai.com/v1"
|
| 123 |
+
export MODEL_NAME="gpt-4o-mini"
|
| 124 |
+
export HF_TOKEN="your-api-key"
|
| 125 |
+
export ENV_URL="http://localhost:8000"
|
| 126 |
+
python inference.py
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
## Baseline Scores
|
| 130 |
+
|
| 131 |
+
| Difficulty | Type | Expected Range |
|
| 132 |
+
|------------|------|---------------|
|
| 133 |
+
| Easy | QA | 0.60 – 1.00 |
|
| 134 |
+
| Medium | MODIFY | 0.30 – 0.80 |
|
| 135 |
+
| Hard | MODIFY | 0.10 – 0.60 |
|
| 136 |
+
|
| 137 |
+
## Project Structure
|
| 138 |
+
|
| 139 |
+
```
|
| 140 |
+
financial_task_env/
|
| 141 |
+
├── __init__.py # Module exports
|
| 142 |
+
├── models.py # FinancialAction & FinancialObservation
|
| 143 |
+
├── tasks.py # 10 task definitions + xlsx paths
|
| 144 |
+
├── graders.py # QA grading + xlsx cell comparison
|
| 145 |
+
├── client.py # FinancialTaskEnv (EnvClient)
|
| 146 |
+
├── inference.py # Baseline inference script
|
| 147 |
+
├── openenv.yaml # OpenEnv manifest
|
| 148 |
+
├── pyproject.toml # Dependencies
|
| 149 |
+
├── Dockerfile # Container image
|
| 150 |
+
├── data/ # xlsx source & reference files
|
| 151 |
+
│ ├── 0/ # Balance sheet validation
|
| 152 |
+
│ ├── 21/ # Data entry
|
| 153 |
+
│ ├── 24/ # Scenario modeling
|
| 154 |
+
│ ├── 34/ # Portfolio calculation
|
| 155 |
+
│ ├── 35/ # Pipeline imbalances
|
| 156 |
+
│ ├── 40/ # Formula audit
|
| 157 |
+
│ ├── 60/ # Table formatting
|
| 158 |
+
│ ├── 67/ # Consolidation
|
| 159 |
+
│ ├── 118/ # Value retrieval
|
| 160 |
+
│ └── 119/ # Plant counting
|
| 161 |
+
└── server/
|
| 162 |
+
├── __init__.py
|
| 163 |
+
├── financial_environment.py # Code-execution environment
|
| 164 |
+
├── app.py # FastAPI application
|
| 165 |
+
└── Dockerfile
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
## Environment Description
|
| 169 |
+
|
| 170 |
+
This environment models real financial spreadsheet work:
|
| 171 |
+
|
| 172 |
+
- **Data extraction** — read values from complex multi-sheet workbooks
|
| 173 |
+
- **Calculation** — compute portfolio changes, imbalances, indicators
|
| 174 |
+
- **Validation** — audit and fix formula errors in workbooks
|
| 175 |
+
- **Data entry** — add rows, enter values, format new columns
|
| 176 |
+
- **Structuring** — create tables, apply filters, build new worksheets
|
| 177 |
+
- **Financial modeling** — replicate scenario sheets with new parameters
|
| 178 |
+
- **Consolidation** — aggregate data across sheets into summary views
|
| 179 |
+
|
| 180 |
+
Each task uses a genuine enterprise Excel workbook. MODIFY tasks are graded
|
| 181 |
+
by cell-level comparison against a reference workbook.
|
__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Financial Task Environment — an OpenEnv environment for finance & accounting tasks.
|
| 2 |
+
|
| 3 |
+
Covers real-world enterprise workflows including data extraction,
|
| 4 |
+
ratio analysis, reconciliation, valuation, and consolidation.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from models import FinancialAction, FinancialObservation
|
| 8 |
+
from client import FinancialTaskEnv
|
| 9 |
+
|
| 10 |
+
__all__ = ["FinancialAction", "FinancialObservation", "FinancialTaskEnv"]
|
client.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Financial Task Environment client."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict
|
| 6 |
+
|
| 7 |
+
from openenv.core.env_client import EnvClient
|
| 8 |
+
from openenv.core.client_types import StepResult, StateT
|
| 9 |
+
|
| 10 |
+
from models import FinancialAction, FinancialObservation
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class FinancialTaskEnv(EnvClient["FinancialAction", "FinancialObservation", StateT]):
|
| 14 |
+
"""Client for connecting to a Financial Task Environment server.
|
| 15 |
+
|
| 16 |
+
Example (async)::
|
| 17 |
+
|
| 18 |
+
async with FinancialTaskEnv(base_url="http://localhost:8000") as env:
|
| 19 |
+
result = await env.reset(task_id="task_1")
|
| 20 |
+
print(result.observation.task_description)
|
| 21 |
+
result = await env.step(FinancialAction(action_type="submit", content="42"))
|
| 22 |
+
print(result.reward)
|
| 23 |
+
|
| 24 |
+
Example (sync)::
|
| 25 |
+
|
| 26 |
+
with FinancialTaskEnv(base_url="http://localhost:8000").sync() as env:
|
| 27 |
+
result = env.reset(task_id="task_1")
|
| 28 |
+
result = env.step(FinancialAction(action_type="submit", content="42"))
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def _step_payload(self, action: FinancialAction) -> Dict[str, Any]:
|
| 32 |
+
return action.model_dump()
|
| 33 |
+
|
| 34 |
+
def _parse_result(self, payload: Dict[str, Any]) -> StepResult[FinancialObservation]:
|
| 35 |
+
obs = FinancialObservation(**payload)
|
| 36 |
+
return StepResult(
|
| 37 |
+
observation=obs,
|
| 38 |
+
reward=obs.reward if isinstance(obs.reward, (int, float)) else 0.0,
|
| 39 |
+
done=obs.done,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
def _parse_state(self, payload: Dict[str, Any]) -> Any:
|
| 43 |
+
from openenv.core.env_server.types import State
|
| 44 |
+
|
| 45 |
+
return State(**payload)
|
data/0/0_ref_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc273e786bfae6a6bb70f9fac0a5663fcb788344ce0faaec4d5c02392bd7d646
|
| 3 |
+
size 80606
|
data/0/0_src_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40397df4ff9ae47d84f071ca01886edcac90a28c2bc0aeb18556264222de807b
|
| 3 |
+
size 79613
|
data/118/118_src_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa0b8b943ab728e4b3be39178f4e8b05fd71095706b1e0520149e022c1e40c3f
|
| 3 |
+
size 131652
|
data/119/119_src_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3cf63aa836a798cd4f41836ecf4f46c6bd15e5fb3b2025013ed925fea40d47d
|
| 3 |
+
size 40000
|
data/21/21_ref_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01f24e077eda9026d5a144cd3b894881f2c57d508c0e4d3d2c3427e42b55eddc
|
| 3 |
+
size 30038
|
data/21/21_src_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1471369e7905b17e465ee7c64864084af2d8535995880b869a637bb67681aaf5
|
| 3 |
+
size 29119
|
data/24/24_ref_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f7108dce2da312f65995628a7bc047fbc360e07fca93e04b9cc9eb25ebae34ea
|
| 3 |
+
size 76723
|
data/24/24_src_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b35cede39defec37318688269cbbc9b1ee30231ab477fa5731dfe0bfc4cb1df0
|
| 3 |
+
size 52512
|
data/34/34_src_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:839721238c41b2936c6f11f6e200bf715cd159181fc229e2b04a9c0de0f3fc7c
|
| 3 |
+
size 49644
|
data/35/35_ref_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6a343703ee8e5e2c9152552d34f92888060d97943dc3b80a549324d3418a043
|
| 3 |
+
size 275054
|
data/35/35_src_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2b53bc2e870425d6170250882fa4b46304d666e818cd4eac555dff1be9e02e4
|
| 3 |
+
size 273725
|
data/40/40_ref_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ba5ba16d27462a8a073615db567880b90135af68ebde8493767104eb50fceb5
|
| 3 |
+
size 886419
|
data/40/40_src_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ddefa7bc75f270d9e413453b8f5f7ae1365f4eb6fa251a947b3e3642bc7f3a0f
|
| 3 |
+
size 174723
|
data/60/60_ref_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:513c0f30941d2002bd2c0142e318132b7ccf5c2868b1cb8cf4a673ef2514307a
|
| 3 |
+
size 48648
|
data/60/60_src_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf415dfb4b6683eb90fdbe266e69b45c3ec4c6667b28dbe7768aaccaf7cb0274
|
| 3 |
+
size 43324
|
data/67/67_ref_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:567a1d5ea8904ef8e3fbbe4282550b100c883400830d9ed2b9b3fcc6d35b6e7d
|
| 3 |
+
size 549697
|
data/67/67_src_0.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36336d97a8e2450d722dadf259e434a6dbe93a4d53a2b1e3c6c0bdc4c289e338
|
| 3 |
+
size 537438
|
graders.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Grading functions for the Financial Task Environment.
|
| 2 |
+
|
| 3 |
+
Two grading modes:
|
| 4 |
+
1. QA tasks — compare agent text answer against reference text
|
| 5 |
+
(numeric extraction + keyword matching)
|
| 6 |
+
2. MODIFY tasks — compare agent-produced xlsx against reference xlsx
|
| 7 |
+
(cell-level comparison with tolerance)
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import re
|
| 13 |
+
import traceback
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Any, Dict, List, Optional, Sequence
|
| 16 |
+
|
| 17 |
+
import openpyxl
|
| 18 |
+
from openpyxl.utils import get_column_letter
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
# Numeric helpers
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
|
| 25 |
+
def _extract_numbers(text: str) -> List[float]:
|
| 26 |
+
"""Extract all numeric values from text, handling commas, $, %."""
|
| 27 |
+
cleaned = text.replace("$", "").replace("€", "").replace("£", "")
|
| 28 |
+
pattern = r"-?\d{1,3}(?:,\d{3})*(?:\.\d+)?|-?\d+(?:\.\d+)?"
|
| 29 |
+
raw = re.findall(pattern, cleaned)
|
| 30 |
+
results: List[float] = []
|
| 31 |
+
for r in raw:
|
| 32 |
+
try:
|
| 33 |
+
results.append(float(r.replace(",", "")))
|
| 34 |
+
except ValueError:
|
| 35 |
+
continue
|
| 36 |
+
return results
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _number_close(actual: float, expected: float, rel_tol: float = 0.05) -> bool:
|
| 40 |
+
if expected == 0:
|
| 41 |
+
return abs(actual) < 1e-6
|
| 42 |
+
return abs(actual - expected) / abs(expected) <= rel_tol
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _best_number_match(numbers: List[float], target: float, rel_tol: float = 0.05) -> bool:
|
| 46 |
+
return any(_number_close(n, target, rel_tol) for n in numbers)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
# QA grading (text answer)
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
|
| 53 |
+
def grade_qa(answer: str, reference_answer: str) -> float:
|
| 54 |
+
"""Grade a text answer against a reference. Returns 0.0–1.0."""
|
| 55 |
+
if not answer.strip():
|
| 56 |
+
return 0.0
|
| 57 |
+
|
| 58 |
+
ref_nums = _extract_numbers(reference_answer)
|
| 59 |
+
ans_nums = _extract_numbers(answer)
|
| 60 |
+
|
| 61 |
+
if ref_nums:
|
| 62 |
+
# Numeric comparison: what fraction of reference numbers appear?
|
| 63 |
+
matched = sum(1 for r in ref_nums if _best_number_match(ans_nums, r))
|
| 64 |
+
num_score = matched / len(ref_nums)
|
| 65 |
+
else:
|
| 66 |
+
num_score = 0.0
|
| 67 |
+
|
| 68 |
+
# Keyword overlap
|
| 69 |
+
ref_words = set(re.findall(r"[a-zA-Z]{3,}", reference_answer.lower()))
|
| 70 |
+
ans_words = set(re.findall(r"[a-zA-Z]{3,}", answer.lower()))
|
| 71 |
+
if ref_words:
|
| 72 |
+
kw_score = len(ref_words & ans_words) / len(ref_words)
|
| 73 |
+
else:
|
| 74 |
+
kw_score = 0.0
|
| 75 |
+
|
| 76 |
+
# Weighted combination (numbers matter more for financial tasks)
|
| 77 |
+
if ref_nums:
|
| 78 |
+
# If all numbers match perfectly, give full score
|
| 79 |
+
if num_score >= 1.0:
|
| 80 |
+
return 1.0
|
| 81 |
+
return round(min(0.8 * num_score + 0.2 * kw_score, 1.0), 4)
|
| 82 |
+
else:
|
| 83 |
+
return round(kw_score, 4)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
# MODIFY grading (xlsx comparison)
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
|
| 90 |
+
def _load_wb_values(path: str):
|
| 91 |
+
"""Load workbook in data_only mode, return dict of {(sheet, row, col): value}."""
|
| 92 |
+
wb = openpyxl.load_workbook(path, data_only=True)
|
| 93 |
+
cells = {}
|
| 94 |
+
sheets = set()
|
| 95 |
+
for name in wb.sheetnames:
|
| 96 |
+
sheets.add(name)
|
| 97 |
+
ws = wb[name]
|
| 98 |
+
for row in ws.iter_rows():
|
| 99 |
+
for cell in row:
|
| 100 |
+
if cell.value is not None:
|
| 101 |
+
cells[(name, cell.row, cell.column)] = cell.value
|
| 102 |
+
wb.close()
|
| 103 |
+
return cells, sheets
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def grade_xlsx(output_path: str, reference_path: str) -> float:
|
| 107 |
+
"""Compare agent output xlsx with reference xlsx. Returns 0.0–1.0.
|
| 108 |
+
|
| 109 |
+
Scoring breakdown:
|
| 110 |
+
- 30% sheet-level: does the output have all reference sheets?
|
| 111 |
+
- 70% cell-level: fraction of reference cells matched (with tolerance for numbers)
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
ref_cells, ref_sheets = _load_wb_values(reference_path)
|
| 115 |
+
out_cells, out_sheets = _load_wb_values(output_path)
|
| 116 |
+
except Exception:
|
| 117 |
+
return 0.0
|
| 118 |
+
|
| 119 |
+
# --- Sheet score (30%) ---
|
| 120 |
+
if ref_sheets:
|
| 121 |
+
sheet_score = len(ref_sheets & out_sheets) / len(ref_sheets)
|
| 122 |
+
else:
|
| 123 |
+
sheet_score = 1.0
|
| 124 |
+
|
| 125 |
+
# --- Cell score (70%) ---
|
| 126 |
+
if not ref_cells:
|
| 127 |
+
return round(0.3 * sheet_score + 0.7 * 1.0, 4)
|
| 128 |
+
|
| 129 |
+
matched = 0
|
| 130 |
+
total = len(ref_cells)
|
| 131 |
+
|
| 132 |
+
for key, ref_val in ref_cells.items():
|
| 133 |
+
out_val = out_cells.get(key)
|
| 134 |
+
if out_val is None:
|
| 135 |
+
continue
|
| 136 |
+
if ref_val == out_val:
|
| 137 |
+
matched += 1
|
| 138 |
+
continue
|
| 139 |
+
# Numeric tolerance
|
| 140 |
+
try:
|
| 141 |
+
rv = float(ref_val)
|
| 142 |
+
ov = float(out_val)
|
| 143 |
+
if _number_close(ov, rv, rel_tol=0.02):
|
| 144 |
+
matched += 1
|
| 145 |
+
continue
|
| 146 |
+
except (ValueError, TypeError):
|
| 147 |
+
pass
|
| 148 |
+
# String comparison (case-insensitive, whitespace-normalized)
|
| 149 |
+
try:
|
| 150 |
+
if str(ref_val).strip().lower() == str(out_val).strip().lower():
|
| 151 |
+
matched += 1
|
| 152 |
+
except Exception:
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
cell_score = matched / total if total > 0 else 1.0
|
| 156 |
+
|
| 157 |
+
return round(0.3 * sheet_score + 0.7 * cell_score, 4)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# ---------------------------------------------------------------------------
|
| 161 |
+
# Dispatcher
|
| 162 |
+
# ---------------------------------------------------------------------------
|
| 163 |
+
|
| 164 |
+
def grade_task(task: Dict[str, Any], answer: str = "", output_path: str = "") -> float:
|
| 165 |
+
"""Grade a task. Returns 0.0–1.0.
|
| 166 |
+
|
| 167 |
+
For QA tasks: uses *answer* (text) vs task["reference_answer"].
|
| 168 |
+
For MODIFY tasks: uses *output_path* (xlsx) vs task["reference_file"].
|
| 169 |
+
"""
|
| 170 |
+
task_type = task.get("task_type", "QA")
|
| 171 |
+
|
| 172 |
+
if task_type == "QA":
|
| 173 |
+
ref = task.get("reference_answer", "")
|
| 174 |
+
return grade_qa(answer, ref)
|
| 175 |
+
elif task_type == "MODIFY":
|
| 176 |
+
ref_path = task.get("reference_file", "")
|
| 177 |
+
if not output_path or not ref_path:
|
| 178 |
+
return 0.0
|
| 179 |
+
if not Path(output_path).exists() or not Path(ref_path).exists():
|
| 180 |
+
return 0.0
|
| 181 |
+
return grade_xlsx(output_path, ref_path)
|
| 182 |
+
else:
|
| 183 |
+
return 0.0
|
inference.py
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Baseline inference script for the Financial Task Environment.
|
| 3 |
+
|
| 4 |
+
Runs an LLM agent against all 10 tasks. The agent generates Python code
|
| 5 |
+
to read/modify Excel workbooks, then submits answers or modified files.
|
| 6 |
+
|
| 7 |
+
Uses WebSocket for persistent sessions (HTTP endpoints are stateless).
|
| 8 |
+
|
| 9 |
+
Environment variables
|
| 10 |
+
─────────────────────
|
| 11 |
+
API_BASE_URL LLM API endpoint (required)
|
| 12 |
+
MODEL_NAME Model identifier (required)
|
| 13 |
+
HF_TOKEN Hugging Face / API key (required)
|
| 14 |
+
ENV_URL Environment server URL (default: http://localhost:8000)
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import asyncio
|
| 20 |
+
import json
|
| 21 |
+
import os
|
| 22 |
+
import re
|
| 23 |
+
import sys
|
| 24 |
+
import textwrap
|
| 25 |
+
from typing import Any, Dict, List, Optional
|
| 26 |
+
|
| 27 |
+
from openai import OpenAI
|
| 28 |
+
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
# Configuration from environment
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
|
| 33 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 34 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "MiniMaxAI/MiniMax-M2.5")
|
| 35 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("API_KEY")
|
| 36 |
+
ENV_URL = os.environ.get("ENV_URL", "http://localhost:8000")
|
| 37 |
+
|
| 38 |
+
BENCHMARK = "financial_task_env"
|
| 39 |
+
MAX_STEPS = 10
|
| 40 |
+
TEMPERATURE = 0.0
|
| 41 |
+
MAX_TOKENS = 12000
|
| 42 |
+
|
| 43 |
+
TASK_IDS = [
|
| 44 |
+
"task_1", "task_2", "task_3", # easy (QA)
|
| 45 |
+
"task_5", "task_8", # medium + hard (MODIFY)
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
SYSTEM_PROMPT = textwrap.dedent("""\
|
| 49 |
+
You are an expert financial analyst and Python programmer.
|
| 50 |
+
You are working with a real Excel workbook. The file path is given to you.
|
| 51 |
+
|
| 52 |
+
CRITICAL RULES:
|
| 53 |
+
1. Do NOT call reset(). Just write plain Python code.
|
| 54 |
+
2. Use the EXACT file path provided. Do not guess paths.
|
| 55 |
+
3. Each code block runs in a FRESH subprocess — you must re-import and re-open
|
| 56 |
+
the workbook every time. Variables do NOT persist between steps.
|
| 57 |
+
4. Use print() liberally to see data. Read the output carefully before your next step.
|
| 58 |
+
5. You have limited steps. Be efficient — explore in step 1, solve in step 2-3, submit.
|
| 59 |
+
|
| 60 |
+
RESPONSE FORMAT — use EXACTLY one of:
|
| 61 |
+
|
| 62 |
+
To run Python code:
|
| 63 |
+
```python
|
| 64 |
+
your code here
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
To submit a text answer (QA tasks):
|
| 68 |
+
SUBMIT_ANSWER: your answer here
|
| 69 |
+
|
| 70 |
+
To submit a modified file (MODIFY tasks):
|
| 71 |
+
SUBMIT_FILE: /path/to/saved.xlsx
|
| 72 |
+
|
| 73 |
+
STRATEGY:
|
| 74 |
+
- Step 1: Run code to explore the spreadsheet structure and data
|
| 75 |
+
- Step 2-3: Run code to compute the answer or make modifications
|
| 76 |
+
- Then SUBMIT immediately. Do not waste steps.
|
| 77 |
+
|
| 78 |
+
For MODIFY tasks: load the workbook, make changes, save it back to the SAME path,
|
| 79 |
+
then use SUBMIT_FILE with that path.
|
| 80 |
+
""")
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ---------------------------------------------------------------------------
|
| 84 |
+
# Logging helpers (strict hackathon format)
|
| 85 |
+
# ---------------------------------------------------------------------------
|
| 86 |
+
|
| 87 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 88 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 92 |
+
done_val = str(done).lower()
|
| 93 |
+
error_val = str(error).lower() if error else "none"
|
| 94 |
+
short_action = action[:120].replace("\n", " ")
|
| 95 |
+
print(
|
| 96 |
+
f"[STEP] step={step} action={short_action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 97 |
+
flush=True,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 102 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 103 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# ---------------------------------------------------------------------------
|
| 107 |
+
# WebSocket environment interaction
|
| 108 |
+
# ---------------------------------------------------------------------------
|
| 109 |
+
|
| 110 |
+
async def ws_send_recv(ws, message: dict) -> dict:
|
| 111 |
+
"""Send a message and receive a response over WebSocket."""
|
| 112 |
+
await ws.send(json.dumps(message))
|
| 113 |
+
resp = json.loads(await ws.recv())
|
| 114 |
+
if resp.get("type") == "error":
|
| 115 |
+
raise RuntimeError(f"Server error: {resp.get('data', {}).get('message', 'unknown')}")
|
| 116 |
+
return resp
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
async def ws_reset(ws, task_id: str) -> dict:
|
| 120 |
+
"""Reset the environment via WebSocket."""
|
| 121 |
+
resp = await ws_send_recv(ws, {"type": "reset", "data": {"task_id": task_id}})
|
| 122 |
+
data = resp.get("data", {})
|
| 123 |
+
obs = data.get("observation", data)
|
| 124 |
+
return {
|
| 125 |
+
"observation": obs,
|
| 126 |
+
"reward": data.get("reward", 0.0),
|
| 127 |
+
"done": data.get("done", False),
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
async def ws_step(ws, action_type: str, content: str) -> dict:
|
| 132 |
+
"""Execute a step via WebSocket."""
|
| 133 |
+
resp = await ws_send_recv(ws, {
|
| 134 |
+
"type": "step",
|
| 135 |
+
"data": {"action_type": action_type, "content": content},
|
| 136 |
+
})
|
| 137 |
+
data = resp.get("data", {})
|
| 138 |
+
obs = data.get("observation", data)
|
| 139 |
+
return {
|
| 140 |
+
"observation": obs,
|
| 141 |
+
"reward": data.get("reward", 0.0),
|
| 142 |
+
"done": data.get("done", False),
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# ---------------------------------------------------------------------------
|
| 147 |
+
# LLM interaction
|
| 148 |
+
# ---------------------------------------------------------------------------
|
| 149 |
+
|
| 150 |
+
def get_model_response(client: OpenAI, messages: List[Dict[str, str]]) -> str:
|
| 151 |
+
try:
|
| 152 |
+
completion = client.chat.completions.create(
|
| 153 |
+
model=MODEL_NAME,
|
| 154 |
+
messages=messages,
|
| 155 |
+
temperature=TEMPERATURE,
|
| 156 |
+
max_tokens=MAX_TOKENS,
|
| 157 |
+
stream=False,
|
| 158 |
+
)
|
| 159 |
+
return (completion.choices[0].message.content or "").strip()
|
| 160 |
+
except Exception as exc:
|
| 161 |
+
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 162 |
+
return ""
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def extract_action(response: str):
|
| 166 |
+
"""Parse model response into (action_type, content)."""
|
| 167 |
+
if "SUBMIT_ANSWER:" in response:
|
| 168 |
+
answer = response.split("SUBMIT_ANSWER:", 1)[1].strip()
|
| 169 |
+
return "submit", answer
|
| 170 |
+
if "SUBMIT_FILE:" in response:
|
| 171 |
+
path = response.split("SUBMIT_FILE:", 1)[1].strip()
|
| 172 |
+
return "submit_file", path
|
| 173 |
+
|
| 174 |
+
# Extract code block
|
| 175 |
+
m = re.search(r"```python\s*\n(.*?)```", response, re.DOTALL)
|
| 176 |
+
if m:
|
| 177 |
+
return "code", m.group(1).strip()
|
| 178 |
+
m = re.search(r"```\s*\n(.*?)```", response, re.DOTALL)
|
| 179 |
+
if m:
|
| 180 |
+
code = m.group(1).strip()
|
| 181 |
+
if "import" in code or "openpyxl" in code or "print" in code:
|
| 182 |
+
return "code", code
|
| 183 |
+
|
| 184 |
+
# Fallback: if it looks like code, treat as code
|
| 185 |
+
if response.strip().startswith("import ") or "openpyxl" in response:
|
| 186 |
+
return "code", response.strip()
|
| 187 |
+
|
| 188 |
+
# Otherwise treat as text answer
|
| 189 |
+
return "submit", response.strip()
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
# ---------------------------------------------------------------------------
|
| 193 |
+
# Main loop
|
| 194 |
+
# ---------------------------------------------------------------------------
|
| 195 |
+
|
| 196 |
+
def _to_ws_url(http_url: str) -> str:
|
| 197 |
+
"""Convert http(s):// URL to ws(s):// URL."""
|
| 198 |
+
return http_url.replace("https://", "wss://").replace("http://", "ws://")
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
async def run_task(client: OpenAI, ws_url: str, task_id: str) -> float:
|
| 202 |
+
import websockets
|
| 203 |
+
|
| 204 |
+
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 205 |
+
|
| 206 |
+
rewards: List[float] = []
|
| 207 |
+
steps_taken = 0
|
| 208 |
+
final_score = 0.0
|
| 209 |
+
success = False
|
| 210 |
+
|
| 211 |
+
try:
|
| 212 |
+
async with websockets.connect(f"{ws_url}/ws", open_timeout=30, max_size=100 * 1024 * 1024) as ws:
|
| 213 |
+
# Reset
|
| 214 |
+
reset_data = await ws_reset(ws, task_id)
|
| 215 |
+
obs = reset_data["observation"]
|
| 216 |
+
task_desc = obs.get("task_description", "")
|
| 217 |
+
feedback = obs.get("feedback", "")
|
| 218 |
+
source_file = obs.get("source_file", "")
|
| 219 |
+
task_type = obs.get("task_type", "QA")
|
| 220 |
+
|
| 221 |
+
messages = [
|
| 222 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 223 |
+
{"role": "user", "content": (
|
| 224 |
+
f"{task_desc}\n\n"
|
| 225 |
+
f"Source file path: {source_file}\n"
|
| 226 |
+
f"Task type: {task_type}\n\n"
|
| 227 |
+
f"{feedback}"
|
| 228 |
+
)},
|
| 229 |
+
]
|
| 230 |
+
|
| 231 |
+
for step_num in range(1, MAX_STEPS + 1):
|
| 232 |
+
response = get_model_response(client, messages)
|
| 233 |
+
if not response:
|
| 234 |
+
break
|
| 235 |
+
|
| 236 |
+
action_type, content = extract_action(response)
|
| 237 |
+
messages.append({"role": "assistant", "content": response})
|
| 238 |
+
|
| 239 |
+
step_data = await ws_step(ws, action_type, content)
|
| 240 |
+
step_obs = step_data["observation"]
|
| 241 |
+
reward = float(step_data.get("reward") or 0)
|
| 242 |
+
done = step_data.get("done", False)
|
| 243 |
+
step_feedback = step_obs.get("feedback", "")
|
| 244 |
+
|
| 245 |
+
rewards.append(reward)
|
| 246 |
+
steps_taken = step_num
|
| 247 |
+
|
| 248 |
+
log_step(
|
| 249 |
+
step=step_num,
|
| 250 |
+
action=f"[{action_type}] {content[:80]}",
|
| 251 |
+
reward=reward,
|
| 252 |
+
done=done,
|
| 253 |
+
error=None,
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
if done:
|
| 257 |
+
final_score = reward
|
| 258 |
+
success = final_score >= 0.5
|
| 259 |
+
break
|
| 260 |
+
|
| 261 |
+
# Feed the execution result back to the LLM
|
| 262 |
+
remaining = MAX_STEPS - step_num
|
| 263 |
+
urgency = ""
|
| 264 |
+
if remaining <= 2:
|
| 265 |
+
urgency = f"\n\n⚠ Only {remaining} step(s) remaining! You MUST submit now."
|
| 266 |
+
if task_type == "QA":
|
| 267 |
+
urgency += " Use: SUBMIT_ANSWER: <your answer>"
|
| 268 |
+
else:
|
| 269 |
+
urgency += f" Save the file and use: SUBMIT_FILE: {source_file}"
|
| 270 |
+
|
| 271 |
+
messages.append({"role": "user", "content": (
|
| 272 |
+
f"Code execution result (step {step_num}/{MAX_STEPS}):\n"
|
| 273 |
+
f"{step_feedback}\n\n"
|
| 274 |
+
f"Source file: {source_file}{urgency}"
|
| 275 |
+
)})
|
| 276 |
+
|
| 277 |
+
# Send close
|
| 278 |
+
try:
|
| 279 |
+
await ws.send(json.dumps({"type": "close"}))
|
| 280 |
+
except Exception:
|
| 281 |
+
pass
|
| 282 |
+
|
| 283 |
+
except Exception as exc:
|
| 284 |
+
print(f"[DEBUG] Task {task_id} error: {exc}", flush=True)
|
| 285 |
+
log_step(step=steps_taken + 1, action="error", reward=0.0, done=True, error=str(exc))
|
| 286 |
+
|
| 287 |
+
log_end(success=success, steps=steps_taken, score=final_score, rewards=rewards)
|
| 288 |
+
return final_score
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
async def async_main() -> None:
|
| 292 |
+
if not API_BASE_URL:
|
| 293 |
+
print("ERROR: API_BASE_URL not set.", file=sys.stderr)
|
| 294 |
+
sys.exit(1)
|
| 295 |
+
if not MODEL_NAME:
|
| 296 |
+
print("ERROR: MODEL_NAME not set.", file=sys.stderr)
|
| 297 |
+
sys.exit(1)
|
| 298 |
+
if not HF_TOKEN:
|
| 299 |
+
print("ERROR: HF_TOKEN environment variable not set.", file=sys.stderr)
|
| 300 |
+
sys.exit(1)
|
| 301 |
+
|
| 302 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 303 |
+
ws_url = _to_ws_url(ENV_URL)
|
| 304 |
+
all_scores: List[float] = []
|
| 305 |
+
|
| 306 |
+
for task_id in TASK_IDS:
|
| 307 |
+
print(f"\n{'='*60}\nRunning {task_id}...\n{'='*60}", flush=True)
|
| 308 |
+
score = await run_task(client, ws_url, task_id)
|
| 309 |
+
all_scores.append(score)
|
| 310 |
+
print(f" -> {task_id} score: {score:.3f}", flush=True)
|
| 311 |
+
|
| 312 |
+
avg = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
| 313 |
+
print(
|
| 314 |
+
f"\n{'='*60}\nOVERALL AVERAGE SCORE: {avg:.3f}\n"
|
| 315 |
+
f"Per-task: {[f'{s:.3f}' for s in all_scores]}\n{'='*60}",
|
| 316 |
+
flush=True,
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def main() -> None:
|
| 321 |
+
asyncio.run(async_main())
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
if __name__ == "__main__":
|
| 325 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed Pydantic models for the Financial Task Environment."""
|
| 2 |
+
|
| 3 |
+
from typing import Any, Dict
|
| 4 |
+
|
| 5 |
+
from pydantic import Field
|
| 6 |
+
|
| 7 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class FinancialAction(Action):
|
| 11 |
+
"""Action model for the Financial Task Environment.
|
| 12 |
+
|
| 13 |
+
Agents interact by executing Python code to read/modify xlsx files,
|
| 14 |
+
or by submitting a text answer / file path.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
action_type: str = Field(
|
| 18 |
+
description="Action type: 'code' to execute Python, 'submit' for text answer, 'submit_file' for xlsx"
|
| 19 |
+
)
|
| 20 |
+
content: str = Field(
|
| 21 |
+
description="Python code when action_type='code', text answer for 'submit', file path for 'submit_file'"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class FinancialObservation(Observation):
|
| 26 |
+
"""Observation model for the Financial Task Environment.
|
| 27 |
+
|
| 28 |
+
Contains the task description, financial data, and feedback from
|
| 29 |
+
the environment after each action.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
task_id: str = Field(default="", description="Current task identifier")
|
| 33 |
+
task_description: str = Field(default="", description="Task instructions")
|
| 34 |
+
financial_data: str = Field(default="", description="Financial data / xlsx summary")
|
| 35 |
+
difficulty: str = Field(default="", description="Task difficulty: easy, medium, or hard")
|
| 36 |
+
feedback: str = Field(default="", description="Feedback on the last action taken")
|
| 37 |
+
current_step: int = Field(default=0, description="Current step number in the episode")
|
| 38 |
+
max_steps: int = Field(default=15, description="Maximum steps allowed per episode")
|
| 39 |
+
task_type: str = Field(default="", description="Type of financial task: QA or MODIFY")
|
| 40 |
+
source_file: str = Field(default="", description="Path to the working xlsx file")
|
| 41 |
+
available_tasks: str = Field(
|
| 42 |
+
default="",
|
| 43 |
+
description="Comma-separated list of available task IDs (shown on reset)",
|
| 44 |
+
)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: financial_task_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
| 7 |
+
|
| 8 |
+
tasks:
|
| 9 |
+
- id: task_1
|
| 10 |
+
name: Count Plants in Spreadsheet
|
| 11 |
+
difficulty: easy
|
| 12 |
+
max_steps: 15
|
| 13 |
+
grader:
|
| 14 |
+
type: programmatic
|
| 15 |
+
description: "QA grading — extracts numbers from agent answer, compares against reference (85). Score 0.0–1.0 based on numeric match with 5% tolerance."
|
| 16 |
+
|
| 17 |
+
- id: task_2
|
| 18 |
+
name: Retrieve TW EOL Charge
|
| 19 |
+
difficulty: easy
|
| 20 |
+
max_steps: 15
|
| 21 |
+
grader:
|
| 22 |
+
type: programmatic
|
| 23 |
+
description: "QA grading — extracts numbers from agent answer, compares against reference (113291). Score 0.0–1.0 based on numeric match with 5% tolerance."
|
| 24 |
+
|
| 25 |
+
- id: task_3
|
| 26 |
+
name: Portfolio Mark-to-Market Change
|
| 27 |
+
difficulty: easy
|
| 28 |
+
max_steps: 15
|
| 29 |
+
grader:
|
| 30 |
+
type: programmatic
|
| 31 |
+
description: "QA grading — extracts numbers from agent answer, compares against reference values ($1,989,600 and 27.9%). Score 0.0–1.0 based on numeric match + keyword overlap."
|
| 32 |
+
|
| 33 |
+
- id: task_5
|
| 34 |
+
name: Audit and Correct Formula Errors
|
| 35 |
+
difficulty: medium
|
| 36 |
+
max_steps: 15
|
| 37 |
+
grader:
|
| 38 |
+
type: programmatic
|
| 39 |
+
description: "MODIFY grading — compares agent output xlsx cell-by-cell against reference workbook. 30% sheet-name match + 70% cell-level match (2% numeric tolerance). Score 0.0–1.0."
|
| 40 |
+
|
| 41 |
+
- id: task_8
|
| 42 |
+
name: Balance Sheet Validation and Indicators
|
| 43 |
+
difficulty: hard
|
| 44 |
+
max_steps: 15
|
| 45 |
+
grader:
|
| 46 |
+
type: programmatic
|
| 47 |
+
description: "MODIFY grading — compares agent output xlsx cell-by-cell against reference workbook. 30% sheet-name match + 70% cell-level match (2% numeric tolerance). Score 0.0–1.0."
|
pyproject.toml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "financial-task-env"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "OpenEnv environment for real-world finance & accounting tasks"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
license = {text = "MIT"}
|
| 11 |
+
requires-python = ">=3.10"
|
| 12 |
+
dependencies = [
|
| 13 |
+
"openenv-core>=0.2.0",
|
| 14 |
+
"fastapi>=0.104.0",
|
| 15 |
+
"uvicorn>=0.24.0",
|
| 16 |
+
"pydantic>=2.0.0",
|
| 17 |
+
"websockets>=12.0",
|
| 18 |
+
"openpyxl>=3.1.0",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
[project.optional-dependencies]
|
| 22 |
+
dev = [
|
| 23 |
+
"pytest",
|
| 24 |
+
"httpx",
|
| 25 |
+
"openai",
|
| 26 |
+
]
|
| 27 |
+
inference = [
|
| 28 |
+
"openai",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
[project.scripts]
|
| 32 |
+
server = "server.app:main"
|
| 33 |
+
|
| 34 |
+
[tool.hatch.build.targets.wheel]
|
| 35 |
+
packages = ["."]
|
server/Dockerfile
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app/env
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
git curl \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Install uv for fast dependency management
|
| 11 |
+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 12 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 13 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx
|
| 14 |
+
|
| 15 |
+
# Copy dependency files first for better caching
|
| 16 |
+
COPY pyproject.toml /app/env/
|
| 17 |
+
|
| 18 |
+
# Install dependencies
|
| 19 |
+
RUN uv pip install --system --no-cache-dir \
|
| 20 |
+
"openenv-core>=0.2.0" \
|
| 21 |
+
"fastapi>=0.104.0" \
|
| 22 |
+
"uvicorn>=0.24.0" \
|
| 23 |
+
"pydantic>=2.0.0" \
|
| 24 |
+
"websockets>=12.0" \
|
| 25 |
+
"openpyxl>=3.1.0"
|
| 26 |
+
|
| 27 |
+
# Copy environment code
|
| 28 |
+
COPY . /app/env/
|
| 29 |
+
|
| 30 |
+
# Set PYTHONPATH so imports work
|
| 31 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 32 |
+
|
| 33 |
+
# Health check
|
| 34 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
|
| 35 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
| 36 |
+
|
| 37 |
+
EXPOSE 8000
|
| 38 |
+
|
| 39 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Financial Task Environment — server-side implementation."""
|
server/app.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application for the Financial Task Environment."""
|
| 2 |
+
|
| 3 |
+
from openenv.core.env_server.http_server import create_app
|
| 4 |
+
|
| 5 |
+
from models import FinancialAction, FinancialObservation
|
| 6 |
+
from server.financial_environment import FinancialEnvironment
|
| 7 |
+
|
| 8 |
+
app = create_app(
|
| 9 |
+
FinancialEnvironment,
|
| 10 |
+
FinancialAction,
|
| 11 |
+
FinancialObservation,
|
| 12 |
+
env_name="financial_task_env",
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def main() -> None:
|
| 17 |
+
"""Entry point for direct execution."""
|
| 18 |
+
import uvicorn
|
| 19 |
+
|
| 20 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
if __name__ == "__main__":
|
| 24 |
+
main()
|
server/financial_environment.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Financial Task Environment — core environment logic.
|
| 2 |
+
|
| 3 |
+
A code-execution environment where the agent writes Python code (using openpyxl)
|
| 4 |
+
to read, analyze, and modify real Excel workbooks from enterprise finance workflows.
|
| 5 |
+
|
| 6 |
+
For QA tasks: the agent reads the xlsx and submits a text answer.
|
| 7 |
+
For MODIFY tasks: the agent writes code that modifies the xlsx, then the result
|
| 8 |
+
is compared cell-by-cell against a reference workbook.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import io
|
| 14 |
+
import openpyxl
|
| 15 |
+
import os
|
| 16 |
+
import shutil
|
| 17 |
+
import subprocess
|
| 18 |
+
import sys
|
| 19 |
+
import tempfile
|
| 20 |
+
import traceback
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from typing import Any, Optional
|
| 23 |
+
from uuid import uuid4
|
| 24 |
+
|
| 25 |
+
from openenv.core.env_server.interfaces import Environment
|
| 26 |
+
from openenv.core.env_server.types import Observation, State
|
| 27 |
+
|
| 28 |
+
from models import FinancialAction, FinancialObservation
|
| 29 |
+
from tasks import TASKS, TASK_IDS, get_task
|
| 30 |
+
from graders import grade_task
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class FinancialEnvironment(Environment):
|
| 34 |
+
"""OpenEnv environment for financial spreadsheet tasks with code execution.
|
| 35 |
+
|
| 36 |
+
Episode flow
|
| 37 |
+
────────────
|
| 38 |
+
1. ``reset(task_id="task_1")`` → observation with task info + xlsx summary.
|
| 39 |
+
2. ``step(action_type="code", content="import openpyxl; ...")`` → execute code, get stdout.
|
| 40 |
+
3. ``step(action_type="submit", content="answer text")`` → grade and end episode.
|
| 41 |
+
*or* for MODIFY tasks:
|
| 42 |
+
``step(action_type="submit_file", content="<path>")`` → grade xlsx and end.
|
| 43 |
+
|
| 44 |
+
The episode also ends when *max_steps* is reached.
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
MAX_STEPS = 15
|
| 48 |
+
|
| 49 |
+
def __init__(self) -> None:
|
| 50 |
+
super().__init__()
|
| 51 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 52 |
+
self._current_task: dict[str, Any] | None = None
|
| 53 |
+
self._done = False
|
| 54 |
+
self._cumulative_reward = 0.0
|
| 55 |
+
self._workdir: str | None = None
|
| 56 |
+
|
| 57 |
+
# ------------------------------------------------------------------
|
| 58 |
+
# reset
|
| 59 |
+
# ------------------------------------------------------------------
|
| 60 |
+
def reset(
|
| 61 |
+
self,
|
| 62 |
+
seed: Optional[int] = None,
|
| 63 |
+
episode_id: Optional[str] = None,
|
| 64 |
+
**kwargs: Any,
|
| 65 |
+
) -> FinancialObservation:
|
| 66 |
+
task_id: str = kwargs.get("task_id", "task_1")
|
| 67 |
+
self._current_task = get_task(task_id)
|
| 68 |
+
self._state = State(
|
| 69 |
+
episode_id=episode_id or str(uuid4()),
|
| 70 |
+
step_count=0,
|
| 71 |
+
)
|
| 72 |
+
self._done = False
|
| 73 |
+
self._cumulative_reward = 0.0
|
| 74 |
+
|
| 75 |
+
# Create a working directory and copy the source xlsx into it
|
| 76 |
+
self._workdir = tempfile.mkdtemp(prefix=f"financial_env_{task_id}_")
|
| 77 |
+
src = self._current_task.get("source_file", "")
|
| 78 |
+
if src and Path(src).exists():
|
| 79 |
+
shutil.copy2(src, self._workdir)
|
| 80 |
+
work_file = str(Path(self._workdir) / Path(src).name)
|
| 81 |
+
else:
|
| 82 |
+
work_file = ""
|
| 83 |
+
|
| 84 |
+
# Generate an xlsx summary to include in the observation
|
| 85 |
+
xlsx_summary = self._summarize_xlsx(work_file) if work_file else "No source file."
|
| 86 |
+
|
| 87 |
+
task = self._current_task
|
| 88 |
+
task_info = (
|
| 89 |
+
f"Task: {task['title']}\n"
|
| 90 |
+
f"Difficulty: {task['difficulty']}\n"
|
| 91 |
+
f"Type: {task['task_type']} ({task['category']})\n\n"
|
| 92 |
+
f"Instruction:\n{task['instruction']}\n"
|
| 93 |
+
)
|
| 94 |
+
if task.get("constraints"):
|
| 95 |
+
task_info += f"\nConstraints:\n{task['constraints']}\n"
|
| 96 |
+
task_info += (
|
| 97 |
+
f"\nSource file: {work_file}\n"
|
| 98 |
+
f"\nSpreadsheet Summary:\n{xlsx_summary}\n\n"
|
| 99 |
+
"Actions:\n"
|
| 100 |
+
" action_type='code' → Execute Python code (openpyxl available).\n"
|
| 101 |
+
" The working file path is in the source_file field.\n"
|
| 102 |
+
" action_type='submit' → Submit a text answer (QA tasks).\n"
|
| 103 |
+
" action_type='submit_file' → Submit a modified xlsx path (MODIFY tasks).\n"
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
return FinancialObservation(
|
| 107 |
+
done=False,
|
| 108 |
+
reward=0.0,
|
| 109 |
+
task_id=task["id"],
|
| 110 |
+
task_description=task_info,
|
| 111 |
+
financial_data=xlsx_summary,
|
| 112 |
+
difficulty=task["difficulty"],
|
| 113 |
+
task_type=task["task_type"],
|
| 114 |
+
feedback="Environment reset. Read the spreadsheet and task instructions carefully.",
|
| 115 |
+
current_step=0,
|
| 116 |
+
max_steps=self.MAX_STEPS,
|
| 117 |
+
available_tasks=",".join(TASK_IDS),
|
| 118 |
+
source_file=work_file,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# ------------------------------------------------------------------
|
| 122 |
+
# step
|
| 123 |
+
# ------------------------------------------------------------------
|
| 124 |
+
def step(
|
| 125 |
+
self,
|
| 126 |
+
action: FinancialAction,
|
| 127 |
+
timeout_s: Optional[float] = None,
|
| 128 |
+
**kwargs: Any,
|
| 129 |
+
) -> FinancialObservation:
|
| 130 |
+
self._state.step_count += 1
|
| 131 |
+
|
| 132 |
+
if self._current_task is None:
|
| 133 |
+
return self._obs(feedback="No task loaded. Call reset() first.", reward=0.0, done=True)
|
| 134 |
+
|
| 135 |
+
if self._done:
|
| 136 |
+
return self._obs(feedback="Episode already finished. Call reset().", reward=0.0, done=True)
|
| 137 |
+
|
| 138 |
+
action_type = action.action_type.strip().lower()
|
| 139 |
+
|
| 140 |
+
if action_type == "code":
|
| 141 |
+
return self._handle_code(action.content)
|
| 142 |
+
elif action_type == "submit":
|
| 143 |
+
return self._handle_submit_text(action.content)
|
| 144 |
+
elif action_type == "submit_file":
|
| 145 |
+
return self._handle_submit_file(action.content)
|
| 146 |
+
else:
|
| 147 |
+
return self._obs(
|
| 148 |
+
feedback=f"Unknown action_type '{action.action_type}'. Use 'code', 'submit', or 'submit_file'.",
|
| 149 |
+
reward=0.0, done=False,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# ------------------------------------------------------------------
|
| 153 |
+
# state property
|
| 154 |
+
# ------------------------------------------------------------------
|
| 155 |
+
@property
|
| 156 |
+
def state(self) -> State:
|
| 157 |
+
return self._state
|
| 158 |
+
|
| 159 |
+
# ------------------------------------------------------------------
|
| 160 |
+
# Code execution
|
| 161 |
+
# ------------------------------------------------------------------
|
| 162 |
+
def _handle_code(self, code: str) -> FinancialObservation:
|
| 163 |
+
"""Execute Python code in a subprocess and return stdout/stderr."""
|
| 164 |
+
if not self._workdir:
|
| 165 |
+
return self._obs(feedback="No working directory. Call reset() first.", reward=0.0, done=False)
|
| 166 |
+
|
| 167 |
+
# Small reward for taking an action
|
| 168 |
+
reward = 0.02
|
| 169 |
+
self._cumulative_reward += reward
|
| 170 |
+
|
| 171 |
+
try:
|
| 172 |
+
result = subprocess.run(
|
| 173 |
+
[sys.executable, "-c", code],
|
| 174 |
+
capture_output=True,
|
| 175 |
+
text=True,
|
| 176 |
+
timeout=30,
|
| 177 |
+
cwd=self._workdir,
|
| 178 |
+
env={**os.environ, "PYTHONDONTWRITEBYTECODE": "1"},
|
| 179 |
+
)
|
| 180 |
+
stdout = result.stdout[:4000] if result.stdout else ""
|
| 181 |
+
stderr = result.stderr[:2000] if result.stderr else ""
|
| 182 |
+
|
| 183 |
+
if result.returncode == 0:
|
| 184 |
+
feedback = f"Code executed successfully.\n\nSTDOUT:\n{stdout}"
|
| 185 |
+
if stderr:
|
| 186 |
+
feedback += f"\n\nSTDERR:\n{stderr}"
|
| 187 |
+
else:
|
| 188 |
+
feedback = f"Code execution failed (exit code {result.returncode}).\n\nSTDERR:\n{stderr}"
|
| 189 |
+
if stdout:
|
| 190 |
+
feedback += f"\n\nSTDOUT:\n{stdout}"
|
| 191 |
+
except subprocess.TimeoutExpired:
|
| 192 |
+
feedback = "Code execution timed out (30s limit)."
|
| 193 |
+
except Exception as e:
|
| 194 |
+
feedback = f"Code execution error: {e}"
|
| 195 |
+
|
| 196 |
+
at_limit = self._state.step_count >= self.MAX_STEPS
|
| 197 |
+
if at_limit:
|
| 198 |
+
self._done = True
|
| 199 |
+
feedback += "\n\n⚠ Maximum steps reached — episode ending."
|
| 200 |
+
|
| 201 |
+
return self._obs(feedback=feedback, reward=reward, done=at_limit)
|
| 202 |
+
|
| 203 |
+
# ------------------------------------------------------------------
|
| 204 |
+
# Submit handlers
|
| 205 |
+
# ------------------------------------------------------------------
|
| 206 |
+
def _handle_submit_text(self, answer: str) -> FinancialObservation:
|
| 207 |
+
"""Grade a text answer (for QA tasks)."""
|
| 208 |
+
task = self._current_task
|
| 209 |
+
assert task is not None
|
| 210 |
+
|
| 211 |
+
score = grade_task(task, answer=answer)
|
| 212 |
+
self._done = True
|
| 213 |
+
self._cumulative_reward += score
|
| 214 |
+
|
| 215 |
+
quality = "Excellent" if score >= 0.9 else "Good" if score >= 0.7 else "Partial" if score >= 0.4 else "Needs improvement"
|
| 216 |
+
return self._obs(
|
| 217 |
+
feedback=f"Answer graded. Score: {score:.2f}/1.00 — {quality}.\nCumulative reward: {self._cumulative_reward:.2f}",
|
| 218 |
+
reward=score, done=True,
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
def _handle_submit_file(self, file_path: str) -> FinancialObservation:
|
| 222 |
+
"""Grade a modified xlsx file (for MODIFY tasks)."""
|
| 223 |
+
task = self._current_task
|
| 224 |
+
assert task is not None
|
| 225 |
+
|
| 226 |
+
# Resolve relative paths against workdir
|
| 227 |
+
p = Path(file_path)
|
| 228 |
+
if not p.is_absolute() and self._workdir:
|
| 229 |
+
p = Path(self._workdir) / p
|
| 230 |
+
|
| 231 |
+
if not p.exists():
|
| 232 |
+
self._done = True
|
| 233 |
+
return self._obs(
|
| 234 |
+
feedback=f"File not found: {p}. Score: 0.00",
|
| 235 |
+
reward=0.0, done=True,
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
score = grade_task(task, output_path=str(p))
|
| 239 |
+
self._done = True
|
| 240 |
+
self._cumulative_reward += score
|
| 241 |
+
|
| 242 |
+
quality = "Excellent" if score >= 0.9 else "Good" if score >= 0.7 else "Partial" if score >= 0.4 else "Needs improvement"
|
| 243 |
+
return self._obs(
|
| 244 |
+
feedback=f"File graded. Score: {score:.2f}/1.00 — {quality}.\nCumulative reward: {self._cumulative_reward:.2f}",
|
| 245 |
+
reward=score, done=True,
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
# ------------------------------------------------------------------
|
| 249 |
+
# Helpers
|
| 250 |
+
# ------------------------------------------------------------------
|
| 251 |
+
def _summarize_xlsx(self, path: str) -> str:
|
| 252 |
+
"""Return a text summary of an xlsx file (sheet names, dimensions, sample data)."""
|
| 253 |
+
try:
|
| 254 |
+
wb = openpyxl.load_workbook(path, data_only=True, read_only=True)
|
| 255 |
+
lines = [f"Workbook: {Path(path).name}", f"Sheets: {wb.sheetnames}", ""]
|
| 256 |
+
for name in wb.sheetnames[:5]: # Limit to 5 sheets
|
| 257 |
+
ws = wb[name]
|
| 258 |
+
lines.append(f"--- Sheet: {name} (rows≈{ws.max_row}, cols≈{ws.max_column}) ---")
|
| 259 |
+
# Show first 8 rows
|
| 260 |
+
row_count = 0
|
| 261 |
+
for row in ws.iter_rows(max_row=8, values_only=True):
|
| 262 |
+
vals = [str(v)[:30] if v is not None else "" for v in row[:12]]
|
| 263 |
+
lines.append(" " + " | ".join(vals))
|
| 264 |
+
row_count += 1
|
| 265 |
+
if ws.max_row and ws.max_row > 8:
|
| 266 |
+
lines.append(f" ... ({ws.max_row - 8} more rows)")
|
| 267 |
+
lines.append("")
|
| 268 |
+
wb.close()
|
| 269 |
+
return "\n".join(lines)
|
| 270 |
+
except Exception as e:
|
| 271 |
+
return f"Could not read xlsx: {e}"
|
| 272 |
+
|
| 273 |
+
def _obs(self, *, feedback: str, reward: float, done: bool) -> FinancialObservation:
|
| 274 |
+
task = self._current_task or {}
|
| 275 |
+
work_file = ""
|
| 276 |
+
if self._workdir and task.get("source_file"):
|
| 277 |
+
work_file = str(Path(self._workdir) / Path(task["source_file"]).name)
|
| 278 |
+
return FinancialObservation(
|
| 279 |
+
done=done,
|
| 280 |
+
reward=reward,
|
| 281 |
+
task_id=task.get("id", ""),
|
| 282 |
+
task_description=task.get("instruction", ""),
|
| 283 |
+
financial_data="",
|
| 284 |
+
difficulty=task.get("difficulty", ""),
|
| 285 |
+
task_type=task.get("task_type", ""),
|
| 286 |
+
feedback=feedback,
|
| 287 |
+
current_step=self._state.step_count,
|
| 288 |
+
max_steps=self.MAX_STEPS,
|
| 289 |
+
available_tasks=",".join(TASK_IDS),
|
| 290 |
+
source_file=work_file,
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
def close(self) -> None:
|
| 294 |
+
"""Clean up the temporary working directory."""
|
| 295 |
+
if self._workdir and Path(self._workdir).exists():
|
| 296 |
+
shutil.rmtree(self._workdir, ignore_errors=True)
|
| 297 |
+
self._workdir = None
|
tasks.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task definitions for the Financial Task Environment.
|
| 2 |
+
|
| 3 |
+
Contains 10 tasks backed by real Excel workbooks covering diverse enterprise
|
| 4 |
+
finance & accounting workflows (QA, calculation, validation, data entry,
|
| 5 |
+
formatting, modeling, consolidation). Each task ships a source .xlsx that
|
| 6 |
+
the agent must read or modify via Python code execution.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Any, Dict, List
|
| 14 |
+
|
| 15 |
+
# Base directory where xlsx files live (data/<task_id>/)
|
| 16 |
+
DATA_DIR = Path(os.environ.get("FINANCIAL_ENV_DATA_DIR", Path(__file__).parent / "data"))
|
| 17 |
+
|
| 18 |
+
TASKS: Dict[str, Dict[str, Any]] = {}
|
| 19 |
+
|
| 20 |
+
# ---------------------------------------------------------------------------
|
| 21 |
+
# Helper to build source / reference paths
|
| 22 |
+
# ---------------------------------------------------------------------------
|
| 23 |
+
|
| 24 |
+
def _paths(task_id: str, src: str, ref: str | None = None):
|
| 25 |
+
"""Return dict with resolved source and optional reference paths."""
|
| 26 |
+
d: Dict[str, Any] = {
|
| 27 |
+
"source_file": str(DATA_DIR / task_id / src),
|
| 28 |
+
}
|
| 29 |
+
if ref:
|
| 30 |
+
d["reference_file"] = str(DATA_DIR / task_id / ref)
|
| 31 |
+
return d
|
| 32 |
+
|
| 33 |
+
# ── EASY ──────────────────────────────────────────────────────────────────
|
| 34 |
+
|
| 35 |
+
# Task 1 — QA: count rows (Calculation)
|
| 36 |
+
TASKS["task_1"] = {
|
| 37 |
+
"id": "task_1",
|
| 38 |
+
"orig_id": "119",
|
| 39 |
+
"title": "Count Plants in Spreadsheet",
|
| 40 |
+
"difficulty": "easy",
|
| 41 |
+
"task_type": "QA",
|
| 42 |
+
"category": "Calculation",
|
| 43 |
+
"instruction": "How many plants are recorded in the spreadsheet?",
|
| 44 |
+
"constraints": "",
|
| 45 |
+
"reference_answer": "85",
|
| 46 |
+
**_paths("119", "119_src_0.xlsx"),
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# Task 2 — QA: value retrieval (Cross-sheet Retrieval)
|
| 50 |
+
TASKS["task_2"] = {
|
| 51 |
+
"id": "task_2",
|
| 52 |
+
"orig_id": "118",
|
| 53 |
+
"title": "Retrieve TW EOL Charge",
|
| 54 |
+
"difficulty": "easy",
|
| 55 |
+
"task_type": "QA",
|
| 56 |
+
"category": "Cross-sheet/file Retrieval",
|
| 57 |
+
"instruction": "What is the TW EOL charge for 2002? Please provide just the amount.",
|
| 58 |
+
"constraints": "",
|
| 59 |
+
"reference_answer": "113291",
|
| 60 |
+
**_paths("118", "118_src_0.xlsx"),
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
# Task 3 — QA: multi-step calculation (Calculation)
|
| 64 |
+
TASKS["task_3"] = {
|
| 65 |
+
"id": "task_3",
|
| 66 |
+
"orig_id": "34",
|
| 67 |
+
"title": "Portfolio Mark-to-Market Change",
|
| 68 |
+
"difficulty": "easy",
|
| 69 |
+
"task_type": "QA",
|
| 70 |
+
"category": "Calculation",
|
| 71 |
+
"instruction": (
|
| 72 |
+
"Assume the following changes occur in the Jul\u2013Dec 2002 market: "
|
| 73 |
+
"Flat curve prices increase uniformly by $2/MWh; Peak 6x16 curve prices "
|
| 74 |
+
"increase uniformly by $5/MWh; monthly contract volumes (Flat and Peak "
|
| 75 |
+
"Total MWh) remain unchanged. Based on the 2002 table, calculate: "
|
| 76 |
+
"(1) the total added value (mark-to-market change) for the combined "
|
| 77 |
+
"Flat + Peak portfolio; and (2) what percentage of this added value "
|
| 78 |
+
"comes from the Peak 6x16 contracts rather than the Flat contracts."
|
| 79 |
+
),
|
| 80 |
+
"constraints": "",
|
| 81 |
+
"reference_answer": (
|
| 82 |
+
"The total added value of the July\u2013December 2002 portfolio is "
|
| 83 |
+
"$1,989,600 (in absolute terms). Of this amount, approximately 27.9% "
|
| 84 |
+
"(about 28%) comes from the Peak 6x16 contracts, with the remaining "
|
| 85 |
+
"~72.1% coming from the Flat contracts."
|
| 86 |
+
),
|
| 87 |
+
**_paths("34", "34_src_0.xlsx"),
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
# ── MEDIUM ────────────────────────────────────────────────────────────────
|
| 91 |
+
|
| 92 |
+
# Task 4 — Modify: summarise imbalances (Calculation + modify)
|
| 93 |
+
TASKS["task_4"] = {
|
| 94 |
+
"id": "task_4",
|
| 95 |
+
"orig_id": "35",
|
| 96 |
+
"title": "Summarize Pipeline Imbalances",
|
| 97 |
+
"difficulty": "medium",
|
| 98 |
+
"task_type": "MODIFY",
|
| 99 |
+
"category": "Calculation",
|
| 100 |
+
"instruction": (
|
| 101 |
+
"Summarize the volume and dollar imbalances that exist between the "
|
| 102 |
+
"various pipeline operators (Operators) and Transwestern."
|
| 103 |
+
),
|
| 104 |
+
"constraints": (
|
| 105 |
+
"You will be given an Excel file as input. Perform all required "
|
| 106 |
+
"operations by modifying the existing workbook. You may add new sheets "
|
| 107 |
+
"if necessary, but you must preserve all original sheets and their "
|
| 108 |
+
"contents. Return the full updated workbook."
|
| 109 |
+
),
|
| 110 |
+
**_paths("35", "35_src_0.xlsx", "35_ref_0.xlsx"),
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
# Task 5 — Modify: audit & fix formulas (Validation / Review)
|
| 114 |
+
TASKS["task_5"] = {
|
| 115 |
+
"id": "task_5",
|
| 116 |
+
"orig_id": "40",
|
| 117 |
+
"title": "Audit and Correct Formula Errors",
|
| 118 |
+
"difficulty": "medium",
|
| 119 |
+
"task_type": "MODIFY",
|
| 120 |
+
"category": "Validation / Review, Calculation",
|
| 121 |
+
"instruction": (
|
| 122 |
+
"Audit the workbook and correct the formula errors in place so numbers "
|
| 123 |
+
"calculate properly."
|
| 124 |
+
),
|
| 125 |
+
"constraints": (
|
| 126 |
+
"You will be given an Excel file as input. Perform all required "
|
| 127 |
+
"operations by modifying the existing workbook. You may add new sheets "
|
| 128 |
+
"if necessary, but you must preserve all original sheets and their "
|
| 129 |
+
"contents. Return the full updated workbook."
|
| 130 |
+
),
|
| 131 |
+
**_paths("40", "40_src_0.xlsx", "40_ref_0.xlsx"),
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
# Task 6 — Modify: create table + filter (Structuring / Formatting)
|
| 135 |
+
TASKS["task_6"] = {
|
| 136 |
+
"id": "task_6",
|
| 137 |
+
"orig_id": "60",
|
| 138 |
+
"title": "Create Table and Apply Filter",
|
| 139 |
+
"difficulty": "medium",
|
| 140 |
+
"task_type": "MODIFY",
|
| 141 |
+
"category": "Structuring / Formatting",
|
| 142 |
+
"instruction": (
|
| 143 |
+
"On the All Natural Gas sheet, create an Excel table and filter to "
|
| 144 |
+
"show only the COUNTERPARTY entries highlighted in red."
|
| 145 |
+
),
|
| 146 |
+
"constraints": (
|
| 147 |
+
"You will be given an Excel file as input. Perform all required "
|
| 148 |
+
"operations by modifying the existing workbook. You may add new sheets "
|
| 149 |
+
"if necessary, but you must preserve all original sheets and their "
|
| 150 |
+
"contents. Return the full updated workbook."
|
| 151 |
+
),
|
| 152 |
+
**_paths("60", "60_src_0.xlsx", "60_ref_0.xlsx"),
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
# Task 7 — Modify: data entry + formatting (Data Entry / Import)
|
| 156 |
+
TASKS["task_7"] = {
|
| 157 |
+
"id": "task_7",
|
| 158 |
+
"orig_id": "21",
|
| 159 |
+
"title": "Add Weekday Row and Data Entry",
|
| 160 |
+
"difficulty": "medium",
|
| 161 |
+
"task_type": "MODIFY",
|
| 162 |
+
"category": "Data Entry / Import, Structuring / Formatting",
|
| 163 |
+
"instruction": (
|
| 164 |
+
"Add a weekday line directly below the date headers and update the "
|
| 165 |
+
"12/31/2001 (Mon) column. For that day, there are no \"Receipts\"; "
|
| 166 |
+
"record disbursements of $1,980,800 to Calpine (Power Purchases) and "
|
| 167 |
+
"$100,000 to an unspecified vendor (Gas Purchases). Under Enron Facility "
|
| 168 |
+
"Services, enter $3,101,855 for \"$2.5 per day\" and -$2,081,386 for "
|
| 169 |
+
"\"estimate receipt\"; in Personnel, EES is $584,500; leave all other "
|
| 170 |
+
"items as \"-\"."
|
| 171 |
+
),
|
| 172 |
+
"constraints": (
|
| 173 |
+
"You will be given an Excel file as input. Perform all required "
|
| 174 |
+
"operations by modifying the existing workbook. You may add new sheets "
|
| 175 |
+
"if necessary, but you must preserve all original sheets and their "
|
| 176 |
+
"contents. Return the full updated workbook."
|
| 177 |
+
),
|
| 178 |
+
**_paths("21", "21_src_0.xlsx", "21_ref_0.xlsx"),
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
# ── HARD ──────────────────────────────────────────────────────────────────
|
| 182 |
+
|
| 183 |
+
# Task 8 — Modify: balance-sheet validation + indicator calcs
|
| 184 |
+
TASKS["task_8"] = {
|
| 185 |
+
"id": "task_8",
|
| 186 |
+
"orig_id": "0",
|
| 187 |
+
"title": "Balance Sheet Validation and Indicators",
|
| 188 |
+
"difficulty": "hard",
|
| 189 |
+
"task_type": "MODIFY",
|
| 190 |
+
"category": "Validation / Review, Calculation, Structuring / Formatting",
|
| 191 |
+
"instruction": (
|
| 192 |
+
"Complete the validation and indicator calculations as follows: on the "
|
| 193 |
+
"Balance Sheet, add a control to ensure TOTAL ASSETS equals TOTAL "
|
| 194 |
+
"LIABILITIES AND EQUITY; on the Income Statement (Revenue & Expenses), "
|
| 195 |
+
"add an Equity Roll Forward Test to reconcile equity movement and "
|
| 196 |
+
"highlight any differences."
|
| 197 |
+
),
|
| 198 |
+
"constraints": (
|
| 199 |
+
"You will be given an Excel file as input. Perform all required "
|
| 200 |
+
"operations by modifying the existing workbook. You may add new sheets "
|
| 201 |
+
"if necessary, but you must preserve all original sheets and their "
|
| 202 |
+
"contents. Return the full updated workbook."
|
| 203 |
+
),
|
| 204 |
+
**_paths("0", "0_src_0.xlsx", "0_ref_0.xlsx"),
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
# Task 9 — Modify: add new sheet mirroring structure (Financial Modeling)
|
| 208 |
+
TASKS["task_9"] = {
|
| 209 |
+
"id": "task_9",
|
| 210 |
+
"orig_id": "24",
|
| 211 |
+
"title": "Create Scenario3 Worksheet",
|
| 212 |
+
"difficulty": "hard",
|
| 213 |
+
"task_type": "MODIFY",
|
| 214 |
+
"category": "Structuring / Formatting, Financial Modeling",
|
| 215 |
+
"instruction": (
|
| 216 |
+
'Add a new worksheet named "Scenario3" to the same workbook, mirroring '
|
| 217 |
+
"the structure, row/column layout, monthly detail table, and chart area "
|
| 218 |
+
'of "Scenario1". For Scenario3, update the hedging assumptions to a '
|
| 219 |
+
"balanced allocation: 10-Yr 25%, 5-Yr 20%, 1-Yr 15%, May-Sep 20%, "
|
| 220 |
+
"Q3 15%. Keep the note \"Maximum Monthly Average Short Position to "
|
| 221 |
+
'Cover (July Peak) = 30,508 MW" unchanged; only the new sheet should '
|
| 222 |
+
"be added, and formulas may be used within it."
|
| 223 |
+
),
|
| 224 |
+
"constraints": (
|
| 225 |
+
"You will be given an Excel file as input. Perform all required "
|
| 226 |
+
"operations by modifying the existing workbook. You may add new sheets "
|
| 227 |
+
"if necessary, but you must preserve all original sheets and their "
|
| 228 |
+
"contents. Return the full updated workbook."
|
| 229 |
+
),
|
| 230 |
+
**_paths("24", "24_src_0.xlsx", "24_ref_0.xlsx"),
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
# Task 10 — Modify: cross-sheet consolidation (multi-type)
|
| 234 |
+
TASKS["task_10"] = {
|
| 235 |
+
"id": "task_10",
|
| 236 |
+
"orig_id": "67",
|
| 237 |
+
"title": "Consolidate by Type and Area",
|
| 238 |
+
"difficulty": "hard",
|
| 239 |
+
"task_type": "MODIFY",
|
| 240 |
+
"category": "Structuring / Formatting, Calculation, Validation / Review, Cross-sheet Retrieval",
|
| 241 |
+
"instruction": (
|
| 242 |
+
"Create a new 'by type_area' worksheet based on the Summary and the "
|
| 243 |
+
"other tabs. It should present two separate tables summarized by "
|
| 244 |
+
"Imbal Type; within each table, consolidate by area, include Volume, "
|
| 245 |
+
"Value and Date, and calculate totals. Finally, confirm that the value "
|
| 246 |
+
"and volume totals tie to the totals shown on the Summary."
|
| 247 |
+
),
|
| 248 |
+
"constraints": (
|
| 249 |
+
"You will be given an Excel file as input. Perform all required "
|
| 250 |
+
"operations by modifying the existing workbook. You may add new sheets "
|
| 251 |
+
"if necessary, but you must preserve all original sheets and their "
|
| 252 |
+
"contents. Return the full updated workbook."
|
| 253 |
+
),
|
| 254 |
+
**_paths("67", "67_src_0.xlsx", "67_ref_0.xlsx"),
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
# ---------------------------------------------------------------------------
|
| 258 |
+
# Helper accessors
|
| 259 |
+
# ---------------------------------------------------------------------------
|
| 260 |
+
|
| 261 |
+
TASK_IDS: List[str] = sorted(TASKS.keys(), key=lambda x: int(x.split("_")[1]))
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def get_task(task_id: str) -> Dict[str, Any]:
|
| 265 |
+
"""Return a task dict by ID or raise KeyError."""
|
| 266 |
+
if task_id not in TASKS:
|
| 267 |
+
raise KeyError(
|
| 268 |
+
f"Unknown task_id '{task_id}'. Available: {', '.join(TASK_IDS)}"
|
| 269 |
+
)
|
| 270 |
+
return TASKS[task_id]
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def list_tasks() -> List[Dict[str, str]]:
|
| 274 |
+
"""Return a summary list of all tasks."""
|
| 275 |
+
return [
|
| 276 |
+
{
|
| 277 |
+
"id": t["id"],
|
| 278 |
+
"title": t["title"],
|
| 279 |
+
"difficulty": t["difficulty"],
|
| 280 |
+
"task_type": t["task_type"],
|
| 281 |
+
"category": t["category"],
|
| 282 |
+
}
|
| 283 |
+
for t in (TASKS[tid] for tid in TASK_IDS)
|
| 284 |
+
]
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|