spec_version: 1 name: financial_task_env type: space runtime: fastapi app: server.app:app port: 8000 tasks: - id: task_1 name: Count Plants in Spreadsheet difficulty: easy max_steps: 15 grader: type: programmatic description: "QA grading — extracts numbers from agent answer, compares against reference (85). Score 0.0–1.0 based on numeric match with 5% tolerance." - id: task_2 name: Retrieve TW EOL Charge difficulty: easy max_steps: 15 grader: type: programmatic description: "QA grading — extracts numbers from agent answer, compares against reference (113291). Score 0.0–1.0 based on numeric match with 5% tolerance." - id: task_3 name: Portfolio Mark-to-Market Change difficulty: easy max_steps: 15 grader: type: programmatic description: "QA grading — extracts numbers from agent answer, compares against reference values ($1,989,600 and 27.9%). Score 0.0–1.0 based on numeric match + keyword overlap." - id: task_5 name: Audit and Correct Formula Errors difficulty: medium max_steps: 15 grader: type: programmatic description: "MODIFY grading — compares agent output xlsx cell-by-cell against reference workbook. 30% sheet-name match + 70% cell-level match (2% numeric tolerance). Score 0.0–1.0." - id: task_8 name: Balance Sheet Validation and Indicators difficulty: hard max_steps: 15 grader: type: programmatic description: "MODIFY grading — compares agent output xlsx cell-by-cell against reference workbook. 30% sheet-name match + 70% cell-level match (2% numeric tolerance). Score 0.0–1.0."