Spaces:

bpHigh
/

financial-task-env

Running

Financial Task Environment — code execution with real xlsx

7485602 1 day ago

1.68 kB

	spec_version: 1
	name: financial_task_env
	type: space
	runtime: fastapi
	app: server.app:app
	port: 8000

	tasks:
	- id: task_1
	name: Count Plants in Spreadsheet
	difficulty: easy
	max_steps: 15
	grader:
	type: programmatic
	description: "QA grading — extracts numbers from agent answer, compares against reference (85). Score 0.0–1.0 based on numeric match with 5% tolerance."

	- id: task_2
	name: Retrieve TW EOL Charge
	difficulty: easy
	max_steps: 15
	grader:
	type: programmatic
	description: "QA grading — extracts numbers from agent answer, compares against reference (113291). Score 0.0–1.0 based on numeric match with 5% tolerance."

	- id: task_3
	name: Portfolio Mark-to-Market Change
	difficulty: easy
	max_steps: 15
	grader:
	type: programmatic
	description: "QA grading — extracts numbers from agent answer, compares against reference values ($1,989,600 and 27.9%). Score 0.0–1.0 based on numeric match + keyword overlap."

	- id: task_5
	name: Audit and Correct Formula Errors
	difficulty: medium
	max_steps: 15
	grader:
	type: programmatic
	description: "MODIFY grading — compares agent output xlsx cell-by-cell against reference workbook. 30% sheet-name match + 70% cell-level match (2% numeric tolerance). Score 0.0–1.0."

	- id: task_8
	name: Balance Sheet Validation and Indicators
	difficulty: hard
	max_steps: 15
	grader:
	type: programmatic
	description: "MODIFY grading — compares agent output xlsx cell-by-cell against reference workbook. 30% sheet-name match + 70% cell-level match (2% numeric tolerance). Score 0.0–1.0."