File size: 1,680 Bytes
7485602
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
spec_version: 1
name: financial_task_env
type: space
runtime: fastapi
app: server.app:app
port: 8000

tasks:
  - id: task_1
    name: Count Plants in Spreadsheet
    difficulty: easy
    max_steps: 15
    grader:
      type: programmatic
      description: "QA grading β€” extracts numbers from agent answer, compares against reference (85). Score 0.0–1.0 based on numeric match with 5% tolerance."

  - id: task_2
    name: Retrieve TW EOL Charge
    difficulty: easy
    max_steps: 15
    grader:
      type: programmatic
      description: "QA grading β€” extracts numbers from agent answer, compares against reference (113291). Score 0.0–1.0 based on numeric match with 5% tolerance."

  - id: task_3
    name: Portfolio Mark-to-Market Change
    difficulty: easy
    max_steps: 15
    grader:
      type: programmatic
      description: "QA grading β€” extracts numbers from agent answer, compares against reference values ($1,989,600 and 27.9%). Score 0.0–1.0 based on numeric match + keyword overlap."

  - id: task_5
    name: Audit and Correct Formula Errors
    difficulty: medium
    max_steps: 15
    grader:
      type: programmatic
      description: "MODIFY grading β€” compares agent output xlsx cell-by-cell against reference workbook. 30% sheet-name match + 70% cell-level match (2% numeric tolerance). Score 0.0–1.0."

  - id: task_8
    name: Balance Sheet Validation and Indicators
    difficulty: hard
    max_steps: 15
    grader:
      type: programmatic
      description: "MODIFY grading β€” compares agent output xlsx cell-by-cell against reference workbook. 30% sheet-name match + 70% cell-level match (2% numeric tolerance). Score 0.0–1.0."