File size: 1,745 Bytes
1c0bef3
f6756c5
 
 
 
5191f15
af6fa71
f6756c5
 
 
6502c1d
f6756c5
 
 
6fb71c6
6502c1d
6fb71c6
 
 
f6756c5
 
6fb71c6
f6756c5
 
6502c1d
f6756c5
 
 
6fb71c6
 
1c0bef3
6fb71c6
f6756c5
 
6fb71c6
f6756c5
 
6502c1d
f6756c5
 
 
6fb71c6
 
 
 
f6756c5
 
6fb71c6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
spec_version: 1
name: fin_auditor
type: space
runtime: fastapi
app: server.app:app
port: 7860
entrypoint: server.fin_auditor_environment:FinAuditorEnvironment

tasks:
  - id: anomaly_detection_easy
    name: "Anomaly Detection (EASY)"
    description: >
      EASY — Detect expired/unreconciled trades. After a reset, trades
      are ingested and immediately aged past the 5-second max window.
      The agent must call /step and receive the anomaly matrix. A valid
      response requires at least one trade flagged (decision=1) in the
      action sent to /step.
    grader: "graders.grader_detection:EasyDetectionGrader"
    difficulty: "easy"
    max_steps: 5
    reward_threshold: 0.5
    score_range: [0.0, 1.0]

  - id: anomaly_detection_medium
    name: "Anomaly Detection (MEDIUM)"
    description: >
      MEDIUM — Identify high-risk counterparties. The agent must process
      the anomaly matrix and flag all trades where risk_score > 0.5.
      Reward is based on precision of high-risk flags. Requires the LLM to
      parse the feature array and make discriminating decisions.
    grader: "graders.grader_classification:MediumClassificationGrader"
    difficulty: "medium"
    max_steps: 10
    reward_threshold: 0.6
    score_range: [0.0, 1.0]

  - id: anomaly_detection_hard
    name: "Anomaly Detection (HARD)"
    description: >
      HARD — Systemic anomaly triage. The agent must detect when
      missing_frequency > 0.3, indicating a systemic anomaly wave,
      and correctly flag ALL trades in the batch. Deducts points for
      false-positive flags during safe periods.
    grader: "graders.grader_fix:HardFixGrader"
    difficulty: "hard"
    max_steps: 20
    reward_threshold: 0.7
    score_range: [0.0, 1.0]