File size: 2,430 Bytes
4ec75cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699f953
4ec75cf
 
 
 
 
699f953
4ec75cf
 
 
 
 
699f953
4ec75cf
 
 
 
 
699f953
4ec75cf
 
 
 
 
699f953
4ec75cf
 
 
 
 
699f953
4ec75cf
 
 
 
 
699f953
4ec75cf
 
 
 
 
699f953
4ec75cf
 
 
 
 
699f953
4ec75cf
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
spec_version: 1
name: multi-agent-dev-tools-env
description: >
  A multi-domain RL environment for training AI agents on real-world developer
  and clinical tasks. Covers MCP security auditing, PyTorch migration debugging,
  and clinical workflow chaos recovery. 9 tasks across 3 domains with graded
  difficulty (easy/medium/hard).
type: environment
runtime: docker
port: 7860

# Action and Observation spaces use typed Pydantic models
# See server/models/ for full definitions

tasks:
  - id: sec_easy
    grader: server.graders.security_grader.grade
    name: Single vulnerability classification
    difficulty: easy
    description: Identify vulnerability type, CVSS score, and severity from a tool-call snippet.

  - id: sec_medium
    grader: server.graders.security_grader.grade
    name: Vulnerability identification + fix proposal
    difficulty: medium
    description: Identify the vulnerability and propose a secure code fix.

  - id: sec_hard
    grader: server.graders.security_grader.grade
    name: Adversarial patch defense with reviewer feedback
    difficulty: hard
    description: Identify, fix, and iteratively revise based on reviewer feedback.

  - id: dep_easy
    grader: server.graders.dependency_grader.grade
    name: PyTorch 1.x deprecated API detection
    difficulty: easy
    description: Flag outdated packages and deprecated API usage.

  - id: dep_medium
    grader: server.graders.dependency_grader.grade
    name: Version conflict chain resolution
    difficulty: medium
    description: Resolve version conflicts using compatibility matrix constraints.

  - id: dep_hard
    grader: server.graders.dependency_grader.grade
    name: torch.compile graph-break hunter
    difficulty: hard
    description: Fix torch.compile graph-break patterns in dependency order.

  - id: cli_easy
    grader: server.graders.clinical_grader.grade
    name: Single workflow gap detection
    difficulty: easy
    description: Detect missing steps in a clinical workflow and assess risk.

  - id: cli_medium
    grader: server.graders.clinical_grader.grade
    name: Multi-gap priority ranking
    difficulty: medium
    description: Detect gaps and rank them by clinical priority.

  - id: cli_hard
    grader: server.graders.clinical_grader.grade
    name: Dependency-ordered recovery planning
    difficulty: hard
    description: Plan a dependency-safe recovery sequence for a disrupted clinical workflow.