File size: 4,125 Bytes
c33d988
41595ac
c33d988
 
 
d86d6a2
 
dfa9f05
 
 
 
 
 
 
 
70b313b
 
33e9ed5
70b313b
 
 
33e9ed5
7896686
dfa9f05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3f7834
 
 
 
dfa9f05
 
 
 
 
b8dbf99
 
55309da
dfa9f05
 
 
 
55309da
ec1ce67
dfa9f05
7896686
b8dbf99
55309da
dfa9f05
 
 
 
55309da
ec1ce67
dfa9f05
7896686
b8dbf99
55309da
dfa9f05
 
 
 
55309da
ec1ce67
dfa9f05
 
 
 
 
 
 
 
 
 
 
41595ac
 
dfa9f05
 
 
41595ac
dfa9f05
 
 
 
 
 
 
 
 
 
 
 
 
d86d6a2
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
spec_version: 1
name: cognitive-load-manager
type: space
runtime: fastapi
app: server.app:app
port: 7860

description: >
  Cognitive Load Manager (CLM) — a real-world productivity simulation where an AI agent
  acts as a human task scheduler, managing energy, stress, and fatigue while completing
  heterogeneous work items (emails, meetings, code reviews, reports, calls) under deadlines.
  Features task dependencies, mid-episode interruptions, focus mode, and priority weighting.
version: "2.0.0"
author: "CLM Team"
tags: [openenv, scheduling, productivity, rl, agent-eval]

endpoints:
  health: /health
  reset:  /reset
  step:   /step
  state:  /state
  grade:  /grader

action_space:
  type: discrete
  actions:
    - name: work
      description: "Work on task_id at normal pace (energy cost varies by task type)"
      requires: task_id
    - name: focus
      description: "Deep-work mode: 2× progress, 2× energy cost; exits on break"
      requires: task_id
    - name: break
      description: "Rest: +0.22 energy, -0.18 stress"
    - name: switch
      description: "Change active task (small context-switch cost)"
      requires: task_id
    - name: delay
      description: "Wait one step; slight stress reduction"

observation_space:
  tasks:
    - id: string
    - task_type: "email | meeting | code_review | report | call"
    - priority: "critical | high | normal | low"
    - progress: float [0.0, 1.0]
    - deadline: int (step number) or null
    - depends_on: task_id or null
    - is_interrupted: bool
  visible_state:
    # Partial observability: energy/stress are categorical labels, not raw floats.
    - fatigue_level: "low | medium | high"     # energy bands: >0.6 | 0.3-0.6 | <0.3
    - stress_level: "calm | elevated | critical" # stress bands: <0.45 | 0.45-0.75 | >0.75
    - stress_warning: bool                       # true when stress > 0.65
    - focus_mode: bool
    - upcoming_deadlines: list[task_id]
    - blocked_tasks: list[task_id]
  time_step: int

tasks:
  - id: easy
    difficulty: easy
    description: >
      2 tasks (email + report), normal priority, no deadlines.
      Agent must complete both without burning out.
      Tests basic work/break balance.
    max_steps: 50
    grader: "grader.clm_graders:EasyGrader"
    baseline_score: 0.856

  - id: medium
    difficulty: medium
    description: >
      5 heterogeneous tasks (email/meeting/code_review/report/call) with mixed
      priorities (critical→low) and real deadlines. Agent must triage intelligently.
      Tests priority-aware scheduling and deadline management.
    max_steps: 50
    grader: "grader.clm_graders:MediumGrader"
    baseline_score: 0.523

  - id: hard
    difficulty: hard
    description: >
      8 tasks with explicit dependencies (task B cannot start until task A completes),
      tight deadlines, and 2 mid-episode urgent email interruptions.
      Tests dependency-aware scheduling under time pressure.
    max_steps: 50
    grader: "grader.clm_graders:HardGrader"
    baseline_score: 0.301

  - id: expert
    difficulty: expert
    description: >
      10 tasks in a deep dependency chain, 3 mid-episode interruptions,
      mixed critical/high/normal priorities, and very tight deadlines.
      Genuinely challenges frontier LLM agents.
    max_steps: 60
    grader: "grader.clm_graders:ExpertGrader"
    baseline_score: 0.221

scoring:
  reward_range: [-1.0, 1.0]         # step rewards (negative preserved for burnout)
  grader_range: [0.01, 0.99]        # final episode scores
  success_threshold: 0.50
  score_formula: deterministic_grader
  components:
    - weighted_completion:  0.60
    - deadline_adherence:   0.22
    - energy_efficiency:    0.10
    - dependency_bonus:     0.05
    - interruption_bonus:   0.03

reward_shaping:
  milestone_rewards: [0.25, 0.50, 0.75, 1.00]
  burnout_penalty: -1.0
  context_switch_penalty: -0.07
  blocked_task_penalty: -0.15
  stress_penalty_threshold: 0.80

constraints:
  max_runtime_seconds: 1800
  max_memory_gb: 8
  max_vcpu: 2

inference:
  script: "inference.py"
  env_vars:
    - API_BASE_URL
    - MODEL_NAME
    - HF_TOKEN