File size: 5,723 Bytes
7c7089b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
entrypoint: examples.terminal_bench.entrypoints.main_tbench

# Hydra config groups (+ prefix in CLI)
config_groups:
  terminal_bench_config: terminal_bench

# Terminal bench / agentic environment settings
terminal_bench:
  # trials_dir: Directory for Harbor trial artifacts (derived from experiments_dir if null)
  trials_dir: null

  # Harbor configuration - schema-driven mapping to TrialConfig
  harbor:
    # Agent settings
    name: terminus-2
    max_episodes: 999999
    enable_summarize: false
    store_all_messages: true
    enable_episode_logging: false
    record_terminal_session: false
    enable_pane_logging: false

    # Strict JSON parser
    strict_json_parser: true

    # Interleaved Thinking Settings
    interleaved_thinking: true
    extra_body:
      chat_template_kwargs:
        enable_thinking: true
    # Long timeout for thinking models
    override_timeout_sec: 1800

    # Environment settings
    override_cpus: 1
    override_memory_mb: 2048
    override_storage_mb: 2048

    # ==========================================================================
    # AUTO SNAPSHOT: Reduce Daytona rate limits with hash-based snapshot caching
    # ==========================================================================
    # When true, automatically creates a snapshot from the Dockerfile on first use,
    # then reuses it for all subsequent sandboxes with the same Dockerfile content.
    # Snapshots are named: harbor__<sha256[:12]>__snapshot
    auto_snapshot: true

    # Verifier settings
    verifier_override_timeout_sec: 120

    # Retry settings
    max_retries: 3
    min_wait_sec: 60.0
    max_wait_sec: 600.0
    wait_multiplier: 2.0

    exclude_exceptions:
      - VerifierTimeoutError
      - VerifierRuntimeError
      - RewardFileNotFoundError
      - RewardFileEmptyError
      - VerifierOutputParseError

    n_concurrent_trials: 280

    # Logging settings
    log_level: INFO

    # Reward shaping (disabled - binary rewards)
    enable_reward_shaping: false

    # RLOO-N error classification
    enable_error_classification: true
    mask_exceptions:
      - DaytonaError
      - EnvironmentStartTimeoutError
      - NetworkError
      - ConnectionError
      - RewardFileNotFoundError
      - RewardFileEmptyError
      - AgentEnvironmentTimeoutError
    default_error_treatment: zero
    passthrough_exceptions:
      - AgentTimeoutError
      - ContextLengthExceededError

  # Model info for Harbor's hosted_vllm validation
  model_info:
    max_input_tokens: 32768
    max_output_tokens: 4096

  archiving:
    # Enable trial archiving callback
    enabled: false

  # Post-training trace upload to HuggingFace
  trace_upload:
    enabled: true
    repo_org: DCAgent
    episodes: last
    dataset_type: SFT
    cleanup: true

# Trainer configuration
trainer:
  strategy: fsdp2
  algorithm:
    advantage_estimator: rloo_n
    use_kl_loss: false
    kl_loss_coef: 0.0
    eps_clip_low: 0.2
    eps_clip_high: 0.2
    loss_reduction: token_mean

  # Training loop settings
  epochs: 2
  max_steps: 80
  update_epochs_per_batch: 1

  # Batch sizes
  train_batch_size: 64
  policy_mini_batch_size: 64
  eval_batch_size: 64

  # Micro batch sizes (micro1x4 variant)
  micro_forward_batch_size_per_gpu: 4
  micro_train_batch_size_per_gpu: 1

  max_prompt_length: 999999

  # Evaluation and checkpointing
  eval_interval: 999999
  eval_before_train: false
  # Resumable checkpointing
  ckpt_interval: 5
  resume_mode: latest
  # HF upload-ready checkpoints
  hf_save_interval: 5
  # HuggingFace Hub upload (set via CLI: trainer.hf_hub_repo_id=org/repo)
  hf_hub_repo_id: null
  hf_hub_private: false
  hf_hub_revision: main

  # Database registration (auto-registers trained model to Supabase)
  # Requires KEYS env var pointing to Supabase credentials file
  enable_db_registration: true

  # Logging
  project_name: OpenThoughts-Agent
  log_level: INFO
  tracker_commit_each_step: true
  logger: console

  # Paths
  run_name: null
  ckpt_path: null
  export_path: null

  policy:
    optimizer_config:
      lr: 3e-5
      weight_decay: 0.0
      adam_betas: [0.9, 0.999]
      max_grad_norm: 10.0
    fsdp_config:
      cpu_offload: false
      reshard_after_forward: true
      fsdp_size: 4
  ref:
    fsdp_config:
      cpu_offload: false
      reshard_after_forward: true
      fsdp_size: 4

  placement:
    colocate_all: false
    policy_num_nodes: 2
    ref_num_nodes: 2
    policy_num_gpus_per_node: 4
    ref_num_gpus_per_node: 4

  fully_async:
    max_staleness_steps: 16
    num_parallel_generation_workers: 768

generator:
  backend: vllm
  timeout_multiplier: 1.0
  model_dtype: bfloat16

  inference_engine_tensor_parallel_size: 1
  # 16 inference engines (24 total GPUs: 16 engines + 8 policy/ref shared)
  num_inference_engines: 16

  n_samples_per_prompt: 8
  eval_n_samples_per_prompt: 8

  gpu_memory_utilization: 0.75

  max_num_seqs: 24
  max_num_batched_tokens: 65536

  enable_prefix_caching: true
  enable_chunked_prefill: true

  run_engines_locally: true
  weight_sync_backend: nccl
  async_engine: true
  batched: false
  enable_http_endpoint: true
  enable_ray_prometheus_stats: false
  vllm_stats_interval: 1
  append_eos_token_after_stop_str_in_multi_turn: true
  max_turns: 999999

  sampling_params:
    max_generate_length: 8192
    temperature: 0.7
    top_p: 0.95
    top_k: 20

  engine_init_kwargs:
    max_model_len: 32768
    # Interleaved thinking chat template: preserves <think> blocks on ALL
    # historical assistant turns (stock Qwen3 template strips them).
    custom_chat_template_chat_completion_path: chat_templates/qwen3_thinking_acc.jinja2

data:
  train_data: []
  val_data: ["open-thoughts/OpenThoughts-TB-dev"]