entrypoint: examples.terminal_bench.entrypoints.main_tbench # Hydra config groups (+ prefix in CLI) config_groups: terminal_bench_config: terminal_bench # Terminal bench / agentic environment settings terminal_bench: # trials_dir: Directory for Harbor trial artifacts (derived from experiments_dir if null) trials_dir: null # Harbor configuration - schema-driven mapping to TrialConfig harbor: # Agent settings name: terminus-2 max_episodes: 999999 enable_summarize: false store_all_messages: true enable_episode_logging: false record_terminal_session: false enable_pane_logging: false # Strict JSON parser strict_json_parser: true # Interleaved Thinking Settings interleaved_thinking: true extra_body: chat_template_kwargs: enable_thinking: true # Long timeout for thinking models override_timeout_sec: 1800 # Environment settings override_cpus: 1 override_memory_mb: 2048 override_storage_mb: 2048 # ========================================================================== # AUTO SNAPSHOT: Reduce Daytona rate limits with hash-based snapshot caching # ========================================================================== # When true, automatically creates a snapshot from the Dockerfile on first use, # then reuses it for all subsequent sandboxes with the same Dockerfile content. # Snapshots are named: harbor____snapshot auto_snapshot: true # Verifier settings verifier_override_timeout_sec: 120 # Retry settings max_retries: 3 min_wait_sec: 60.0 max_wait_sec: 600.0 wait_multiplier: 2.0 exclude_exceptions: - VerifierTimeoutError - VerifierRuntimeError - RewardFileNotFoundError - RewardFileEmptyError - VerifierOutputParseError n_concurrent_trials: 280 # Logging settings log_level: INFO # Reward shaping (disabled - binary rewards) enable_reward_shaping: false # RLOO-N error classification enable_error_classification: true mask_exceptions: - DaytonaError - EnvironmentStartTimeoutError - NetworkError - ConnectionError - RewardFileNotFoundError - RewardFileEmptyError - AgentEnvironmentTimeoutError default_error_treatment: zero passthrough_exceptions: - AgentTimeoutError - ContextLengthExceededError # Model info for Harbor's hosted_vllm validation model_info: max_input_tokens: 32768 max_output_tokens: 4096 archiving: # Enable trial archiving callback enabled: false # Post-training trace upload to HuggingFace trace_upload: enabled: true repo_org: DCAgent episodes: last dataset_type: SFT cleanup: true # Trainer configuration trainer: strategy: fsdp2 algorithm: advantage_estimator: rloo_n use_kl_loss: false kl_loss_coef: 0.0 eps_clip_low: 0.2 eps_clip_high: 0.2 loss_reduction: token_mean # Training loop settings epochs: 2 max_steps: 80 update_epochs_per_batch: 1 # Batch sizes train_batch_size: 64 policy_mini_batch_size: 64 eval_batch_size: 64 # Micro batch sizes (micro1x4 variant) micro_forward_batch_size_per_gpu: 4 micro_train_batch_size_per_gpu: 1 max_prompt_length: 999999 # Evaluation and checkpointing eval_interval: 999999 eval_before_train: false # Resumable checkpointing ckpt_interval: 5 resume_mode: latest # HF upload-ready checkpoints hf_save_interval: 5 # HuggingFace Hub upload (set via CLI: trainer.hf_hub_repo_id=org/repo) hf_hub_repo_id: null hf_hub_private: false hf_hub_revision: main # Database registration (auto-registers trained model to Supabase) # Requires KEYS env var pointing to Supabase credentials file enable_db_registration: true # Logging project_name: OpenThoughts-Agent log_level: INFO tracker_commit_each_step: true logger: console # Paths run_name: null ckpt_path: null export_path: null policy: optimizer_config: lr: 3e-5 weight_decay: 0.0 adam_betas: [0.9, 0.999] max_grad_norm: 10.0 fsdp_config: cpu_offload: false reshard_after_forward: true fsdp_size: 4 ref: fsdp_config: cpu_offload: false reshard_after_forward: true fsdp_size: 4 placement: colocate_all: false policy_num_nodes: 2 ref_num_nodes: 2 policy_num_gpus_per_node: 4 ref_num_gpus_per_node: 4 fully_async: max_staleness_steps: 16 num_parallel_generation_workers: 768 generator: backend: vllm timeout_multiplier: 1.0 model_dtype: bfloat16 inference_engine_tensor_parallel_size: 1 # 16 inference engines (24 total GPUs: 16 engines + 8 policy/ref shared) num_inference_engines: 16 n_samples_per_prompt: 8 eval_n_samples_per_prompt: 8 gpu_memory_utilization: 0.75 max_num_seqs: 24 max_num_batched_tokens: 65536 enable_prefix_caching: true enable_chunked_prefill: true run_engines_locally: true weight_sync_backend: nccl async_engine: true batched: false enable_http_endpoint: true enable_ray_prometheus_stats: false vllm_stats_interval: 1 append_eos_token_after_stop_str_in_multi_turn: true max_turns: 999999 sampling_params: max_generate_length: 8192 temperature: 0.7 top_p: 0.95 top_k: 20 engine_init_kwargs: max_model_len: 32768 # Interleaved thinking chat template: preserves blocks on ALL # historical assistant turns (stock Qwen3 template strips them). custom_chat_template_chat_completion_path: chat_templates/qwen3_thinking_acc.jinja2 data: train_data: [] val_data: ["open-thoughts/OpenThoughts-TB-dev"]