entrypoint: examples.terminal_bench.entrypoints.main_tbench

# Hydra config groups (+ prefix in CLI)
config_groups:
  terminal_bench_config: terminal_bench

# Terminal bench / agentic environment settings
terminal_bench:
  # trials_dir: Directory for Harbor trial artifacts (derived from experiments_dir if null)
  trials_dir: null

  # Harbor configuration - schema-driven mapping to TrialConfig
  harbor:
    # Agent settings
    name: terminus-2
    max_episodes: 999999
    enable_summarize: false
    store_all_messages: true
    enable_episode_logging: false
    record_terminal_session: false
    enable_pane_logging: false

    # Strict JSON parser
    strict_json_parser: true

    # Interleaved Thinking Settings
    interleaved_thinking: true
    extra_body:
      chat_template_kwargs:
        enable_thinking: true
    # Long timeout for thinking models
    override_timeout_sec: 1800

    # Environment settings
    override_cpus: 1
    override_memory_mb: 2048
    override_storage_mb: 2048

    # ==========================================================================
    # AUTO SNAPSHOT: Reduce Daytona rate limits with hash-based snapshot caching
    # ==========================================================================
    # When true, automatically creates a snapshot from the Dockerfile on first use,
    # then reuses it for all subsequent sandboxes with the same Dockerfile content.
    # Snapshots are named: harbor__<sha256[:12]>__snapshot
    auto_snapshot: true

    # Verifier settings
    verifier_override_timeout_sec: 120

    # Retry settings
    max_retries: 3
    min_wait_sec: 60.0
    max_wait_sec: 600.0
    wait_multiplier: 2.0

    exclude_exceptions:
      - VerifierTimeoutError
      - VerifierRuntimeError
      - RewardFileNotFoundError
      - RewardFileEmptyError
      - VerifierOutputParseError

    n_concurrent_trials: 280

    # Logging settings
    log_level: INFO

    # Reward shaping (disabled - binary rewards)
    enable_reward_shaping: false

    # RLOO-N error classification
    enable_error_classification: true
    mask_exceptions:
      - DaytonaError
      - EnvironmentStartTimeoutError
      - NetworkError
      - ConnectionError
      - RewardFileNotFoundError
      - RewardFileEmptyError
      - AgentEnvironmentTimeoutError
    default_error_treatment: zero
    passthrough_exceptions:
      - AgentTimeoutError
      - ContextLengthExceededError

  # Model info for Harbor's hosted_vllm validation
  model_info:
    max_input_tokens: 32768
    max_output_tokens: 4096

  archiving:
    # Enable trial archiving callback
    enabled: false

  # Post-training trace upload to HuggingFace
  trace_upload:
    enabled: true
    repo_org: DCAgent
    episodes: last
    dataset_type: SFT
    cleanup: true

# Trainer configuration
trainer:
  strategy: fsdp2
  algorithm:
    advantage_estimator: rloo_n
    use_kl_loss: false
    kl_loss_coef: 0.0
    eps_clip_low: 0.2
    eps_clip_high: 0.2
    loss_reduction: token_mean

  # Training loop settings
  epochs: 2
  max_steps: 80
  update_epochs_per_batch: 1

  # Batch sizes
  train_batch_size: 64
  policy_mini_batch_size: 64
  eval_batch_size: 64

  # Micro batch sizes (micro1x4 variant)
  micro_forward_batch_size_per_gpu: 4
  micro_train_batch_size_per_gpu: 1

  max_prompt_length: 999999

  # Evaluation and checkpointing
  eval_interval: 999999
  eval_before_train: false
  # Resumable checkpointing
  ckpt_interval: 5
  resume_mode: latest
  # HF upload-ready checkpoints
  hf_save_interval: 5
  # HuggingFace Hub upload (set via CLI: trainer.hf_hub_repo_id=org/repo)
  hf_hub_repo_id: null
  hf_hub_private: false
  hf_hub_revision: main

  # Database registration (auto-registers trained model to Supabase)
  # Requires KEYS env var pointing to Supabase credentials file
  enable_db_registration: true

  # Logging
  project_name: OpenThoughts-Agent
  log_level: INFO
  tracker_commit_each_step: true
  logger: console

  # Paths
  run_name: null
  ckpt_path: null
  export_path: null

  policy:
    optimizer_config:
      lr: 3e-5
      weight_decay: 0.0
      adam_betas: [0.9, 0.999]
      max_grad_norm: 10.0
    fsdp_config:
      cpu_offload: false
      reshard_after_forward: true
      fsdp_size: 4
  ref:
    fsdp_config:
      cpu_offload: false
      reshard_after_forward: true
      fsdp_size: 4

  placement:
    colocate_all: false
    policy_num_nodes: 2
    ref_num_nodes: 2
    policy_num_gpus_per_node: 4
    ref_num_gpus_per_node: 4

  fully_async:
    max_staleness_steps: 16
    num_parallel_generation_workers: 768

generator:
  backend: vllm
  timeout_multiplier: 1.0
  model_dtype: bfloat16

  inference_engine_tensor_parallel_size: 1
  # 16 inference engines (24 total GPUs: 16 engines + 8 policy/ref shared)
  num_inference_engines: 16

  n_samples_per_prompt: 8
  eval_n_samples_per_prompt: 8

  gpu_memory_utilization: 0.75

  max_num_seqs: 24
  max_num_batched_tokens: 65536

  enable_prefix_caching: true
  enable_chunked_prefill: true

  run_engines_locally: true
  weight_sync_backend: nccl
  async_engine: true
  batched: false
  enable_http_endpoint: true
  enable_ray_prometheus_stats: false
  vllm_stats_interval: 1
  append_eos_token_after_stop_str_in_multi_turn: true
  max_turns: 999999

  sampling_params:
    max_generate_length: 8192
    temperature: 0.7
    top_p: 0.95
    top_k: 20

  engine_init_kwargs:
    max_model_len: 32768
    # Interleaved thinking chat template: preserves <think> blocks on ALL
    # historical assistant turns (stock Qwen3 template strips them).
    custom_chat_template_chat_completion_path: chat_templates/qwen3_thinking_acc.jinja2

data:
  train_data: []
  val_data: ["open-thoughts/OpenThoughts-TB-dev"]