Spaces:

openenv-community
/

test-local-nested-envs

Running on T4

Claude commited on 2 days ago

Commit

b1685a6

unverified ·

1 Parent(s): 384df8f

Increase training scale: more steps, episodes, and SFT epochs

- num_training_steps: 15 → 30
- episodes_per_candidate: 5 → 8
- sft_epochs: 2 → 3
- eval_episodes: 10 → 15

https://claude.ai/code/session_01DPirJ78YYN4fJUvUFJ5D6V

Files changed (1) hide show

config.yaml +4 -4

config.yaml CHANGED Viewed

@@ -20,13 +20,13 @@ grpo:
   # SFT warm start — prime the model on seed prompts before GRPO
   sft_warm_start: true            # Enable SFT warm start phase
-  sft_epochs: 2                   # Epochs over seed prompts
   sft_lr: 1.0e-4                  # Learning rate for SFT phase
   # GRPO training loop
-  num_training_steps: 15          # Number of policy updates (GRPO iterations)
   num_candidates: 4               # Candidate prompts per step (GRPO group size, min=2)
-  episodes_per_candidate: 5       # Customers each candidate talks to
   learning_rate: 2.0e-5           # Lower LR for stability at scale
   max_prompt_length: 512          # Max tokens for generated system prompt (hard cap during GRPO)
@@ -104,7 +104,7 @@ reward:
 report:
   enabled: true
   output_dir: "/workspace/output/reports"
-  eval_episodes: 10               # Episodes per checkpoint evaluation
   example_customers: 5            # Example conversations in report

   # SFT warm start — prime the model on seed prompts before GRPO
   sft_warm_start: true            # Enable SFT warm start phase
+  sft_epochs: 3                   # Epochs over seed prompts
   sft_lr: 1.0e-4                  # Learning rate for SFT phase
   # GRPO training loop
+  num_training_steps: 30          # Number of policy updates (GRPO iterations)
   num_candidates: 4               # Candidate prompts per step (GRPO group size, min=2)
+  episodes_per_candidate: 8       # Customers each candidate talks to
   learning_rate: 2.0e-5           # Lower LR for stability at scale
   max_prompt_length: 512          # Max tokens for generated system prompt (hard cap during GRPO)
 report:
   enabled: true
   output_dir: "/workspace/output/reports"
+  eval_episodes: 15               # Episodes per checkpoint evaluation
   example_customers: 5            # Example conversations in report