Spaces:
Running on T4
Running on T4
Claude commited on
Align GRPOConfig defaults with CLI: 10 steps, 7 episodes
Browse filesThe dataclass defaults were still 50 steps / 10 episodes, causing
deployments that don't pass explicit CLI args to run much longer
than intended.
https://claude.ai/code/session_01DPirJ78YYN4fJUvUFJ5D6V
- layer1/grpo_trainer.py +2 -2
- layer1/train.py +1 -1
layer1/grpo_trainer.py
CHANGED
|
@@ -37,8 +37,8 @@ class GRPOConfig:
|
|
| 37 |
|
| 38 |
# GRPO
|
| 39 |
num_candidates: int = 4 # N candidate prompts per step
|
| 40 |
-
episodes_per_candidate: int =
|
| 41 |
-
num_training_steps: int =
|
| 42 |
learning_rate: float = 5e-5
|
| 43 |
max_prompt_length: int = 512
|
| 44 |
|
|
|
|
| 37 |
|
| 38 |
# GRPO
|
| 39 |
num_candidates: int = 4 # N candidate prompts per step
|
| 40 |
+
episodes_per_candidate: int = 7 # K episodes to evaluate each candidate
|
| 41 |
+
num_training_steps: int = 10
|
| 42 |
learning_rate: float = 5e-5
|
| 43 |
max_prompt_length: int = 512
|
| 44 |
|
layer1/train.py
CHANGED
|
@@ -3,7 +3,7 @@ Layer 1 — Executable GRPO training script.
|
|
| 3 |
|
| 4 |
Usage:
|
| 5 |
# Full GPU training (requires Colab/GPU + train deps)
|
| 6 |
-
python -m layer1.train --mode train --steps
|
| 7 |
|
| 8 |
# CPU mock optimization (evaluates hand-written prompts)
|
| 9 |
python -m layer1.train --mode mock --episodes 20
|
|
|
|
| 3 |
|
| 4 |
Usage:
|
| 5 |
# Full GPU training (requires Colab/GPU + train deps)
|
| 6 |
+
python -m layer1.train --mode train --steps 10
|
| 7 |
|
| 8 |
# CPU mock optimization (evaluates hand-written prompts)
|
| 9 |
python -m layer1.train --mode mock --episodes 20
|