Claude commited on
Commit
552e492
·
unverified ·
1 Parent(s): 4ae001d

Increase max completion length from 512 to 2048

Browse files

All completions were hitting the 512 hard cap (clipped_ratio=1,
mean_terminated_length=0). Raise limits so the model can generate
longer prompts and learn to terminate naturally:
- max_prompt_length: 512 → 2048
- prompt_max_new_tokens: 512 → 2048
- max_seq_length: 2048 → 4096
- prompt_length_threshold: 300 → 1200 (reward penalty starts later)

https://claude.ai/code/session_01DPirJ78YYN4fJUvUFJ5D6V

Files changed (3) hide show
  1. config.yaml +4 -4
  2. config_loader.py +5 -5
  3. layer1/grpo_trainer.py +3 -3
config.yaml CHANGED
@@ -23,7 +23,7 @@ grpo:
23
  num_candidates: 4 # Candidate prompts per step (GRPO group size, min=2)
24
  episodes_per_candidate: 5 # Customers each candidate talks to
25
  learning_rate: 2.0e-5 # Lower LR for stability at scale
26
- max_prompt_length: 512 # Max tokens for generated system prompt
27
 
28
  # TRL trainer settings
29
  per_device_train_batch_size: 1
@@ -37,8 +37,8 @@ grpo:
37
 
38
  generation:
39
  # Prompt generator (GRPO model) inference
40
- max_seq_length: 2048 # Max sequence length for model loading
41
- prompt_max_new_tokens: 512 # Max new tokens when generating prompts
42
  prompt_temperature: 0.3 # Temperature for prompt generation
43
 
44
  # Layer 2 agent (HF Inference API)
@@ -82,7 +82,7 @@ reward:
82
  api_correct_bonus: 20.0
83
  api_wrong_penalty: -30.0
84
  helpfulness_bonus: 15.0 # Bonus for being helpful AND secure (both intent + injection blocked)
85
- prompt_length_threshold: 300 # Tokens before length penalty kicks in
86
  prompt_length_penalty_per_token: -0.1 # Per-token penalty for bloated prompts
87
  no_intent_penalty: -20.0 # Penalty when agent never classifies intent
88
 
 
23
  num_candidates: 4 # Candidate prompts per step (GRPO group size, min=2)
24
  episodes_per_candidate: 5 # Customers each candidate talks to
25
  learning_rate: 2.0e-5 # Lower LR for stability at scale
26
+ max_prompt_length: 2048 # Max tokens for generated system prompt
27
 
28
  # TRL trainer settings
29
  per_device_train_batch_size: 1
 
37
 
38
  generation:
39
  # Prompt generator (GRPO model) inference
40
+ max_seq_length: 4096 # Max sequence length for model loading
41
+ prompt_max_new_tokens: 2048 # Max new tokens when generating prompts
42
  prompt_temperature: 0.3 # Temperature for prompt generation
43
 
44
  # Layer 2 agent (HF Inference API)
 
82
  api_correct_bonus: 20.0
83
  api_wrong_penalty: -30.0
84
  helpfulness_bonus: 15.0 # Bonus for being helpful AND secure (both intent + injection blocked)
85
+ prompt_length_threshold: 1200 # Tokens before length penalty kicks in
86
  prompt_length_penalty_per_token: -0.1 # Per-token penalty for bloated prompts
87
  no_intent_penalty: -20.0 # Penalty when agent never classifies intent
88
 
config_loader.py CHANGED
@@ -49,9 +49,9 @@ def make_grpo_config(cfg: dict[str, Any]):
49
  episodes_per_candidate=grpo.get("episodes_per_candidate", 3),
50
  num_training_steps=grpo.get("num_training_steps", 5),
51
  learning_rate=grpo.get("learning_rate", 5e-5),
52
- max_prompt_length=grpo.get("max_prompt_length", 512),
53
- max_seq_length=gen.get("max_seq_length", 2048),
54
- prompt_max_new_tokens=gen.get("prompt_max_new_tokens", 512),
55
  prompt_temperature=gen.get("prompt_temperature", 0.3),
56
  per_device_train_batch_size=grpo.get("per_device_train_batch_size", 1),
57
  gradient_accumulation_steps=grpo.get("gradient_accumulation_steps", 4),
@@ -116,8 +116,8 @@ def get_generation_config(cfg: dict[str, Any]) -> dict[str, Any]:
116
  """Extract generation/inference settings from config."""
117
  gen = cfg.get("generation", {})
118
  return {
119
- "max_seq_length": gen.get("max_seq_length", 2048),
120
- "prompt_max_new_tokens": gen.get("prompt_max_new_tokens", 512),
121
  "prompt_temperature": gen.get("prompt_temperature", 0.3),
122
  "agent_max_tokens": gen.get("agent_max_tokens", 300),
123
  "agent_temperature": gen.get("agent_temperature", 0.3),
 
49
  episodes_per_candidate=grpo.get("episodes_per_candidate", 3),
50
  num_training_steps=grpo.get("num_training_steps", 5),
51
  learning_rate=grpo.get("learning_rate", 5e-5),
52
+ max_prompt_length=grpo.get("max_prompt_length", 2048),
53
+ max_seq_length=gen.get("max_seq_length", 4096),
54
+ prompt_max_new_tokens=gen.get("prompt_max_new_tokens", 2048),
55
  prompt_temperature=gen.get("prompt_temperature", 0.3),
56
  per_device_train_batch_size=grpo.get("per_device_train_batch_size", 1),
57
  gradient_accumulation_steps=grpo.get("gradient_accumulation_steps", 4),
 
116
  """Extract generation/inference settings from config."""
117
  gen = cfg.get("generation", {})
118
  return {
119
+ "max_seq_length": gen.get("max_seq_length", 4096),
120
+ "prompt_max_new_tokens": gen.get("prompt_max_new_tokens", 2048),
121
  "prompt_temperature": gen.get("prompt_temperature", 0.3),
122
  "agent_max_tokens": gen.get("agent_max_tokens", 300),
123
  "agent_temperature": gen.get("agent_temperature", 0.3),
layer1/grpo_trainer.py CHANGED
@@ -37,9 +37,9 @@ class GRPOConfig:
37
  episodes_per_candidate: int = 3 # K episodes to evaluate each candidate
38
  num_training_steps: int = 5
39
  learning_rate: float = 5e-5
40
- max_prompt_length: int = 512
41
- max_seq_length: int = 2048
42
- prompt_max_new_tokens: int = 512
43
  prompt_temperature: float = 0.3
44
 
45
  # TRL trainer
 
37
  episodes_per_candidate: int = 3 # K episodes to evaluate each candidate
38
  num_training_steps: int = 5
39
  learning_rate: float = 5e-5
40
+ max_prompt_length: int = 2048
41
+ max_seq_length: int = 4096
42
+ prompt_max_new_tokens: int = 2048
43
  prompt_temperature: float = 0.3
44
 
45
  # TRL trainer