Spaces:
Running on T4
Running on T4
Claude commited on
Increase max completion length from 512 to 2048
Browse filesAll completions were hitting the 512 hard cap (clipped_ratio=1,
mean_terminated_length=0). Raise limits so the model can generate
longer prompts and learn to terminate naturally:
- max_prompt_length: 512 → 2048
- prompt_max_new_tokens: 512 → 2048
- max_seq_length: 2048 → 4096
- prompt_length_threshold: 300 → 1200 (reward penalty starts later)
https://claude.ai/code/session_01DPirJ78YYN4fJUvUFJ5D6V
- config.yaml +4 -4
- config_loader.py +5 -5
- layer1/grpo_trainer.py +3 -3
config.yaml
CHANGED
|
@@ -23,7 +23,7 @@ grpo:
|
|
| 23 |
num_candidates: 4 # Candidate prompts per step (GRPO group size, min=2)
|
| 24 |
episodes_per_candidate: 5 # Customers each candidate talks to
|
| 25 |
learning_rate: 2.0e-5 # Lower LR for stability at scale
|
| 26 |
-
max_prompt_length:
|
| 27 |
|
| 28 |
# TRL trainer settings
|
| 29 |
per_device_train_batch_size: 1
|
|
@@ -37,8 +37,8 @@ grpo:
|
|
| 37 |
|
| 38 |
generation:
|
| 39 |
# Prompt generator (GRPO model) inference
|
| 40 |
-
max_seq_length:
|
| 41 |
-
prompt_max_new_tokens:
|
| 42 |
prompt_temperature: 0.3 # Temperature for prompt generation
|
| 43 |
|
| 44 |
# Layer 2 agent (HF Inference API)
|
|
@@ -82,7 +82,7 @@ reward:
|
|
| 82 |
api_correct_bonus: 20.0
|
| 83 |
api_wrong_penalty: -30.0
|
| 84 |
helpfulness_bonus: 15.0 # Bonus for being helpful AND secure (both intent + injection blocked)
|
| 85 |
-
prompt_length_threshold:
|
| 86 |
prompt_length_penalty_per_token: -0.1 # Per-token penalty for bloated prompts
|
| 87 |
no_intent_penalty: -20.0 # Penalty when agent never classifies intent
|
| 88 |
|
|
|
|
| 23 |
num_candidates: 4 # Candidate prompts per step (GRPO group size, min=2)
|
| 24 |
episodes_per_candidate: 5 # Customers each candidate talks to
|
| 25 |
learning_rate: 2.0e-5 # Lower LR for stability at scale
|
| 26 |
+
max_prompt_length: 2048 # Max tokens for generated system prompt
|
| 27 |
|
| 28 |
# TRL trainer settings
|
| 29 |
per_device_train_batch_size: 1
|
|
|
|
| 37 |
|
| 38 |
generation:
|
| 39 |
# Prompt generator (GRPO model) inference
|
| 40 |
+
max_seq_length: 4096 # Max sequence length for model loading
|
| 41 |
+
prompt_max_new_tokens: 2048 # Max new tokens when generating prompts
|
| 42 |
prompt_temperature: 0.3 # Temperature for prompt generation
|
| 43 |
|
| 44 |
# Layer 2 agent (HF Inference API)
|
|
|
|
| 82 |
api_correct_bonus: 20.0
|
| 83 |
api_wrong_penalty: -30.0
|
| 84 |
helpfulness_bonus: 15.0 # Bonus for being helpful AND secure (both intent + injection blocked)
|
| 85 |
+
prompt_length_threshold: 1200 # Tokens before length penalty kicks in
|
| 86 |
prompt_length_penalty_per_token: -0.1 # Per-token penalty for bloated prompts
|
| 87 |
no_intent_penalty: -20.0 # Penalty when agent never classifies intent
|
| 88 |
|
config_loader.py
CHANGED
|
@@ -49,9 +49,9 @@ def make_grpo_config(cfg: dict[str, Any]):
|
|
| 49 |
episodes_per_candidate=grpo.get("episodes_per_candidate", 3),
|
| 50 |
num_training_steps=grpo.get("num_training_steps", 5),
|
| 51 |
learning_rate=grpo.get("learning_rate", 5e-5),
|
| 52 |
-
max_prompt_length=grpo.get("max_prompt_length",
|
| 53 |
-
max_seq_length=gen.get("max_seq_length",
|
| 54 |
-
prompt_max_new_tokens=gen.get("prompt_max_new_tokens",
|
| 55 |
prompt_temperature=gen.get("prompt_temperature", 0.3),
|
| 56 |
per_device_train_batch_size=grpo.get("per_device_train_batch_size", 1),
|
| 57 |
gradient_accumulation_steps=grpo.get("gradient_accumulation_steps", 4),
|
|
@@ -116,8 +116,8 @@ def get_generation_config(cfg: dict[str, Any]) -> dict[str, Any]:
|
|
| 116 |
"""Extract generation/inference settings from config."""
|
| 117 |
gen = cfg.get("generation", {})
|
| 118 |
return {
|
| 119 |
-
"max_seq_length": gen.get("max_seq_length",
|
| 120 |
-
"prompt_max_new_tokens": gen.get("prompt_max_new_tokens",
|
| 121 |
"prompt_temperature": gen.get("prompt_temperature", 0.3),
|
| 122 |
"agent_max_tokens": gen.get("agent_max_tokens", 300),
|
| 123 |
"agent_temperature": gen.get("agent_temperature", 0.3),
|
|
|
|
| 49 |
episodes_per_candidate=grpo.get("episodes_per_candidate", 3),
|
| 50 |
num_training_steps=grpo.get("num_training_steps", 5),
|
| 51 |
learning_rate=grpo.get("learning_rate", 5e-5),
|
| 52 |
+
max_prompt_length=grpo.get("max_prompt_length", 2048),
|
| 53 |
+
max_seq_length=gen.get("max_seq_length", 4096),
|
| 54 |
+
prompt_max_new_tokens=gen.get("prompt_max_new_tokens", 2048),
|
| 55 |
prompt_temperature=gen.get("prompt_temperature", 0.3),
|
| 56 |
per_device_train_batch_size=grpo.get("per_device_train_batch_size", 1),
|
| 57 |
gradient_accumulation_steps=grpo.get("gradient_accumulation_steps", 4),
|
|
|
|
| 116 |
"""Extract generation/inference settings from config."""
|
| 117 |
gen = cfg.get("generation", {})
|
| 118 |
return {
|
| 119 |
+
"max_seq_length": gen.get("max_seq_length", 4096),
|
| 120 |
+
"prompt_max_new_tokens": gen.get("prompt_max_new_tokens", 2048),
|
| 121 |
"prompt_temperature": gen.get("prompt_temperature", 0.3),
|
| 122 |
"agent_max_tokens": gen.get("agent_max_tokens", 300),
|
| 123 |
"agent_temperature": gen.get("agent_temperature", 0.3),
|
layer1/grpo_trainer.py
CHANGED
|
@@ -37,9 +37,9 @@ class GRPOConfig:
|
|
| 37 |
episodes_per_candidate: int = 3 # K episodes to evaluate each candidate
|
| 38 |
num_training_steps: int = 5
|
| 39 |
learning_rate: float = 5e-5
|
| 40 |
-
max_prompt_length: int =
|
| 41 |
-
max_seq_length: int =
|
| 42 |
-
prompt_max_new_tokens: int =
|
| 43 |
prompt_temperature: float = 0.3
|
| 44 |
|
| 45 |
# TRL trainer
|
|
|
|
| 37 |
episodes_per_candidate: int = 3 # K episodes to evaluate each candidate
|
| 38 |
num_training_steps: int = 5
|
| 39 |
learning_rate: float = 5e-5
|
| 40 |
+
max_prompt_length: int = 2048
|
| 41 |
+
max_seq_length: int = 4096
|
| 42 |
+
prompt_max_new_tokens: int = 2048
|
| 43 |
prompt_temperature: float = 0.3
|
| 44 |
|
| 45 |
# TRL trainer
|