arithmetic-grpo / verl /trainer /config /algorithm /rollout_correction.yaml
LeTue09's picture
initial clean commit
1faccd4
# Rollout Correction: corrects off-policy distribution shifts
# See documentation: docs/algo/rollout_corr.md
# Use presets: RolloutCorrectionConfig.decoupled_seq_is(), .bypass_pg_is(), etc.
# IS aggregation level: null (disabled), "token" (per-token), "sequence" (per-sequence)
rollout_is: null
# Upper threshold for IS weight truncation (typical: 2.0-5.0)
rollout_is_threshold: 2.0
# RS aggregation level: null (disabled), e.g. "token_k1", "seq_sum_k1", "seq_mean_k3"
rollout_rs: null
# Threshold for rejection sampling (string or float; see code docs)
rollout_rs_threshold: null
# Operating mode: false = Decoupled (3 policies), true = Bypass (2 policies)
bypass_mode: false
# Loss type in bypass mode (bypass_mode=true):
# - "ppo_clip": PPO clipped objective (IS handled by ratio, default)
# - "reinforce": REINFORCE with explicit IS weights (no PPO clipping)
loss_type: ppo_clip
# Batch normalize IS weights: false = raw weights, true = normalize to mean=1.0
rollout_is_batch_normalize: false