| # Rollout Correction: corrects off-policy distribution shifts | |
| # See documentation: docs/algo/rollout_corr.md | |
| # Use presets: RolloutCorrectionConfig.decoupled_seq_is(), .bypass_pg_is(), etc. | |
| # IS aggregation level: null (disabled), "token" (per-token), "sequence" (per-sequence) | |
| rollout_is: null | |
| # Upper threshold for IS weight truncation (typical: 2.0-5.0) | |
| rollout_is_threshold: 2.0 | |
| # RS aggregation level: null (disabled), e.g. "token_k1", "seq_sum_k1", "seq_mean_k3" | |
| rollout_rs: null | |
| # Threshold for rejection sampling (string or float; see code docs) | |
| rollout_rs_threshold: null | |
| # Operating mode: false = Decoupled (3 policies), true = Bypass (2 policies) | |
| bypass_mode: false | |
| # Loss type in bypass mode (bypass_mode=true): | |
| # - "ppo_clip": PPO clipped objective (IS handled by ratio, default) | |
| # - "reinforce": REINFORCE with explicit IS weights (no PPO clipping) | |
| loss_type: ppo_clip | |
| # Batch normalize IS weights: false = raw weights, true = normalize to mean=1.0 | |
| rollout_is_batch_normalize: false | |