| # Phase 7 — full v4 dataset, lr=1e-5, K=16, NO data reuse. | |
| # | |
| # Pool = 473 prompts (v4, balanced trigger probes + rewritten swap_check). | |
| # 16 prompts × 29 cycles = 464 samples (no reuse, 9 prompts unused). | |
| run_name: drgrpo_p7_v4_lr1e5_b16k16 | |
| wandb_project: lora-oracles-posttrain | |
| output_dir: checkpoints/drgrpo_p7_v4_lr1e5_b16k16 | |
| seed: 42 | |
| sft_checkpoint_repo: ceselder/loracle-pretrain-v7-sweep-A-oneq-final-step3120 | |
| base_model: /workspace/models/Qwen3-14B | |
| prompts_parquet: data/posttrain_rl_v4/qa.parquet | |
| holdout_ids_path: data/posttrain_rl_v4/holdout_ids.json | |
| tokens_dir: data/posttrain_v3_tokens/direction_tokens_svd_fixed_k16_mag7_rankfirst | |
| n_prompts_per_cycle: 16 | |
| k_rollouts: 16 | |
| temperature: 1.0 | |
| max_new_tokens: 300 | |
| n_cycles: 29 | |
| lr: 1.0e-5 | |
| eps_low: 0.2 | |
| eps_high: 0.28 | |
| max_grad_norm: 1.0 | |
| max_length: 5500 | |
| filter_min_max: 0.0 | |
| filter_min_std: 0.0 | |
| unbiased_advantages: true | |
| use_system_prompt: false | |
| prefix_mode: rank_tagged | |
| top_k: 16 | |
| n_direction_tokens: 4480 | |
| judge_provider: openrouter | |
| judge_model: anthropic/claude-sonnet-4.6 | |
| judge_workers: 32 | |
| judge_max_retries: 4 | |
| judge_request_timeout_s: 60 | |
| save_every: 8 | |
| log_every: 1 | |
| eval_at_step_0: true | |
| eval_every_cycles: 8 # 0/8/16/24, no post-eval | |
| mid_train_eval_sets: | |
| - configs/eval_sets/auditbench.yaml | |
| - configs/eval_sets/trigger_recovery_heldout_ia.yaml | |
| - configs/eval_sets/ood_models_v3.yaml | |
| post_eval: false | |
| eval_sets: | |
| - configs/eval_sets/auditbench.yaml | |
| - configs/eval_sets/ood_models_v3.yaml | |
| - configs/eval_sets/trigger_recovery_heldout_ia.yaml | |