# Phase 7 — full v4 dataset, lr=1e-5, K=16, NO data reuse. # # Pool = 473 prompts (v4, balanced trigger probes + rewritten swap_check). # 16 prompts × 29 cycles = 464 samples (no reuse, 9 prompts unused). run_name: drgrpo_p7_v4_lr1e5_b16k16 wandb_project: lora-oracles-posttrain output_dir: checkpoints/drgrpo_p7_v4_lr1e5_b16k16 seed: 42 sft_checkpoint_repo: ceselder/loracle-pretrain-v7-sweep-A-oneq-final-step3120 base_model: /workspace/models/Qwen3-14B prompts_parquet: data/posttrain_rl_v4/qa.parquet holdout_ids_path: data/posttrain_rl_v4/holdout_ids.json tokens_dir: data/posttrain_v3_tokens/direction_tokens_svd_fixed_k16_mag7_rankfirst n_prompts_per_cycle: 16 k_rollouts: 16 temperature: 1.0 max_new_tokens: 300 n_cycles: 29 lr: 1.0e-5 eps_low: 0.2 eps_high: 0.28 max_grad_norm: 1.0 max_length: 5500 filter_min_max: 0.0 filter_min_std: 0.0 unbiased_advantages: true use_system_prompt: false prefix_mode: rank_tagged top_k: 16 n_direction_tokens: 4480 judge_provider: openrouter judge_model: anthropic/claude-sonnet-4.6 judge_workers: 32 judge_max_retries: 4 judge_request_timeout_s: 60 save_every: 8 log_every: 1 eval_at_step_0: true eval_every_cycles: 8 # 0/8/16/24, no post-eval mid_train_eval_sets: - configs/eval_sets/auditbench.yaml - configs/eval_sets/trigger_recovery_heldout_ia.yaml - configs/eval_sets/ood_models_v3.yaml post_eval: false eval_sets: - configs/eval_sets/auditbench.yaml - configs/eval_sets/ood_models_v3.yaml - configs/eval_sets/trigger_recovery_heldout_ia.yaml