| { |
| "experiment": "EXP-TRAIN-CDG", |
| "total_iterations": 300, |
| "training_time_hours": 19.159403832289907, |
| "eval_interval": 100, |
| "rolling_checkpoint_interval": 50, |
| "pending_evaluations": 3, |
| "eval_queue_file": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/metrics/pending_eval_checkpoints.jsonl", |
| "best_model": { |
| "iteration": 226, |
| "training_reward": 0.9533333333333334, |
| "path": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/best_model", |
| "note": "Best by training reward; run EuroEval for validation score" |
| }, |
| "final_model": { |
| "path": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/final_model" |
| }, |
| "cdg_metrics": { |
| "total_episodes": 3200, |
| "wrong_to_right": 723, |
| "right_to_wrong": 317, |
| "right_to_right": 1140, |
| "wrong_to_wrong": 1020, |
| "filtered_episodes": 1494, |
| "avg_episode_quality": 0.78125 |
| }, |
| "eval_history": [], |
| "config": { |
| "cdg_config": { |
| "enabled": true, |
| "max_turns": 2, |
| "helpful_critic_ratio": 0.5, |
| "prover_correct_bonus": 0.3, |
| "prover_weight": 1.0, |
| "helpful_critic_weight": 0.5, |
| "misleading_critic_weight": 0.3, |
| "min_episode_quality": 0.2, |
| "filter_trivial_episodes": true, |
| "critique_max_tokens": 128, |
| "revision_max_tokens": 256, |
| "prover_temperature": 0.7, |
| "critic_temperature": 0.9, |
| "log_role_rewards": true, |
| "log_state_transitions": true, |
| "rest_filtering": { |
| "enabled": true, |
| "top_k_ratio": 0.8, |
| "min_episodes_per_batch": 4 |
| } |
| }, |
| "rl_config": { |
| "output_dir": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg", |
| "iterations": 300, |
| "batch_size": 8, |
| "learning_rate": 5e-06, |
| "gradient_accumulation_steps": 4, |
| "max_length": 512, |
| "grpo_group_size": 4, |
| "ppo_epochs": 4, |
| "adap_kl_ctrl": true, |
| "init_kl_coef": 0.1, |
| "target": 0.1, |
| "horizon": 10000, |
| "kl_coef_min": 0.05, |
| "kl_coef_max": 0.5, |
| "advantage_clip": 3.0, |
| "advantage_epsilon": 1e-06, |
| "kl_decay": 0.95, |
| "kl_growth": 1.2, |
| "ref_update_interval": 1, |
| "reward_weights": { |
| "accuracy": 1.0, |
| "fluency": 0.0, |
| "reconstruction": 0.0, |
| "novelty": 0.0, |
| "grammar": 0.0, |
| "grammar_enabled": false |
| }, |
| "adaptive_weights": true, |
| "weight_update_interval": 100, |
| "kl_penalty_coef": 0.08, |
| "kl_penalty_warmup_steps": 500, |
| "max_kl_divergence": 5.0, |
| "entropy_penalty_weight": 0.01, |
| "min_response_tokens": 10, |
| "qa_min_response_tokens": 3, |
| "qa_short_response_advantage_margin": -0.5, |
| "skip_reference_kl": false, |
| "pretraining_weight": 0.3, |
| "pretraining_samples_per_batch": 2, |
| "reward_clip_min": -1.0, |
| "reward_clip_max": 1.0, |
| "reward_norm_momentum": 0.1, |
| "reward_downscale": 0.5, |
| "generation": { |
| "max_new_tokens": 256, |
| "min_new_tokens": 5, |
| "temperature": 0.7, |
| "top_p": 0.9, |
| "top_k": 50, |
| "do_sample": true, |
| "repetition_penalty": 1.1 |
| } |
| } |
| }, |
| "post_training_eval_required": true, |
| "storage_note": "Only best_model and final_model saved (storage-efficient)" |
| } |