{ "experiment": "EXP-TRAIN-CDG", "total_iterations": 300, "training_time_hours": 19.159403832289907, "eval_interval": 100, "rolling_checkpoint_interval": 50, "pending_evaluations": 3, "eval_queue_file": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/metrics/pending_eval_checkpoints.jsonl", "best_model": { "iteration": 226, "training_reward": 0.9533333333333334, "path": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/best_model", "note": "Best by training reward; run EuroEval for validation score" }, "final_model": { "path": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/final_model" }, "cdg_metrics": { "total_episodes": 3200, "wrong_to_right": 723, "right_to_wrong": 317, "right_to_right": 1140, "wrong_to_wrong": 1020, "filtered_episodes": 1494, "avg_episode_quality": 0.78125 }, "eval_history": [], "config": { "cdg_config": { "enabled": true, "max_turns": 2, "helpful_critic_ratio": 0.5, "prover_correct_bonus": 0.3, "prover_weight": 1.0, "helpful_critic_weight": 0.5, "misleading_critic_weight": 0.3, "min_episode_quality": 0.2, "filter_trivial_episodes": true, "critique_max_tokens": 128, "revision_max_tokens": 256, "prover_temperature": 0.7, "critic_temperature": 0.9, "log_role_rewards": true, "log_state_transitions": true, "rest_filtering": { "enabled": true, "top_k_ratio": 0.8, "min_episodes_per_batch": 4 } }, "rl_config": { "output_dir": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg", "iterations": 300, "batch_size": 8, "learning_rate": 5e-06, "gradient_accumulation_steps": 4, "max_length": 512, "grpo_group_size": 4, "ppo_epochs": 4, "adap_kl_ctrl": true, "init_kl_coef": 0.1, "target": 0.1, "horizon": 10000, "kl_coef_min": 0.05, "kl_coef_max": 0.5, "advantage_clip": 3.0, "advantage_epsilon": 1e-06, "kl_decay": 0.95, "kl_growth": 1.2, "ref_update_interval": 1, "reward_weights": { "accuracy": 1.0, "fluency": 0.0, "reconstruction": 0.0, "novelty": 0.0, "grammar": 0.0, "grammar_enabled": false }, "adaptive_weights": true, "weight_update_interval": 100, "kl_penalty_coef": 0.08, "kl_penalty_warmup_steps": 500, "max_kl_divergence": 5.0, "entropy_penalty_weight": 0.01, "min_response_tokens": 10, "qa_min_response_tokens": 3, "qa_short_response_advantage_margin": -0.5, "skip_reference_kl": false, "pretraining_weight": 0.3, "pretraining_samples_per_batch": 2, "reward_clip_min": -1.0, "reward_clip_max": 1.0, "reward_norm_momentum": 0.1, "reward_downscale": 0.5, "generation": { "max_new_tokens": 256, "min_new_tokens": 5, "temperature": 0.7, "top_p": 0.9, "top_k": 50, "do_sample": true, "repetition_penalty": 1.1 } } }, "post_training_eval_required": true, "storage_note": "Only best_model and final_model saved (storage-efficient)" }