exp-train-cdg / training_summary.json
acbueff's picture
Upload experiment exp_train_cdg
3e81abb verified
{
"experiment": "EXP-TRAIN-CDG",
"total_iterations": 300,
"training_time_hours": 19.159403832289907,
"eval_interval": 100,
"rolling_checkpoint_interval": 50,
"pending_evaluations": 3,
"eval_queue_file": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/metrics/pending_eval_checkpoints.jsonl",
"best_model": {
"iteration": 226,
"training_reward": 0.9533333333333334,
"path": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/best_model",
"note": "Best by training reward; run EuroEval for validation score"
},
"final_model": {
"path": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/final_model"
},
"cdg_metrics": {
"total_episodes": 3200,
"wrong_to_right": 723,
"right_to_wrong": 317,
"right_to_right": 1140,
"wrong_to_wrong": 1020,
"filtered_episodes": 1494,
"avg_episode_quality": 0.78125
},
"eval_history": [],
"config": {
"cdg_config": {
"enabled": true,
"max_turns": 2,
"helpful_critic_ratio": 0.5,
"prover_correct_bonus": 0.3,
"prover_weight": 1.0,
"helpful_critic_weight": 0.5,
"misleading_critic_weight": 0.3,
"min_episode_quality": 0.2,
"filter_trivial_episodes": true,
"critique_max_tokens": 128,
"revision_max_tokens": 256,
"prover_temperature": 0.7,
"critic_temperature": 0.9,
"log_role_rewards": true,
"log_state_transitions": true,
"rest_filtering": {
"enabled": true,
"top_k_ratio": 0.8,
"min_episodes_per_batch": 4
}
},
"rl_config": {
"output_dir": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg",
"iterations": 300,
"batch_size": 8,
"learning_rate": 5e-06,
"gradient_accumulation_steps": 4,
"max_length": 512,
"grpo_group_size": 4,
"ppo_epochs": 4,
"adap_kl_ctrl": true,
"init_kl_coef": 0.1,
"target": 0.1,
"horizon": 10000,
"kl_coef_min": 0.05,
"kl_coef_max": 0.5,
"advantage_clip": 3.0,
"advantage_epsilon": 1e-06,
"kl_decay": 0.95,
"kl_growth": 1.2,
"ref_update_interval": 1,
"reward_weights": {
"accuracy": 1.0,
"fluency": 0.0,
"reconstruction": 0.0,
"novelty": 0.0,
"grammar": 0.0,
"grammar_enabled": false
},
"adaptive_weights": true,
"weight_update_interval": 100,
"kl_penalty_coef": 0.08,
"kl_penalty_warmup_steps": 500,
"max_kl_divergence": 5.0,
"entropy_penalty_weight": 0.01,
"min_response_tokens": 10,
"qa_min_response_tokens": 3,
"qa_short_response_advantage_margin": -0.5,
"skip_reference_kl": false,
"pretraining_weight": 0.3,
"pretraining_samples_per_batch": 2,
"reward_clip_min": -1.0,
"reward_clip_max": 1.0,
"reward_norm_momentum": 0.1,
"reward_downscale": 0.5,
"generation": {
"max_new_tokens": 256,
"min_new_tokens": 5,
"temperature": 0.7,
"top_p": 0.9,
"top_k": 50,
"do_sample": true,
"repetition_penalty": 1.1
}
}
},
"post_training_eval_required": true,
"storage_note": "Only best_model and final_model saved (storage-efficient)"
}