acbueff
/

exp-train-cdg

Model card Files Files and versions

exp-train-cdg / training_summary.json

acbueff's picture

Upload experiment exp_train_cdg

3e81abb verified 10 days ago

history blame contribute delete

3.37 kB

	{
	"experiment": "EXP-TRAIN-CDG",
	"total_iterations": 300,
	"training_time_hours": 19.159403832289907,
	"eval_interval": 100,
	"rolling_checkpoint_interval": 50,
	"pending_evaluations": 3,
	"eval_queue_file": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/metrics/pending_eval_checkpoints.jsonl",
	"best_model": {
	"iteration": 226,
	"training_reward": 0.9533333333333334,
	"path": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/best_model",
	"note": "Best by training reward; run EuroEval for validation score"
	},
	"final_model": {
	"path": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/final_model"
	},
	"cdg_metrics": {
	"total_episodes": 3200,
	"wrong_to_right": 723,
	"right_to_wrong": 317,
	"right_to_right": 1140,
	"wrong_to_wrong": 1020,
	"filtered_episodes": 1494,
	"avg_episode_quality": 0.78125
	},
	"eval_history": [],
	"config": {
	"cdg_config": {
	"enabled": true,
	"max_turns": 2,
	"helpful_critic_ratio": 0.5,
	"prover_correct_bonus": 0.3,
	"prover_weight": 1.0,
	"helpful_critic_weight": 0.5,
	"misleading_critic_weight": 0.3,
	"min_episode_quality": 0.2,
	"filter_trivial_episodes": true,
	"critique_max_tokens": 128,
	"revision_max_tokens": 256,
	"prover_temperature": 0.7,
	"critic_temperature": 0.9,
	"log_role_rewards": true,
	"log_state_transitions": true,
	"rest_filtering": {
	"enabled": true,
	"top_k_ratio": 0.8,
	"min_episodes_per_batch": 4
	}
	},
	"rl_config": {
	"output_dir": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg",
	"iterations": 300,
	"batch_size": 8,
	"learning_rate": 5e-06,
	"gradient_accumulation_steps": 4,
	"max_length": 512,
	"grpo_group_size": 4,
	"ppo_epochs": 4,
	"adap_kl_ctrl": true,
	"init_kl_coef": 0.1,
	"target": 0.1,
	"horizon": 10000,
	"kl_coef_min": 0.05,
	"kl_coef_max": 0.5,
	"advantage_clip": 3.0,
	"advantage_epsilon": 1e-06,
	"kl_decay": 0.95,
	"kl_growth": 1.2,
	"ref_update_interval": 1,
	"reward_weights": {
	"accuracy": 1.0,
	"fluency": 0.0,
	"reconstruction": 0.0,
	"novelty": 0.0,
	"grammar": 0.0,
	"grammar_enabled": false
	},
	"adaptive_weights": true,
	"weight_update_interval": 100,
	"kl_penalty_coef": 0.08,
	"kl_penalty_warmup_steps": 500,
	"max_kl_divergence": 5.0,
	"entropy_penalty_weight": 0.01,
	"min_response_tokens": 10,
	"qa_min_response_tokens": 3,
	"qa_short_response_advantage_margin": -0.5,
	"skip_reference_kl": false,
	"pretraining_weight": 0.3,
	"pretraining_samples_per_batch": 2,
	"reward_clip_min": -1.0,
	"reward_clip_max": 1.0,
	"reward_norm_momentum": 0.1,
	"reward_downscale": 0.5,
	"generation": {
	"max_new_tokens": 256,
	"min_new_tokens": 5,
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 50,
	"do_sample": true,
	"repetition_penalty": 1.1
	}
	}
	},
	"post_training_eval_required": true,
	"storage_note": "Only best_model and final_model saved (storage-efficient)"
	}