Qwen-2.5-7B-base-Simple-RL / trainer_state.json
kkish's picture
Model save
434c1cb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9893390191897654,
"eval_steps": 100,
"global_step": 58,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 442.65068969726565,
"epoch": 0.08528784648187633,
"grad_norm": 6.35717248916626,
"kl": 0.0004929542541503906,
"learning_rate": 2.5e-06,
"loss": 0.0,
"reward": 0.41696430593729017,
"reward_std": 0.4544539041817188,
"rewards/accuracy_reward": 0.12834822051227093,
"rewards/format_reward": 0.28861608617007734,
"step": 5
},
{
"completion_length": 304.9187644958496,
"epoch": 0.17057569296375266,
"grad_norm": 2.1602344512939453,
"kl": 0.026971435546875,
"learning_rate": 2.956412726139078e-06,
"loss": 0.0011,
"reward": 0.986160759627819,
"reward_std": 0.3372926861047745,
"rewards/accuracy_reward": 0.11272321967408061,
"rewards/format_reward": 0.873437537252903,
"step": 10
},
{
"completion_length": 365.3803741455078,
"epoch": 0.255863539445629,
"grad_norm": 0.33471256494522095,
"kl": 0.03214263916015625,
"learning_rate": 2.7836719084521715e-06,
"loss": 0.0013,
"reward": 1.298883980512619,
"reward_std": 0.41679972484707833,
"rewards/accuracy_reward": 0.37522323429584503,
"rewards/format_reward": 0.923660759627819,
"step": 15
},
{
"completion_length": 378.46050758361815,
"epoch": 0.3411513859275053,
"grad_norm": 1.4841564893722534,
"kl": 0.0420074462890625,
"learning_rate": 2.4946839873611927e-06,
"loss": 0.0017,
"reward": 1.4825893670320511,
"reward_std": 0.4033116206526756,
"rewards/accuracy_reward": 0.5584821693599225,
"rewards/format_reward": 0.9241071850061416,
"step": 20
},
{
"completion_length": 394.2165367126465,
"epoch": 0.42643923240938164,
"grad_norm": 0.28560909628868103,
"kl": 0.0350738525390625,
"learning_rate": 2.1156192081791355e-06,
"loss": 0.0014,
"reward": 1.512500062584877,
"reward_std": 0.3465679492801428,
"rewards/accuracy_reward": 0.5779018126428127,
"rewards/format_reward": 0.9345982566475868,
"step": 25
},
{
"completion_length": 401.2672065734863,
"epoch": 0.511727078891258,
"grad_norm": 0.5306402444839478,
"kl": 0.034088134765625,
"learning_rate": 1.6808050203829845e-06,
"loss": 0.0014,
"reward": 1.5180804193019868,
"reward_std": 0.3442438319325447,
"rewards/accuracy_reward": 0.5779018104076385,
"rewards/format_reward": 0.940178607404232,
"step": 30
},
{
"completion_length": 393.07434768676757,
"epoch": 0.5970149253731343,
"grad_norm": 0.2967917323112488,
"kl": 0.0385345458984375,
"learning_rate": 1.2296174432791415e-06,
"loss": 0.0015,
"reward": 1.5225447058677672,
"reward_std": 0.34006611779332163,
"rewards/accuracy_reward": 0.5747768118977546,
"rewards/format_reward": 0.9477679014205933,
"step": 35
},
{
"completion_length": 390.0587242126465,
"epoch": 0.6823027718550106,
"grad_norm": 0.358806312084198,
"kl": 0.0369537353515625,
"learning_rate": 8.029152419343472e-07,
"loss": 0.0015,
"reward": 1.572321492433548,
"reward_std": 0.313805010356009,
"rewards/accuracy_reward": 0.6136160999536514,
"rewards/format_reward": 0.9587054014205932,
"step": 40
},
{
"completion_length": 418.40091094970705,
"epoch": 0.767590618336887,
"grad_norm": 0.2598155736923218,
"kl": 0.03328857421875,
"learning_rate": 4.3933982822017883e-07,
"loss": 0.0013,
"reward": 1.5649554222822188,
"reward_std": 0.3114229992032051,
"rewards/accuracy_reward": 0.6140625312924385,
"rewards/format_reward": 0.9508929029107094,
"step": 45
},
{
"completion_length": 422.549129486084,
"epoch": 0.8528784648187633,
"grad_norm": 0.2509433925151825,
"kl": 0.034161376953125,
"learning_rate": 1.718159615201853e-07,
"loss": 0.0014,
"reward": 1.5645089954137803,
"reward_std": 0.3242972683161497,
"rewards/accuracy_reward": 0.6129464544355869,
"rewards/format_reward": 0.9515625432133674,
"step": 50
},
{
"completion_length": 419.257829284668,
"epoch": 0.9381663113006397,
"grad_norm": 0.3123314082622528,
"kl": 0.0333709716796875,
"learning_rate": 2.4570139579284723e-08,
"loss": 0.0013,
"reward": 1.5895090013742448,
"reward_std": 0.326204876601696,
"rewards/accuracy_reward": 0.6330357410013676,
"rewards/format_reward": 0.9564732536673546,
"step": 55
},
{
"completion_length": 415.0018781026204,
"epoch": 0.9893390191897654,
"kl": 0.03472900390625,
"reward": 1.5885417411724727,
"reward_std": 0.29818206280469894,
"rewards/accuracy_reward": 0.6279762176175913,
"rewards/format_reward": 0.9605655198295912,
"step": 58,
"total_flos": 0.0,
"train_loss": 0.0013394545973730192,
"train_runtime": 6585.1769,
"train_samples_per_second": 1.139,
"train_steps_per_second": 0.009
}
],
"logging_steps": 5,
"max_steps": 58,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}