GRPO_Reward_Model / trainer_state.json
SaitejaJate's picture
Upload 14 files
f6b8251 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.380952380952381,
"eval_steps": 500,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 51.85,
"epoch": 0.07936507936507936,
"grad_norm": 2.2403488159179688,
"kl": 0.0027098871301859616,
"learning_rate": 2.6666666666666667e-07,
"loss": 0.0001,
"reward": 0.16500000339001417,
"reward_std": 0.11435296162962913,
"rewards/cbt_content_reward": 0.16500000339001417,
"rewards/check_cbt_structure": 0.0,
"step": 5
},
{
"completion_length": 58.675,
"epoch": 0.15873015873015872,
"grad_norm": 3.0842247009277344,
"kl": 0.002581492770696059,
"learning_rate": 6e-07,
"loss": 0.0001,
"reward": 0.18250000290572643,
"reward_std": 0.14434738419950008,
"rewards/cbt_content_reward": 0.18250000439584255,
"rewards/check_cbt_structure": 0.0,
"step": 10
},
{
"completion_length": 55.7,
"epoch": 0.23809523809523808,
"grad_norm": 1.792191982269287,
"kl": 0.0019858392188325524,
"learning_rate": 9.333333333333333e-07,
"loss": 0.0001,
"reward": 0.1250000039115548,
"reward_std": 0.1089518491178751,
"rewards/cbt_content_reward": 0.1250000039115548,
"rewards/check_cbt_structure": 0.0,
"step": 15
},
{
"completion_length": 73.025,
"epoch": 0.31746031746031744,
"grad_norm": 2.7672274112701416,
"kl": 0.0017400937096681446,
"learning_rate": 9.703703703703704e-07,
"loss": 0.0001,
"reward": 0.15250000283122062,
"reward_std": 0.1529246997088194,
"rewards/cbt_content_reward": 0.15250000432133676,
"rewards/check_cbt_structure": 0.0,
"step": 20
},
{
"completion_length": 52.825,
"epoch": 0.3968253968253968,
"grad_norm": 1.1795265674591064,
"kl": 0.0027943541877903043,
"learning_rate": 9.333333333333333e-07,
"loss": 0.0001,
"reward": 0.14500000327825546,
"reward_std": 0.10991083942353726,
"rewards/cbt_content_reward": 0.14500000178813935,
"rewards/check_cbt_structure": 0.0,
"step": 25
},
{
"completion_length": 53.3375,
"epoch": 0.47619047619047616,
"grad_norm": 2.038141965866089,
"kl": 0.002141835092334077,
"learning_rate": 8.962962962962963e-07,
"loss": 0.0001,
"reward": 0.1750000048428774,
"reward_std": 0.11487694047391414,
"rewards/cbt_content_reward": 0.17500000335276128,
"rewards/check_cbt_structure": 0.0,
"step": 30
},
{
"completion_length": 66.55,
"epoch": 0.5555555555555556,
"grad_norm": 2.512485980987549,
"kl": 0.0019801797869149597,
"learning_rate": 8.592592592592592e-07,
"loss": 0.0001,
"reward": 0.1575000027194619,
"reward_std": 0.1333638343960047,
"rewards/cbt_content_reward": 0.1575000027194619,
"rewards/check_cbt_structure": 0.0,
"step": 35
},
{
"completion_length": 56.8,
"epoch": 0.6349206349206349,
"grad_norm": 4.515315055847168,
"kl": 0.002533014601795003,
"learning_rate": 8.222222222222221e-07,
"loss": 0.0001,
"reward": 0.18250000663101673,
"reward_std": 0.1475393757224083,
"rewards/cbt_content_reward": 0.18250000514090062,
"rewards/check_cbt_structure": 0.0,
"step": 40
},
{
"completion_length": 64.275,
"epoch": 0.7142857142857143,
"grad_norm": 2.102088212966919,
"kl": 0.002205433923518285,
"learning_rate": 7.851851851851852e-07,
"loss": 0.0001,
"reward": 0.180000002682209,
"reward_std": 0.13480928540229797,
"rewards/cbt_content_reward": 0.180000002682209,
"rewards/check_cbt_structure": 0.0,
"step": 45
},
{
"completion_length": 47.3,
"epoch": 0.7936507936507936,
"grad_norm": 3.474039077758789,
"kl": 0.0022507158428197727,
"learning_rate": 7.481481481481481e-07,
"loss": 0.0001,
"reward": 0.15000000335276126,
"reward_std": 0.1302133210003376,
"rewards/cbt_content_reward": 0.15000000335276126,
"rewards/check_cbt_structure": 0.0,
"step": 50
},
{
"completion_length": 64.8125,
"epoch": 0.873015873015873,
"grad_norm": 2.440229654312134,
"kl": 0.00192810871230904,
"learning_rate": 7.111111111111111e-07,
"loss": 0.0001,
"reward": 0.16500000488013028,
"reward_std": 0.0976612851023674,
"rewards/cbt_content_reward": 0.16500000488013028,
"rewards/check_cbt_structure": 0.0,
"step": 55
},
{
"completion_length": 59.7375,
"epoch": 0.9523809523809523,
"grad_norm": 1.8490701913833618,
"kl": 0.002136132796294987,
"learning_rate": 6.74074074074074e-07,
"loss": 0.0001,
"reward": 0.17750000581145287,
"reward_std": 0.14892779104411602,
"rewards/cbt_content_reward": 0.17750000432133675,
"rewards/check_cbt_structure": 0.0,
"step": 60
},
{
"completion_length": 54.7125,
"epoch": 1.0317460317460316,
"grad_norm": 2.5725176334381104,
"kl": 0.002353719263919629,
"learning_rate": 6.37037037037037e-07,
"loss": 0.0001,
"reward": 0.1800000037997961,
"reward_std": 0.12799057997763158,
"rewards/cbt_content_reward": 0.18000000230968,
"rewards/check_cbt_structure": 0.0,
"step": 65
},
{
"completion_length": 66.5625,
"epoch": 1.1111111111111112,
"grad_norm": 1.7574554681777954,
"kl": 0.00198215174023062,
"learning_rate": 6e-07,
"loss": 0.0001,
"reward": 0.22500000521540642,
"reward_std": 0.12019186988472938,
"rewards/cbt_content_reward": 0.2250000037252903,
"rewards/check_cbt_structure": 0.0,
"step": 70
},
{
"completion_length": 58.825,
"epoch": 1.1904761904761905,
"grad_norm": 2.7693963050842285,
"kl": 0.002443282786407508,
"learning_rate": 5.62962962962963e-07,
"loss": 0.0001,
"reward": 0.1500000048428774,
"reward_std": 0.13859827741980552,
"rewards/cbt_content_reward": 0.15000000186264514,
"rewards/check_cbt_structure": 0.0,
"step": 75
},
{
"completion_length": 63.875,
"epoch": 1.2698412698412698,
"grad_norm": 2.8838014602661133,
"kl": 0.002126309886807576,
"learning_rate": 5.259259259259258e-07,
"loss": 0.0001,
"reward": 0.1775000037625432,
"reward_std": 0.11867224015295505,
"rewards/cbt_content_reward": 0.1775000037625432,
"rewards/check_cbt_structure": 0.0,
"step": 80
},
{
"completion_length": 53.85,
"epoch": 1.3492063492063493,
"grad_norm": 3.608692169189453,
"kl": 0.002317077317275107,
"learning_rate": 4.888888888888889e-07,
"loss": 0.0001,
"reward": 0.18250000216066836,
"reward_std": 0.1294781118631363,
"rewards/cbt_content_reward": 0.18250000514090062,
"rewards/check_cbt_structure": 0.0,
"step": 85
},
{
"completion_length": 56.3375,
"epoch": 1.4285714285714286,
"grad_norm": 2.851497173309326,
"kl": 0.0024501425621565433,
"learning_rate": 4.5185185185185183e-07,
"loss": 0.0001,
"reward": 0.1675000036135316,
"reward_std": 0.11531616114079953,
"rewards/cbt_content_reward": 0.1675000036135316,
"rewards/check_cbt_structure": 0.0,
"step": 90
},
{
"completion_length": 54.6625,
"epoch": 1.507936507936508,
"grad_norm": 2.257418632507324,
"kl": 0.002075143059482798,
"learning_rate": 4.1481481481481476e-07,
"loss": 0.0001,
"reward": 0.14500000271946192,
"reward_std": 0.12950329035520552,
"rewards/cbt_content_reward": 0.14500000271946192,
"rewards/check_cbt_structure": 0.0,
"step": 95
},
{
"completion_length": 60.6,
"epoch": 1.5873015873015874,
"grad_norm": 2.6931891441345215,
"kl": 0.002077191596617922,
"learning_rate": 3.7777777777777775e-07,
"loss": 0.0001,
"reward": 0.13250000309199095,
"reward_std": 0.13716318383812903,
"rewards/cbt_content_reward": 0.13250000458210706,
"rewards/check_cbt_structure": 0.0,
"step": 100
},
{
"completion_length": 78.9,
"epoch": 1.6666666666666665,
"grad_norm": 2.117309808731079,
"kl": 0.0018815583549439906,
"learning_rate": 3.407407407407407e-07,
"loss": 0.0001,
"reward": 0.21000000461935997,
"reward_std": 0.14268166311085223,
"rewards/cbt_content_reward": 0.21000000461935997,
"rewards/check_cbt_structure": 0.0,
"step": 105
},
{
"completion_length": 54.3125,
"epoch": 1.746031746031746,
"grad_norm": 2.469907760620117,
"kl": 0.002604751317994669,
"learning_rate": 3.037037037037037e-07,
"loss": 0.0001,
"reward": 0.21750000603497027,
"reward_std": 0.1493647824972868,
"rewards/cbt_content_reward": 0.21750000603497027,
"rewards/check_cbt_structure": 0.0,
"step": 110
},
{
"completion_length": 56.85,
"epoch": 1.8253968253968254,
"grad_norm": 2.3624932765960693,
"kl": 0.002011225459864363,
"learning_rate": 2.6666666666666667e-07,
"loss": 0.0001,
"reward": 0.1475000012665987,
"reward_std": 0.12692904993891715,
"rewards/cbt_content_reward": 0.14750000275671482,
"rewards/check_cbt_structure": 0.0,
"step": 115
},
{
"completion_length": 55.625,
"epoch": 1.9047619047619047,
"grad_norm": 3.7563695907592773,
"kl": 0.002532219042768702,
"learning_rate": 2.296296296296296e-07,
"loss": 0.0001,
"reward": 0.16250000335276127,
"reward_std": 0.11915707401931286,
"rewards/cbt_content_reward": 0.16250000037252904,
"rewards/check_cbt_structure": 0.0,
"step": 120
},
{
"completion_length": 57.4625,
"epoch": 1.9841269841269842,
"grad_norm": 2.7810513973236084,
"kl": 0.002170709293568507,
"learning_rate": 1.9259259259259257e-07,
"loss": 0.0001,
"reward": 0.15750000439584255,
"reward_std": 0.1208796363323927,
"rewards/cbt_content_reward": 0.15750000439584255,
"rewards/check_cbt_structure": 0.0,
"step": 125
},
{
"completion_length": 49.35,
"epoch": 2.0634920634920633,
"grad_norm": 2.166604518890381,
"kl": 0.0023496907786466183,
"learning_rate": 1.5555555555555556e-07,
"loss": 0.0001,
"reward": 0.1425000036135316,
"reward_std": 0.14228889718651772,
"rewards/cbt_content_reward": 0.1425000036135316,
"rewards/check_cbt_structure": 0.0,
"step": 130
},
{
"completion_length": 63.175,
"epoch": 2.142857142857143,
"grad_norm": 3.5904221534729004,
"kl": 0.002583104814402759,
"learning_rate": 1.1851851851851851e-07,
"loss": 0.0001,
"reward": 0.20750000271946192,
"reward_std": 0.12724252939224243,
"rewards/cbt_content_reward": 0.20750000420957804,
"rewards/check_cbt_structure": 0.0,
"step": 135
},
{
"completion_length": 57.625,
"epoch": 2.2222222222222223,
"grad_norm": 2.712351083755493,
"kl": 0.0022835541458334774,
"learning_rate": 8.148148148148149e-08,
"loss": 0.0001,
"reward": 0.1375000037252903,
"reward_std": 0.118898082152009,
"rewards/cbt_content_reward": 0.13750000223517417,
"rewards/check_cbt_structure": 0.0,
"step": 140
},
{
"completion_length": 52.4375,
"epoch": 2.3015873015873014,
"grad_norm": 3.027590751647949,
"kl": 0.0020462898130062966,
"learning_rate": 4.444444444444444e-08,
"loss": 0.0001,
"reward": 0.1750000027939677,
"reward_std": 0.11545386202633381,
"rewards/cbt_content_reward": 0.1750000027939677,
"rewards/check_cbt_structure": 0.0,
"step": 145
},
{
"completion_length": 70.325,
"epoch": 2.380952380952381,
"grad_norm": 3.1476638317108154,
"kl": 0.0017971335852053016,
"learning_rate": 7.407407407407407e-09,
"loss": 0.0001,
"reward": 0.19500000346451998,
"reward_std": 0.12282740026712417,
"rewards/cbt_content_reward": 0.19500000346451998,
"rewards/check_cbt_structure": 0.0,
"step": 150
}
],
"logging_steps": 5,
"max_steps": 150,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}