math_base_spg_mix / checkpoint-200 /trainer_state.json
kaiyuanzh's picture
Training in progress, step 200, checkpoint
3d706de verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.10666666666666667,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 253.59375,
"epoch": 0.0005333333333333334,
"grad_norm": 0.1841306984424591,
"learning_rate": 3e-06,
"loss": 0.0686,
"policy/loss": -0.013315461575984955,
"reward": 1.3489583730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.640625,
"rewards/correctness_reward_func_math": 0.7083333283662796,
"step": 1
},
{
"epoch": 0.0010666666666666667,
"grad_norm": 0.21774785220623016,
"learning_rate": 3e-06,
"loss": -0.0802,
"policy/loss": 0.0014063939452171326,
"step": 2
},
{
"epoch": 0.0016,
"grad_norm": 0.18103685975074768,
"learning_rate": 3e-06,
"loss": 0.0684,
"policy/loss": -0.012406323105096817,
"step": 3
},
{
"epoch": 0.0021333333333333334,
"grad_norm": 0.2169843465089798,
"learning_rate": 3e-06,
"loss": -0.0799,
"policy/loss": 0.00319121778011322,
"step": 4
},
{
"completion_length": 232.2604217529297,
"epoch": 0.0026666666666666666,
"grad_norm": 0.14396080374717712,
"learning_rate": 3e-06,
"loss": -0.0081,
"policy/loss": 0.18358719907701015,
"reward": 1.2916666865348816,
"rewards/boxed_and_answer_tags_format_reward": 0.6458333432674408,
"rewards/correctness_reward_func_math": 0.6458333432674408,
"step": 5
},
{
"epoch": 0.0032,
"grad_norm": 0.1440899670124054,
"learning_rate": 3e-06,
"loss": -0.1113,
"policy/loss": -0.014110475778579712,
"step": 6
},
{
"epoch": 0.0037333333333333333,
"grad_norm": 0.1543431133031845,
"learning_rate": 3e-06,
"loss": -0.009,
"policy/loss": 0.18367187306284904,
"step": 7
},
{
"epoch": 0.004266666666666667,
"grad_norm": 0.18800058960914612,
"learning_rate": 3e-06,
"loss": -0.1117,
"policy/loss": -0.01380608044564724,
"step": 8
},
{
"completion_length": 242.7291717529297,
"epoch": 0.0048,
"grad_norm": 0.23804515600204468,
"learning_rate": 3e-06,
"loss": -0.0155,
"policy/loss": -0.08487206892361598,
"reward": 1.057291716337204,
"rewards/boxed_and_answer_tags_format_reward": 0.5572916567325592,
"rewards/correctness_reward_func_math": 0.4999999850988388,
"step": 9
},
{
"epoch": 0.005333333333333333,
"grad_norm": 0.17962512373924255,
"learning_rate": 3e-06,
"loss": 0.0122,
"policy/loss": 0.1790632456280008,
"step": 10
},
{
"epoch": 0.005866666666666667,
"grad_norm": 0.16098381578922272,
"learning_rate": 3e-06,
"loss": -0.016,
"policy/loss": -0.08530520830174737,
"step": 11
},
{
"epoch": 0.0064,
"grad_norm": 0.19069895148277283,
"learning_rate": 3e-06,
"loss": 0.0123,
"policy/loss": 0.1812864543667274,
"step": 12
},
{
"completion_length": 232.53125,
"epoch": 0.006933333333333333,
"grad_norm": 0.17127424478530884,
"learning_rate": 3e-06,
"loss": 0.0568,
"policy/loss": -0.03765931725502014,
"reward": 1.0520833730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.6354166567325592,
"rewards/correctness_reward_func_math": 0.4166666716337204,
"step": 13
},
{
"epoch": 0.007466666666666667,
"grad_norm": 0.18777285516262054,
"learning_rate": 3e-06,
"loss": 0.2118,
"policy/loss": -0.09204106219112873,
"step": 14
},
{
"epoch": 0.008,
"grad_norm": 0.18267256021499634,
"learning_rate": 3e-06,
"loss": 0.057,
"policy/loss": -0.03848084807395935,
"step": 15
},
{
"epoch": 0.008533333333333334,
"grad_norm": 0.1676584780216217,
"learning_rate": 3e-06,
"loss": 0.2136,
"policy/loss": -0.09104587137699127,
"step": 16
},
{
"completion_length": 249.9375,
"epoch": 0.009066666666666667,
"grad_norm": 0.15089137852191925,
"learning_rate": 3e-06,
"loss": 0.0501,
"policy/loss": -0.1889983732253313,
"reward": 1.2135416865348816,
"rewards/boxed_and_answer_tags_format_reward": 0.5885416865348816,
"rewards/correctness_reward_func_math": 0.6250000149011612,
"step": 17
},
{
"epoch": 0.0096,
"grad_norm": 0.16881856322288513,
"learning_rate": 3e-06,
"loss": 0.0436,
"policy/loss": 0.27584413066506386,
"step": 18
},
{
"epoch": 0.010133333333333333,
"grad_norm": 0.14319613575935364,
"learning_rate": 3e-06,
"loss": 0.0499,
"policy/loss": -0.18909327313303947,
"step": 19
},
{
"epoch": 0.010666666666666666,
"grad_norm": 0.2733864188194275,
"learning_rate": 3e-06,
"loss": 0.0433,
"policy/loss": 0.27555806189775467,
"step": 20
},
{
"completion_length": 245.7604217529297,
"epoch": 0.0112,
"grad_norm": 0.3413054347038269,
"learning_rate": 3e-06,
"loss": 0.0222,
"policy/loss": 0.36454475536381636,
"reward": 1.1666667461395264,
"rewards/boxed_and_answer_tags_format_reward": 0.6041666865348816,
"rewards/correctness_reward_func_math": 0.5625,
"step": 21
},
{
"epoch": 0.011733333333333333,
"grad_norm": 0.21312959492206573,
"learning_rate": 3e-06,
"loss": 0.0843,
"policy/loss": 0.15956711052950823,
"step": 22
},
{
"epoch": 0.012266666666666667,
"grad_norm": 0.35879793763160706,
"learning_rate": 3e-06,
"loss": 0.0218,
"policy/loss": 0.3650218606836688,
"step": 23
},
{
"epoch": 0.0128,
"grad_norm": 0.18096353113651276,
"learning_rate": 3e-06,
"loss": 0.0843,
"policy/loss": 0.1569992530010822,
"step": 24
},
{
"completion_length": 242.43750762939453,
"epoch": 0.013333333333333334,
"grad_norm": 0.16766154766082764,
"learning_rate": 3e-06,
"loss": 0.0118,
"policy/loss": 0.48417001962661743,
"reward": 1.0781250596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.578125,
"rewards/correctness_reward_func_math": 0.5,
"step": 25
},
{
"epoch": 0.013866666666666666,
"grad_norm": 0.2944582402706146,
"learning_rate": 3e-06,
"loss": 0.1917,
"policy/loss": 0.01044890284538269,
"step": 26
},
{
"epoch": 0.0144,
"grad_norm": 0.2324495017528534,
"learning_rate": 3e-06,
"loss": 0.012,
"policy/loss": 0.48227669298648834,
"step": 27
},
{
"epoch": 0.014933333333333333,
"grad_norm": 0.2566571533679962,
"learning_rate": 3e-06,
"loss": 0.1918,
"policy/loss": 0.010879706591367722,
"step": 28
},
{
"completion_length": 244.61458587646484,
"epoch": 0.015466666666666667,
"grad_norm": 0.21410726010799408,
"learning_rate": 3e-06,
"loss": -0.026,
"policy/loss": 0.1202235990203917,
"reward": 1.0520833730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.59375,
"rewards/correctness_reward_func_math": 0.4583333283662796,
"step": 29
},
{
"epoch": 0.016,
"grad_norm": 0.1583758145570755,
"learning_rate": 3e-06,
"loss": -0.0577,
"policy/loss": -0.05267565557733178,
"step": 30
},
{
"epoch": 0.016533333333333334,
"grad_norm": 0.22393964231014252,
"learning_rate": 3e-06,
"loss": -0.0266,
"policy/loss": 0.11891656881198287,
"step": 31
},
{
"epoch": 0.017066666666666667,
"grad_norm": 0.16789793968200684,
"learning_rate": 3e-06,
"loss": -0.057,
"policy/loss": -0.05271946266293526,
"step": 32
},
{
"completion_length": 245.89583587646484,
"epoch": 0.0176,
"grad_norm": 0.16854967176914215,
"learning_rate": 3e-06,
"loss": -0.0832,
"policy/loss": -0.1320323795080185,
"reward": 0.8854166865348816,
"rewards/boxed_and_answer_tags_format_reward": 0.59375,
"rewards/correctness_reward_func_math": 0.2916666716337204,
"step": 33
},
{
"epoch": 0.018133333333333335,
"grad_norm": 0.3621562421321869,
"learning_rate": 3e-06,
"loss": -0.129,
"policy/loss": -0.5422025052830577,
"step": 34
},
{
"epoch": 0.018666666666666668,
"grad_norm": 0.14944235980510712,
"learning_rate": 3e-06,
"loss": -0.0835,
"policy/loss": -0.13186319917440414,
"step": 35
},
{
"epoch": 0.0192,
"grad_norm": 0.4199303984642029,
"learning_rate": 3e-06,
"loss": -0.1291,
"policy/loss": -0.5477359890937805,
"step": 36
},
{
"completion_length": 246.28125762939453,
"epoch": 0.019733333333333332,
"grad_norm": 0.14028505980968475,
"learning_rate": 3e-06,
"loss": 0.0285,
"policy/loss": 0.24223446287214756,
"reward": 0.9479166865348816,
"rewards/boxed_and_answer_tags_format_reward": 0.65625,
"rewards/correctness_reward_func_math": 0.2916666567325592,
"step": 37
},
{
"epoch": 0.020266666666666665,
"grad_norm": 0.1633375883102417,
"learning_rate": 3e-06,
"loss": 0.027,
"policy/loss": -0.2900683730840683,
"step": 38
},
{
"epoch": 0.0208,
"grad_norm": 0.14287146925926208,
"learning_rate": 3e-06,
"loss": 0.0284,
"policy/loss": 0.24034919310361147,
"step": 39
},
{
"epoch": 0.021333333333333333,
"grad_norm": 0.16466538608074188,
"learning_rate": 3e-06,
"loss": 0.0254,
"policy/loss": -0.29295632243156433,
"step": 40
},
{
"completion_length": 239.96875762939453,
"epoch": 0.021866666666666666,
"grad_norm": 0.15317575633525848,
"learning_rate": 3e-06,
"loss": -0.0366,
"policy/loss": 0.10384738075332933,
"reward": 1.0468750596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.609375,
"rewards/correctness_reward_func_math": 0.4375000149011612,
"step": 41
},
{
"epoch": 0.0224,
"grad_norm": 0.19731628894805908,
"learning_rate": 3e-06,
"loss": -0.1193,
"policy/loss": 0.00456411417428626,
"step": 42
},
{
"epoch": 0.022933333333333333,
"grad_norm": 0.14448165893554688,
"learning_rate": 3e-06,
"loss": -0.0383,
"policy/loss": 0.10382469364986235,
"step": 43
},
{
"epoch": 0.023466666666666667,
"grad_norm": 0.1933155655860901,
"learning_rate": 3e-06,
"loss": -0.1191,
"policy/loss": 0.005274432977692811,
"step": 44
},
{
"completion_length": 244.89583587646484,
"epoch": 0.024,
"grad_norm": 0.24117735028266907,
"learning_rate": 3e-06,
"loss": 0.0938,
"policy/loss": -0.2796844388358295,
"reward": 1.1666666865348816,
"rewards/boxed_and_answer_tags_format_reward": 0.625,
"rewards/correctness_reward_func_math": 0.5416666716337204,
"step": 45
},
{
"epoch": 0.024533333333333334,
"grad_norm": 0.24326607584953308,
"learning_rate": 3e-06,
"loss": -0.1093,
"policy/loss": -0.23608368262648582,
"step": 46
},
{
"epoch": 0.025066666666666668,
"grad_norm": 0.2464676797389984,
"learning_rate": 3e-06,
"loss": 0.094,
"policy/loss": -0.28131480794399977,
"step": 47
},
{
"epoch": 0.0256,
"grad_norm": 0.15732921659946442,
"learning_rate": 3e-06,
"loss": -0.1101,
"policy/loss": -0.2395353652536869,
"step": 48
},
{
"completion_length": 249.1979217529297,
"epoch": 0.026133333333333335,
"grad_norm": 0.41586366295814514,
"learning_rate": 3e-06,
"loss": 0.0274,
"policy/loss": -0.047684632387245074,
"reward": 1.0937500298023224,
"rewards/boxed_and_answer_tags_format_reward": 0.6145833134651184,
"rewards/correctness_reward_func_math": 0.4791666641831398,
"step": 49
},
{
"epoch": 0.02666666666666667,
"grad_norm": 0.17614451050758362,
"learning_rate": 3e-06,
"loss": -0.0918,
"policy/loss": -0.13479230320081115,
"step": 50
},
{
"epoch": 0.0272,
"grad_norm": 0.1597583144903183,
"learning_rate": 3e-06,
"loss": 0.027,
"policy/loss": -0.05036866711452603,
"step": 51
},
{
"epoch": 0.027733333333333332,
"grad_norm": 0.1461249440908432,
"learning_rate": 3e-06,
"loss": -0.0923,
"policy/loss": -0.13541921973228455,
"step": 52
},
{
"completion_length": 244.4479217529297,
"epoch": 0.028266666666666666,
"grad_norm": 0.1617969125509262,
"learning_rate": 3e-06,
"loss": -0.0326,
"policy/loss": -0.05200636462233632,
"reward": 1.3802083730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.5885416567325592,
"rewards/correctness_reward_func_math": 0.7916666567325592,
"step": 53
},
{
"epoch": 0.0288,
"grad_norm": 0.14798980951309204,
"learning_rate": 3e-06,
"loss": -0.0111,
"policy/loss": 0.018518115793725087,
"step": 54
},
{
"epoch": 0.029333333333333333,
"grad_norm": 0.1704786717891693,
"learning_rate": 3e-06,
"loss": -0.0323,
"policy/loss": -0.05232445967863697,
"step": 55
},
{
"epoch": 0.029866666666666666,
"grad_norm": 0.15073394775390625,
"learning_rate": 3e-06,
"loss": -0.0116,
"policy/loss": 0.01774279534570411,
"step": 56
},
{
"completion_length": 240.375,
"epoch": 0.0304,
"grad_norm": 0.18698211014270782,
"learning_rate": 3e-06,
"loss": -0.0838,
"policy/loss": -0.04460075659737939,
"reward": 1.4322916865348816,
"rewards/boxed_and_answer_tags_format_reward": 0.6197916567325592,
"rewards/correctness_reward_func_math": 0.8125,
"step": 57
},
{
"epoch": 0.030933333333333334,
"grad_norm": 0.1560385823249817,
"learning_rate": 3e-06,
"loss": -0.0019,
"policy/loss": 0.032065877014169075,
"step": 58
},
{
"epoch": 0.031466666666666664,
"grad_norm": 0.17436009645462036,
"learning_rate": 3e-06,
"loss": -0.0834,
"policy/loss": -0.044650425946258565,
"step": 59
},
{
"epoch": 0.032,
"grad_norm": 0.14935211837291718,
"learning_rate": 3e-06,
"loss": -0.0019,
"policy/loss": 0.032336668403949886,
"step": 60
},
{
"completion_length": 252.83333587646484,
"epoch": 0.03253333333333333,
"grad_norm": 0.1839117556810379,
"learning_rate": 3e-06,
"loss": -0.1101,
"policy/loss": -0.058528343215584755,
"reward": 0.9895833730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.59375,
"rewards/correctness_reward_func_math": 0.3958333283662796,
"step": 61
},
{
"epoch": 0.03306666666666667,
"grad_norm": 0.16935284435749054,
"learning_rate": 3e-06,
"loss": -0.0523,
"policy/loss": -0.06420497968792915,
"step": 62
},
{
"epoch": 0.0336,
"grad_norm": 0.1868937611579895,
"learning_rate": 3e-06,
"loss": -0.1096,
"policy/loss": -0.05889650620520115,
"step": 63
},
{
"epoch": 0.034133333333333335,
"grad_norm": 0.16210873425006866,
"learning_rate": 3e-06,
"loss": -0.0527,
"policy/loss": -0.06419926509261131,
"step": 64
},
{
"completion_length": 245.67708587646484,
"epoch": 0.034666666666666665,
"grad_norm": 0.19509585201740265,
"learning_rate": 3e-06,
"loss": -0.0881,
"policy/loss": 0.055254802107810974,
"reward": 1.234375,
"rewards/boxed_and_answer_tags_format_reward": 0.6302083432674408,
"rewards/correctness_reward_func_math": 0.6041666567325592,
"step": 65
},
{
"epoch": 0.0352,
"grad_norm": 0.21583299338817596,
"learning_rate": 3e-06,
"loss": 0.0043,
"policy/loss": 0.3691144287586212,
"step": 66
},
{
"epoch": 0.03573333333333333,
"grad_norm": 0.21014481782913208,
"learning_rate": 3e-06,
"loss": -0.0881,
"policy/loss": 0.05858026444911957,
"step": 67
},
{
"epoch": 0.03626666666666667,
"grad_norm": 0.17624783515930176,
"learning_rate": 3e-06,
"loss": 0.0041,
"policy/loss": 0.3663817197084427,
"step": 68
},
{
"completion_length": 247.3541717529297,
"epoch": 0.0368,
"grad_norm": 0.1945660412311554,
"learning_rate": 3e-06,
"loss": 0.0264,
"policy/loss": -0.11664605140686035,
"reward": 0.8750000298023224,
"rewards/boxed_and_answer_tags_format_reward": 0.6458333432674408,
"rewards/correctness_reward_func_math": 0.2291666641831398,
"step": 69
},
{
"epoch": 0.037333333333333336,
"grad_norm": 0.17174845933914185,
"learning_rate": 3e-06,
"loss": -0.0919,
"policy/loss": -0.1406829059123993,
"step": 70
},
{
"epoch": 0.037866666666666667,
"grad_norm": 0.19668424129486084,
"learning_rate": 3e-06,
"loss": 0.0257,
"policy/loss": -0.11868753097951412,
"step": 71
},
{
"epoch": 0.0384,
"grad_norm": 0.17255908250808716,
"learning_rate": 3e-06,
"loss": -0.0928,
"policy/loss": -0.141671571880579,
"step": 72
},
{
"completion_length": 244.55209350585938,
"epoch": 0.038933333333333334,
"grad_norm": 0.17894263565540314,
"learning_rate": 3e-06,
"loss": 0.0772,
"policy/loss": 0.026979694513691754,
"reward": 1.0885416865348816,
"rewards/boxed_and_answer_tags_format_reward": 0.609375,
"rewards/correctness_reward_func_math": 0.4791666716337204,
"step": 73
},
{
"epoch": 0.039466666666666664,
"grad_norm": 0.20189912617206573,
"learning_rate": 3e-06,
"loss": 0.0181,
"policy/loss": -0.01205286930909466,
"step": 74
},
{
"epoch": 0.04,
"grad_norm": 0.18249736726284027,
"learning_rate": 3e-06,
"loss": 0.0771,
"policy/loss": 0.02613500727184004,
"step": 75
},
{
"epoch": 0.04053333333333333,
"grad_norm": 0.19343672692775726,
"learning_rate": 3e-06,
"loss": 0.0186,
"policy/loss": -0.01260100852024948,
"step": 76
},
{
"completion_length": 242.0729217529297,
"epoch": 0.04106666666666667,
"grad_norm": 0.20817507803440094,
"learning_rate": 3e-06,
"loss": 0.0947,
"policy/loss": -0.011811529185905556,
"reward": 1.0208333432674408,
"rewards/boxed_and_answer_tags_format_reward": 0.6041666567325592,
"rewards/correctness_reward_func_math": 0.4166666641831398,
"step": 77
},
{
"epoch": 0.0416,
"grad_norm": 0.209181547164917,
"learning_rate": 3e-06,
"loss": 0.0026,
"policy/loss": -0.21592581128098232,
"step": 78
},
{
"epoch": 0.042133333333333335,
"grad_norm": 0.21479324996471405,
"learning_rate": 3e-06,
"loss": 0.0944,
"policy/loss": -0.012446357799618113,
"step": 79
},
{
"epoch": 0.042666666666666665,
"grad_norm": 0.20790192484855652,
"learning_rate": 3e-06,
"loss": 0.0026,
"policy/loss": -0.21514567594918077,
"step": 80
},
{
"completion_length": 237.7916717529297,
"epoch": 0.0432,
"grad_norm": 0.326062947511673,
"learning_rate": 3e-06,
"loss": -0.0501,
"policy/loss": -0.08871376924216179,
"reward": 1.0208333432674408,
"rewards/boxed_and_answer_tags_format_reward": 0.5416666567325592,
"rewards/correctness_reward_func_math": 0.4791666716337204,
"step": 81
},
{
"epoch": 0.04373333333333333,
"grad_norm": 0.17182475328445435,
"learning_rate": 3e-06,
"loss": -0.0181,
"policy/loss": -0.11642095722509538,
"step": 82
},
{
"epoch": 0.04426666666666667,
"grad_norm": 0.2907707691192627,
"learning_rate": 3e-06,
"loss": -0.0516,
"policy/loss": -0.08941211211223532,
"step": 83
},
{
"epoch": 0.0448,
"grad_norm": 0.16859214007854462,
"learning_rate": 3e-06,
"loss": -0.0187,
"policy/loss": -0.11713782228317626,
"step": 84
},
{
"completion_length": 244.17708587646484,
"epoch": 0.04533333333333334,
"grad_norm": 0.20400136709213257,
"learning_rate": 3e-06,
"loss": 0.025,
"policy/loss": -0.09599835332483053,
"reward": 1.0989583730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.5572916567325592,
"rewards/correctness_reward_func_math": 0.5416666716337204,
"step": 85
},
{
"epoch": 0.04586666666666667,
"grad_norm": 0.23238201439380646,
"learning_rate": 3e-06,
"loss": -0.0728,
"policy/loss": -0.65560332685709,
"step": 86
},
{
"epoch": 0.0464,
"grad_norm": 0.2232753336429596,
"learning_rate": 3e-06,
"loss": 0.024,
"policy/loss": -0.09737076377496123,
"step": 87
},
{
"epoch": 0.046933333333333334,
"grad_norm": 0.21358439326286316,
"learning_rate": 3e-06,
"loss": -0.0712,
"policy/loss": -0.6477146595716476,
"step": 88
},
{
"completion_length": 250.21875762939453,
"epoch": 0.047466666666666664,
"grad_norm": 0.21840573847293854,
"learning_rate": 3e-06,
"loss": 0.0227,
"policy/loss": -0.2187749519944191,
"reward": 1.4687500596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.6354166865348816,
"rewards/correctness_reward_func_math": 0.8333333432674408,
"step": 89
},
{
"epoch": 0.048,
"grad_norm": 0.17804275453090668,
"learning_rate": 3e-06,
"loss": 0.0304,
"policy/loss": 0.25106509774923325,
"step": 90
},
{
"epoch": 0.04853333333333333,
"grad_norm": 0.17775166034698486,
"learning_rate": 3e-06,
"loss": 0.0218,
"policy/loss": -0.2161092460155487,
"step": 91
},
{
"epoch": 0.04906666666666667,
"grad_norm": 0.1654275357723236,
"learning_rate": 3e-06,
"loss": 0.0286,
"policy/loss": 0.2492801770567894,
"step": 92
},
{
"completion_length": 230.4791717529297,
"epoch": 0.0496,
"grad_norm": 0.20215706527233124,
"learning_rate": 3e-06,
"loss": 0.0607,
"policy/loss": 0.07679092884063721,
"reward": 1.4375,
"rewards/boxed_and_answer_tags_format_reward": 0.625,
"rewards/correctness_reward_func_math": 0.8125000149011612,
"step": 93
},
{
"epoch": 0.050133333333333335,
"grad_norm": 0.2815592288970947,
"learning_rate": 3e-06,
"loss": 0.0601,
"policy/loss": 0.34143297374248505,
"step": 94
},
{
"epoch": 0.050666666666666665,
"grad_norm": 0.21156412363052368,
"learning_rate": 3e-06,
"loss": 0.0597,
"policy/loss": 0.07551443576812744,
"step": 95
},
{
"epoch": 0.0512,
"grad_norm": 0.19008544087409973,
"learning_rate": 3e-06,
"loss": 0.0585,
"policy/loss": 0.33709482848644257,
"step": 96
},
{
"completion_length": 236.77083587646484,
"epoch": 0.05173333333333333,
"grad_norm": 0.159907728433609,
"learning_rate": 3e-06,
"loss": 0.0825,
"policy/loss": -0.06487638503313065,
"reward": 1.265625,
"rewards/boxed_and_answer_tags_format_reward": 0.6822916865348816,
"rewards/correctness_reward_func_math": 0.5833333283662796,
"step": 97
},
{
"epoch": 0.05226666666666667,
"grad_norm": 0.1606937199831009,
"learning_rate": 3e-06,
"loss": 0.2219,
"policy/loss": 0.06859804317355156,
"step": 98
},
{
"epoch": 0.0528,
"grad_norm": 0.15820233523845673,
"learning_rate": 3e-06,
"loss": 0.082,
"policy/loss": -0.06637740135192871,
"step": 99
},
{
"epoch": 0.05333333333333334,
"grad_norm": 0.16980242729187012,
"learning_rate": 3e-06,
"loss": 0.2217,
"policy/loss": 0.06899168714880943,
"step": 100
},
{
"completion_length": 242.0729217529297,
"epoch": 0.05386666666666667,
"grad_norm": 0.1724499613046646,
"learning_rate": 3e-06,
"loss": 0.077,
"policy/loss": 0.010863131593367825,
"reward": 1.2656250596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.6822916567325592,
"rewards/correctness_reward_func_math": 0.5833333432674408,
"step": 101
},
{
"epoch": 0.0544,
"grad_norm": 0.1928744912147522,
"learning_rate": 3e-06,
"loss": -0.1418,
"policy/loss": -0.14429278626924358,
"step": 102
},
{
"epoch": 0.054933333333333334,
"grad_norm": 0.17666271328926086,
"learning_rate": 3e-06,
"loss": 0.0754,
"policy/loss": 0.009962649615893326,
"step": 103
},
{
"epoch": 0.055466666666666664,
"grad_norm": 0.20106588304042816,
"learning_rate": 3e-06,
"loss": -0.142,
"policy/loss": -0.14376884679922242,
"step": 104
},
{
"completion_length": 248.12500762939453,
"epoch": 0.056,
"grad_norm": 0.21009132266044617,
"learning_rate": 3e-06,
"loss": -0.0962,
"policy/loss": 0.013725795336409163,
"reward": 1.3125000298023224,
"rewards/boxed_and_answer_tags_format_reward": 0.6458333134651184,
"rewards/correctness_reward_func_math": 0.6666666679084301,
"step": 105
},
{
"epoch": 0.05653333333333333,
"grad_norm": 0.11821702867746353,
"learning_rate": 3e-06,
"loss": -0.0624,
"policy/loss": -0.0343314218800046,
"step": 106
},
{
"epoch": 0.05706666666666667,
"grad_norm": 0.2041134238243103,
"learning_rate": 3e-06,
"loss": -0.0959,
"policy/loss": 0.013820057307459166,
"step": 107
},
{
"epoch": 0.0576,
"grad_norm": 0.12859182059764862,
"learning_rate": 3e-06,
"loss": -0.0633,
"policy/loss": -0.03427921939243461,
"step": 108
},
{
"completion_length": 246.3541717529297,
"epoch": 0.058133333333333335,
"grad_norm": 0.14615735411643982,
"learning_rate": 3e-06,
"loss": 0.1392,
"policy/loss": 0.040021819022378224,
"reward": 1.2083333730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.6041666567325592,
"rewards/correctness_reward_func_math": 0.6041666567325592,
"step": 109
},
{
"epoch": 0.058666666666666666,
"grad_norm": 0.1528160125017166,
"learning_rate": 3e-06,
"loss": -0.0175,
"policy/loss": 0.02410803958116503,
"step": 110
},
{
"epoch": 0.0592,
"grad_norm": 0.26312804222106934,
"learning_rate": 3e-06,
"loss": 0.1394,
"policy/loss": 0.040499807353633344,
"step": 111
},
{
"epoch": 0.05973333333333333,
"grad_norm": 0.14612212777137756,
"learning_rate": 3e-06,
"loss": -0.0185,
"policy/loss": 0.024410473563328594,
"step": 112
},
{
"completion_length": 245.18750762939453,
"epoch": 0.06026666666666667,
"grad_norm": 0.19870316982269287,
"learning_rate": 3e-06,
"loss": 0.0525,
"policy/loss": 0.0026417523622512817,
"reward": 1.1718750596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.6927083432674408,
"rewards/correctness_reward_func_math": 0.4791666716337204,
"step": 113
},
{
"epoch": 0.0608,
"grad_norm": 0.27426677942276,
"learning_rate": 3e-06,
"loss": 0.0535,
"policy/loss": 0.16809340938925743,
"step": 114
},
{
"epoch": 0.06133333333333333,
"grad_norm": 0.17909908294677734,
"learning_rate": 3e-06,
"loss": 0.0514,
"policy/loss": 0.003060735762119293,
"step": 115
},
{
"epoch": 0.06186666666666667,
"grad_norm": 0.1752181202173233,
"learning_rate": 3e-06,
"loss": 0.0529,
"policy/loss": 0.17004671320319176,
"step": 116
},
{
"completion_length": 252.14584350585938,
"epoch": 0.0624,
"grad_norm": 0.15613406896591187,
"learning_rate": 3e-06,
"loss": 0.0615,
"policy/loss": -0.2717489246279001,
"reward": 1.4166666865348816,
"rewards/boxed_and_answer_tags_format_reward": 0.6666666865348816,
"rewards/correctness_reward_func_math": 0.75,
"step": 117
},
{
"epoch": 0.06293333333333333,
"grad_norm": 0.16029293835163116,
"learning_rate": 3e-06,
"loss": 0.0304,
"policy/loss": 0.05623655021190643,
"step": 118
},
{
"epoch": 0.06346666666666667,
"grad_norm": 0.15869830548763275,
"learning_rate": 3e-06,
"loss": 0.0607,
"policy/loss": -0.27060971036553383,
"step": 119
},
{
"epoch": 0.064,
"grad_norm": 0.15622366964817047,
"learning_rate": 3e-06,
"loss": 0.0299,
"policy/loss": 0.059750813990831375,
"step": 120
},
{
"completion_length": 248.34375762939453,
"epoch": 0.06453333333333333,
"grad_norm": 0.18005700409412384,
"learning_rate": 3e-06,
"loss": 0.029,
"policy/loss": 1.3900643658359968e-07,
"reward": 0.984375,
"rewards/boxed_and_answer_tags_format_reward": 0.609375,
"rewards/correctness_reward_func_math": 0.375,
"step": 121
},
{
"epoch": 0.06506666666666666,
"grad_norm": 0.18394261598587036,
"learning_rate": 3e-06,
"loss": -0.0416,
"policy/loss": 1.3915559726740412e-07,
"step": 122
},
{
"epoch": 0.0656,
"grad_norm": 0.18433783948421478,
"learning_rate": 3e-06,
"loss": 0.0288,
"policy/loss": 1.3890329775279042e-07,
"step": 123
},
{
"epoch": 0.06613333333333334,
"grad_norm": 0.20658805966377258,
"learning_rate": 3e-06,
"loss": -0.0406,
"policy/loss": 1.3907800244794544e-07,
"step": 124
},
{
"completion_length": 251.9791717529297,
"epoch": 0.06666666666666667,
"grad_norm": 0.19564618170261383,
"learning_rate": 3e-06,
"loss": 0.0077,
"policy/loss": 0.2248321995139122,
"reward": 1.4531250596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.6614583432674408,
"rewards/correctness_reward_func_math": 0.7916666865348816,
"step": 125
},
{
"epoch": 0.0672,
"grad_norm": 0.20839302241802216,
"learning_rate": 3e-06,
"loss": 0.0157,
"policy/loss": 0.5651952549815178,
"step": 126
},
{
"epoch": 0.06773333333333334,
"grad_norm": 0.1958751529455185,
"learning_rate": 3e-06,
"loss": 0.0068,
"policy/loss": 0.21976368129253387,
"step": 127
},
{
"epoch": 0.06826666666666667,
"grad_norm": 0.1874540001153946,
"learning_rate": 3e-06,
"loss": 0.0128,
"policy/loss": 0.5569901391863823,
"step": 128
},
{
"completion_length": 243.86458587646484,
"epoch": 0.0688,
"grad_norm": 0.16888441145420074,
"learning_rate": 3e-06,
"loss": 0.0046,
"policy/loss": 0.29372987684747187,
"reward": 0.9583333730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.7083333134651184,
"rewards/correctness_reward_func_math": 0.2500000074505806,
"step": 129
},
{
"epoch": 0.06933333333333333,
"grad_norm": 0.147910475730896,
"learning_rate": 3e-06,
"loss": 0.0363,
"policy/loss": -0.0594180128145787,
"step": 130
},
{
"epoch": 0.06986666666666666,
"grad_norm": 0.1847718209028244,
"learning_rate": 3e-06,
"loss": 0.0031,
"policy/loss": 0.2919089549587426,
"step": 131
},
{
"epoch": 0.0704,
"grad_norm": 0.14381037652492523,
"learning_rate": 3e-06,
"loss": 0.035,
"policy/loss": -0.06040795295599466,
"step": 132
},
{
"completion_length": 237.17709350585938,
"epoch": 0.07093333333333333,
"grad_norm": 0.1769952028989792,
"learning_rate": 3e-06,
"loss": -0.0769,
"policy/loss": -0.269472052808851,
"reward": 1.3958333432674408,
"rewards/boxed_and_answer_tags_format_reward": 0.6041666567325592,
"rewards/correctness_reward_func_math": 0.7916666716337204,
"step": 133
},
{
"epoch": 0.07146666666666666,
"grad_norm": 0.23748241364955902,
"learning_rate": 3e-06,
"loss": 0.0414,
"policy/loss": -0.08278632164001465,
"step": 134
},
{
"epoch": 0.072,
"grad_norm": 0.18565919995307922,
"learning_rate": 3e-06,
"loss": -0.0788,
"policy/loss": -0.279860089183785,
"step": 135
},
{
"epoch": 0.07253333333333334,
"grad_norm": 0.16724295914173126,
"learning_rate": 3e-06,
"loss": 0.0396,
"policy/loss": -0.08615577220916748,
"step": 136
},
{
"completion_length": 240.92708587646484,
"epoch": 0.07306666666666667,
"grad_norm": 0.20110689103603363,
"learning_rate": 3e-06,
"loss": -0.0048,
"policy/loss": -0.15881963817272293,
"reward": 1.2083333730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.6458333134651184,
"rewards/correctness_reward_func_math": 0.5625,
"step": 137
},
{
"epoch": 0.0736,
"grad_norm": 0.18458232283592224,
"learning_rate": 3e-06,
"loss": 0.1303,
"policy/loss": -0.1697537311212045,
"step": 138
},
{
"epoch": 0.07413333333333333,
"grad_norm": 0.1825876086950302,
"learning_rate": 3e-06,
"loss": -0.0063,
"policy/loss": -0.16175133734067515,
"step": 139
},
{
"epoch": 0.07466666666666667,
"grad_norm": 0.21164043247699738,
"learning_rate": 3e-06,
"loss": 0.1297,
"policy/loss": -0.16999738006476406,
"step": 140
},
{
"completion_length": 240.92709350585938,
"epoch": 0.0752,
"grad_norm": 0.16944681107997894,
"learning_rate": 3e-06,
"loss": -0.2597,
"policy/loss": -0.4086807461266684,
"reward": 1.21875,
"rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408,
"rewards/correctness_reward_func_math": 0.6041666567325592,
"step": 141
},
{
"epoch": 0.07573333333333333,
"grad_norm": 0.22837960720062256,
"learning_rate": 3e-06,
"loss": 0.1325,
"policy/loss": 0.27677876760697373,
"step": 142
},
{
"epoch": 0.07626666666666666,
"grad_norm": 0.1661621481180191,
"learning_rate": 3e-06,
"loss": -0.2593,
"policy/loss": -0.4062321575056007,
"step": 143
},
{
"epoch": 0.0768,
"grad_norm": 0.18849802017211914,
"learning_rate": 3e-06,
"loss": 0.1322,
"policy/loss": 0.2781364421500241,
"step": 144
},
{
"completion_length": 248.17708587646484,
"epoch": 0.07733333333333334,
"grad_norm": 0.21365630626678467,
"learning_rate": 3e-06,
"loss": 0.1392,
"policy/loss": 0.3016326804974794,
"reward": 1.0781250596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.640625,
"rewards/correctness_reward_func_math": 0.4375,
"step": 145
},
{
"epoch": 0.07786666666666667,
"grad_norm": 0.22841136157512665,
"learning_rate": 3e-06,
"loss": -0.1061,
"policy/loss": -0.028553878638526875,
"step": 146
},
{
"epoch": 0.0784,
"grad_norm": 0.21557925641536713,
"learning_rate": 3e-06,
"loss": 0.1371,
"policy/loss": 0.30307797390917823,
"step": 147
},
{
"epoch": 0.07893333333333333,
"grad_norm": 0.22633995115756989,
"learning_rate": 3e-06,
"loss": -0.1068,
"policy/loss": -0.027433033795552397,
"step": 148
},
{
"completion_length": 246.15625762939453,
"epoch": 0.07946666666666667,
"grad_norm": 0.17080900073051453,
"learning_rate": 3e-06,
"loss": 0.106,
"policy/loss": -0.2952568163817375,
"reward": 1.2135416865348816,
"rewards/boxed_and_answer_tags_format_reward": 0.6510416567325592,
"rewards/correctness_reward_func_math": 0.5624999850988388,
"step": 149
},
{
"epoch": 0.08,
"grad_norm": 0.18366633355617523,
"learning_rate": 3e-06,
"loss": -0.0337,
"policy/loss": 0.06641959956277788,
"step": 150
},
{
"epoch": 0.08053333333333333,
"grad_norm": 0.2946370244026184,
"learning_rate": 3e-06,
"loss": 0.1046,
"policy/loss": -0.2993208696474099,
"step": 151
},
{
"epoch": 0.08106666666666666,
"grad_norm": 0.18177594244480133,
"learning_rate": 3e-06,
"loss": -0.0337,
"policy/loss": 0.06611955726583574,
"step": 152
},
{
"completion_length": 241.34375,
"epoch": 0.0816,
"grad_norm": 0.2595878839492798,
"learning_rate": 3e-06,
"loss": -0.0129,
"policy/loss": -0.029897108674049377,
"reward": 1.3645833730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.65625,
"rewards/correctness_reward_func_math": 0.7083333283662796,
"step": 153
},
{
"epoch": 0.08213333333333334,
"grad_norm": 0.2282121330499649,
"learning_rate": 3e-06,
"loss": -0.1403,
"policy/loss": -0.7962741553783417,
"step": 154
},
{
"epoch": 0.08266666666666667,
"grad_norm": 0.2732269763946533,
"learning_rate": 3e-06,
"loss": -0.0142,
"policy/loss": -0.0346699059009552,
"step": 155
},
{
"epoch": 0.0832,
"grad_norm": 0.19453194737434387,
"learning_rate": 3e-06,
"loss": -0.1409,
"policy/loss": -0.7987985908985138,
"step": 156
},
{
"completion_length": 246.7291717529297,
"epoch": 0.08373333333333334,
"grad_norm": 0.15822243690490723,
"learning_rate": 3e-06,
"loss": -0.0368,
"policy/loss": 0.1923005077087261,
"reward": 1.0989583730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.6822916567325592,
"rewards/correctness_reward_func_math": 0.4166666716337204,
"step": 157
},
{
"epoch": 0.08426666666666667,
"grad_norm": 0.20775403082370758,
"learning_rate": 3e-06,
"loss": -0.1298,
"policy/loss": -0.008804838902069179,
"step": 158
},
{
"epoch": 0.0848,
"grad_norm": 0.1534760296344757,
"learning_rate": 3e-06,
"loss": -0.0376,
"policy/loss": 0.19086543647436827,
"step": 159
},
{
"epoch": 0.08533333333333333,
"grad_norm": 0.1879645437002182,
"learning_rate": 3e-06,
"loss": -0.1308,
"policy/loss": -0.010181248919947095,
"step": 160
},
{
"completion_length": 247.52083587646484,
"epoch": 0.08586666666666666,
"grad_norm": 0.2048528641462326,
"learning_rate": 3e-06,
"loss": -0.1292,
"policy/loss": -0.25954964756965637,
"reward": 1.5625000596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.7291666865348816,
"rewards/correctness_reward_func_math": 0.8333333134651184,
"step": 161
},
{
"epoch": 0.0864,
"grad_norm": 0.26088085770606995,
"learning_rate": 3e-06,
"loss": -0.0211,
"policy/loss": -0.09772485494613647,
"step": 162
},
{
"epoch": 0.08693333333333333,
"grad_norm": 0.17860932648181915,
"learning_rate": 3e-06,
"loss": -0.1294,
"policy/loss": -0.2615499645471573,
"step": 163
},
{
"epoch": 0.08746666666666666,
"grad_norm": 0.19965782761573792,
"learning_rate": 3e-06,
"loss": -0.0201,
"policy/loss": -0.09446658566594124,
"step": 164
},
{
"completion_length": 248.52083587646484,
"epoch": 0.088,
"grad_norm": 0.24648579955101013,
"learning_rate": 3e-06,
"loss": -0.0068,
"policy/loss": -0.1897476138547063,
"reward": 1.3072917461395264,
"rewards/boxed_and_answer_tags_format_reward": 0.703125,
"rewards/correctness_reward_func_math": 0.6041666567325592,
"step": 165
},
{
"epoch": 0.08853333333333334,
"grad_norm": 0.18784964084625244,
"learning_rate": 3e-06,
"loss": 0.0468,
"policy/loss": -0.35483095049858093,
"step": 166
},
{
"epoch": 0.08906666666666667,
"grad_norm": 0.23281453549861908,
"learning_rate": 3e-06,
"loss": -0.0079,
"policy/loss": -0.19379627890884876,
"step": 167
},
{
"epoch": 0.0896,
"grad_norm": 0.16716426610946655,
"learning_rate": 3e-06,
"loss": 0.0464,
"policy/loss": -0.3517995774745941,
"step": 168
},
{
"completion_length": 237.05209350585938,
"epoch": 0.09013333333333333,
"grad_norm": 0.15929904580116272,
"learning_rate": 3e-06,
"loss": -0.1312,
"policy/loss": -0.45294931530952454,
"reward": 1.2395833730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.6770833134651184,
"rewards/correctness_reward_func_math": 0.5625000149011612,
"step": 169
},
{
"epoch": 0.09066666666666667,
"grad_norm": 0.1787302941083908,
"learning_rate": 3e-06,
"loss": 0.0216,
"policy/loss": 0.26260387897491455,
"step": 170
},
{
"epoch": 0.0912,
"grad_norm": 0.17395518720149994,
"learning_rate": 3e-06,
"loss": -0.1311,
"policy/loss": -0.45233495021238923,
"step": 171
},
{
"epoch": 0.09173333333333333,
"grad_norm": 0.18037588894367218,
"learning_rate": 3e-06,
"loss": 0.0218,
"policy/loss": 0.2613874822854996,
"step": 172
},
{
"completion_length": 242.8229217529297,
"epoch": 0.09226666666666666,
"grad_norm": 0.31133604049682617,
"learning_rate": 3e-06,
"loss": -0.0216,
"policy/loss": 0.22989650322091393,
"reward": 1.5625,
"rewards/boxed_and_answer_tags_format_reward": 0.75,
"rewards/correctness_reward_func_math": 0.8125,
"step": 173
},
{
"epoch": 0.0928,
"grad_norm": 0.1914536952972412,
"learning_rate": 3e-06,
"loss": 0.1394,
"policy/loss": 0.026128504063084534,
"step": 174
},
{
"epoch": 0.09333333333333334,
"grad_norm": 0.19691255688667297,
"learning_rate": 3e-06,
"loss": -0.0219,
"policy/loss": 0.23154904199277482,
"step": 175
},
{
"epoch": 0.09386666666666667,
"grad_norm": 0.18783937394618988,
"learning_rate": 3e-06,
"loss": 0.139,
"policy/loss": 0.026339715032900557,
"step": 176
},
{
"completion_length": 248.09375762939453,
"epoch": 0.0944,
"grad_norm": 0.21956992149353027,
"learning_rate": 3e-06,
"loss": -0.0418,
"policy/loss": 0.33423537268861736,
"reward": 0.890625,
"rewards/boxed_and_answer_tags_format_reward": 0.5156249850988388,
"rewards/correctness_reward_func_math": 0.3750000074505806,
"step": 177
},
{
"epoch": 0.09493333333333333,
"grad_norm": 0.2890993356704712,
"learning_rate": 3e-06,
"loss": -0.0196,
"policy/loss": 0.0917874399780203,
"step": 178
},
{
"epoch": 0.09546666666666667,
"grad_norm": 0.21964141726493835,
"learning_rate": 3e-06,
"loss": -0.0423,
"policy/loss": 0.33307433389631314,
"step": 179
},
{
"epoch": 0.096,
"grad_norm": 0.2810533344745636,
"learning_rate": 3e-06,
"loss": -0.0195,
"policy/loss": 0.09134699882917374,
"step": 180
},
{
"completion_length": 220.5,
"epoch": 0.09653333333333333,
"grad_norm": 0.16769170761108398,
"learning_rate": 3e-06,
"loss": -0.0963,
"policy/loss": 0.2489219456911087,
"reward": 1.2968750596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.6510416865348816,
"rewards/correctness_reward_func_math": 0.6458333432674408,
"step": 181
},
{
"epoch": 0.09706666666666666,
"grad_norm": 0.1784435361623764,
"learning_rate": 3e-06,
"loss": 0.0834,
"policy/loss": 0.19268840551376343,
"step": 182
},
{
"epoch": 0.0976,
"grad_norm": 0.1695830374956131,
"learning_rate": 3e-06,
"loss": -0.0967,
"policy/loss": 0.25148695707321167,
"step": 183
},
{
"epoch": 0.09813333333333334,
"grad_norm": 0.1726071685552597,
"learning_rate": 3e-06,
"loss": 0.083,
"policy/loss": 0.19391090795397758,
"step": 184
},
{
"completion_length": 234.03125762939453,
"epoch": 0.09866666666666667,
"grad_norm": 0.22702626883983612,
"learning_rate": 3e-06,
"loss": -0.0245,
"policy/loss": 1.0809221606677966e-07,
"reward": 1.3645833730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408,
"rewards/correctness_reward_func_math": 0.6249999776482582,
"step": 185
},
{
"epoch": 0.0992,
"grad_norm": 0.19707860052585602,
"learning_rate": 3e-06,
"loss": -0.1058,
"policy/loss": 1.4286691296661047e-07,
"step": 186
},
{
"epoch": 0.09973333333333333,
"grad_norm": 0.24218805134296417,
"learning_rate": 3e-06,
"loss": -0.0252,
"policy/loss": 1.0810558492835298e-07,
"step": 187
},
{
"epoch": 0.10026666666666667,
"grad_norm": 0.24096260964870453,
"learning_rate": 3e-06,
"loss": -0.1073,
"policy/loss": 1.4284267280118002e-07,
"step": 188
},
{
"completion_length": 242.15625762939453,
"epoch": 0.1008,
"grad_norm": 0.1797930747270584,
"learning_rate": 3e-06,
"loss": 0.0615,
"policy/loss": 0.7325967848300934,
"reward": 0.947916716337204,
"rewards/boxed_and_answer_tags_format_reward": 0.59375,
"rewards/correctness_reward_func_math": 0.354166679084301,
"step": 189
},
{
"epoch": 0.10133333333333333,
"grad_norm": 0.18300126492977142,
"learning_rate": 3e-06,
"loss": 0.0809,
"policy/loss": 0.47127315402030945,
"step": 190
},
{
"epoch": 0.10186666666666666,
"grad_norm": 0.17413219809532166,
"learning_rate": 3e-06,
"loss": 0.0613,
"policy/loss": 0.7307329177856445,
"step": 191
},
{
"epoch": 0.1024,
"grad_norm": 0.19463799893856049,
"learning_rate": 3e-06,
"loss": 0.08,
"policy/loss": 0.46520596742630005,
"step": 192
},
{
"completion_length": 240.96875762939453,
"epoch": 0.10293333333333334,
"grad_norm": 0.17509786784648895,
"learning_rate": 3e-06,
"loss": 0.0398,
"policy/loss": 0.3508252985775471,
"reward": 1.2552083730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.6927083134651184,
"rewards/correctness_reward_func_math": 0.5624999850988388,
"step": 193
},
{
"epoch": 0.10346666666666667,
"grad_norm": 0.1478498876094818,
"learning_rate": 3e-06,
"loss": -0.0543,
"policy/loss": -0.027727939188480377,
"step": 194
},
{
"epoch": 0.104,
"grad_norm": 0.16833505034446716,
"learning_rate": 3e-06,
"loss": 0.0387,
"policy/loss": 0.34517115354537964,
"step": 195
},
{
"epoch": 0.10453333333333334,
"grad_norm": 0.1531871259212494,
"learning_rate": 3e-06,
"loss": -0.0547,
"policy/loss": -0.03184395655989647,
"step": 196
},
{
"completion_length": 240.81250762939453,
"epoch": 0.10506666666666667,
"grad_norm": 0.2604769766330719,
"learning_rate": 3e-06,
"loss": 0.014,
"policy/loss": 0.19213315472006798,
"reward": 1.1250000596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.6041666716337204,
"rewards/correctness_reward_func_math": 0.5208333283662796,
"step": 197
},
{
"epoch": 0.1056,
"grad_norm": 0.20661188662052155,
"learning_rate": 3e-06,
"loss": 0.0395,
"policy/loss": 0.12942768074572086,
"step": 198
},
{
"epoch": 0.10613333333333333,
"grad_norm": 0.22845624387264252,
"learning_rate": 3e-06,
"loss": 0.0132,
"policy/loss": 0.18689145147800446,
"step": 199
},
{
"epoch": 0.10666666666666667,
"grad_norm": 0.21520274877548218,
"learning_rate": 3e-06,
"loss": 0.039,
"policy/loss": 0.12739743292331696,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 5625,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}