| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.10666666666666667, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 253.59375, |
| "epoch": 0.0005333333333333334, |
| "grad_norm": 0.1841306984424591, |
| "learning_rate": 3e-06, |
| "loss": 0.0686, |
| "policy/loss": -0.013315461575984955, |
| "reward": 1.3489583730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.640625, |
| "rewards/correctness_reward_func_math": 0.7083333283662796, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0010666666666666667, |
| "grad_norm": 0.21774785220623016, |
| "learning_rate": 3e-06, |
| "loss": -0.0802, |
| "policy/loss": 0.0014063939452171326, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0016, |
| "grad_norm": 0.18103685975074768, |
| "learning_rate": 3e-06, |
| "loss": 0.0684, |
| "policy/loss": -0.012406323105096817, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0021333333333333334, |
| "grad_norm": 0.2169843465089798, |
| "learning_rate": 3e-06, |
| "loss": -0.0799, |
| "policy/loss": 0.00319121778011322, |
| "step": 4 |
| }, |
| { |
| "completion_length": 232.2604217529297, |
| "epoch": 0.0026666666666666666, |
| "grad_norm": 0.14396080374717712, |
| "learning_rate": 3e-06, |
| "loss": -0.0081, |
| "policy/loss": 0.18358719907701015, |
| "reward": 1.2916666865348816, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6458333432674408, |
| "rewards/correctness_reward_func_math": 0.6458333432674408, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0032, |
| "grad_norm": 0.1440899670124054, |
| "learning_rate": 3e-06, |
| "loss": -0.1113, |
| "policy/loss": -0.014110475778579712, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0037333333333333333, |
| "grad_norm": 0.1543431133031845, |
| "learning_rate": 3e-06, |
| "loss": -0.009, |
| "policy/loss": 0.18367187306284904, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.004266666666666667, |
| "grad_norm": 0.18800058960914612, |
| "learning_rate": 3e-06, |
| "loss": -0.1117, |
| "policy/loss": -0.01380608044564724, |
| "step": 8 |
| }, |
| { |
| "completion_length": 242.7291717529297, |
| "epoch": 0.0048, |
| "grad_norm": 0.23804515600204468, |
| "learning_rate": 3e-06, |
| "loss": -0.0155, |
| "policy/loss": -0.08487206892361598, |
| "reward": 1.057291716337204, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5572916567325592, |
| "rewards/correctness_reward_func_math": 0.4999999850988388, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.005333333333333333, |
| "grad_norm": 0.17962512373924255, |
| "learning_rate": 3e-06, |
| "loss": 0.0122, |
| "policy/loss": 0.1790632456280008, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.005866666666666667, |
| "grad_norm": 0.16098381578922272, |
| "learning_rate": 3e-06, |
| "loss": -0.016, |
| "policy/loss": -0.08530520830174737, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0064, |
| "grad_norm": 0.19069895148277283, |
| "learning_rate": 3e-06, |
| "loss": 0.0123, |
| "policy/loss": 0.1812864543667274, |
| "step": 12 |
| }, |
| { |
| "completion_length": 232.53125, |
| "epoch": 0.006933333333333333, |
| "grad_norm": 0.17127424478530884, |
| "learning_rate": 3e-06, |
| "loss": 0.0568, |
| "policy/loss": -0.03765931725502014, |
| "reward": 1.0520833730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6354166567325592, |
| "rewards/correctness_reward_func_math": 0.4166666716337204, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.007466666666666667, |
| "grad_norm": 0.18777285516262054, |
| "learning_rate": 3e-06, |
| "loss": 0.2118, |
| "policy/loss": -0.09204106219112873, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 0.18267256021499634, |
| "learning_rate": 3e-06, |
| "loss": 0.057, |
| "policy/loss": -0.03848084807395935, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.008533333333333334, |
| "grad_norm": 0.1676584780216217, |
| "learning_rate": 3e-06, |
| "loss": 0.2136, |
| "policy/loss": -0.09104587137699127, |
| "step": 16 |
| }, |
| { |
| "completion_length": 249.9375, |
| "epoch": 0.009066666666666667, |
| "grad_norm": 0.15089137852191925, |
| "learning_rate": 3e-06, |
| "loss": 0.0501, |
| "policy/loss": -0.1889983732253313, |
| "reward": 1.2135416865348816, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5885416865348816, |
| "rewards/correctness_reward_func_math": 0.6250000149011612, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.0096, |
| "grad_norm": 0.16881856322288513, |
| "learning_rate": 3e-06, |
| "loss": 0.0436, |
| "policy/loss": 0.27584413066506386, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.010133333333333333, |
| "grad_norm": 0.14319613575935364, |
| "learning_rate": 3e-06, |
| "loss": 0.0499, |
| "policy/loss": -0.18909327313303947, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.010666666666666666, |
| "grad_norm": 0.2733864188194275, |
| "learning_rate": 3e-06, |
| "loss": 0.0433, |
| "policy/loss": 0.27555806189775467, |
| "step": 20 |
| }, |
| { |
| "completion_length": 245.7604217529297, |
| "epoch": 0.0112, |
| "grad_norm": 0.3413054347038269, |
| "learning_rate": 3e-06, |
| "loss": 0.0222, |
| "policy/loss": 0.36454475536381636, |
| "reward": 1.1666667461395264, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6041666865348816, |
| "rewards/correctness_reward_func_math": 0.5625, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.011733333333333333, |
| "grad_norm": 0.21312959492206573, |
| "learning_rate": 3e-06, |
| "loss": 0.0843, |
| "policy/loss": 0.15956711052950823, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.012266666666666667, |
| "grad_norm": 0.35879793763160706, |
| "learning_rate": 3e-06, |
| "loss": 0.0218, |
| "policy/loss": 0.3650218606836688, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.0128, |
| "grad_norm": 0.18096353113651276, |
| "learning_rate": 3e-06, |
| "loss": 0.0843, |
| "policy/loss": 0.1569992530010822, |
| "step": 24 |
| }, |
| { |
| "completion_length": 242.43750762939453, |
| "epoch": 0.013333333333333334, |
| "grad_norm": 0.16766154766082764, |
| "learning_rate": 3e-06, |
| "loss": 0.0118, |
| "policy/loss": 0.48417001962661743, |
| "reward": 1.0781250596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.578125, |
| "rewards/correctness_reward_func_math": 0.5, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.013866666666666666, |
| "grad_norm": 0.2944582402706146, |
| "learning_rate": 3e-06, |
| "loss": 0.1917, |
| "policy/loss": 0.01044890284538269, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.0144, |
| "grad_norm": 0.2324495017528534, |
| "learning_rate": 3e-06, |
| "loss": 0.012, |
| "policy/loss": 0.48227669298648834, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.014933333333333333, |
| "grad_norm": 0.2566571533679962, |
| "learning_rate": 3e-06, |
| "loss": 0.1918, |
| "policy/loss": 0.010879706591367722, |
| "step": 28 |
| }, |
| { |
| "completion_length": 244.61458587646484, |
| "epoch": 0.015466666666666667, |
| "grad_norm": 0.21410726010799408, |
| "learning_rate": 3e-06, |
| "loss": -0.026, |
| "policy/loss": 0.1202235990203917, |
| "reward": 1.0520833730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.59375, |
| "rewards/correctness_reward_func_math": 0.4583333283662796, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 0.1583758145570755, |
| "learning_rate": 3e-06, |
| "loss": -0.0577, |
| "policy/loss": -0.05267565557733178, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.016533333333333334, |
| "grad_norm": 0.22393964231014252, |
| "learning_rate": 3e-06, |
| "loss": -0.0266, |
| "policy/loss": 0.11891656881198287, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.017066666666666667, |
| "grad_norm": 0.16789793968200684, |
| "learning_rate": 3e-06, |
| "loss": -0.057, |
| "policy/loss": -0.05271946266293526, |
| "step": 32 |
| }, |
| { |
| "completion_length": 245.89583587646484, |
| "epoch": 0.0176, |
| "grad_norm": 0.16854967176914215, |
| "learning_rate": 3e-06, |
| "loss": -0.0832, |
| "policy/loss": -0.1320323795080185, |
| "reward": 0.8854166865348816, |
| "rewards/boxed_and_answer_tags_format_reward": 0.59375, |
| "rewards/correctness_reward_func_math": 0.2916666716337204, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.018133333333333335, |
| "grad_norm": 0.3621562421321869, |
| "learning_rate": 3e-06, |
| "loss": -0.129, |
| "policy/loss": -0.5422025052830577, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.018666666666666668, |
| "grad_norm": 0.14944235980510712, |
| "learning_rate": 3e-06, |
| "loss": -0.0835, |
| "policy/loss": -0.13186319917440414, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0192, |
| "grad_norm": 0.4199303984642029, |
| "learning_rate": 3e-06, |
| "loss": -0.1291, |
| "policy/loss": -0.5477359890937805, |
| "step": 36 |
| }, |
| { |
| "completion_length": 246.28125762939453, |
| "epoch": 0.019733333333333332, |
| "grad_norm": 0.14028505980968475, |
| "learning_rate": 3e-06, |
| "loss": 0.0285, |
| "policy/loss": 0.24223446287214756, |
| "reward": 0.9479166865348816, |
| "rewards/boxed_and_answer_tags_format_reward": 0.65625, |
| "rewards/correctness_reward_func_math": 0.2916666567325592, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.020266666666666665, |
| "grad_norm": 0.1633375883102417, |
| "learning_rate": 3e-06, |
| "loss": 0.027, |
| "policy/loss": -0.2900683730840683, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.0208, |
| "grad_norm": 0.14287146925926208, |
| "learning_rate": 3e-06, |
| "loss": 0.0284, |
| "policy/loss": 0.24034919310361147, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.021333333333333333, |
| "grad_norm": 0.16466538608074188, |
| "learning_rate": 3e-06, |
| "loss": 0.0254, |
| "policy/loss": -0.29295632243156433, |
| "step": 40 |
| }, |
| { |
| "completion_length": 239.96875762939453, |
| "epoch": 0.021866666666666666, |
| "grad_norm": 0.15317575633525848, |
| "learning_rate": 3e-06, |
| "loss": -0.0366, |
| "policy/loss": 0.10384738075332933, |
| "reward": 1.0468750596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.609375, |
| "rewards/correctness_reward_func_math": 0.4375000149011612, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.0224, |
| "grad_norm": 0.19731628894805908, |
| "learning_rate": 3e-06, |
| "loss": -0.1193, |
| "policy/loss": 0.00456411417428626, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.022933333333333333, |
| "grad_norm": 0.14448165893554688, |
| "learning_rate": 3e-06, |
| "loss": -0.0383, |
| "policy/loss": 0.10382469364986235, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.023466666666666667, |
| "grad_norm": 0.1933155655860901, |
| "learning_rate": 3e-06, |
| "loss": -0.1191, |
| "policy/loss": 0.005274432977692811, |
| "step": 44 |
| }, |
| { |
| "completion_length": 244.89583587646484, |
| "epoch": 0.024, |
| "grad_norm": 0.24117735028266907, |
| "learning_rate": 3e-06, |
| "loss": 0.0938, |
| "policy/loss": -0.2796844388358295, |
| "reward": 1.1666666865348816, |
| "rewards/boxed_and_answer_tags_format_reward": 0.625, |
| "rewards/correctness_reward_func_math": 0.5416666716337204, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.024533333333333334, |
| "grad_norm": 0.24326607584953308, |
| "learning_rate": 3e-06, |
| "loss": -0.1093, |
| "policy/loss": -0.23608368262648582, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.025066666666666668, |
| "grad_norm": 0.2464676797389984, |
| "learning_rate": 3e-06, |
| "loss": 0.094, |
| "policy/loss": -0.28131480794399977, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.0256, |
| "grad_norm": 0.15732921659946442, |
| "learning_rate": 3e-06, |
| "loss": -0.1101, |
| "policy/loss": -0.2395353652536869, |
| "step": 48 |
| }, |
| { |
| "completion_length": 249.1979217529297, |
| "epoch": 0.026133333333333335, |
| "grad_norm": 0.41586366295814514, |
| "learning_rate": 3e-06, |
| "loss": 0.0274, |
| "policy/loss": -0.047684632387245074, |
| "reward": 1.0937500298023224, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6145833134651184, |
| "rewards/correctness_reward_func_math": 0.4791666641831398, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.02666666666666667, |
| "grad_norm": 0.17614451050758362, |
| "learning_rate": 3e-06, |
| "loss": -0.0918, |
| "policy/loss": -0.13479230320081115, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0272, |
| "grad_norm": 0.1597583144903183, |
| "learning_rate": 3e-06, |
| "loss": 0.027, |
| "policy/loss": -0.05036866711452603, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.027733333333333332, |
| "grad_norm": 0.1461249440908432, |
| "learning_rate": 3e-06, |
| "loss": -0.0923, |
| "policy/loss": -0.13541921973228455, |
| "step": 52 |
| }, |
| { |
| "completion_length": 244.4479217529297, |
| "epoch": 0.028266666666666666, |
| "grad_norm": 0.1617969125509262, |
| "learning_rate": 3e-06, |
| "loss": -0.0326, |
| "policy/loss": -0.05200636462233632, |
| "reward": 1.3802083730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5885416567325592, |
| "rewards/correctness_reward_func_math": 0.7916666567325592, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.0288, |
| "grad_norm": 0.14798980951309204, |
| "learning_rate": 3e-06, |
| "loss": -0.0111, |
| "policy/loss": 0.018518115793725087, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.029333333333333333, |
| "grad_norm": 0.1704786717891693, |
| "learning_rate": 3e-06, |
| "loss": -0.0323, |
| "policy/loss": -0.05232445967863697, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.029866666666666666, |
| "grad_norm": 0.15073394775390625, |
| "learning_rate": 3e-06, |
| "loss": -0.0116, |
| "policy/loss": 0.01774279534570411, |
| "step": 56 |
| }, |
| { |
| "completion_length": 240.375, |
| "epoch": 0.0304, |
| "grad_norm": 0.18698211014270782, |
| "learning_rate": 3e-06, |
| "loss": -0.0838, |
| "policy/loss": -0.04460075659737939, |
| "reward": 1.4322916865348816, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6197916567325592, |
| "rewards/correctness_reward_func_math": 0.8125, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.030933333333333334, |
| "grad_norm": 0.1560385823249817, |
| "learning_rate": 3e-06, |
| "loss": -0.0019, |
| "policy/loss": 0.032065877014169075, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.031466666666666664, |
| "grad_norm": 0.17436009645462036, |
| "learning_rate": 3e-06, |
| "loss": -0.0834, |
| "policy/loss": -0.044650425946258565, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.14935211837291718, |
| "learning_rate": 3e-06, |
| "loss": -0.0019, |
| "policy/loss": 0.032336668403949886, |
| "step": 60 |
| }, |
| { |
| "completion_length": 252.83333587646484, |
| "epoch": 0.03253333333333333, |
| "grad_norm": 0.1839117556810379, |
| "learning_rate": 3e-06, |
| "loss": -0.1101, |
| "policy/loss": -0.058528343215584755, |
| "reward": 0.9895833730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.59375, |
| "rewards/correctness_reward_func_math": 0.3958333283662796, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.03306666666666667, |
| "grad_norm": 0.16935284435749054, |
| "learning_rate": 3e-06, |
| "loss": -0.0523, |
| "policy/loss": -0.06420497968792915, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.0336, |
| "grad_norm": 0.1868937611579895, |
| "learning_rate": 3e-06, |
| "loss": -0.1096, |
| "policy/loss": -0.05889650620520115, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.034133333333333335, |
| "grad_norm": 0.16210873425006866, |
| "learning_rate": 3e-06, |
| "loss": -0.0527, |
| "policy/loss": -0.06419926509261131, |
| "step": 64 |
| }, |
| { |
| "completion_length": 245.67708587646484, |
| "epoch": 0.034666666666666665, |
| "grad_norm": 0.19509585201740265, |
| "learning_rate": 3e-06, |
| "loss": -0.0881, |
| "policy/loss": 0.055254802107810974, |
| "reward": 1.234375, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6302083432674408, |
| "rewards/correctness_reward_func_math": 0.6041666567325592, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.0352, |
| "grad_norm": 0.21583299338817596, |
| "learning_rate": 3e-06, |
| "loss": 0.0043, |
| "policy/loss": 0.3691144287586212, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.03573333333333333, |
| "grad_norm": 0.21014481782913208, |
| "learning_rate": 3e-06, |
| "loss": -0.0881, |
| "policy/loss": 0.05858026444911957, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.03626666666666667, |
| "grad_norm": 0.17624783515930176, |
| "learning_rate": 3e-06, |
| "loss": 0.0041, |
| "policy/loss": 0.3663817197084427, |
| "step": 68 |
| }, |
| { |
| "completion_length": 247.3541717529297, |
| "epoch": 0.0368, |
| "grad_norm": 0.1945660412311554, |
| "learning_rate": 3e-06, |
| "loss": 0.0264, |
| "policy/loss": -0.11664605140686035, |
| "reward": 0.8750000298023224, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6458333432674408, |
| "rewards/correctness_reward_func_math": 0.2291666641831398, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.037333333333333336, |
| "grad_norm": 0.17174845933914185, |
| "learning_rate": 3e-06, |
| "loss": -0.0919, |
| "policy/loss": -0.1406829059123993, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.037866666666666667, |
| "grad_norm": 0.19668424129486084, |
| "learning_rate": 3e-06, |
| "loss": 0.0257, |
| "policy/loss": -0.11868753097951412, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.0384, |
| "grad_norm": 0.17255908250808716, |
| "learning_rate": 3e-06, |
| "loss": -0.0928, |
| "policy/loss": -0.141671571880579, |
| "step": 72 |
| }, |
| { |
| "completion_length": 244.55209350585938, |
| "epoch": 0.038933333333333334, |
| "grad_norm": 0.17894263565540314, |
| "learning_rate": 3e-06, |
| "loss": 0.0772, |
| "policy/loss": 0.026979694513691754, |
| "reward": 1.0885416865348816, |
| "rewards/boxed_and_answer_tags_format_reward": 0.609375, |
| "rewards/correctness_reward_func_math": 0.4791666716337204, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.039466666666666664, |
| "grad_norm": 0.20189912617206573, |
| "learning_rate": 3e-06, |
| "loss": 0.0181, |
| "policy/loss": -0.01205286930909466, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.18249736726284027, |
| "learning_rate": 3e-06, |
| "loss": 0.0771, |
| "policy/loss": 0.02613500727184004, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.04053333333333333, |
| "grad_norm": 0.19343672692775726, |
| "learning_rate": 3e-06, |
| "loss": 0.0186, |
| "policy/loss": -0.01260100852024948, |
| "step": 76 |
| }, |
| { |
| "completion_length": 242.0729217529297, |
| "epoch": 0.04106666666666667, |
| "grad_norm": 0.20817507803440094, |
| "learning_rate": 3e-06, |
| "loss": 0.0947, |
| "policy/loss": -0.011811529185905556, |
| "reward": 1.0208333432674408, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6041666567325592, |
| "rewards/correctness_reward_func_math": 0.4166666641831398, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.0416, |
| "grad_norm": 0.209181547164917, |
| "learning_rate": 3e-06, |
| "loss": 0.0026, |
| "policy/loss": -0.21592581128098232, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.042133333333333335, |
| "grad_norm": 0.21479324996471405, |
| "learning_rate": 3e-06, |
| "loss": 0.0944, |
| "policy/loss": -0.012446357799618113, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.042666666666666665, |
| "grad_norm": 0.20790192484855652, |
| "learning_rate": 3e-06, |
| "loss": 0.0026, |
| "policy/loss": -0.21514567594918077, |
| "step": 80 |
| }, |
| { |
| "completion_length": 237.7916717529297, |
| "epoch": 0.0432, |
| "grad_norm": 0.326062947511673, |
| "learning_rate": 3e-06, |
| "loss": -0.0501, |
| "policy/loss": -0.08871376924216179, |
| "reward": 1.0208333432674408, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5416666567325592, |
| "rewards/correctness_reward_func_math": 0.4791666716337204, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.04373333333333333, |
| "grad_norm": 0.17182475328445435, |
| "learning_rate": 3e-06, |
| "loss": -0.0181, |
| "policy/loss": -0.11642095722509538, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.04426666666666667, |
| "grad_norm": 0.2907707691192627, |
| "learning_rate": 3e-06, |
| "loss": -0.0516, |
| "policy/loss": -0.08941211211223532, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0448, |
| "grad_norm": 0.16859214007854462, |
| "learning_rate": 3e-06, |
| "loss": -0.0187, |
| "policy/loss": -0.11713782228317626, |
| "step": 84 |
| }, |
| { |
| "completion_length": 244.17708587646484, |
| "epoch": 0.04533333333333334, |
| "grad_norm": 0.20400136709213257, |
| "learning_rate": 3e-06, |
| "loss": 0.025, |
| "policy/loss": -0.09599835332483053, |
| "reward": 1.0989583730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5572916567325592, |
| "rewards/correctness_reward_func_math": 0.5416666716337204, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.04586666666666667, |
| "grad_norm": 0.23238201439380646, |
| "learning_rate": 3e-06, |
| "loss": -0.0728, |
| "policy/loss": -0.65560332685709, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.0464, |
| "grad_norm": 0.2232753336429596, |
| "learning_rate": 3e-06, |
| "loss": 0.024, |
| "policy/loss": -0.09737076377496123, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.046933333333333334, |
| "grad_norm": 0.21358439326286316, |
| "learning_rate": 3e-06, |
| "loss": -0.0712, |
| "policy/loss": -0.6477146595716476, |
| "step": 88 |
| }, |
| { |
| "completion_length": 250.21875762939453, |
| "epoch": 0.047466666666666664, |
| "grad_norm": 0.21840573847293854, |
| "learning_rate": 3e-06, |
| "loss": 0.0227, |
| "policy/loss": -0.2187749519944191, |
| "reward": 1.4687500596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6354166865348816, |
| "rewards/correctness_reward_func_math": 0.8333333432674408, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 0.17804275453090668, |
| "learning_rate": 3e-06, |
| "loss": 0.0304, |
| "policy/loss": 0.25106509774923325, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.04853333333333333, |
| "grad_norm": 0.17775166034698486, |
| "learning_rate": 3e-06, |
| "loss": 0.0218, |
| "policy/loss": -0.2161092460155487, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.04906666666666667, |
| "grad_norm": 0.1654275357723236, |
| "learning_rate": 3e-06, |
| "loss": 0.0286, |
| "policy/loss": 0.2492801770567894, |
| "step": 92 |
| }, |
| { |
| "completion_length": 230.4791717529297, |
| "epoch": 0.0496, |
| "grad_norm": 0.20215706527233124, |
| "learning_rate": 3e-06, |
| "loss": 0.0607, |
| "policy/loss": 0.07679092884063721, |
| "reward": 1.4375, |
| "rewards/boxed_and_answer_tags_format_reward": 0.625, |
| "rewards/correctness_reward_func_math": 0.8125000149011612, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.050133333333333335, |
| "grad_norm": 0.2815592288970947, |
| "learning_rate": 3e-06, |
| "loss": 0.0601, |
| "policy/loss": 0.34143297374248505, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.050666666666666665, |
| "grad_norm": 0.21156412363052368, |
| "learning_rate": 3e-06, |
| "loss": 0.0597, |
| "policy/loss": 0.07551443576812744, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.0512, |
| "grad_norm": 0.19008544087409973, |
| "learning_rate": 3e-06, |
| "loss": 0.0585, |
| "policy/loss": 0.33709482848644257, |
| "step": 96 |
| }, |
| { |
| "completion_length": 236.77083587646484, |
| "epoch": 0.05173333333333333, |
| "grad_norm": 0.159907728433609, |
| "learning_rate": 3e-06, |
| "loss": 0.0825, |
| "policy/loss": -0.06487638503313065, |
| "reward": 1.265625, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6822916865348816, |
| "rewards/correctness_reward_func_math": 0.5833333283662796, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.05226666666666667, |
| "grad_norm": 0.1606937199831009, |
| "learning_rate": 3e-06, |
| "loss": 0.2219, |
| "policy/loss": 0.06859804317355156, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.0528, |
| "grad_norm": 0.15820233523845673, |
| "learning_rate": 3e-06, |
| "loss": 0.082, |
| "policy/loss": -0.06637740135192871, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.05333333333333334, |
| "grad_norm": 0.16980242729187012, |
| "learning_rate": 3e-06, |
| "loss": 0.2217, |
| "policy/loss": 0.06899168714880943, |
| "step": 100 |
| }, |
| { |
| "completion_length": 242.0729217529297, |
| "epoch": 0.05386666666666667, |
| "grad_norm": 0.1724499613046646, |
| "learning_rate": 3e-06, |
| "loss": 0.077, |
| "policy/loss": 0.010863131593367825, |
| "reward": 1.2656250596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6822916567325592, |
| "rewards/correctness_reward_func_math": 0.5833333432674408, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.0544, |
| "grad_norm": 0.1928744912147522, |
| "learning_rate": 3e-06, |
| "loss": -0.1418, |
| "policy/loss": -0.14429278626924358, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.054933333333333334, |
| "grad_norm": 0.17666271328926086, |
| "learning_rate": 3e-06, |
| "loss": 0.0754, |
| "policy/loss": 0.009962649615893326, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.055466666666666664, |
| "grad_norm": 0.20106588304042816, |
| "learning_rate": 3e-06, |
| "loss": -0.142, |
| "policy/loss": -0.14376884679922242, |
| "step": 104 |
| }, |
| { |
| "completion_length": 248.12500762939453, |
| "epoch": 0.056, |
| "grad_norm": 0.21009132266044617, |
| "learning_rate": 3e-06, |
| "loss": -0.0962, |
| "policy/loss": 0.013725795336409163, |
| "reward": 1.3125000298023224, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6458333134651184, |
| "rewards/correctness_reward_func_math": 0.6666666679084301, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.05653333333333333, |
| "grad_norm": 0.11821702867746353, |
| "learning_rate": 3e-06, |
| "loss": -0.0624, |
| "policy/loss": -0.0343314218800046, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.05706666666666667, |
| "grad_norm": 0.2041134238243103, |
| "learning_rate": 3e-06, |
| "loss": -0.0959, |
| "policy/loss": 0.013820057307459166, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0576, |
| "grad_norm": 0.12859182059764862, |
| "learning_rate": 3e-06, |
| "loss": -0.0633, |
| "policy/loss": -0.03427921939243461, |
| "step": 108 |
| }, |
| { |
| "completion_length": 246.3541717529297, |
| "epoch": 0.058133333333333335, |
| "grad_norm": 0.14615735411643982, |
| "learning_rate": 3e-06, |
| "loss": 0.1392, |
| "policy/loss": 0.040021819022378224, |
| "reward": 1.2083333730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6041666567325592, |
| "rewards/correctness_reward_func_math": 0.6041666567325592, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.058666666666666666, |
| "grad_norm": 0.1528160125017166, |
| "learning_rate": 3e-06, |
| "loss": -0.0175, |
| "policy/loss": 0.02410803958116503, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0592, |
| "grad_norm": 0.26312804222106934, |
| "learning_rate": 3e-06, |
| "loss": 0.1394, |
| "policy/loss": 0.040499807353633344, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.05973333333333333, |
| "grad_norm": 0.14612212777137756, |
| "learning_rate": 3e-06, |
| "loss": -0.0185, |
| "policy/loss": 0.024410473563328594, |
| "step": 112 |
| }, |
| { |
| "completion_length": 245.18750762939453, |
| "epoch": 0.06026666666666667, |
| "grad_norm": 0.19870316982269287, |
| "learning_rate": 3e-06, |
| "loss": 0.0525, |
| "policy/loss": 0.0026417523622512817, |
| "reward": 1.1718750596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6927083432674408, |
| "rewards/correctness_reward_func_math": 0.4791666716337204, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.0608, |
| "grad_norm": 0.27426677942276, |
| "learning_rate": 3e-06, |
| "loss": 0.0535, |
| "policy/loss": 0.16809340938925743, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.06133333333333333, |
| "grad_norm": 0.17909908294677734, |
| "learning_rate": 3e-06, |
| "loss": 0.0514, |
| "policy/loss": 0.003060735762119293, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.06186666666666667, |
| "grad_norm": 0.1752181202173233, |
| "learning_rate": 3e-06, |
| "loss": 0.0529, |
| "policy/loss": 0.17004671320319176, |
| "step": 116 |
| }, |
| { |
| "completion_length": 252.14584350585938, |
| "epoch": 0.0624, |
| "grad_norm": 0.15613406896591187, |
| "learning_rate": 3e-06, |
| "loss": 0.0615, |
| "policy/loss": -0.2717489246279001, |
| "reward": 1.4166666865348816, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6666666865348816, |
| "rewards/correctness_reward_func_math": 0.75, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.06293333333333333, |
| "grad_norm": 0.16029293835163116, |
| "learning_rate": 3e-06, |
| "loss": 0.0304, |
| "policy/loss": 0.05623655021190643, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.06346666666666667, |
| "grad_norm": 0.15869830548763275, |
| "learning_rate": 3e-06, |
| "loss": 0.0607, |
| "policy/loss": -0.27060971036553383, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 0.15622366964817047, |
| "learning_rate": 3e-06, |
| "loss": 0.0299, |
| "policy/loss": 0.059750813990831375, |
| "step": 120 |
| }, |
| { |
| "completion_length": 248.34375762939453, |
| "epoch": 0.06453333333333333, |
| "grad_norm": 0.18005700409412384, |
| "learning_rate": 3e-06, |
| "loss": 0.029, |
| "policy/loss": 1.3900643658359968e-07, |
| "reward": 0.984375, |
| "rewards/boxed_and_answer_tags_format_reward": 0.609375, |
| "rewards/correctness_reward_func_math": 0.375, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.06506666666666666, |
| "grad_norm": 0.18394261598587036, |
| "learning_rate": 3e-06, |
| "loss": -0.0416, |
| "policy/loss": 1.3915559726740412e-07, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.0656, |
| "grad_norm": 0.18433783948421478, |
| "learning_rate": 3e-06, |
| "loss": 0.0288, |
| "policy/loss": 1.3890329775279042e-07, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.06613333333333334, |
| "grad_norm": 0.20658805966377258, |
| "learning_rate": 3e-06, |
| "loss": -0.0406, |
| "policy/loss": 1.3907800244794544e-07, |
| "step": 124 |
| }, |
| { |
| "completion_length": 251.9791717529297, |
| "epoch": 0.06666666666666667, |
| "grad_norm": 0.19564618170261383, |
| "learning_rate": 3e-06, |
| "loss": 0.0077, |
| "policy/loss": 0.2248321995139122, |
| "reward": 1.4531250596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6614583432674408, |
| "rewards/correctness_reward_func_math": 0.7916666865348816, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.0672, |
| "grad_norm": 0.20839302241802216, |
| "learning_rate": 3e-06, |
| "loss": 0.0157, |
| "policy/loss": 0.5651952549815178, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.06773333333333334, |
| "grad_norm": 0.1958751529455185, |
| "learning_rate": 3e-06, |
| "loss": 0.0068, |
| "policy/loss": 0.21976368129253387, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.06826666666666667, |
| "grad_norm": 0.1874540001153946, |
| "learning_rate": 3e-06, |
| "loss": 0.0128, |
| "policy/loss": 0.5569901391863823, |
| "step": 128 |
| }, |
| { |
| "completion_length": 243.86458587646484, |
| "epoch": 0.0688, |
| "grad_norm": 0.16888441145420074, |
| "learning_rate": 3e-06, |
| "loss": 0.0046, |
| "policy/loss": 0.29372987684747187, |
| "reward": 0.9583333730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.7083333134651184, |
| "rewards/correctness_reward_func_math": 0.2500000074505806, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.06933333333333333, |
| "grad_norm": 0.147910475730896, |
| "learning_rate": 3e-06, |
| "loss": 0.0363, |
| "policy/loss": -0.0594180128145787, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.06986666666666666, |
| "grad_norm": 0.1847718209028244, |
| "learning_rate": 3e-06, |
| "loss": 0.0031, |
| "policy/loss": 0.2919089549587426, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.0704, |
| "grad_norm": 0.14381037652492523, |
| "learning_rate": 3e-06, |
| "loss": 0.035, |
| "policy/loss": -0.06040795295599466, |
| "step": 132 |
| }, |
| { |
| "completion_length": 237.17709350585938, |
| "epoch": 0.07093333333333333, |
| "grad_norm": 0.1769952028989792, |
| "learning_rate": 3e-06, |
| "loss": -0.0769, |
| "policy/loss": -0.269472052808851, |
| "reward": 1.3958333432674408, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6041666567325592, |
| "rewards/correctness_reward_func_math": 0.7916666716337204, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.07146666666666666, |
| "grad_norm": 0.23748241364955902, |
| "learning_rate": 3e-06, |
| "loss": 0.0414, |
| "policy/loss": -0.08278632164001465, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 0.18565919995307922, |
| "learning_rate": 3e-06, |
| "loss": -0.0788, |
| "policy/loss": -0.279860089183785, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.07253333333333334, |
| "grad_norm": 0.16724295914173126, |
| "learning_rate": 3e-06, |
| "loss": 0.0396, |
| "policy/loss": -0.08615577220916748, |
| "step": 136 |
| }, |
| { |
| "completion_length": 240.92708587646484, |
| "epoch": 0.07306666666666667, |
| "grad_norm": 0.20110689103603363, |
| "learning_rate": 3e-06, |
| "loss": -0.0048, |
| "policy/loss": -0.15881963817272293, |
| "reward": 1.2083333730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6458333134651184, |
| "rewards/correctness_reward_func_math": 0.5625, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.0736, |
| "grad_norm": 0.18458232283592224, |
| "learning_rate": 3e-06, |
| "loss": 0.1303, |
| "policy/loss": -0.1697537311212045, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.07413333333333333, |
| "grad_norm": 0.1825876086950302, |
| "learning_rate": 3e-06, |
| "loss": -0.0063, |
| "policy/loss": -0.16175133734067515, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.07466666666666667, |
| "grad_norm": 0.21164043247699738, |
| "learning_rate": 3e-06, |
| "loss": 0.1297, |
| "policy/loss": -0.16999738006476406, |
| "step": 140 |
| }, |
| { |
| "completion_length": 240.92709350585938, |
| "epoch": 0.0752, |
| "grad_norm": 0.16944681107997894, |
| "learning_rate": 3e-06, |
| "loss": -0.2597, |
| "policy/loss": -0.4086807461266684, |
| "reward": 1.21875, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, |
| "rewards/correctness_reward_func_math": 0.6041666567325592, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.07573333333333333, |
| "grad_norm": 0.22837960720062256, |
| "learning_rate": 3e-06, |
| "loss": 0.1325, |
| "policy/loss": 0.27677876760697373, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.07626666666666666, |
| "grad_norm": 0.1661621481180191, |
| "learning_rate": 3e-06, |
| "loss": -0.2593, |
| "policy/loss": -0.4062321575056007, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.0768, |
| "grad_norm": 0.18849802017211914, |
| "learning_rate": 3e-06, |
| "loss": 0.1322, |
| "policy/loss": 0.2781364421500241, |
| "step": 144 |
| }, |
| { |
| "completion_length": 248.17708587646484, |
| "epoch": 0.07733333333333334, |
| "grad_norm": 0.21365630626678467, |
| "learning_rate": 3e-06, |
| "loss": 0.1392, |
| "policy/loss": 0.3016326804974794, |
| "reward": 1.0781250596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.640625, |
| "rewards/correctness_reward_func_math": 0.4375, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.07786666666666667, |
| "grad_norm": 0.22841136157512665, |
| "learning_rate": 3e-06, |
| "loss": -0.1061, |
| "policy/loss": -0.028553878638526875, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.0784, |
| "grad_norm": 0.21557925641536713, |
| "learning_rate": 3e-06, |
| "loss": 0.1371, |
| "policy/loss": 0.30307797390917823, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.07893333333333333, |
| "grad_norm": 0.22633995115756989, |
| "learning_rate": 3e-06, |
| "loss": -0.1068, |
| "policy/loss": -0.027433033795552397, |
| "step": 148 |
| }, |
| { |
| "completion_length": 246.15625762939453, |
| "epoch": 0.07946666666666667, |
| "grad_norm": 0.17080900073051453, |
| "learning_rate": 3e-06, |
| "loss": 0.106, |
| "policy/loss": -0.2952568163817375, |
| "reward": 1.2135416865348816, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6510416567325592, |
| "rewards/correctness_reward_func_math": 0.5624999850988388, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.18366633355617523, |
| "learning_rate": 3e-06, |
| "loss": -0.0337, |
| "policy/loss": 0.06641959956277788, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.08053333333333333, |
| "grad_norm": 0.2946370244026184, |
| "learning_rate": 3e-06, |
| "loss": 0.1046, |
| "policy/loss": -0.2993208696474099, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.08106666666666666, |
| "grad_norm": 0.18177594244480133, |
| "learning_rate": 3e-06, |
| "loss": -0.0337, |
| "policy/loss": 0.06611955726583574, |
| "step": 152 |
| }, |
| { |
| "completion_length": 241.34375, |
| "epoch": 0.0816, |
| "grad_norm": 0.2595878839492798, |
| "learning_rate": 3e-06, |
| "loss": -0.0129, |
| "policy/loss": -0.029897108674049377, |
| "reward": 1.3645833730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.65625, |
| "rewards/correctness_reward_func_math": 0.7083333283662796, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.08213333333333334, |
| "grad_norm": 0.2282121330499649, |
| "learning_rate": 3e-06, |
| "loss": -0.1403, |
| "policy/loss": -0.7962741553783417, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.08266666666666667, |
| "grad_norm": 0.2732269763946533, |
| "learning_rate": 3e-06, |
| "loss": -0.0142, |
| "policy/loss": -0.0346699059009552, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.0832, |
| "grad_norm": 0.19453194737434387, |
| "learning_rate": 3e-06, |
| "loss": -0.1409, |
| "policy/loss": -0.7987985908985138, |
| "step": 156 |
| }, |
| { |
| "completion_length": 246.7291717529297, |
| "epoch": 0.08373333333333334, |
| "grad_norm": 0.15822243690490723, |
| "learning_rate": 3e-06, |
| "loss": -0.0368, |
| "policy/loss": 0.1923005077087261, |
| "reward": 1.0989583730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6822916567325592, |
| "rewards/correctness_reward_func_math": 0.4166666716337204, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.08426666666666667, |
| "grad_norm": 0.20775403082370758, |
| "learning_rate": 3e-06, |
| "loss": -0.1298, |
| "policy/loss": -0.008804838902069179, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.0848, |
| "grad_norm": 0.1534760296344757, |
| "learning_rate": 3e-06, |
| "loss": -0.0376, |
| "policy/loss": 0.19086543647436827, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.08533333333333333, |
| "grad_norm": 0.1879645437002182, |
| "learning_rate": 3e-06, |
| "loss": -0.1308, |
| "policy/loss": -0.010181248919947095, |
| "step": 160 |
| }, |
| { |
| "completion_length": 247.52083587646484, |
| "epoch": 0.08586666666666666, |
| "grad_norm": 0.2048528641462326, |
| "learning_rate": 3e-06, |
| "loss": -0.1292, |
| "policy/loss": -0.25954964756965637, |
| "reward": 1.5625000596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.7291666865348816, |
| "rewards/correctness_reward_func_math": 0.8333333134651184, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.0864, |
| "grad_norm": 0.26088085770606995, |
| "learning_rate": 3e-06, |
| "loss": -0.0211, |
| "policy/loss": -0.09772485494613647, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.08693333333333333, |
| "grad_norm": 0.17860932648181915, |
| "learning_rate": 3e-06, |
| "loss": -0.1294, |
| "policy/loss": -0.2615499645471573, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.08746666666666666, |
| "grad_norm": 0.19965782761573792, |
| "learning_rate": 3e-06, |
| "loss": -0.0201, |
| "policy/loss": -0.09446658566594124, |
| "step": 164 |
| }, |
| { |
| "completion_length": 248.52083587646484, |
| "epoch": 0.088, |
| "grad_norm": 0.24648579955101013, |
| "learning_rate": 3e-06, |
| "loss": -0.0068, |
| "policy/loss": -0.1897476138547063, |
| "reward": 1.3072917461395264, |
| "rewards/boxed_and_answer_tags_format_reward": 0.703125, |
| "rewards/correctness_reward_func_math": 0.6041666567325592, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.08853333333333334, |
| "grad_norm": 0.18784964084625244, |
| "learning_rate": 3e-06, |
| "loss": 0.0468, |
| "policy/loss": -0.35483095049858093, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.08906666666666667, |
| "grad_norm": 0.23281453549861908, |
| "learning_rate": 3e-06, |
| "loss": -0.0079, |
| "policy/loss": -0.19379627890884876, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.0896, |
| "grad_norm": 0.16716426610946655, |
| "learning_rate": 3e-06, |
| "loss": 0.0464, |
| "policy/loss": -0.3517995774745941, |
| "step": 168 |
| }, |
| { |
| "completion_length": 237.05209350585938, |
| "epoch": 0.09013333333333333, |
| "grad_norm": 0.15929904580116272, |
| "learning_rate": 3e-06, |
| "loss": -0.1312, |
| "policy/loss": -0.45294931530952454, |
| "reward": 1.2395833730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6770833134651184, |
| "rewards/correctness_reward_func_math": 0.5625000149011612, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.09066666666666667, |
| "grad_norm": 0.1787302941083908, |
| "learning_rate": 3e-06, |
| "loss": 0.0216, |
| "policy/loss": 0.26260387897491455, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0912, |
| "grad_norm": 0.17395518720149994, |
| "learning_rate": 3e-06, |
| "loss": -0.1311, |
| "policy/loss": -0.45233495021238923, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.09173333333333333, |
| "grad_norm": 0.18037588894367218, |
| "learning_rate": 3e-06, |
| "loss": 0.0218, |
| "policy/loss": 0.2613874822854996, |
| "step": 172 |
| }, |
| { |
| "completion_length": 242.8229217529297, |
| "epoch": 0.09226666666666666, |
| "grad_norm": 0.31133604049682617, |
| "learning_rate": 3e-06, |
| "loss": -0.0216, |
| "policy/loss": 0.22989650322091393, |
| "reward": 1.5625, |
| "rewards/boxed_and_answer_tags_format_reward": 0.75, |
| "rewards/correctness_reward_func_math": 0.8125, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.0928, |
| "grad_norm": 0.1914536952972412, |
| "learning_rate": 3e-06, |
| "loss": 0.1394, |
| "policy/loss": 0.026128504063084534, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.09333333333333334, |
| "grad_norm": 0.19691255688667297, |
| "learning_rate": 3e-06, |
| "loss": -0.0219, |
| "policy/loss": 0.23154904199277482, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.09386666666666667, |
| "grad_norm": 0.18783937394618988, |
| "learning_rate": 3e-06, |
| "loss": 0.139, |
| "policy/loss": 0.026339715032900557, |
| "step": 176 |
| }, |
| { |
| "completion_length": 248.09375762939453, |
| "epoch": 0.0944, |
| "grad_norm": 0.21956992149353027, |
| "learning_rate": 3e-06, |
| "loss": -0.0418, |
| "policy/loss": 0.33423537268861736, |
| "reward": 0.890625, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5156249850988388, |
| "rewards/correctness_reward_func_math": 0.3750000074505806, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.09493333333333333, |
| "grad_norm": 0.2890993356704712, |
| "learning_rate": 3e-06, |
| "loss": -0.0196, |
| "policy/loss": 0.0917874399780203, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.09546666666666667, |
| "grad_norm": 0.21964141726493835, |
| "learning_rate": 3e-06, |
| "loss": -0.0423, |
| "policy/loss": 0.33307433389631314, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 0.2810533344745636, |
| "learning_rate": 3e-06, |
| "loss": -0.0195, |
| "policy/loss": 0.09134699882917374, |
| "step": 180 |
| }, |
| { |
| "completion_length": 220.5, |
| "epoch": 0.09653333333333333, |
| "grad_norm": 0.16769170761108398, |
| "learning_rate": 3e-06, |
| "loss": -0.0963, |
| "policy/loss": 0.2489219456911087, |
| "reward": 1.2968750596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6510416865348816, |
| "rewards/correctness_reward_func_math": 0.6458333432674408, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.09706666666666666, |
| "grad_norm": 0.1784435361623764, |
| "learning_rate": 3e-06, |
| "loss": 0.0834, |
| "policy/loss": 0.19268840551376343, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.0976, |
| "grad_norm": 0.1695830374956131, |
| "learning_rate": 3e-06, |
| "loss": -0.0967, |
| "policy/loss": 0.25148695707321167, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.09813333333333334, |
| "grad_norm": 0.1726071685552597, |
| "learning_rate": 3e-06, |
| "loss": 0.083, |
| "policy/loss": 0.19391090795397758, |
| "step": 184 |
| }, |
| { |
| "completion_length": 234.03125762939453, |
| "epoch": 0.09866666666666667, |
| "grad_norm": 0.22702626883983612, |
| "learning_rate": 3e-06, |
| "loss": -0.0245, |
| "policy/loss": 1.0809221606677966e-07, |
| "reward": 1.3645833730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, |
| "rewards/correctness_reward_func_math": 0.6249999776482582, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.0992, |
| "grad_norm": 0.19707860052585602, |
| "learning_rate": 3e-06, |
| "loss": -0.1058, |
| "policy/loss": 1.4286691296661047e-07, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.09973333333333333, |
| "grad_norm": 0.24218805134296417, |
| "learning_rate": 3e-06, |
| "loss": -0.0252, |
| "policy/loss": 1.0810558492835298e-07, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.10026666666666667, |
| "grad_norm": 0.24096260964870453, |
| "learning_rate": 3e-06, |
| "loss": -0.1073, |
| "policy/loss": 1.4284267280118002e-07, |
| "step": 188 |
| }, |
| { |
| "completion_length": 242.15625762939453, |
| "epoch": 0.1008, |
| "grad_norm": 0.1797930747270584, |
| "learning_rate": 3e-06, |
| "loss": 0.0615, |
| "policy/loss": 0.7325967848300934, |
| "reward": 0.947916716337204, |
| "rewards/boxed_and_answer_tags_format_reward": 0.59375, |
| "rewards/correctness_reward_func_math": 0.354166679084301, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.10133333333333333, |
| "grad_norm": 0.18300126492977142, |
| "learning_rate": 3e-06, |
| "loss": 0.0809, |
| "policy/loss": 0.47127315402030945, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.10186666666666666, |
| "grad_norm": 0.17413219809532166, |
| "learning_rate": 3e-06, |
| "loss": 0.0613, |
| "policy/loss": 0.7307329177856445, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.1024, |
| "grad_norm": 0.19463799893856049, |
| "learning_rate": 3e-06, |
| "loss": 0.08, |
| "policy/loss": 0.46520596742630005, |
| "step": 192 |
| }, |
| { |
| "completion_length": 240.96875762939453, |
| "epoch": 0.10293333333333334, |
| "grad_norm": 0.17509786784648895, |
| "learning_rate": 3e-06, |
| "loss": 0.0398, |
| "policy/loss": 0.3508252985775471, |
| "reward": 1.2552083730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6927083134651184, |
| "rewards/correctness_reward_func_math": 0.5624999850988388, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.10346666666666667, |
| "grad_norm": 0.1478498876094818, |
| "learning_rate": 3e-06, |
| "loss": -0.0543, |
| "policy/loss": -0.027727939188480377, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 0.16833505034446716, |
| "learning_rate": 3e-06, |
| "loss": 0.0387, |
| "policy/loss": 0.34517115354537964, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.10453333333333334, |
| "grad_norm": 0.1531871259212494, |
| "learning_rate": 3e-06, |
| "loss": -0.0547, |
| "policy/loss": -0.03184395655989647, |
| "step": 196 |
| }, |
| { |
| "completion_length": 240.81250762939453, |
| "epoch": 0.10506666666666667, |
| "grad_norm": 0.2604769766330719, |
| "learning_rate": 3e-06, |
| "loss": 0.014, |
| "policy/loss": 0.19213315472006798, |
| "reward": 1.1250000596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6041666716337204, |
| "rewards/correctness_reward_func_math": 0.5208333283662796, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.1056, |
| "grad_norm": 0.20661188662052155, |
| "learning_rate": 3e-06, |
| "loss": 0.0395, |
| "policy/loss": 0.12942768074572086, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.10613333333333333, |
| "grad_norm": 0.22845624387264252, |
| "learning_rate": 3e-06, |
| "loss": 0.0132, |
| "policy/loss": 0.18689145147800446, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.10666666666666667, |
| "grad_norm": 0.21520274877548218, |
| "learning_rate": 3e-06, |
| "loss": 0.039, |
| "policy/loss": 0.12739743292331696, |
| "step": 200 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 5625, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|