{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03968647683301915, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 221.46875, "epoch": 0.0003968647683301915, "grad_norm": 0.1240234375, "kl": 0.0, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.875, "reward_std": 0.125, "rewards/correctness_reward_func": 0.875, "step": 1 }, { "completion_length": 181.71875, "epoch": 0.000793729536660383, "grad_norm": 0.00052642822265625, "kl": 0.00021356809156714007, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func": 1.125, "step": 2 }, { "completion_length": 279.875, "epoch": 0.0011905943049905744, "grad_norm": 0.380859375, "kl": 0.0001963738031918183, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.40625, "reward_std": 0.13466878235340118, "rewards/correctness_reward_func": 1.40625, "step": 3 }, { "completion_length": 231.625, "epoch": 0.001587459073320766, "grad_norm": 0.068359375, "kl": 0.00031982854125089943, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.53125, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.53125, "step": 4 }, { "completion_length": 271.140625, "epoch": 0.0019843238416509573, "grad_norm": 0.00013446807861328125, "kl": 0.00010319267312297598, "learning_rate": 1e-05, "loss": 0.0, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "step": 5 }, { "completion_length": 294.65625, "epoch": 0.0023811886099811487, "grad_norm": 0.04248046875, "kl": 0.0001881300595414359, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.34375, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.34375, "step": 6 }, { "completion_length": 279.25, "epoch": 0.0027780533783113405, "grad_norm": 0.000713348388671875, "kl": 0.0002071384915325325, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func": 1.125, "step": 7 }, { "completion_length": 287.125, "epoch": 0.003174918146641532, "grad_norm": 0.0003032684326171875, "kl": 0.0001682171678112354, "learning_rate": 1e-05, "loss": 0.0, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "step": 8 }, { "completion_length": 259.0625, "epoch": 0.0035717829149717233, "grad_norm": 0.1591796875, "kl": 0.0002609610164654441, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.34375, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.34375, "step": 9 }, { "completion_length": 380.171875, "epoch": 0.003968647683301915, "grad_norm": 0.0654296875, "kl": 0.0002176924463128671, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.34375, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.34375, "step": 10 }, { "completion_length": 489.640625, "epoch": 0.004365512451632106, "grad_norm": 0.032958984375, "kl": 0.00015466208969883155, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.21875, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.21875, "step": 11 }, { "completion_length": 332.609375, "epoch": 0.0047623772199622974, "grad_norm": 0.056640625, "kl": 0.00017187761113746092, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.90625, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.90625, "step": 12 }, { "completion_length": 366.828125, "epoch": 0.00515924198829249, "grad_norm": 0.00021648406982421875, "kl": 0.00014664832269772887, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/correctness_reward_func": 1.75, "step": 13 }, { "completion_length": 232.5, "epoch": 0.005556106756622681, "grad_norm": 0.0005340576171875, "kl": 0.000275460712146014, "learning_rate": 1e-05, "loss": 0.0, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "step": 14 }, { "completion_length": 294.359375, "epoch": 0.0059529715249528724, "grad_norm": 0.087890625, "kl": 0.00027155900897923857, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.0, "reward_std": 0.19716878235340118, "rewards/correctness_reward_func": 1.0, "step": 15 }, { "completion_length": 384.578125, "epoch": 0.006349836293283064, "grad_norm": 0.0634765625, "kl": 0.00022811047529103234, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.53125, "reward_std": 0.13466878235340118, "rewards/correctness_reward_func": 1.53125, "step": 16 }, { "completion_length": 334.09375, "epoch": 0.006746701061613255, "grad_norm": 0.057861328125, "kl": 0.00027415682779974304, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.53125, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.53125, "step": 17 }, { "completion_length": 375.921875, "epoch": 0.007143565829943447, "grad_norm": 0.08642578125, "kl": 0.0001596472029632423, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.78125, "reward_std": 0.13466878235340118, "rewards/correctness_reward_func": 1.78125, "step": 18 }, { "completion_length": 345.203125, "epoch": 0.007540430598273638, "grad_norm": 0.000461578369140625, "kl": 0.00024132111502694897, "learning_rate": 1e-05, "loss": 0.0, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "step": 19 }, { "completion_length": 418.296875, "epoch": 0.00793729536660383, "grad_norm": 0.251953125, "kl": 0.0002591940647107549, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.0625, "reward_std": 0.25, "rewards/correctness_reward_func": 1.0625, "step": 20 }, { "completion_length": 441.0, "epoch": 0.00833416013493402, "grad_norm": 0.240234375, "kl": 0.00021814405772602186, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.25, "reward_std": 0.3221687823534012, "rewards/correctness_reward_func": 1.25, "step": 21 }, { "completion_length": 349.15625, "epoch": 0.008731024903264212, "grad_norm": 0.0947265625, "kl": 0.00023298784799408168, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.4375, "reward_std": 0.125, "rewards/correctness_reward_func": 1.4375, "step": 22 }, { "completion_length": 375.90625, "epoch": 0.009127889671594403, "grad_norm": 0.064453125, "kl": 0.0001741187043080572, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.375, "reward_std": 0.19716878235340118, "rewards/correctness_reward_func": 1.375, "step": 23 }, { "completion_length": 383.203125, "epoch": 0.009524754439924595, "grad_norm": 0.0732421875, "kl": 0.00014529807594954036, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.9375, "reward_std": 0.125, "rewards/correctness_reward_func": 1.9375, "step": 24 }, { "completion_length": 334.6875, "epoch": 0.009921619208254788, "grad_norm": 0.06005859375, "kl": 0.00016799916193122044, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.9375, "reward_std": 0.125, "rewards/correctness_reward_func": 1.9375, "step": 25 }, { "completion_length": 405.140625, "epoch": 0.01031848397658498, "grad_norm": 0.0245361328125, "kl": 0.00020157176550128497, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.53125, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.53125, "step": 26 }, { "completion_length": 374.28125, "epoch": 0.01071534874491517, "grad_norm": 0.1796875, "kl": 0.00029869095305912197, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.9375, "reward_std": 0.19716878235340118, "rewards/correctness_reward_func": 0.9375, "step": 27 }, { "completion_length": 426.484375, "epoch": 0.011112213513245362, "grad_norm": 0.43359375, "kl": 0.0002508295456209453, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.9375, "reward_std": 0.4471687823534012, "rewards/correctness_reward_func": 0.9375, "step": 28 }, { "completion_length": 432.21875, "epoch": 0.011509078281575554, "grad_norm": 0.2578125, "kl": 0.00018439985433360562, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.03125, "reward_std": 0.1875, "rewards/correctness_reward_func": 1.03125, "step": 29 }, { "completion_length": 610.609375, "epoch": 0.011905943049905745, "grad_norm": 0.162109375, "kl": 9.109115489991382e-05, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.40625, "reward_std": 0.0625, "rewards/correctness_reward_func": 0.40625, "step": 30 }, { "completion_length": 411.140625, "epoch": 0.012302807818235936, "grad_norm": 0.12890625, "kl": 0.00014656526946055237, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.8125, "reward_std": 0.32216876745224, "rewards/correctness_reward_func": 0.8125, "step": 31 }, { "completion_length": 371.359375, "epoch": 0.012699672586566128, "grad_norm": 0.053955078125, "kl": 0.0001413824884366477, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.71875, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.71875, "step": 32 }, { "completion_length": 360.6875, "epoch": 0.013096537354896319, "grad_norm": 0.09375, "kl": 0.00014583250776922796, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.96875, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.96875, "step": 33 }, { "completion_length": 256.140625, "epoch": 0.01349340212322651, "grad_norm": 0.0003681182861328125, "kl": 0.00021255770843708888, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.875, "reward_std": 0.0, "rewards/correctness_reward_func": 1.875, "step": 34 }, { "completion_length": 298.71875, "epoch": 0.013890266891556702, "grad_norm": 0.05712890625, "kl": 0.00023778366448823363, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.84375, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.84375, "step": 35 }, { "completion_length": 397.203125, "epoch": 0.014287131659886893, "grad_norm": 0.0693359375, "kl": 0.0001875234411272686, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.96875, "reward_std": 0.0625, "rewards/correctness_reward_func": 0.96875, "step": 36 }, { "completion_length": 476.59375, "epoch": 0.014683996428217085, "grad_norm": 0.1982421875, "kl": 0.0002962429862236604, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.15625, "reward_std": 0.45683756470680237, "rewards/correctness_reward_func": 1.15625, "step": 37 }, { "completion_length": 380.046875, "epoch": 0.015080861196547276, "grad_norm": 0.2177734375, "kl": 0.00021642427236656658, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.25, "reward_std": 0.3221687823534012, "rewards/correctness_reward_func": 1.25, "step": 38 }, { "completion_length": 319.671875, "epoch": 0.015477725964877467, "grad_norm": 0.1787109375, "kl": 0.0002720155207498465, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.0, "reward_std": 0.19716878235340118, "rewards/correctness_reward_func": 1.0, "step": 39 }, { "completion_length": 527.625, "epoch": 0.01587459073320766, "grad_norm": 0.11474609375, "kl": 0.0001258815245819278, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.625, "reward_std": 0.125, "rewards/correctness_reward_func": 0.625, "step": 40 }, { "completion_length": 326.09375, "epoch": 0.01627145550153785, "grad_norm": 0.0634765625, "kl": 0.00018829756663762964, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.78125, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.78125, "step": 41 }, { "completion_length": 362.1875, "epoch": 0.01666832026986804, "grad_norm": 0.042724609375, "kl": 0.00019165025150869042, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.71875, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.71875, "step": 42 }, { "completion_length": 297.5625, "epoch": 0.017065185038198233, "grad_norm": 0.000431060791015625, "kl": 0.00027088051137980074, "learning_rate": 1e-05, "loss": 0.0, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "step": 43 }, { "completion_length": 361.09375, "epoch": 0.017462049806528424, "grad_norm": 0.0712890625, "kl": 0.00019505245654727332, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.96875, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.96875, "step": 44 }, { "completion_length": 349.140625, "epoch": 0.017858914574858616, "grad_norm": 0.08984375, "kl": 0.000281722481304314, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.0, "reward_std": 0.125, "rewards/correctness_reward_func": 1.0, "step": 45 }, { "completion_length": 454.375, "epoch": 0.018255779343188807, "grad_norm": 0.203125, "kl": 0.0002504781477910001, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.15625, "reward_std": 0.33183756470680237, "rewards/correctness_reward_func": 1.15625, "step": 46 }, { "completion_length": 417.984375, "epoch": 0.018652644111519, "grad_norm": 0.57421875, "kl": 0.0007036441420495976, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.96875, "reward_std": 0.33183756470680237, "rewards/correctness_reward_func": 0.96875, "step": 47 }, { "completion_length": 403.5625, "epoch": 0.01904950887984919, "grad_norm": 0.2109375, "kl": 0.00022189919036463834, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.28125, "reward_std": 0.2596687823534012, "rewards/correctness_reward_func": 1.28125, "step": 48 }, { "completion_length": 578.09375, "epoch": 0.019446373648179385, "grad_norm": 0.193359375, "kl": 0.00016415365462307818, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/correctness_reward_func": 0.5, "step": 49 }, { "completion_length": 388.46875, "epoch": 0.019843238416509576, "grad_norm": 0.2041015625, "kl": 0.00029624684611917473, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/correctness_reward_func": 0.625, "step": 50 }, { "completion_length": 393.546875, "epoch": 0.020240103184839767, "grad_norm": 0.06201171875, "kl": 0.000217016367969336, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.4375, "reward_std": 0.125, "rewards/correctness_reward_func": 1.4375, "step": 51 }, { "completion_length": 483.203125, "epoch": 0.02063696795316996, "grad_norm": 0.12109375, "kl": 0.00021450840722536668, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.46875, "reward_std": 0.1875, "rewards/correctness_reward_func": 1.46875, "step": 52 }, { "completion_length": 333.375, "epoch": 0.02103383272150015, "grad_norm": 0.0003528594970703125, "kl": 0.0002801330847432837, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.625, "reward_std": 0.0, "rewards/correctness_reward_func": 1.625, "step": 53 }, { "completion_length": 327.609375, "epoch": 0.02143069748983034, "grad_norm": 0.05908203125, "kl": 0.0002553274571255315, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.5625, "reward_std": 0.125, "rewards/correctness_reward_func": 1.5625, "step": 54 }, { "completion_length": 313.484375, "epoch": 0.021827562258160533, "grad_norm": 0.05126953125, "kl": 0.00040918682134361006, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.4375, "reward_std": 0.07216878235340118, "rewards/correctness_reward_func": 1.4375, "step": 55 }, { "completion_length": 587.828125, "epoch": 0.022224427026490724, "grad_norm": 0.2421875, "kl": 0.00024972553364932537, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.59375, "reward_std": 0.5290063470602036, "rewards/correctness_reward_func": 0.59375, "step": 56 }, { "completion_length": 397.625, "epoch": 0.022621291794820916, "grad_norm": 0.1962890625, "kl": 0.0003616545727709308, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.125, "reward_std": 0.26933756470680237, "rewards/correctness_reward_func": 1.125, "step": 57 }, { "completion_length": 493.171875, "epoch": 0.023018156563151107, "grad_norm": 0.146484375, "kl": 0.00021447856852319092, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.1875, "reward_std": 0.34150634706020355, "rewards/correctness_reward_func": 1.1875, "step": 58 }, { "completion_length": 505.140625, "epoch": 0.0234150213314813, "grad_norm": 0.173828125, "kl": 0.00036706905302708037, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/correctness_reward_func": 0.6875, "step": 59 }, { "completion_length": 565.90625, "epoch": 0.02381188609981149, "grad_norm": 0.10107421875, "kl": 0.0002694212962524034, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.5, "reward_std": 0.125, "rewards/correctness_reward_func": 0.5, "step": 60 }, { "completion_length": 554.953125, "epoch": 0.02420875086814168, "grad_norm": 0.0400390625, "kl": 0.00021707677660742775, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.28125, "reward_std": 0.0625, "rewards/correctness_reward_func": 0.28125, "step": 61 }, { "completion_length": 297.0625, "epoch": 0.024605615636471873, "grad_norm": 0.0004024505615234375, "kl": 0.000280580159596866, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.5, "reward_std": 0.0, "rewards/correctness_reward_func": 1.5, "step": 62 }, { "completion_length": 412.125, "epoch": 0.025002480404802064, "grad_norm": 0.08984375, "kl": 0.0002535973835620098, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.90625, "reward_std": 0.1875, "rewards/correctness_reward_func": 1.90625, "step": 63 }, { "completion_length": 416.046875, "epoch": 0.025399345173132255, "grad_norm": 0.087890625, "kl": 0.00024055263565969653, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.71875, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.71875, "step": 64 }, { "completion_length": 267.140625, "epoch": 0.025796209941462447, "grad_norm": 0.000850677490234375, "kl": 0.0006130204565124586, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/correctness_reward_func": 1.75, "step": 65 }, { "completion_length": 402.5625, "epoch": 0.026193074709792638, "grad_norm": 0.0986328125, "kl": 0.0002838912623701617, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.3125, "reward_std": 0.19716878235340118, "rewards/correctness_reward_func": 1.3125, "step": 66 }, { "completion_length": 466.6875, "epoch": 0.02658993947812283, "grad_norm": 0.08349609375, "kl": 0.0002692481575650163, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.0, "reward_std": 0.25, "rewards/correctness_reward_func": 1.0, "step": 67 }, { "completion_length": 445.171875, "epoch": 0.02698680424645302, "grad_norm": 0.177734375, "kl": 0.0005885176433366723, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.28125, "reward_std": 0.3125, "rewards/correctness_reward_func": 1.28125, "step": 68 }, { "completion_length": 530.484375, "epoch": 0.027383669014783212, "grad_norm": 0.1162109375, "kl": 0.0002777522022370249, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.9375, "reward_std": 0.25, "rewards/correctness_reward_func": 0.9375, "step": 69 }, { "completion_length": 360.640625, "epoch": 0.027780533783113404, "grad_norm": 0.1171875, "kl": 0.000590253112022765, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.6875, "reward_std": 0.125, "rewards/correctness_reward_func": 1.6875, "step": 70 }, { "completion_length": 481.71875, "epoch": 0.028177398551443595, "grad_norm": 0.061767578125, "kl": 0.0002587015369499568, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.71875, "reward_std": 0.0625, "rewards/correctness_reward_func": 0.71875, "step": 71 }, { "completion_length": 539.5, "epoch": 0.028574263319773786, "grad_norm": 0.11767578125, "kl": 0.00022344068929669447, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.125, "reward_std": 0.39433756470680237, "rewards/correctness_reward_func": 1.125, "step": 72 }, { "completion_length": 331.703125, "epoch": 0.028971128088103978, "grad_norm": 0.000308990478515625, "kl": 0.0002874392084777355, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/correctness_reward_func": 1.75, "step": 73 }, { "completion_length": 439.34375, "epoch": 0.02936799285643417, "grad_norm": 0.07177734375, "kl": 0.00026139178953599185, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.5625, "reward_std": 0.125, "rewards/correctness_reward_func": 1.5625, "step": 74 }, { "completion_length": 448.328125, "epoch": 0.02976485762476436, "grad_norm": 0.1455078125, "kl": 0.00037807323678862303, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.625, "reward_std": 0.26933756470680237, "rewards/correctness_reward_func": 1.625, "step": 75 }, { "completion_length": 271.0625, "epoch": 0.030161722393094552, "grad_norm": 0.000728607177734375, "kl": 0.0005262854028842412, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.875, "reward_std": 0.0, "rewards/correctness_reward_func": 1.875, "step": 76 }, { "completion_length": 348.4375, "epoch": 0.030558587161424743, "grad_norm": 0.11279296875, "kl": 0.0004181276017334312, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.71875, "reward_std": 0.1875, "rewards/correctness_reward_func": 1.71875, "step": 77 }, { "completion_length": 526.40625, "epoch": 0.030955451929754935, "grad_norm": 0.1318359375, "kl": 0.0003622628501034342, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.5625, "reward_std": 0.41367512941360474, "rewards/correctness_reward_func": 0.5625, "step": 78 }, { "completion_length": 308.1875, "epoch": 0.03135231669808513, "grad_norm": 0.171875, "kl": 0.00046880038280505687, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.46875, "reward_std": 0.13466878235340118, "rewards/correctness_reward_func": 1.46875, "step": 79 }, { "completion_length": 473.03125, "epoch": 0.03174918146641532, "grad_norm": 0.279296875, "kl": 0.0005240375467110425, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.0625, "reward_std": 0.5721687823534012, "rewards/correctness_reward_func": 1.0625, "step": 80 }, { "completion_length": 415.5, "epoch": 0.03214604623474551, "grad_norm": 0.1865234375, "kl": 0.000653728457109537, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.25, "reward_std": 0.4471687823534012, "rewards/correctness_reward_func": 1.25, "step": 81 }, { "completion_length": 438.625, "epoch": 0.0325429110030757, "grad_norm": 0.11572265625, "kl": 0.0006323889683699235, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.21875, "reward_std": 0.33183756470680237, "rewards/correctness_reward_func": 1.21875, "step": 82 }, { "completion_length": 600.796875, "epoch": 0.032939775771405895, "grad_norm": 0.19921875, "kl": 0.00043308708700351417, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.375, "reward_std": 0.25, "rewards/correctness_reward_func": 0.375, "step": 83 }, { "completion_length": 414.65625, "epoch": 0.03333664053973608, "grad_norm": 0.080078125, "kl": 0.0003923129916074686, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.34375, "reward_std": 0.2596687823534012, "rewards/correctness_reward_func": 1.34375, "step": 84 }, { "completion_length": 409.6875, "epoch": 0.03373350530806628, "grad_norm": 0.17578125, "kl": 0.0005655539498548023, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.625, "reward_std": 0.3221687823534012, "rewards/correctness_reward_func": 1.625, "step": 85 }, { "completion_length": 388.9375, "epoch": 0.034130370076396466, "grad_norm": 0.07666015625, "kl": 0.00030672067805426195, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.4375, "reward_std": 0.07216878235340118, "rewards/correctness_reward_func": 1.4375, "step": 86 }, { "completion_length": 377.5625, "epoch": 0.03452723484472666, "grad_norm": 0.00054931640625, "kl": 0.0005226589782978408, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.875, "reward_std": 0.0, "rewards/correctness_reward_func": 1.875, "step": 87 }, { "completion_length": 403.59375, "epoch": 0.03492409961305685, "grad_norm": 0.09521484375, "kl": 0.0003179389132128563, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.25, "reward_std": 0.14433756470680237, "rewards/correctness_reward_func": 1.25, "step": 88 }, { "completion_length": 433.5, "epoch": 0.03532096438138704, "grad_norm": 0.1513671875, "kl": 0.0005926354933762923, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.46875, "reward_std": 0.0625, "rewards/correctness_reward_func": 1.46875, "step": 89 }, { "completion_length": 581.0625, "epoch": 0.03571782914971723, "grad_norm": 0.18359375, "kl": 0.00041846034582704306, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.84375, "reward_std": 0.3846687823534012, "rewards/correctness_reward_func": 0.84375, "step": 90 }, { "completion_length": 545.015625, "epoch": 0.036114693918047426, "grad_norm": 0.236328125, "kl": 0.0004266584510332905, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.96875, "reward_std": 0.5096687823534012, "rewards/correctness_reward_func": 0.96875, "step": 91 }, { "completion_length": 454.59375, "epoch": 0.036511558686377614, "grad_norm": 0.16015625, "kl": 0.00039287004256038927, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.96875, "reward_std": 0.3846687823534012, "rewards/correctness_reward_func": 0.96875, "step": 92 }, { "completion_length": 450.28125, "epoch": 0.03690842345470781, "grad_norm": 0.1318359375, "kl": 0.0003956604123231955, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.25, "reward_std": 0.25, "rewards/correctness_reward_func": 1.25, "step": 93 }, { "completion_length": 606.96875, "epoch": 0.037305288223038, "grad_norm": 0.1650390625, "kl": 0.00025138623459497467, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.3125, "reward_std": 0.39433756470680237, "rewards/correctness_reward_func": 0.3125, "step": 94 }, { "completion_length": 486.15625, "epoch": 0.03770215299136819, "grad_norm": 0.1552734375, "kl": 0.00036627210283768363, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/correctness_reward_func": 0.8125, "step": 95 }, { "completion_length": 442.8125, "epoch": 0.03809901775969838, "grad_norm": 0.09521484375, "kl": 0.0002713065696298145, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.59375, "reward_std": 0.1875, "rewards/correctness_reward_func": 1.59375, "step": 96 }, { "completion_length": 511.015625, "epoch": 0.038495882528028574, "grad_norm": 0.10205078125, "kl": 0.0002741093012446072, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.53125, "reward_std": 0.3125, "rewards/correctness_reward_func": 1.53125, "step": 97 }, { "completion_length": 510.390625, "epoch": 0.03889274729635877, "grad_norm": 0.09814453125, "kl": 0.00027243237127549946, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.8125, "reward_std": 0.26933756470680237, "rewards/correctness_reward_func": 1.8125, "step": 98 }, { "completion_length": 354.9375, "epoch": 0.03928961206468896, "grad_norm": 0.0927734375, "kl": 0.0004462298093130812, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.65625, "reward_std": 0.1875, "rewards/correctness_reward_func": 1.65625, "step": 99 }, { "completion_length": 450.296875, "epoch": 0.03968647683301915, "grad_norm": 0.06298828125, "kl": 0.0003486628302198369, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.53125, "reward_std": 0.1875, "rewards/correctness_reward_func": 1.53125, "step": 100 } ], "logging_steps": 1, "max_steps": 503800, "num_input_tokens_seen": 0, "num_train_epochs": 200, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }