| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.03968647683301915, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 221.46875, | |
| "epoch": 0.0003968647683301915, | |
| "grad_norm": 0.1240234375, | |
| "kl": 0.0, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.875, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 0.875, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 181.71875, | |
| "epoch": 0.000793729536660383, | |
| "grad_norm": 0.00052642822265625, | |
| "kl": 0.00021356809156714007, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.125, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 1.125, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 279.875, | |
| "epoch": 0.0011905943049905744, | |
| "grad_norm": 0.380859375, | |
| "kl": 0.0001963738031918183, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.40625, | |
| "reward_std": 0.13466878235340118, | |
| "rewards/correctness_reward_func": 1.40625, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 231.625, | |
| "epoch": 0.001587459073320766, | |
| "grad_norm": 0.068359375, | |
| "kl": 0.00031982854125089943, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.53125, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.53125, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 271.140625, | |
| "epoch": 0.0019843238416509573, | |
| "grad_norm": 0.00013446807861328125, | |
| "kl": 0.00010319267312297598, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 294.65625, | |
| "epoch": 0.0023811886099811487, | |
| "grad_norm": 0.04248046875, | |
| "kl": 0.0001881300595414359, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.34375, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.34375, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 279.25, | |
| "epoch": 0.0027780533783113405, | |
| "grad_norm": 0.000713348388671875, | |
| "kl": 0.0002071384915325325, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.125, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 1.125, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 287.125, | |
| "epoch": 0.003174918146641532, | |
| "grad_norm": 0.0003032684326171875, | |
| "kl": 0.0001682171678112354, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 259.0625, | |
| "epoch": 0.0035717829149717233, | |
| "grad_norm": 0.1591796875, | |
| "kl": 0.0002609610164654441, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.34375, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.34375, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 380.171875, | |
| "epoch": 0.003968647683301915, | |
| "grad_norm": 0.0654296875, | |
| "kl": 0.0002176924463128671, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.34375, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.34375, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 489.640625, | |
| "epoch": 0.004365512451632106, | |
| "grad_norm": 0.032958984375, | |
| "kl": 0.00015466208969883155, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.21875, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.21875, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 332.609375, | |
| "epoch": 0.0047623772199622974, | |
| "grad_norm": 0.056640625, | |
| "kl": 0.00017187761113746092, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.90625, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.90625, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 366.828125, | |
| "epoch": 0.00515924198829249, | |
| "grad_norm": 0.00021648406982421875, | |
| "kl": 0.00014664832269772887, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 1.75, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 232.5, | |
| "epoch": 0.005556106756622681, | |
| "grad_norm": 0.0005340576171875, | |
| "kl": 0.000275460712146014, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 294.359375, | |
| "epoch": 0.0059529715249528724, | |
| "grad_norm": 0.087890625, | |
| "kl": 0.00027155900897923857, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_std": 0.19716878235340118, | |
| "rewards/correctness_reward_func": 1.0, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 384.578125, | |
| "epoch": 0.006349836293283064, | |
| "grad_norm": 0.0634765625, | |
| "kl": 0.00022811047529103234, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.53125, | |
| "reward_std": 0.13466878235340118, | |
| "rewards/correctness_reward_func": 1.53125, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 334.09375, | |
| "epoch": 0.006746701061613255, | |
| "grad_norm": 0.057861328125, | |
| "kl": 0.00027415682779974304, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.53125, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.53125, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 375.921875, | |
| "epoch": 0.007143565829943447, | |
| "grad_norm": 0.08642578125, | |
| "kl": 0.0001596472029632423, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.78125, | |
| "reward_std": 0.13466878235340118, | |
| "rewards/correctness_reward_func": 1.78125, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 345.203125, | |
| "epoch": 0.007540430598273638, | |
| "grad_norm": 0.000461578369140625, | |
| "kl": 0.00024132111502694897, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 418.296875, | |
| "epoch": 0.00793729536660383, | |
| "grad_norm": 0.251953125, | |
| "kl": 0.0002591940647107549, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.0625, | |
| "reward_std": 0.25, | |
| "rewards/correctness_reward_func": 1.0625, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 441.0, | |
| "epoch": 0.00833416013493402, | |
| "grad_norm": 0.240234375, | |
| "kl": 0.00021814405772602186, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_std": 0.3221687823534012, | |
| "rewards/correctness_reward_func": 1.25, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 349.15625, | |
| "epoch": 0.008731024903264212, | |
| "grad_norm": 0.0947265625, | |
| "kl": 0.00023298784799408168, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.4375, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 1.4375, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 375.90625, | |
| "epoch": 0.009127889671594403, | |
| "grad_norm": 0.064453125, | |
| "kl": 0.0001741187043080572, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.375, | |
| "reward_std": 0.19716878235340118, | |
| "rewards/correctness_reward_func": 1.375, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 383.203125, | |
| "epoch": 0.009524754439924595, | |
| "grad_norm": 0.0732421875, | |
| "kl": 0.00014529807594954036, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.9375, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 1.9375, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 334.6875, | |
| "epoch": 0.009921619208254788, | |
| "grad_norm": 0.06005859375, | |
| "kl": 0.00016799916193122044, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.9375, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 1.9375, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 405.140625, | |
| "epoch": 0.01031848397658498, | |
| "grad_norm": 0.0245361328125, | |
| "kl": 0.00020157176550128497, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.53125, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.53125, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 374.28125, | |
| "epoch": 0.01071534874491517, | |
| "grad_norm": 0.1796875, | |
| "kl": 0.00029869095305912197, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.19716878235340118, | |
| "rewards/correctness_reward_func": 0.9375, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 426.484375, | |
| "epoch": 0.011112213513245362, | |
| "grad_norm": 0.43359375, | |
| "kl": 0.0002508295456209453, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.4471687823534012, | |
| "rewards/correctness_reward_func": 0.9375, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 432.21875, | |
| "epoch": 0.011509078281575554, | |
| "grad_norm": 0.2578125, | |
| "kl": 0.00018439985433360562, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.03125, | |
| "reward_std": 0.1875, | |
| "rewards/correctness_reward_func": 1.03125, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 610.609375, | |
| "epoch": 0.011905943049905745, | |
| "grad_norm": 0.162109375, | |
| "kl": 9.109115489991382e-05, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.40625, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 0.40625, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 411.140625, | |
| "epoch": 0.012302807818235936, | |
| "grad_norm": 0.12890625, | |
| "kl": 0.00014656526946055237, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.32216876745224, | |
| "rewards/correctness_reward_func": 0.8125, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 371.359375, | |
| "epoch": 0.012699672586566128, | |
| "grad_norm": 0.053955078125, | |
| "kl": 0.0001413824884366477, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.71875, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.71875, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 360.6875, | |
| "epoch": 0.013096537354896319, | |
| "grad_norm": 0.09375, | |
| "kl": 0.00014583250776922796, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.96875, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.96875, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 256.140625, | |
| "epoch": 0.01349340212322651, | |
| "grad_norm": 0.0003681182861328125, | |
| "kl": 0.00021255770843708888, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.875, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 1.875, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 298.71875, | |
| "epoch": 0.013890266891556702, | |
| "grad_norm": 0.05712890625, | |
| "kl": 0.00023778366448823363, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.84375, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.84375, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 397.203125, | |
| "epoch": 0.014287131659886893, | |
| "grad_norm": 0.0693359375, | |
| "kl": 0.0001875234411272686, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 0.96875, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 476.59375, | |
| "epoch": 0.014683996428217085, | |
| "grad_norm": 0.1982421875, | |
| "kl": 0.0002962429862236604, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.15625, | |
| "reward_std": 0.45683756470680237, | |
| "rewards/correctness_reward_func": 1.15625, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 380.046875, | |
| "epoch": 0.015080861196547276, | |
| "grad_norm": 0.2177734375, | |
| "kl": 0.00021642427236656658, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_std": 0.3221687823534012, | |
| "rewards/correctness_reward_func": 1.25, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 319.671875, | |
| "epoch": 0.015477725964877467, | |
| "grad_norm": 0.1787109375, | |
| "kl": 0.0002720155207498465, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_std": 0.19716878235340118, | |
| "rewards/correctness_reward_func": 1.0, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 527.625, | |
| "epoch": 0.01587459073320766, | |
| "grad_norm": 0.11474609375, | |
| "kl": 0.0001258815245819278, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.625, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 0.625, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 326.09375, | |
| "epoch": 0.01627145550153785, | |
| "grad_norm": 0.0634765625, | |
| "kl": 0.00018829756663762964, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.78125, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.78125, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 362.1875, | |
| "epoch": 0.01666832026986804, | |
| "grad_norm": 0.042724609375, | |
| "kl": 0.00019165025150869042, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.71875, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.71875, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 297.5625, | |
| "epoch": 0.017065185038198233, | |
| "grad_norm": 0.000431060791015625, | |
| "kl": 0.00027088051137980074, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 361.09375, | |
| "epoch": 0.017462049806528424, | |
| "grad_norm": 0.0712890625, | |
| "kl": 0.00019505245654727332, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.96875, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.96875, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 349.140625, | |
| "epoch": 0.017858914574858616, | |
| "grad_norm": 0.08984375, | |
| "kl": 0.000281722481304314, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 1.0, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 454.375, | |
| "epoch": 0.018255779343188807, | |
| "grad_norm": 0.203125, | |
| "kl": 0.0002504781477910001, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.15625, | |
| "reward_std": 0.33183756470680237, | |
| "rewards/correctness_reward_func": 1.15625, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 417.984375, | |
| "epoch": 0.018652644111519, | |
| "grad_norm": 0.57421875, | |
| "kl": 0.0007036441420495976, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.33183756470680237, | |
| "rewards/correctness_reward_func": 0.96875, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 403.5625, | |
| "epoch": 0.01904950887984919, | |
| "grad_norm": 0.2109375, | |
| "kl": 0.00022189919036463834, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.28125, | |
| "reward_std": 0.2596687823534012, | |
| "rewards/correctness_reward_func": 1.28125, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 578.09375, | |
| "epoch": 0.019446373648179385, | |
| "grad_norm": 0.193359375, | |
| "kl": 0.00016415365462307818, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.5, | |
| "reward_std": 0.3221687823534012, | |
| "rewards/correctness_reward_func": 0.5, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 388.46875, | |
| "epoch": 0.019843238416509576, | |
| "grad_norm": 0.2041015625, | |
| "kl": 0.00029624684611917473, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.625, | |
| "reward_std": 0.19716878235340118, | |
| "rewards/correctness_reward_func": 0.625, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 393.546875, | |
| "epoch": 0.020240103184839767, | |
| "grad_norm": 0.06201171875, | |
| "kl": 0.000217016367969336, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.4375, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 1.4375, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 483.203125, | |
| "epoch": 0.02063696795316996, | |
| "grad_norm": 0.12109375, | |
| "kl": 0.00021450840722536668, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.46875, | |
| "reward_std": 0.1875, | |
| "rewards/correctness_reward_func": 1.46875, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 333.375, | |
| "epoch": 0.02103383272150015, | |
| "grad_norm": 0.0003528594970703125, | |
| "kl": 0.0002801330847432837, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.625, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 1.625, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 327.609375, | |
| "epoch": 0.02143069748983034, | |
| "grad_norm": 0.05908203125, | |
| "kl": 0.0002553274571255315, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 1.5625, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 313.484375, | |
| "epoch": 0.021827562258160533, | |
| "grad_norm": 0.05126953125, | |
| "kl": 0.00040918682134361006, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.4375, | |
| "reward_std": 0.07216878235340118, | |
| "rewards/correctness_reward_func": 1.4375, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 587.828125, | |
| "epoch": 0.022224427026490724, | |
| "grad_norm": 0.2421875, | |
| "kl": 0.00024972553364932537, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.59375, | |
| "reward_std": 0.5290063470602036, | |
| "rewards/correctness_reward_func": 0.59375, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 397.625, | |
| "epoch": 0.022621291794820916, | |
| "grad_norm": 0.1962890625, | |
| "kl": 0.0003616545727709308, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.125, | |
| "reward_std": 0.26933756470680237, | |
| "rewards/correctness_reward_func": 1.125, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 493.171875, | |
| "epoch": 0.023018156563151107, | |
| "grad_norm": 0.146484375, | |
| "kl": 0.00021447856852319092, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.1875, | |
| "reward_std": 0.34150634706020355, | |
| "rewards/correctness_reward_func": 1.1875, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 505.140625, | |
| "epoch": 0.0234150213314813, | |
| "grad_norm": 0.173828125, | |
| "kl": 0.00036706905302708037, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.6875, | |
| "reward_std": 0.3221687823534012, | |
| "rewards/correctness_reward_func": 0.6875, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 565.90625, | |
| "epoch": 0.02381188609981149, | |
| "grad_norm": 0.10107421875, | |
| "kl": 0.0002694212962524034, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.5, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 0.5, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 554.953125, | |
| "epoch": 0.02420875086814168, | |
| "grad_norm": 0.0400390625, | |
| "kl": 0.00021707677660742775, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.28125, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 0.28125, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 297.0625, | |
| "epoch": 0.024605615636471873, | |
| "grad_norm": 0.0004024505615234375, | |
| "kl": 0.000280580159596866, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 1.5, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 412.125, | |
| "epoch": 0.025002480404802064, | |
| "grad_norm": 0.08984375, | |
| "kl": 0.0002535973835620098, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.90625, | |
| "reward_std": 0.1875, | |
| "rewards/correctness_reward_func": 1.90625, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 416.046875, | |
| "epoch": 0.025399345173132255, | |
| "grad_norm": 0.087890625, | |
| "kl": 0.00024055263565969653, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.71875, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.71875, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 267.140625, | |
| "epoch": 0.025796209941462447, | |
| "grad_norm": 0.000850677490234375, | |
| "kl": 0.0006130204565124586, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 1.75, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 402.5625, | |
| "epoch": 0.026193074709792638, | |
| "grad_norm": 0.0986328125, | |
| "kl": 0.0002838912623701617, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.3125, | |
| "reward_std": 0.19716878235340118, | |
| "rewards/correctness_reward_func": 1.3125, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 466.6875, | |
| "epoch": 0.02658993947812283, | |
| "grad_norm": 0.08349609375, | |
| "kl": 0.0002692481575650163, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_std": 0.25, | |
| "rewards/correctness_reward_func": 1.0, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 445.171875, | |
| "epoch": 0.02698680424645302, | |
| "grad_norm": 0.177734375, | |
| "kl": 0.0005885176433366723, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.28125, | |
| "reward_std": 0.3125, | |
| "rewards/correctness_reward_func": 1.28125, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 530.484375, | |
| "epoch": 0.027383669014783212, | |
| "grad_norm": 0.1162109375, | |
| "kl": 0.0002777522022370249, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.25, | |
| "rewards/correctness_reward_func": 0.9375, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 360.640625, | |
| "epoch": 0.027780533783113404, | |
| "grad_norm": 0.1171875, | |
| "kl": 0.000590253112022765, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.6875, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 1.6875, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 481.71875, | |
| "epoch": 0.028177398551443595, | |
| "grad_norm": 0.061767578125, | |
| "kl": 0.0002587015369499568, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 0.71875, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 539.5, | |
| "epoch": 0.028574263319773786, | |
| "grad_norm": 0.11767578125, | |
| "kl": 0.00022344068929669447, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.125, | |
| "reward_std": 0.39433756470680237, | |
| "rewards/correctness_reward_func": 1.125, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 331.703125, | |
| "epoch": 0.028971128088103978, | |
| "grad_norm": 0.000308990478515625, | |
| "kl": 0.0002874392084777355, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 1.75, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 439.34375, | |
| "epoch": 0.02936799285643417, | |
| "grad_norm": 0.07177734375, | |
| "kl": 0.00026139178953599185, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.125, | |
| "rewards/correctness_reward_func": 1.5625, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 448.328125, | |
| "epoch": 0.02976485762476436, | |
| "grad_norm": 0.1455078125, | |
| "kl": 0.00037807323678862303, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.625, | |
| "reward_std": 0.26933756470680237, | |
| "rewards/correctness_reward_func": 1.625, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 271.0625, | |
| "epoch": 0.030161722393094552, | |
| "grad_norm": 0.000728607177734375, | |
| "kl": 0.0005262854028842412, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.875, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 1.875, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 348.4375, | |
| "epoch": 0.030558587161424743, | |
| "grad_norm": 0.11279296875, | |
| "kl": 0.0004181276017334312, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.71875, | |
| "reward_std": 0.1875, | |
| "rewards/correctness_reward_func": 1.71875, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 526.40625, | |
| "epoch": 0.030955451929754935, | |
| "grad_norm": 0.1318359375, | |
| "kl": 0.0003622628501034342, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.5625, | |
| "reward_std": 0.41367512941360474, | |
| "rewards/correctness_reward_func": 0.5625, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 308.1875, | |
| "epoch": 0.03135231669808513, | |
| "grad_norm": 0.171875, | |
| "kl": 0.00046880038280505687, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.46875, | |
| "reward_std": 0.13466878235340118, | |
| "rewards/correctness_reward_func": 1.46875, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 473.03125, | |
| "epoch": 0.03174918146641532, | |
| "grad_norm": 0.279296875, | |
| "kl": 0.0005240375467110425, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.0625, | |
| "reward_std": 0.5721687823534012, | |
| "rewards/correctness_reward_func": 1.0625, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 415.5, | |
| "epoch": 0.03214604623474551, | |
| "grad_norm": 0.1865234375, | |
| "kl": 0.000653728457109537, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_std": 0.4471687823534012, | |
| "rewards/correctness_reward_func": 1.25, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 438.625, | |
| "epoch": 0.0325429110030757, | |
| "grad_norm": 0.11572265625, | |
| "kl": 0.0006323889683699235, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.21875, | |
| "reward_std": 0.33183756470680237, | |
| "rewards/correctness_reward_func": 1.21875, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 600.796875, | |
| "epoch": 0.032939775771405895, | |
| "grad_norm": 0.19921875, | |
| "kl": 0.00043308708700351417, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.375, | |
| "reward_std": 0.25, | |
| "rewards/correctness_reward_func": 0.375, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 414.65625, | |
| "epoch": 0.03333664053973608, | |
| "grad_norm": 0.080078125, | |
| "kl": 0.0003923129916074686, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.34375, | |
| "reward_std": 0.2596687823534012, | |
| "rewards/correctness_reward_func": 1.34375, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 409.6875, | |
| "epoch": 0.03373350530806628, | |
| "grad_norm": 0.17578125, | |
| "kl": 0.0005655539498548023, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.625, | |
| "reward_std": 0.3221687823534012, | |
| "rewards/correctness_reward_func": 1.625, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 388.9375, | |
| "epoch": 0.034130370076396466, | |
| "grad_norm": 0.07666015625, | |
| "kl": 0.00030672067805426195, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.4375, | |
| "reward_std": 0.07216878235340118, | |
| "rewards/correctness_reward_func": 1.4375, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 377.5625, | |
| "epoch": 0.03452723484472666, | |
| "grad_norm": 0.00054931640625, | |
| "kl": 0.0005226589782978408, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.875, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 1.875, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 403.59375, | |
| "epoch": 0.03492409961305685, | |
| "grad_norm": 0.09521484375, | |
| "kl": 0.0003179389132128563, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_std": 0.14433756470680237, | |
| "rewards/correctness_reward_func": 1.25, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 433.5, | |
| "epoch": 0.03532096438138704, | |
| "grad_norm": 0.1513671875, | |
| "kl": 0.0005926354933762923, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.46875, | |
| "reward_std": 0.0625, | |
| "rewards/correctness_reward_func": 1.46875, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 581.0625, | |
| "epoch": 0.03571782914971723, | |
| "grad_norm": 0.18359375, | |
| "kl": 0.00041846034582704306, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.3846687823534012, | |
| "rewards/correctness_reward_func": 0.84375, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 545.015625, | |
| "epoch": 0.036114693918047426, | |
| "grad_norm": 0.236328125, | |
| "kl": 0.0004266584510332905, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.5096687823534012, | |
| "rewards/correctness_reward_func": 0.96875, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 454.59375, | |
| "epoch": 0.036511558686377614, | |
| "grad_norm": 0.16015625, | |
| "kl": 0.00039287004256038927, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.3846687823534012, | |
| "rewards/correctness_reward_func": 0.96875, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 450.28125, | |
| "epoch": 0.03690842345470781, | |
| "grad_norm": 0.1318359375, | |
| "kl": 0.0003956604123231955, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_std": 0.25, | |
| "rewards/correctness_reward_func": 1.25, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 606.96875, | |
| "epoch": 0.037305288223038, | |
| "grad_norm": 0.1650390625, | |
| "kl": 0.00025138623459497467, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.3125, | |
| "reward_std": 0.39433756470680237, | |
| "rewards/correctness_reward_func": 0.3125, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 486.15625, | |
| "epoch": 0.03770215299136819, | |
| "grad_norm": 0.1552734375, | |
| "kl": 0.00036627210283768363, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.19716878235340118, | |
| "rewards/correctness_reward_func": 0.8125, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 442.8125, | |
| "epoch": 0.03809901775969838, | |
| "grad_norm": 0.09521484375, | |
| "kl": 0.0002713065696298145, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.59375, | |
| "reward_std": 0.1875, | |
| "rewards/correctness_reward_func": 1.59375, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 511.015625, | |
| "epoch": 0.038495882528028574, | |
| "grad_norm": 0.10205078125, | |
| "kl": 0.0002741093012446072, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.53125, | |
| "reward_std": 0.3125, | |
| "rewards/correctness_reward_func": 1.53125, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 510.390625, | |
| "epoch": 0.03889274729635877, | |
| "grad_norm": 0.09814453125, | |
| "kl": 0.00027243237127549946, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.8125, | |
| "reward_std": 0.26933756470680237, | |
| "rewards/correctness_reward_func": 1.8125, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 354.9375, | |
| "epoch": 0.03928961206468896, | |
| "grad_norm": 0.0927734375, | |
| "kl": 0.0004462298093130812, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.65625, | |
| "reward_std": 0.1875, | |
| "rewards/correctness_reward_func": 1.65625, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 450.296875, | |
| "epoch": 0.03968647683301915, | |
| "grad_norm": 0.06298828125, | |
| "kl": 0.0003486628302198369, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.53125, | |
| "reward_std": 0.1875, | |
| "rewards/correctness_reward_func": 1.53125, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 503800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 200, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |