| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.16, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 130.62890625, | |
| "epoch": 0.001587459073320766, | |
| "grad_norm": 0.95703125, | |
| "kl": 0.0, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 1.0234375, | |
| "reward_std": 0.620339147746563, | |
| "rewards/correctness_reward_func": 1.0234375, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 150.61328125, | |
| "epoch": 0.003174918146641532, | |
| "grad_norm": 0.62109375, | |
| "kl": 0.061338113620877266, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 1.2734375, | |
| "reward_std": 0.3329593911767006, | |
| "rewards/correctness_reward_func": 1.2734375, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 171.515625, | |
| "epoch": 0.0047623772199622974, | |
| "grad_norm": 0.609375, | |
| "kl": 0.08757861610502005, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 1.25, | |
| "reward_std": 0.20200317353010178, | |
| "rewards/correctness_reward_func": 1.25, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 179.25390625, | |
| "epoch": 0.006349836293283064, | |
| "grad_norm": 14.625, | |
| "kl": 0.6993596660904586, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0035, | |
| "reward": 1.265625, | |
| "reward_std": 0.2416265867650509, | |
| "rewards/correctness_reward_func": 1.265625, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 189.1015625, | |
| "epoch": 0.00793729536660383, | |
| "grad_norm": 0.484375, | |
| "kl": 0.053785258904099464, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 1.0703125, | |
| "reward_std": 0.2572515867650509, | |
| "rewards/correctness_reward_func": 1.0703125, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 234.6875, | |
| "epoch": 0.009524754439924595, | |
| "grad_norm": 0.59765625, | |
| "kl": 0.041797632817178965, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 0.984375, | |
| "reward_std": 0.3895031735301018, | |
| "rewards/correctness_reward_func": 0.984375, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 190.66015625, | |
| "epoch": 0.011112213513245362, | |
| "grad_norm": 0.8359375, | |
| "kl": 0.05031784961465746, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 1.0, | |
| "reward_std": 0.4339609779417515, | |
| "rewards/correctness_reward_func": 1.0, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 203.046875, | |
| "epoch": 0.012699672586566128, | |
| "grad_norm": 0.52734375, | |
| "kl": 0.05922417342662811, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 1.0625, | |
| "reward_std": 0.34987976029515266, | |
| "rewards/correctness_reward_func": 1.0625, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 231.3203125, | |
| "epoch": 0.014287131659886893, | |
| "grad_norm": 0.412109375, | |
| "kl": 0.038438764633610845, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 1.1015625, | |
| "reward_std": 0.3245859779417515, | |
| "rewards/correctness_reward_func": 1.1015625, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 228.765625, | |
| "epoch": 0.01587459073320766, | |
| "grad_norm": 0.466796875, | |
| "kl": 0.06533525174017996, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 1.0, | |
| "reward_std": 0.2777109779417515, | |
| "rewards/correctness_reward_func": 1.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 217.0546875, | |
| "epoch": 0.017462049806528424, | |
| "grad_norm": 0.388671875, | |
| "kl": 0.04172722063958645, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 1.1796875, | |
| "reward_std": 0.2885015867650509, | |
| "rewards/correctness_reward_func": 1.1796875, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 279.97265625, | |
| "epoch": 0.01904950887984919, | |
| "grad_norm": 0.578125, | |
| "kl": 0.05215206788852811, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.984375, | |
| "reward_std": 0.36308756470680237, | |
| "rewards/correctness_reward_func": 0.984375, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 288.18359375, | |
| "epoch": 0.02063696795316996, | |
| "grad_norm": 0.419921875, | |
| "kl": 0.04899335908703506, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9140625, | |
| "reward_std": 0.32942036911845207, | |
| "rewards/correctness_reward_func": 0.9140625, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 280.36328125, | |
| "epoch": 0.022224427026490724, | |
| "grad_norm": 0.400390625, | |
| "kl": 0.0435270715970546, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8984375, | |
| "reward_std": 0.2572515830397606, | |
| "rewards/correctness_reward_func": 0.8984375, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 252.9453125, | |
| "epoch": 0.02381188609981149, | |
| "grad_norm": 0.99609375, | |
| "kl": 0.07505812356248498, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 1.0234375, | |
| "reward_std": 0.30300476029515266, | |
| "rewards/correctness_reward_func": 1.0234375, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 311.01171875, | |
| "epoch": 0.025399345173132255, | |
| "grad_norm": 0.4375, | |
| "kl": 0.09632595372386277, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.9921875, | |
| "reward_std": 0.32104695588350296, | |
| "rewards/correctness_reward_func": 0.9921875, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 278.70703125, | |
| "epoch": 0.02698680424645302, | |
| "grad_norm": 0.478515625, | |
| "kl": 0.04982293304055929, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8671875, | |
| "reward_std": 0.34746256470680237, | |
| "rewards/correctness_reward_func": 0.8671875, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 276.03515625, | |
| "epoch": 0.028574263319773786, | |
| "grad_norm": 0.48046875, | |
| "kl": 0.058841129299253225, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.9140625, | |
| "reward_std": 0.34746256470680237, | |
| "rewards/correctness_reward_func": 0.9140625, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 299.88671875, | |
| "epoch": 0.030161722393094552, | |
| "grad_norm": 0.43359375, | |
| "kl": 0.05037166504189372, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.96875, | |
| "reward_std": 0.31862976029515266, | |
| "rewards/correctness_reward_func": 0.96875, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 302.046875, | |
| "epoch": 0.03174918146641532, | |
| "grad_norm": 0.4140625, | |
| "kl": 0.051571789546869695, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8515625, | |
| "reward_std": 0.37033915519714355, | |
| "rewards/correctness_reward_func": 0.8515625, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 384.6328125, | |
| "epoch": 0.03333664053973608, | |
| "grad_norm": 0.31640625, | |
| "kl": 0.04484600038267672, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6875, | |
| "reward_std": 0.2416265867650509, | |
| "rewards/correctness_reward_func": 0.6875, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 326.0078125, | |
| "epoch": 0.03492409961305685, | |
| "grad_norm": 0.4296875, | |
| "kl": 0.04701656638644636, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8984375, | |
| "reward_std": 0.4183359779417515, | |
| "rewards/correctness_reward_func": 0.8984375, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 298.75390625, | |
| "epoch": 0.036511558686377614, | |
| "grad_norm": 0.384765625, | |
| "kl": 0.05655176565051079, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.90625, | |
| "reward_std": 0.3270031735301018, | |
| "rewards/correctness_reward_func": 0.90625, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 371.8984375, | |
| "epoch": 0.03809901775969838, | |
| "grad_norm": 0.380859375, | |
| "kl": 0.06988459394779056, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8203125, | |
| "reward_std": 0.30783915147185326, | |
| "rewards/correctness_reward_func": 0.8203125, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 367.21484375, | |
| "epoch": 0.03968647683301915, | |
| "grad_norm": 0.400390625, | |
| "kl": 0.061523064388893545, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8203125, | |
| "reward_std": 0.2572515867650509, | |
| "rewards/correctness_reward_func": 0.8203125, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 317.98828125, | |
| "epoch": 0.04127393590633992, | |
| "grad_norm": 0.419921875, | |
| "kl": 0.05494794622063637, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 1.078125, | |
| "reward_std": 0.2909187823534012, | |
| "rewards/correctness_reward_func": 1.078125, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 359.47265625, | |
| "epoch": 0.04286139497966068, | |
| "grad_norm": 0.337890625, | |
| "kl": 0.052700593834742904, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.6953125, | |
| "reward_std": 0.3113781735301018, | |
| "rewards/correctness_reward_func": 0.6953125, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 359.359375, | |
| "epoch": 0.04444885405298145, | |
| "grad_norm": 0.392578125, | |
| "kl": 0.06072323571424931, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.6796875, | |
| "reward_std": 0.2620859779417515, | |
| "rewards/correctness_reward_func": 0.6796875, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 309.65625, | |
| "epoch": 0.046036313126302214, | |
| "grad_norm": 0.392578125, | |
| "kl": 0.06208631256595254, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7578125, | |
| "reward_std": 0.34746256470680237, | |
| "rewards/correctness_reward_func": 0.7578125, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 324.359375, | |
| "epoch": 0.04762377219962298, | |
| "grad_norm": 0.419921875, | |
| "kl": 0.054743261309340596, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7421875, | |
| "reward_std": 0.2752937823534012, | |
| "rewards/correctness_reward_func": 0.7421875, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 336.78125, | |
| "epoch": 0.049211231272943745, | |
| "grad_norm": 0.62890625, | |
| "kl": 0.09580778249073774, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.9140625, | |
| "reward_std": 0.3113781735301018, | |
| "rewards/correctness_reward_func": 0.9140625, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 382.821875, | |
| "epoch": 1.032, | |
| "grad_norm": 0.498046875, | |
| "kl": 0.05092890365049243, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.5125, | |
| "reward_std": 0.3471687823534012, | |
| "rewards/correctness_reward_func": 0.5125, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 293.07421875, | |
| "epoch": 1.064, | |
| "grad_norm": 0.5546875, | |
| "kl": 0.10762864979915321, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.875, | |
| "reward_std": 0.28254536911845207, | |
| "rewards/correctness_reward_func": 0.875, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 202.1875, | |
| "epoch": 1.096, | |
| "grad_norm": 2.03125, | |
| "kl": 0.1804919212590903, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0009, | |
| "reward": 0.953125, | |
| "reward_std": 0.43042195588350296, | |
| "rewards/correctness_reward_func": 0.953125, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 298.62890625, | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 0.85546875, | |
| "kl": 0.14085538033396006, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0007, | |
| "reward": 0.640625, | |
| "reward_std": 0.36792195588350296, | |
| "rewards/correctness_reward_func": 0.640625, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 274.4140625, | |
| "epoch": 1.16, | |
| "grad_norm": 0.59765625, | |
| "kl": 0.07066078553907573, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.859375, | |
| "reward_std": 0.4339609779417515, | |
| "rewards/correctness_reward_func": 0.859375, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 285.171875, | |
| "epoch": 1.192, | |
| "grad_norm": 0.58203125, | |
| "kl": 0.06347297015599906, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.796875, | |
| "reward_std": 0.4423343911767006, | |
| "rewards/correctness_reward_func": 0.796875, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 263.57421875, | |
| "epoch": 1.224, | |
| "grad_norm": 0.51953125, | |
| "kl": 0.0734753671567887, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.75, | |
| "reward_std": 0.4207531735301018, | |
| "rewards/correctness_reward_func": 0.75, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 348.9296875, | |
| "epoch": 1.256, | |
| "grad_norm": 0.625, | |
| "kl": 0.08903565420769155, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.65625, | |
| "reward_std": 0.3798343911767006, | |
| "rewards/correctness_reward_func": 0.65625, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 352.82421875, | |
| "epoch": 1.288, | |
| "grad_norm": 0.5390625, | |
| "kl": 0.06843947665765882, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.6484375, | |
| "reward_std": 0.4051281735301018, | |
| "rewards/correctness_reward_func": 0.6484375, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 329.1328125, | |
| "epoch": 1.32, | |
| "grad_norm": 0.46875, | |
| "kl": 0.057105657644569874, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.671875, | |
| "reward_std": 0.4520031735301018, | |
| "rewards/correctness_reward_func": 0.671875, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 322.95703125, | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 0.53515625, | |
| "kl": 0.05887855147011578, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.578125, | |
| "reward_std": 0.43042195588350296, | |
| "rewards/correctness_reward_func": 0.578125, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 373.6796875, | |
| "epoch": 1.384, | |
| "grad_norm": 0.375, | |
| "kl": 0.05490247346460819, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7265625, | |
| "reward_std": 0.3426281735301018, | |
| "rewards/correctness_reward_func": 0.7265625, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 338.2109375, | |
| "epoch": 1.416, | |
| "grad_norm": 0.474609375, | |
| "kl": 0.05778517574071884, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.5234375, | |
| "reward_std": 0.41479695960879326, | |
| "rewards/correctness_reward_func": 0.5234375, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 379.49609375, | |
| "epoch": 1.448, | |
| "grad_norm": 0.443359375, | |
| "kl": 0.052183745545335114, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.4609375, | |
| "reward_std": 0.3377937823534012, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 453.609375, | |
| "epoch": 1.48, | |
| "grad_norm": 0.37890625, | |
| "kl": 0.05021012085489929, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.484375, | |
| "reward_std": 0.39433756470680237, | |
| "rewards/correctness_reward_func": 0.484375, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 425.3515625, | |
| "epoch": 1.512, | |
| "grad_norm": 0.423828125, | |
| "kl": 0.0551034901291132, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.46875, | |
| "reward_std": 0.2777109779417515, | |
| "rewards/correctness_reward_func": 0.46875, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 436.1796875, | |
| "epoch": 1.544, | |
| "grad_norm": 0.373046875, | |
| "kl": 0.05335203884169459, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.4921875, | |
| "reward_std": 0.3245859779417515, | |
| "rewards/correctness_reward_func": 0.4921875, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 552.60546875, | |
| "epoch": 1.576, | |
| "grad_norm": 0.43359375, | |
| "kl": 0.05089649045839906, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.4765625, | |
| "reward_std": 0.3426281735301018, | |
| "rewards/correctness_reward_func": 0.4765625, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 509.71484375, | |
| "epoch": 1.608, | |
| "grad_norm": 0.474609375, | |
| "kl": 0.05733613413758576, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.4375, | |
| "reward_std": 0.43525634706020355, | |
| "rewards/correctness_reward_func": 0.4375, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 480.94921875, | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 0.375, | |
| "kl": 0.06065811449661851, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.34375, | |
| "reward_std": 0.3270031735301018, | |
| "rewards/correctness_reward_func": 0.34375, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 491.1640625, | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 0.390625, | |
| "kl": 0.060098053654655814, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.4375, | |
| "reward_std": 0.3846687786281109, | |
| "rewards/correctness_reward_func": 0.4375, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 464.2265625, | |
| "epoch": 1.704, | |
| "grad_norm": 0.435546875, | |
| "kl": 0.062199660344049335, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.4765625, | |
| "reward_std": 0.35713134706020355, | |
| "rewards/correctness_reward_func": 0.4765625, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 429.90625, | |
| "epoch": 1.736, | |
| "grad_norm": 12.625, | |
| "kl": 0.22791615827009082, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.734375, | |
| "reward_std": 0.3402109779417515, | |
| "rewards/correctness_reward_func": 0.734375, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 512.62109375, | |
| "epoch": 1.768, | |
| "grad_norm": 0.388671875, | |
| "kl": 0.059834773652255535, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7265625, | |
| "reward_std": 0.24404378235340118, | |
| "rewards/correctness_reward_func": 0.7265625, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 471.5859375, | |
| "epoch": 1.8, | |
| "grad_norm": 0.388671875, | |
| "kl": 0.07468604180030525, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.859375, | |
| "reward_std": 0.2055421955883503, | |
| "rewards/correctness_reward_func": 0.859375, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 446.046875, | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 0.443359375, | |
| "kl": 0.06021555629558861, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.890625, | |
| "reward_std": 0.3221687823534012, | |
| "rewards/correctness_reward_func": 0.890625, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 536.68359375, | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 0.333984375, | |
| "kl": 0.060477497056126595, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.609375, | |
| "reward_std": 0.24646097794175148, | |
| "rewards/correctness_reward_func": 0.609375, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 411.69140625, | |
| "epoch": 1.896, | |
| "grad_norm": 0.453125, | |
| "kl": 0.0779561479575932, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.9609375, | |
| "reward_std": 0.2620859779417515, | |
| "rewards/correctness_reward_func": 0.9609375, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 544.3671875, | |
| "epoch": 1.928, | |
| "grad_norm": 0.81640625, | |
| "kl": 0.10888050403445959, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.5859375, | |
| "reward_std": 0.24404378235340118, | |
| "rewards/correctness_reward_func": 0.5859375, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 397.546875, | |
| "epoch": 1.96, | |
| "grad_norm": 0.6953125, | |
| "kl": 0.10245927749201655, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.9375, | |
| "reward_std": 0.30058756470680237, | |
| "rewards/correctness_reward_func": 0.9375, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 533.109375, | |
| "epoch": 1.992, | |
| "grad_norm": 0.65234375, | |
| "kl": 0.09279383928515017, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.640625, | |
| "reward_std": 0.28254536911845207, | |
| "rewards/correctness_reward_func": 0.640625, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 607.078125, | |
| "epoch": 2.0, | |
| "grad_norm": 0.2421875, | |
| "kl": 0.059856025502085686, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0001, | |
| "reward": 0.6875, | |
| "reward_std": 0.4471687823534012, | |
| "rewards/correctness_reward_func": 0.6875, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 706.515625, | |
| "epoch": 2.032, | |
| "grad_norm": 2.9375, | |
| "kl": 0.23451336496509612, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.21875, | |
| "reward_std": 0.19716878235340118, | |
| "rewards/correctness_reward_func": 0.21875, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 661.4453125, | |
| "epoch": 2.064, | |
| "grad_norm": 0.455078125, | |
| "kl": 0.08936465694569051, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.1796875, | |
| "reward_std": 0.0961671955883503, | |
| "rewards/correctness_reward_func": 0.1796875, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 644.8046875, | |
| "epoch": 2.096, | |
| "grad_norm": 0.287109375, | |
| "kl": 0.0694590169005096, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.2109375, | |
| "reward_std": 0.1454593911767006, | |
| "rewards/correctness_reward_func": 0.2109375, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 331.421875, | |
| "epoch": 2.128, | |
| "grad_norm": 0.5625, | |
| "kl": 0.10819920105859637, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.7109375, | |
| "reward_std": 0.39192036911845207, | |
| "rewards/correctness_reward_func": 0.7109375, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 454.83984375, | |
| "epoch": 2.16, | |
| "grad_norm": 1.171875, | |
| "kl": 0.14896808750927448, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0007, | |
| "reward": 0.4453125, | |
| "reward_std": 0.2885015867650509, | |
| "rewards/correctness_reward_func": 0.4453125, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 474.54296875, | |
| "epoch": 2.192, | |
| "grad_norm": 0.56640625, | |
| "kl": 0.06995820580050349, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.6015625, | |
| "reward_std": 0.39192036911845207, | |
| "rewards/correctness_reward_func": 0.6015625, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 440.05859375, | |
| "epoch": 2.224, | |
| "grad_norm": 0.50390625, | |
| "kl": 0.0910127570386976, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.46875, | |
| "reward_std": 0.39433756470680237, | |
| "rewards/correctness_reward_func": 0.46875, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 478.2890625, | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 0.419921875, | |
| "kl": 0.07039178418926895, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.515625, | |
| "reward_std": 0.3534187823534012, | |
| "rewards/correctness_reward_func": 0.515625, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 539.33203125, | |
| "epoch": 2.288, | |
| "grad_norm": 0.35546875, | |
| "kl": 0.0686434528324753, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.4453125, | |
| "reward_std": 0.2933359779417515, | |
| "rewards/correctness_reward_func": 0.4453125, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 480.2578125, | |
| "epoch": 2.32, | |
| "grad_norm": 0.326171875, | |
| "kl": 0.06706948089413345, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.40625, | |
| "reward_std": 0.3582531735301018, | |
| "rewards/correctness_reward_func": 0.40625, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 497.828125, | |
| "epoch": 2.352, | |
| "grad_norm": 0.486328125, | |
| "kl": 0.07771567534655333, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.4609375, | |
| "reward_std": 0.45925475656986237, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 540.3671875, | |
| "epoch": 2.384, | |
| "grad_norm": 0.330078125, | |
| "kl": 0.07498506712727249, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.359375, | |
| "reward_std": 0.2909187823534012, | |
| "rewards/correctness_reward_func": 0.359375, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 472.80859375, | |
| "epoch": 2.416, | |
| "grad_norm": 0.431640625, | |
| "kl": 0.06656011822633445, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.5, | |
| "reward_std": 0.3534187823534012, | |
| "rewards/correctness_reward_func": 0.5, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 517.7421875, | |
| "epoch": 2.448, | |
| "grad_norm": 0.64453125, | |
| "kl": 0.07896088412962854, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.375, | |
| "reward_std": 0.34504536911845207, | |
| "rewards/correctness_reward_func": 0.375, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 589.140625, | |
| "epoch": 2.48, | |
| "grad_norm": 1.1875, | |
| "kl": 0.12798133888281882, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0006, | |
| "reward": 0.3203125, | |
| "reward_std": 0.34746256470680237, | |
| "rewards/correctness_reward_func": 0.3203125, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 506.98828125, | |
| "epoch": 2.512, | |
| "grad_norm": 0.302734375, | |
| "kl": 0.06648093881085515, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "reward": 0.2890625, | |
| "reward_std": 0.29817036911845207, | |
| "rewards/correctness_reward_func": 0.2890625, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 634.9140625, | |
| "epoch": 2.544, | |
| "grad_norm": 0.296875, | |
| "kl": 0.08733222424052656, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.265625, | |
| "reward_std": 0.2596687823534012, | |
| "rewards/correctness_reward_func": 0.265625, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 561.2734375, | |
| "epoch": 2.576, | |
| "grad_norm": 0.451171875, | |
| "kl": 0.08239166042767465, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.5078125, | |
| "reward_std": 0.45442036911845207, | |
| "rewards/correctness_reward_func": 0.5078125, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 535.79296875, | |
| "epoch": 2.608, | |
| "grad_norm": 0.99609375, | |
| "kl": 0.22772114095278084, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.3359375, | |
| "reward_std": 0.36550476029515266, | |
| "rewards/correctness_reward_func": 0.3359375, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 516.10546875, | |
| "epoch": 2.64, | |
| "grad_norm": 0.4765625, | |
| "kl": 0.12832795921713114, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0006, | |
| "reward": 0.390625, | |
| "reward_std": 0.2728765867650509, | |
| "rewards/correctness_reward_func": 0.390625, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 486.69140625, | |
| "epoch": 2.672, | |
| "grad_norm": 0.453125, | |
| "kl": 0.08598195551894605, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.4296875, | |
| "reward_std": 0.3377937823534012, | |
| "rewards/correctness_reward_func": 0.4296875, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 595.859375, | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 0.337890625, | |
| "kl": 0.08345729601569474, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.390625, | |
| "reward_std": 0.3089609779417515, | |
| "rewards/correctness_reward_func": 0.390625, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 613.12109375, | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 0.291015625, | |
| "kl": 0.0912326027173549, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.2890625, | |
| "reward_std": 0.19958597794175148, | |
| "rewards/correctness_reward_func": 0.2890625, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 586.3984375, | |
| "epoch": 2.768, | |
| "grad_norm": 0.361328125, | |
| "kl": 0.08986557251773775, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.4609375, | |
| "reward_std": 0.3245859779417515, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 580.55859375, | |
| "epoch": 2.8, | |
| "grad_norm": 0.4140625, | |
| "kl": 0.08739295578561723, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.3671875, | |
| "reward_std": 0.2933359779417515, | |
| "rewards/correctness_reward_func": 0.3671875, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 545.07421875, | |
| "epoch": 2.832, | |
| "grad_norm": 0.30078125, | |
| "kl": 0.0942155783995986, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.328125, | |
| "reward_std": 0.2512953653931618, | |
| "rewards/correctness_reward_func": 0.328125, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 507.109375, | |
| "epoch": 2.864, | |
| "grad_norm": 0.37109375, | |
| "kl": 0.08285854570567608, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.4140625, | |
| "reward_std": 0.31621256098151207, | |
| "rewards/correctness_reward_func": 0.4140625, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 548.953125, | |
| "epoch": 2.896, | |
| "grad_norm": 1.2109375, | |
| "kl": 0.17426068475469947, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0009, | |
| "reward": 0.5390625, | |
| "reward_std": 0.2752937823534012, | |
| "rewards/correctness_reward_func": 0.5390625, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 487.5078125, | |
| "epoch": 2.928, | |
| "grad_norm": 0.3671875, | |
| "kl": 0.08832010440528393, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.75, | |
| "reward_std": 0.3270031735301018, | |
| "rewards/correctness_reward_func": 0.75, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 464.93359375, | |
| "epoch": 2.96, | |
| "grad_norm": 0.625, | |
| "kl": 0.12189564970321953, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0006, | |
| "reward": 0.5625, | |
| "reward_std": 0.2645031735301018, | |
| "rewards/correctness_reward_func": 0.5625, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 474.10546875, | |
| "epoch": 2.992, | |
| "grad_norm": 0.30078125, | |
| "kl": 0.08207035693340003, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.75, | |
| "reward_std": 0.2596687823534012, | |
| "rewards/correctness_reward_func": 0.75, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 495.15625, | |
| "epoch": 3.0, | |
| "grad_norm": 0.107421875, | |
| "kl": 0.09875499829649925, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0001, | |
| "reward": 0.5, | |
| "reward_std": 0.25, | |
| "rewards/correctness_reward_func": 0.5, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 693.6640625, | |
| "epoch": 3.032, | |
| "grad_norm": 0.287109375, | |
| "kl": 0.13659441424533725, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0007, | |
| "reward": 0.1953125, | |
| "reward_std": 0.21279378235340118, | |
| "rewards/correctness_reward_func": 0.1953125, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 681.03125, | |
| "epoch": 3.064, | |
| "grad_norm": 0.255859375, | |
| "kl": 0.13894613785669208, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0007, | |
| "reward": 0.2578125, | |
| "reward_std": 0.1947515867650509, | |
| "rewards/correctness_reward_func": 0.2578125, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 722.29296875, | |
| "epoch": 3.096, | |
| "grad_norm": 0.330078125, | |
| "kl": 0.1400570566765964, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0007, | |
| "reward": 0.2109375, | |
| "reward_std": 0.2079593911767006, | |
| "rewards/correctness_reward_func": 0.2109375, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 634.5546875, | |
| "epoch": 3.128, | |
| "grad_norm": 0.287109375, | |
| "kl": 0.13448539143428206, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0007, | |
| "reward": 0.203125, | |
| "reward_std": 0.1923343911767006, | |
| "rewards/correctness_reward_func": 0.203125, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 406.96875, | |
| "epoch": 3.16, | |
| "grad_norm": 0.7109375, | |
| "kl": 0.13877984089776874, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0007, | |
| "reward": 0.4609375, | |
| "reward_std": 0.3558359779417515, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 125800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 200, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |