{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.16, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 130.62890625, "epoch": 0.001587459073320766, "grad_norm": 0.95703125, "kl": 0.0, "learning_rate": 1e-05, "loss": 0.0, "reward": 1.0234375, "reward_std": 0.620339147746563, "rewards/correctness_reward_func": 1.0234375, "step": 1 }, { "completion_length": 150.61328125, "epoch": 0.003174918146641532, "grad_norm": 0.62109375, "kl": 0.061338113620877266, "learning_rate": 1e-05, "loss": 0.0003, "reward": 1.2734375, "reward_std": 0.3329593911767006, "rewards/correctness_reward_func": 1.2734375, "step": 2 }, { "completion_length": 171.515625, "epoch": 0.0047623772199622974, "grad_norm": 0.609375, "kl": 0.08757861610502005, "learning_rate": 1e-05, "loss": 0.0004, "reward": 1.25, "reward_std": 0.20200317353010178, "rewards/correctness_reward_func": 1.25, "step": 3 }, { "completion_length": 179.25390625, "epoch": 0.006349836293283064, "grad_norm": 14.625, "kl": 0.6993596660904586, "learning_rate": 1e-05, "loss": 0.0035, "reward": 1.265625, "reward_std": 0.2416265867650509, "rewards/correctness_reward_func": 1.265625, "step": 4 }, { "completion_length": 189.1015625, "epoch": 0.00793729536660383, "grad_norm": 0.484375, "kl": 0.053785258904099464, "learning_rate": 1e-05, "loss": 0.0003, "reward": 1.0703125, "reward_std": 0.2572515867650509, "rewards/correctness_reward_func": 1.0703125, "step": 5 }, { "completion_length": 234.6875, "epoch": 0.009524754439924595, "grad_norm": 0.59765625, "kl": 0.041797632817178965, "learning_rate": 1e-05, "loss": 0.0002, "reward": 0.984375, "reward_std": 0.3895031735301018, "rewards/correctness_reward_func": 0.984375, "step": 6 }, { "completion_length": 190.66015625, "epoch": 0.011112213513245362, "grad_norm": 0.8359375, "kl": 0.05031784961465746, "learning_rate": 1e-05, "loss": 0.0003, "reward": 1.0, "reward_std": 0.4339609779417515, "rewards/correctness_reward_func": 1.0, "step": 7 }, { "completion_length": 203.046875, "epoch": 0.012699672586566128, "grad_norm": 0.52734375, "kl": 0.05922417342662811, "learning_rate": 1e-05, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.34987976029515266, "rewards/correctness_reward_func": 1.0625, "step": 8 }, { "completion_length": 231.3203125, "epoch": 0.014287131659886893, "grad_norm": 0.412109375, "kl": 0.038438764633610845, "learning_rate": 1e-05, "loss": 0.0002, "reward": 1.1015625, "reward_std": 0.3245859779417515, "rewards/correctness_reward_func": 1.1015625, "step": 9 }, { "completion_length": 228.765625, "epoch": 0.01587459073320766, "grad_norm": 0.466796875, "kl": 0.06533525174017996, "learning_rate": 1e-05, "loss": 0.0003, "reward": 1.0, "reward_std": 0.2777109779417515, "rewards/correctness_reward_func": 1.0, "step": 10 }, { "completion_length": 217.0546875, "epoch": 0.017462049806528424, "grad_norm": 0.388671875, "kl": 0.04172722063958645, "learning_rate": 1e-05, "loss": 0.0002, "reward": 1.1796875, "reward_std": 0.2885015867650509, "rewards/correctness_reward_func": 1.1796875, "step": 11 }, { "completion_length": 279.97265625, "epoch": 0.01904950887984919, "grad_norm": 0.578125, "kl": 0.05215206788852811, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.984375, "reward_std": 0.36308756470680237, "rewards/correctness_reward_func": 0.984375, "step": 12 }, { "completion_length": 288.18359375, "epoch": 0.02063696795316996, "grad_norm": 0.419921875, "kl": 0.04899335908703506, "learning_rate": 1e-05, "loss": 0.0002, "reward": 0.9140625, "reward_std": 0.32942036911845207, "rewards/correctness_reward_func": 0.9140625, "step": 13 }, { "completion_length": 280.36328125, "epoch": 0.022224427026490724, "grad_norm": 0.400390625, "kl": 0.0435270715970546, "learning_rate": 1e-05, "loss": 0.0002, "reward": 0.8984375, "reward_std": 0.2572515830397606, "rewards/correctness_reward_func": 0.8984375, "step": 14 }, { "completion_length": 252.9453125, "epoch": 0.02381188609981149, "grad_norm": 0.99609375, "kl": 0.07505812356248498, "learning_rate": 1e-05, "loss": 0.0004, "reward": 1.0234375, "reward_std": 0.30300476029515266, "rewards/correctness_reward_func": 1.0234375, "step": 15 }, { "completion_length": 311.01171875, "epoch": 0.025399345173132255, "grad_norm": 0.4375, "kl": 0.09632595372386277, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.9921875, "reward_std": 0.32104695588350296, "rewards/correctness_reward_func": 0.9921875, "step": 16 }, { "completion_length": 278.70703125, "epoch": 0.02698680424645302, "grad_norm": 0.478515625, "kl": 0.04982293304055929, "learning_rate": 1e-05, "loss": 0.0002, "reward": 0.8671875, "reward_std": 0.34746256470680237, "rewards/correctness_reward_func": 0.8671875, "step": 17 }, { "completion_length": 276.03515625, "epoch": 0.028574263319773786, "grad_norm": 0.48046875, "kl": 0.058841129299253225, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.9140625, "reward_std": 0.34746256470680237, "rewards/correctness_reward_func": 0.9140625, "step": 18 }, { "completion_length": 299.88671875, "epoch": 0.030161722393094552, "grad_norm": 0.43359375, "kl": 0.05037166504189372, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.31862976029515266, "rewards/correctness_reward_func": 0.96875, "step": 19 }, { "completion_length": 302.046875, "epoch": 0.03174918146641532, "grad_norm": 0.4140625, "kl": 0.051571789546869695, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.8515625, "reward_std": 0.37033915519714355, "rewards/correctness_reward_func": 0.8515625, "step": 20 }, { "completion_length": 384.6328125, "epoch": 0.03333664053973608, "grad_norm": 0.31640625, "kl": 0.04484600038267672, "learning_rate": 1e-05, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.2416265867650509, "rewards/correctness_reward_func": 0.6875, "step": 21 }, { "completion_length": 326.0078125, "epoch": 0.03492409961305685, "grad_norm": 0.4296875, "kl": 0.04701656638644636, "learning_rate": 1e-05, "loss": 0.0002, "reward": 0.8984375, "reward_std": 0.4183359779417515, "rewards/correctness_reward_func": 0.8984375, "step": 22 }, { "completion_length": 298.75390625, "epoch": 0.036511558686377614, "grad_norm": 0.384765625, "kl": 0.05655176565051079, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.3270031735301018, "rewards/correctness_reward_func": 0.90625, "step": 23 }, { "completion_length": 371.8984375, "epoch": 0.03809901775969838, "grad_norm": 0.380859375, "kl": 0.06988459394779056, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.8203125, "reward_std": 0.30783915147185326, "rewards/correctness_reward_func": 0.8203125, "step": 24 }, { "completion_length": 367.21484375, "epoch": 0.03968647683301915, "grad_norm": 0.400390625, "kl": 0.061523064388893545, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.8203125, "reward_std": 0.2572515867650509, "rewards/correctness_reward_func": 0.8203125, "step": 25 }, { "completion_length": 317.98828125, "epoch": 0.04127393590633992, "grad_norm": 0.419921875, "kl": 0.05494794622063637, "learning_rate": 1e-05, "loss": 0.0003, "reward": 1.078125, "reward_std": 0.2909187823534012, "rewards/correctness_reward_func": 1.078125, "step": 26 }, { "completion_length": 359.47265625, "epoch": 0.04286139497966068, "grad_norm": 0.337890625, "kl": 0.052700593834742904, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.6953125, "reward_std": 0.3113781735301018, "rewards/correctness_reward_func": 0.6953125, "step": 27 }, { "completion_length": 359.359375, "epoch": 0.04444885405298145, "grad_norm": 0.392578125, "kl": 0.06072323571424931, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.6796875, "reward_std": 0.2620859779417515, "rewards/correctness_reward_func": 0.6796875, "step": 28 }, { "completion_length": 309.65625, "epoch": 0.046036313126302214, "grad_norm": 0.392578125, "kl": 0.06208631256595254, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.7578125, "reward_std": 0.34746256470680237, "rewards/correctness_reward_func": 0.7578125, "step": 29 }, { "completion_length": 324.359375, "epoch": 0.04762377219962298, "grad_norm": 0.419921875, "kl": 0.054743261309340596, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.7421875, "reward_std": 0.2752937823534012, "rewards/correctness_reward_func": 0.7421875, "step": 30 }, { "completion_length": 336.78125, "epoch": 0.049211231272943745, "grad_norm": 0.62890625, "kl": 0.09580778249073774, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.9140625, "reward_std": 0.3113781735301018, "rewards/correctness_reward_func": 0.9140625, "step": 31 }, { "completion_length": 382.821875, "epoch": 1.032, "grad_norm": 0.498046875, "kl": 0.05092890365049243, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.5125, "reward_std": 0.3471687823534012, "rewards/correctness_reward_func": 0.5125, "step": 32 }, { "completion_length": 293.07421875, "epoch": 1.064, "grad_norm": 0.5546875, "kl": 0.10762864979915321, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.875, "reward_std": 0.28254536911845207, "rewards/correctness_reward_func": 0.875, "step": 33 }, { "completion_length": 202.1875, "epoch": 1.096, "grad_norm": 2.03125, "kl": 0.1804919212590903, "learning_rate": 1e-05, "loss": 0.0009, "reward": 0.953125, "reward_std": 0.43042195588350296, "rewards/correctness_reward_func": 0.953125, "step": 34 }, { "completion_length": 298.62890625, "epoch": 1.1280000000000001, "grad_norm": 0.85546875, "kl": 0.14085538033396006, "learning_rate": 1e-05, "loss": 0.0007, "reward": 0.640625, "reward_std": 0.36792195588350296, "rewards/correctness_reward_func": 0.640625, "step": 35 }, { "completion_length": 274.4140625, "epoch": 1.16, "grad_norm": 0.59765625, "kl": 0.07066078553907573, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.859375, "reward_std": 0.4339609779417515, "rewards/correctness_reward_func": 0.859375, "step": 36 }, { "completion_length": 285.171875, "epoch": 1.192, "grad_norm": 0.58203125, "kl": 0.06347297015599906, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.796875, "reward_std": 0.4423343911767006, "rewards/correctness_reward_func": 0.796875, "step": 37 }, { "completion_length": 263.57421875, "epoch": 1.224, "grad_norm": 0.51953125, "kl": 0.0734753671567887, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.75, "reward_std": 0.4207531735301018, "rewards/correctness_reward_func": 0.75, "step": 38 }, { "completion_length": 348.9296875, "epoch": 1.256, "grad_norm": 0.625, "kl": 0.08903565420769155, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.3798343911767006, "rewards/correctness_reward_func": 0.65625, "step": 39 }, { "completion_length": 352.82421875, "epoch": 1.288, "grad_norm": 0.5390625, "kl": 0.06843947665765882, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.6484375, "reward_std": 0.4051281735301018, "rewards/correctness_reward_func": 0.6484375, "step": 40 }, { "completion_length": 329.1328125, "epoch": 1.32, "grad_norm": 0.46875, "kl": 0.057105657644569874, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.671875, "reward_std": 0.4520031735301018, "rewards/correctness_reward_func": 0.671875, "step": 41 }, { "completion_length": 322.95703125, "epoch": 1.3519999999999999, "grad_norm": 0.53515625, "kl": 0.05887855147011578, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.578125, "reward_std": 0.43042195588350296, "rewards/correctness_reward_func": 0.578125, "step": 42 }, { "completion_length": 373.6796875, "epoch": 1.384, "grad_norm": 0.375, "kl": 0.05490247346460819, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.7265625, "reward_std": 0.3426281735301018, "rewards/correctness_reward_func": 0.7265625, "step": 43 }, { "completion_length": 338.2109375, "epoch": 1.416, "grad_norm": 0.474609375, "kl": 0.05778517574071884, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.5234375, "reward_std": 0.41479695960879326, "rewards/correctness_reward_func": 0.5234375, "step": 44 }, { "completion_length": 379.49609375, "epoch": 1.448, "grad_norm": 0.443359375, "kl": 0.052183745545335114, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.4609375, "reward_std": 0.3377937823534012, "rewards/correctness_reward_func": 0.4609375, "step": 45 }, { "completion_length": 453.609375, "epoch": 1.48, "grad_norm": 0.37890625, "kl": 0.05021012085489929, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.484375, "reward_std": 0.39433756470680237, "rewards/correctness_reward_func": 0.484375, "step": 46 }, { "completion_length": 425.3515625, "epoch": 1.512, "grad_norm": 0.423828125, "kl": 0.0551034901291132, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.46875, "reward_std": 0.2777109779417515, "rewards/correctness_reward_func": 0.46875, "step": 47 }, { "completion_length": 436.1796875, "epoch": 1.544, "grad_norm": 0.373046875, "kl": 0.05335203884169459, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.4921875, "reward_std": 0.3245859779417515, "rewards/correctness_reward_func": 0.4921875, "step": 48 }, { "completion_length": 552.60546875, "epoch": 1.576, "grad_norm": 0.43359375, "kl": 0.05089649045839906, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.4765625, "reward_std": 0.3426281735301018, "rewards/correctness_reward_func": 0.4765625, "step": 49 }, { "completion_length": 509.71484375, "epoch": 1.608, "grad_norm": 0.474609375, "kl": 0.05733613413758576, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.4375, "reward_std": 0.43525634706020355, "rewards/correctness_reward_func": 0.4375, "step": 50 }, { "completion_length": 480.94921875, "epoch": 1.6400000000000001, "grad_norm": 0.375, "kl": 0.06065811449661851, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.34375, "reward_std": 0.3270031735301018, "rewards/correctness_reward_func": 0.34375, "step": 51 }, { "completion_length": 491.1640625, "epoch": 1.6720000000000002, "grad_norm": 0.390625, "kl": 0.060098053654655814, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.4375, "reward_std": 0.3846687786281109, "rewards/correctness_reward_func": 0.4375, "step": 52 }, { "completion_length": 464.2265625, "epoch": 1.704, "grad_norm": 0.435546875, "kl": 0.062199660344049335, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.4765625, "reward_std": 0.35713134706020355, "rewards/correctness_reward_func": 0.4765625, "step": 53 }, { "completion_length": 429.90625, "epoch": 1.736, "grad_norm": 12.625, "kl": 0.22791615827009082, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.734375, "reward_std": 0.3402109779417515, "rewards/correctness_reward_func": 0.734375, "step": 54 }, { "completion_length": 512.62109375, "epoch": 1.768, "grad_norm": 0.388671875, "kl": 0.059834773652255535, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.7265625, "reward_std": 0.24404378235340118, "rewards/correctness_reward_func": 0.7265625, "step": 55 }, { "completion_length": 471.5859375, "epoch": 1.8, "grad_norm": 0.388671875, "kl": 0.07468604180030525, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.859375, "reward_std": 0.2055421955883503, "rewards/correctness_reward_func": 0.859375, "step": 56 }, { "completion_length": 446.046875, "epoch": 1.8319999999999999, "grad_norm": 0.443359375, "kl": 0.06021555629558861, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.890625, "reward_std": 0.3221687823534012, "rewards/correctness_reward_func": 0.890625, "step": 57 }, { "completion_length": 536.68359375, "epoch": 1.8639999999999999, "grad_norm": 0.333984375, "kl": 0.060477497056126595, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.609375, "reward_std": 0.24646097794175148, "rewards/correctness_reward_func": 0.609375, "step": 58 }, { "completion_length": 411.69140625, "epoch": 1.896, "grad_norm": 0.453125, "kl": 0.0779561479575932, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.9609375, "reward_std": 0.2620859779417515, "rewards/correctness_reward_func": 0.9609375, "step": 59 }, { "completion_length": 544.3671875, "epoch": 1.928, "grad_norm": 0.81640625, "kl": 0.10888050403445959, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.5859375, "reward_std": 0.24404378235340118, "rewards/correctness_reward_func": 0.5859375, "step": 60 }, { "completion_length": 397.546875, "epoch": 1.96, "grad_norm": 0.6953125, "kl": 0.10245927749201655, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.9375, "reward_std": 0.30058756470680237, "rewards/correctness_reward_func": 0.9375, "step": 61 }, { "completion_length": 533.109375, "epoch": 1.992, "grad_norm": 0.65234375, "kl": 0.09279383928515017, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.640625, "reward_std": 0.28254536911845207, "rewards/correctness_reward_func": 0.640625, "step": 62 }, { "completion_length": 607.078125, "epoch": 2.0, "grad_norm": 0.2421875, "kl": 0.059856025502085686, "learning_rate": 1e-05, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.4471687823534012, "rewards/correctness_reward_func": 0.6875, "step": 63 }, { "completion_length": 706.515625, "epoch": 2.032, "grad_norm": 2.9375, "kl": 0.23451336496509612, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.21875, "reward_std": 0.19716878235340118, "rewards/correctness_reward_func": 0.21875, "step": 64 }, { "completion_length": 661.4453125, "epoch": 2.064, "grad_norm": 0.455078125, "kl": 0.08936465694569051, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.1796875, "reward_std": 0.0961671955883503, "rewards/correctness_reward_func": 0.1796875, "step": 65 }, { "completion_length": 644.8046875, "epoch": 2.096, "grad_norm": 0.287109375, "kl": 0.0694590169005096, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.2109375, "reward_std": 0.1454593911767006, "rewards/correctness_reward_func": 0.2109375, "step": 66 }, { "completion_length": 331.421875, "epoch": 2.128, "grad_norm": 0.5625, "kl": 0.10819920105859637, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.7109375, "reward_std": 0.39192036911845207, "rewards/correctness_reward_func": 0.7109375, "step": 67 }, { "completion_length": 454.83984375, "epoch": 2.16, "grad_norm": 1.171875, "kl": 0.14896808750927448, "learning_rate": 1e-05, "loss": 0.0007, "reward": 0.4453125, "reward_std": 0.2885015867650509, "rewards/correctness_reward_func": 0.4453125, "step": 68 }, { "completion_length": 474.54296875, "epoch": 2.192, "grad_norm": 0.56640625, "kl": 0.06995820580050349, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.6015625, "reward_std": 0.39192036911845207, "rewards/correctness_reward_func": 0.6015625, "step": 69 }, { "completion_length": 440.05859375, "epoch": 2.224, "grad_norm": 0.50390625, "kl": 0.0910127570386976, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.46875, "reward_std": 0.39433756470680237, "rewards/correctness_reward_func": 0.46875, "step": 70 }, { "completion_length": 478.2890625, "epoch": 2.2560000000000002, "grad_norm": 0.419921875, "kl": 0.07039178418926895, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.515625, "reward_std": 0.3534187823534012, "rewards/correctness_reward_func": 0.515625, "step": 71 }, { "completion_length": 539.33203125, "epoch": 2.288, "grad_norm": 0.35546875, "kl": 0.0686434528324753, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.4453125, "reward_std": 0.2933359779417515, "rewards/correctness_reward_func": 0.4453125, "step": 72 }, { "completion_length": 480.2578125, "epoch": 2.32, "grad_norm": 0.326171875, "kl": 0.06706948089413345, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.40625, "reward_std": 0.3582531735301018, "rewards/correctness_reward_func": 0.40625, "step": 73 }, { "completion_length": 497.828125, "epoch": 2.352, "grad_norm": 0.486328125, "kl": 0.07771567534655333, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.4609375, "reward_std": 0.45925475656986237, "rewards/correctness_reward_func": 0.4609375, "step": 74 }, { "completion_length": 540.3671875, "epoch": 2.384, "grad_norm": 0.330078125, "kl": 0.07498506712727249, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.359375, "reward_std": 0.2909187823534012, "rewards/correctness_reward_func": 0.359375, "step": 75 }, { "completion_length": 472.80859375, "epoch": 2.416, "grad_norm": 0.431640625, "kl": 0.06656011822633445, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.5, "reward_std": 0.3534187823534012, "rewards/correctness_reward_func": 0.5, "step": 76 }, { "completion_length": 517.7421875, "epoch": 2.448, "grad_norm": 0.64453125, "kl": 0.07896088412962854, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.375, "reward_std": 0.34504536911845207, "rewards/correctness_reward_func": 0.375, "step": 77 }, { "completion_length": 589.140625, "epoch": 2.48, "grad_norm": 1.1875, "kl": 0.12798133888281882, "learning_rate": 1e-05, "loss": 0.0006, "reward": 0.3203125, "reward_std": 0.34746256470680237, "rewards/correctness_reward_func": 0.3203125, "step": 78 }, { "completion_length": 506.98828125, "epoch": 2.512, "grad_norm": 0.302734375, "kl": 0.06648093881085515, "learning_rate": 1e-05, "loss": 0.0003, "reward": 0.2890625, "reward_std": 0.29817036911845207, "rewards/correctness_reward_func": 0.2890625, "step": 79 }, { "completion_length": 634.9140625, "epoch": 2.544, "grad_norm": 0.296875, "kl": 0.08733222424052656, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.265625, "reward_std": 0.2596687823534012, "rewards/correctness_reward_func": 0.265625, "step": 80 }, { "completion_length": 561.2734375, "epoch": 2.576, "grad_norm": 0.451171875, "kl": 0.08239166042767465, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.5078125, "reward_std": 0.45442036911845207, "rewards/correctness_reward_func": 0.5078125, "step": 81 }, { "completion_length": 535.79296875, "epoch": 2.608, "grad_norm": 0.99609375, "kl": 0.22772114095278084, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.3359375, "reward_std": 0.36550476029515266, "rewards/correctness_reward_func": 0.3359375, "step": 82 }, { "completion_length": 516.10546875, "epoch": 2.64, "grad_norm": 0.4765625, "kl": 0.12832795921713114, "learning_rate": 1e-05, "loss": 0.0006, "reward": 0.390625, "reward_std": 0.2728765867650509, "rewards/correctness_reward_func": 0.390625, "step": 83 }, { "completion_length": 486.69140625, "epoch": 2.672, "grad_norm": 0.453125, "kl": 0.08598195551894605, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.4296875, "reward_std": 0.3377937823534012, "rewards/correctness_reward_func": 0.4296875, "step": 84 }, { "completion_length": 595.859375, "epoch": 2.7039999999999997, "grad_norm": 0.337890625, "kl": 0.08345729601569474, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.390625, "reward_std": 0.3089609779417515, "rewards/correctness_reward_func": 0.390625, "step": 85 }, { "completion_length": 613.12109375, "epoch": 2.7359999999999998, "grad_norm": 0.291015625, "kl": 0.0912326027173549, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.2890625, "reward_std": 0.19958597794175148, "rewards/correctness_reward_func": 0.2890625, "step": 86 }, { "completion_length": 586.3984375, "epoch": 2.768, "grad_norm": 0.361328125, "kl": 0.08986557251773775, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.4609375, "reward_std": 0.3245859779417515, "rewards/correctness_reward_func": 0.4609375, "step": 87 }, { "completion_length": 580.55859375, "epoch": 2.8, "grad_norm": 0.4140625, "kl": 0.08739295578561723, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.3671875, "reward_std": 0.2933359779417515, "rewards/correctness_reward_func": 0.3671875, "step": 88 }, { "completion_length": 545.07421875, "epoch": 2.832, "grad_norm": 0.30078125, "kl": 0.0942155783995986, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.328125, "reward_std": 0.2512953653931618, "rewards/correctness_reward_func": 0.328125, "step": 89 }, { "completion_length": 507.109375, "epoch": 2.864, "grad_norm": 0.37109375, "kl": 0.08285854570567608, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.4140625, "reward_std": 0.31621256098151207, "rewards/correctness_reward_func": 0.4140625, "step": 90 }, { "completion_length": 548.953125, "epoch": 2.896, "grad_norm": 1.2109375, "kl": 0.17426068475469947, "learning_rate": 1e-05, "loss": 0.0009, "reward": 0.5390625, "reward_std": 0.2752937823534012, "rewards/correctness_reward_func": 0.5390625, "step": 91 }, { "completion_length": 487.5078125, "epoch": 2.928, "grad_norm": 0.3671875, "kl": 0.08832010440528393, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.75, "reward_std": 0.3270031735301018, "rewards/correctness_reward_func": 0.75, "step": 92 }, { "completion_length": 464.93359375, "epoch": 2.96, "grad_norm": 0.625, "kl": 0.12189564970321953, "learning_rate": 1e-05, "loss": 0.0006, "reward": 0.5625, "reward_std": 0.2645031735301018, "rewards/correctness_reward_func": 0.5625, "step": 93 }, { "completion_length": 474.10546875, "epoch": 2.992, "grad_norm": 0.30078125, "kl": 0.08207035693340003, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.75, "reward_std": 0.2596687823534012, "rewards/correctness_reward_func": 0.75, "step": 94 }, { "completion_length": 495.15625, "epoch": 3.0, "grad_norm": 0.107421875, "kl": 0.09875499829649925, "learning_rate": 1e-05, "loss": 0.0001, "reward": 0.5, "reward_std": 0.25, "rewards/correctness_reward_func": 0.5, "step": 95 }, { "completion_length": 693.6640625, "epoch": 3.032, "grad_norm": 0.287109375, "kl": 0.13659441424533725, "learning_rate": 1e-05, "loss": 0.0007, "reward": 0.1953125, "reward_std": 0.21279378235340118, "rewards/correctness_reward_func": 0.1953125, "step": 96 }, { "completion_length": 681.03125, "epoch": 3.064, "grad_norm": 0.255859375, "kl": 0.13894613785669208, "learning_rate": 1e-05, "loss": 0.0007, "reward": 0.2578125, "reward_std": 0.1947515867650509, "rewards/correctness_reward_func": 0.2578125, "step": 97 }, { "completion_length": 722.29296875, "epoch": 3.096, "grad_norm": 0.330078125, "kl": 0.1400570566765964, "learning_rate": 1e-05, "loss": 0.0007, "reward": 0.2109375, "reward_std": 0.2079593911767006, "rewards/correctness_reward_func": 0.2109375, "step": 98 }, { "completion_length": 634.5546875, "epoch": 3.128, "grad_norm": 0.287109375, "kl": 0.13448539143428206, "learning_rate": 1e-05, "loss": 0.0007, "reward": 0.203125, "reward_std": 0.1923343911767006, "rewards/correctness_reward_func": 0.203125, "step": 99 }, { "completion_length": 406.96875, "epoch": 3.16, "grad_norm": 0.7109375, "kl": 0.13877984089776874, "learning_rate": 1e-05, "loss": 0.0007, "reward": 0.4609375, "reward_std": 0.3558359779417515, "rewards/correctness_reward_func": 0.4609375, "step": 100 } ], "logging_steps": 1, "max_steps": 125800, "num_input_tokens_seen": 0, "num_train_epochs": 200, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }