| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5714285714285714, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 2334.9792137145996, |
| "epoch": 0.001142857142857143, |
| "grad_norm": 0.04168504849076271, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.0343, |
| "reward": 0.5176793523132801, |
| "reward_std": 0.7803475465625525, |
| "rewards/cosine_scaled_reward": -0.011993663385510445, |
| "rewards/format_reward": 0.5416666772216558, |
| "step": 1 |
| }, |
| { |
| "completion_length": 2554.7708892822266, |
| "epoch": 0.002285714285714286, |
| "grad_norm": 0.0329468809068203, |
| "kl": 0.0, |
| "learning_rate": 2e-08, |
| "loss": 0.048, |
| "reward": 0.544984623324126, |
| "reward_std": 0.8168745078146458, |
| "rewards/cosine_scaled_reward": 0.05374230569577776, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 2 |
| }, |
| { |
| "completion_length": 2859.1875076293945, |
| "epoch": 0.0034285714285714284, |
| "grad_norm": 0.023289382457733154, |
| "kl": 5.08427619934082e-05, |
| "learning_rate": 4e-08, |
| "loss": 0.0051, |
| "reward": -0.02553212083876133, |
| "reward_std": 0.48618486896157265, |
| "rewards/cosine_scaled_reward": -0.1481827348470688, |
| "rewards/format_reward": 0.27083333395421505, |
| "step": 3 |
| }, |
| { |
| "completion_length": 1444.7500267028809, |
| "epoch": 0.004571428571428572, |
| "grad_norm": 0.03743300959467888, |
| "kl": 4.610419273376465e-05, |
| "learning_rate": 6e-08, |
| "loss": -0.0001, |
| "reward": 0.7746477648615837, |
| "reward_std": 0.7952391989529133, |
| "rewards/cosine_scaled_reward": -0.03975944733247161, |
| "rewards/format_reward": 0.8541666716337204, |
| "step": 4 |
| }, |
| { |
| "completion_length": 3115.9791870117188, |
| "epoch": 0.005714285714285714, |
| "grad_norm": 0.03854230046272278, |
| "kl": 5.6996941566467285e-05, |
| "learning_rate": 8e-08, |
| "loss": 0.0264, |
| "reward": -0.07823497732169926, |
| "reward_std": 0.7049717996269464, |
| "rewards/cosine_scaled_reward": -0.18495082901790738, |
| "rewards/format_reward": 0.29166666977107525, |
| "step": 5 |
| }, |
| { |
| "completion_length": 2771.9166946411133, |
| "epoch": 0.006857142857142857, |
| "grad_norm": 0.035347286611795425, |
| "kl": 5.025416612625122e-05, |
| "learning_rate": 1e-07, |
| "loss": 0.0071, |
| "reward": 0.07868877053260803, |
| "reward_std": 0.4701222777366638, |
| "rewards/cosine_scaled_reward": -0.17940562218427658, |
| "rewards/format_reward": 0.4375, |
| "step": 6 |
| }, |
| { |
| "completion_length": 2687.6458740234375, |
| "epoch": 0.008, |
| "grad_norm": 0.027172546833753586, |
| "kl": 3.434717655181885e-05, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0422, |
| "reward": 0.45066535100340843, |
| "reward_std": 0.5497629679739475, |
| "rewards/cosine_scaled_reward": -0.055917332880198956, |
| "rewards/format_reward": 0.5625000018626451, |
| "step": 7 |
| }, |
| { |
| "completion_length": 2470.6458587646484, |
| "epoch": 0.009142857142857144, |
| "grad_norm": 0.039594072848558426, |
| "kl": 4.009902477264404e-05, |
| "learning_rate": 1.4e-07, |
| "loss": -0.0246, |
| "reward": 0.9416400007903576, |
| "reward_std": 0.8407168909907341, |
| "rewards/cosine_scaled_reward": 0.1583199892193079, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 8 |
| }, |
| { |
| "completion_length": 2876.791717529297, |
| "epoch": 0.010285714285714285, |
| "grad_norm": 0.043148480355739594, |
| "kl": 5.060434341430664e-05, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0566, |
| "reward": 0.06330016255378723, |
| "reward_std": 0.8118741400539875, |
| "rewards/cosine_scaled_reward": -0.13501658383756876, |
| "rewards/format_reward": 0.33333333767950535, |
| "step": 9 |
| }, |
| { |
| "completion_length": 2594.583381652832, |
| "epoch": 0.011428571428571429, |
| "grad_norm": 0.038916755467653275, |
| "kl": 3.3721327781677246e-05, |
| "learning_rate": 1.8e-07, |
| "loss": 0.0571, |
| "reward": 0.6445693820714951, |
| "reward_std": 0.703135633841157, |
| "rewards/cosine_scaled_reward": 0.11395137198269367, |
| "rewards/format_reward": 0.4166666679084301, |
| "step": 10 |
| }, |
| { |
| "completion_length": 3401.9166870117188, |
| "epoch": 0.012571428571428572, |
| "grad_norm": 0.034626927226781845, |
| "kl": 4.112720489501953e-05, |
| "learning_rate": 2e-07, |
| "loss": 0.0237, |
| "reward": -0.26008480228483677, |
| "reward_std": 0.52357386238873, |
| "rewards/cosine_scaled_reward": -0.21337573695927858, |
| "rewards/format_reward": 0.1666666716337204, |
| "step": 11 |
| }, |
| { |
| "completion_length": 1942.1250228881836, |
| "epoch": 0.013714285714285714, |
| "grad_norm": 0.033392105251550674, |
| "kl": 4.201382398605347e-05, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.0569, |
| "reward": 0.6705481559038162, |
| "reward_std": 0.7221638858318329, |
| "rewards/cosine_scaled_reward": -0.02930926624685526, |
| "rewards/format_reward": 0.7291666772216558, |
| "step": 12 |
| }, |
| { |
| "completion_length": 2874.3541946411133, |
| "epoch": 0.014857142857142857, |
| "grad_norm": 0.019545605406165123, |
| "kl": 3.758817911148071e-05, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0097, |
| "reward": 0.141155153978616, |
| "reward_std": 0.42822966538369656, |
| "rewards/cosine_scaled_reward": -0.07525575812906027, |
| "rewards/format_reward": 0.2916666679084301, |
| "step": 13 |
| }, |
| { |
| "completion_length": 2432.3750534057617, |
| "epoch": 0.016, |
| "grad_norm": 0.017046086490154266, |
| "kl": 2.7783215045928955e-05, |
| "learning_rate": 2.6e-07, |
| "loss": 0.0137, |
| "reward": 0.04899838753044605, |
| "reward_std": 0.42438305931864306, |
| "rewards/cosine_scaled_reward": -0.20466747391037643, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 14 |
| }, |
| { |
| "completion_length": 2732.5625038146973, |
| "epoch": 0.017142857142857144, |
| "grad_norm": 0.03251831606030464, |
| "kl": 4.025548696517944e-05, |
| "learning_rate": 2.8e-07, |
| "loss": -0.0008, |
| "reward": 0.6431202292442322, |
| "reward_std": 0.4675060287117958, |
| "rewards/cosine_scaled_reward": 0.11322677787393332, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 15 |
| }, |
| { |
| "completion_length": 3581.7916870117188, |
| "epoch": 0.018285714285714287, |
| "grad_norm": 0.029273949563503265, |
| "kl": 4.838407039642334e-05, |
| "learning_rate": 3e-07, |
| "loss": 0.0011, |
| "reward": -0.35534902289509773, |
| "reward_std": 0.5198515467345715, |
| "rewards/cosine_scaled_reward": -0.1985078384168446, |
| "rewards/format_reward": 0.0416666679084301, |
| "step": 16 |
| }, |
| { |
| "completion_length": 2177.9792251586914, |
| "epoch": 0.019428571428571427, |
| "grad_norm": 0.03885098919272423, |
| "kl": 5.97536563873291e-05, |
| "learning_rate": 3.2e-07, |
| "loss": 0.0438, |
| "reward": 0.4878217186778784, |
| "reward_std": 0.6091510504484177, |
| "rewards/cosine_scaled_reward": -0.026922473683953285, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 17 |
| }, |
| { |
| "completion_length": 2967.854217529297, |
| "epoch": 0.02057142857142857, |
| "grad_norm": 0.04610211029648781, |
| "kl": 3.8489699363708496e-05, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.0619, |
| "reward": 0.5143160903826356, |
| "reward_std": 0.820105541497469, |
| "rewards/cosine_scaled_reward": 0.02799136098474264, |
| "rewards/format_reward": 0.45833333767950535, |
| "step": 18 |
| }, |
| { |
| "completion_length": 2860.062530517578, |
| "epoch": 0.021714285714285714, |
| "grad_norm": 0.040472596883773804, |
| "kl": 3.56137752532959e-05, |
| "learning_rate": 3.6e-07, |
| "loss": 0.0495, |
| "reward": 0.7190339863300323, |
| "reward_std": 0.6223967634141445, |
| "rewards/cosine_scaled_reward": 0.15118366852402687, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 19 |
| }, |
| { |
| "completion_length": 1933.0625495910645, |
| "epoch": 0.022857142857142857, |
| "grad_norm": 0.028757991269230843, |
| "kl": 3.4768134355545044e-05, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": 0.017, |
| "reward": 0.5407668675179593, |
| "reward_std": 0.6035287424456328, |
| "rewards/cosine_scaled_reward": -0.08378324937075377, |
| "rewards/format_reward": 0.7083333376795053, |
| "step": 20 |
| }, |
| { |
| "completion_length": 2637.354190826416, |
| "epoch": 0.024, |
| "grad_norm": 0.021128255873918533, |
| "kl": 4.544854164123535e-05, |
| "learning_rate": 4e-07, |
| "loss": 0.0244, |
| "reward": 0.29663072153925896, |
| "reward_std": 0.34191627195104957, |
| "rewards/cosine_scaled_reward": -0.028767995536327362, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 21 |
| }, |
| { |
| "completion_length": 1485.1041946411133, |
| "epoch": 0.025142857142857144, |
| "grad_norm": 0.024596063420176506, |
| "kl": 3.890693187713623e-05, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.019, |
| "reward": 0.7402454526163638, |
| "reward_std": 0.44790306128561497, |
| "rewards/cosine_scaled_reward": -0.04654395952820778, |
| "rewards/format_reward": 0.8333333395421505, |
| "step": 22 |
| }, |
| { |
| "completion_length": 2294.4166870117188, |
| "epoch": 0.026285714285714287, |
| "grad_norm": 0.0354381762444973, |
| "kl": 5.1334500312805176e-05, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0626, |
| "reward": 0.5391949899494648, |
| "reward_std": 0.8765826672315598, |
| "rewards/cosine_scaled_reward": 0.009180818684399128, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 23 |
| }, |
| { |
| "completion_length": 2430.979217529297, |
| "epoch": 0.027428571428571427, |
| "grad_norm": 0.03837364539504051, |
| "kl": 2.7820002287626266e-05, |
| "learning_rate": 4.6e-07, |
| "loss": 0.0828, |
| "reward": 0.6404989643488079, |
| "reward_std": 0.7054540691897273, |
| "rewards/cosine_scaled_reward": 0.018166150897741318, |
| "rewards/format_reward": 0.6041666734963655, |
| "step": 24 |
| }, |
| { |
| "completion_length": 2256.9583702087402, |
| "epoch": 0.02857142857142857, |
| "grad_norm": 0.026419438421726227, |
| "kl": 5.500763654708862e-05, |
| "learning_rate": 4.8e-07, |
| "loss": -0.012, |
| "reward": 0.27087176591157913, |
| "reward_std": 0.62013322673738, |
| "rewards/cosine_scaled_reward": -0.12498080357909203, |
| "rewards/format_reward": 0.520833333954215, |
| "step": 25 |
| }, |
| { |
| "completion_length": 2868.500030517578, |
| "epoch": 0.029714285714285714, |
| "grad_norm": 0.028371552005410194, |
| "kl": 4.79966402053833e-05, |
| "learning_rate": 5e-07, |
| "loss": 0.0107, |
| "reward": 0.24959533289074898, |
| "reward_std": 0.6043985933065414, |
| "rewards/cosine_scaled_reward": -0.10436900053173304, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 26 |
| }, |
| { |
| "completion_length": 2872.520851135254, |
| "epoch": 0.030857142857142857, |
| "grad_norm": 0.03675654157996178, |
| "kl": 5.967915058135986e-05, |
| "learning_rate": 5.2e-07, |
| "loss": 0.0283, |
| "reward": 0.0008413083851337433, |
| "reward_std": 0.5718730166554451, |
| "rewards/cosine_scaled_reward": -0.17666268534958363, |
| "rewards/format_reward": 0.3541666753590107, |
| "step": 27 |
| }, |
| { |
| "completion_length": 2665.0416870117188, |
| "epoch": 0.032, |
| "grad_norm": 0.039023082703351974, |
| "kl": 4.050694406032562e-05, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0179, |
| "reward": 0.4467268669977784, |
| "reward_std": 0.684929771348834, |
| "rewards/cosine_scaled_reward": 0.00461340369656682, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 28 |
| }, |
| { |
| "completion_length": 2884.125030517578, |
| "epoch": 0.03314285714285714, |
| "grad_norm": 0.03789398819208145, |
| "kl": 4.595518112182617e-05, |
| "learning_rate": 5.6e-07, |
| "loss": 0.0371, |
| "reward": -0.05399216455407441, |
| "reward_std": 0.5582299008965492, |
| "rewards/cosine_scaled_reward": -0.1936627607792616, |
| "rewards/format_reward": 0.3333333469927311, |
| "step": 29 |
| }, |
| { |
| "completion_length": 2812.604217529297, |
| "epoch": 0.03428571428571429, |
| "grad_norm": 0.05193966627120972, |
| "kl": 4.1157007217407227e-05, |
| "learning_rate": 5.8e-07, |
| "loss": 0.0381, |
| "reward": 0.5208531729876995, |
| "reward_std": 1.0489587001502514, |
| "rewards/cosine_scaled_reward": 0.010426569730043411, |
| "rewards/format_reward": 0.5000000111758709, |
| "step": 30 |
| }, |
| { |
| "completion_length": 2949.8125228881836, |
| "epoch": 0.03542857142857143, |
| "grad_norm": 0.03635473921895027, |
| "kl": 4.1857361793518066e-05, |
| "learning_rate": 6e-07, |
| "loss": -0.0211, |
| "reward": 0.07118985429406166, |
| "reward_std": 0.7498712912201881, |
| "rewards/cosine_scaled_reward": -0.09982176125049591, |
| "rewards/format_reward": 0.2708333395421505, |
| "step": 31 |
| }, |
| { |
| "completion_length": 2791.0000610351562, |
| "epoch": 0.036571428571428574, |
| "grad_norm": 0.028467582538723946, |
| "kl": 4.823505878448486e-05, |
| "learning_rate": 6.2e-07, |
| "loss": 0.0302, |
| "reward": 0.47064110776409507, |
| "reward_std": 0.5171910058706999, |
| "rewards/cosine_scaled_reward": 0.016570553183555603, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 32 |
| }, |
| { |
| "completion_length": 3185.166702270508, |
| "epoch": 0.037714285714285714, |
| "grad_norm": 0.03910686448216438, |
| "kl": 4.856288433074951e-05, |
| "learning_rate": 6.4e-07, |
| "loss": 0.0154, |
| "reward": 0.18704182095825672, |
| "reward_std": 0.7629665322601795, |
| "rewards/cosine_scaled_reward": -0.10439574904739857, |
| "rewards/format_reward": 0.39583334140479565, |
| "step": 33 |
| }, |
| { |
| "completion_length": 2163.333351135254, |
| "epoch": 0.038857142857142854, |
| "grad_norm": 0.03700648993253708, |
| "kl": 4.020100459456444e-05, |
| "learning_rate": 6.6e-07, |
| "loss": 0.0315, |
| "reward": 1.0100164553150535, |
| "reward_std": 0.8100163154304028, |
| "rewards/cosine_scaled_reward": 0.21334155462682247, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 34 |
| }, |
| { |
| "completion_length": 3131.5208435058594, |
| "epoch": 0.04, |
| "grad_norm": 0.04378014802932739, |
| "kl": 6.04093074798584e-05, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": 0.007, |
| "reward": -0.16884424164891243, |
| "reward_std": 0.6548870000988245, |
| "rewards/cosine_scaled_reward": -0.18858879059553146, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 35 |
| }, |
| { |
| "completion_length": 3189.0208435058594, |
| "epoch": 0.04114285714285714, |
| "grad_norm": 0.034937676042318344, |
| "kl": 5.710124969482422e-05, |
| "learning_rate": 7e-07, |
| "loss": 0.0264, |
| "reward": -0.3049023966304958, |
| "reward_std": 0.4520086422562599, |
| "rewards/cosine_scaled_reward": -0.25661787390708923, |
| "rewards/format_reward": 0.20833334140479565, |
| "step": 36 |
| }, |
| { |
| "completion_length": 3393.3541870117188, |
| "epoch": 0.04228571428571429, |
| "grad_norm": 0.031590525060892105, |
| "kl": 4.273653030395508e-05, |
| "learning_rate": 7.2e-07, |
| "loss": 0.0103, |
| "reward": -0.2297830693423748, |
| "reward_std": 0.4669038709253073, |
| "rewards/cosine_scaled_reward": -0.21905820816755295, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 37 |
| }, |
| { |
| "completion_length": 3292.7708587646484, |
| "epoch": 0.04342857142857143, |
| "grad_norm": 0.026573847979307175, |
| "kl": 4.4655054807662964e-05, |
| "learning_rate": 7.4e-07, |
| "loss": 0.0033, |
| "reward": -0.19998125731945038, |
| "reward_std": 0.48700529895722866, |
| "rewards/cosine_scaled_reward": -0.172907296102494, |
| "rewards/format_reward": 0.14583333395421505, |
| "step": 38 |
| }, |
| { |
| "completion_length": 2868.5417137145996, |
| "epoch": 0.044571428571428574, |
| "grad_norm": 0.03660503029823303, |
| "kl": 3.191828727722168e-05, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": 0.0278, |
| "reward": 0.5126112666912377, |
| "reward_std": 0.5605221642181277, |
| "rewards/cosine_scaled_reward": 0.02713894983753562, |
| "rewards/format_reward": 0.4583333395421505, |
| "step": 39 |
| }, |
| { |
| "completion_length": 2619.3958892822266, |
| "epoch": 0.045714285714285714, |
| "grad_norm": 0.04862872138619423, |
| "kl": 3.953278064727783e-05, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.0356, |
| "reward": 0.4027645355090499, |
| "reward_std": 0.7209917511790991, |
| "rewards/cosine_scaled_reward": -0.05903442995622754, |
| "rewards/format_reward": 0.5208333469927311, |
| "step": 40 |
| }, |
| { |
| "completion_length": 3137.8750915527344, |
| "epoch": 0.046857142857142854, |
| "grad_norm": 0.04735239967703819, |
| "kl": 4.553794860839844e-05, |
| "learning_rate": 8e-07, |
| "loss": -0.0455, |
| "reward": 0.18608580250293016, |
| "reward_std": 0.7598953321576118, |
| "rewards/cosine_scaled_reward": -0.1152904350310564, |
| "rewards/format_reward": 0.4166666753590107, |
| "step": 41 |
| }, |
| { |
| "completion_length": 2854.625, |
| "epoch": 0.048, |
| "grad_norm": 0.018570901826024055, |
| "kl": 6.361305713653564e-05, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": 0.0098, |
| "reward": -0.3359166495501995, |
| "reward_std": 0.2978406958281994, |
| "rewards/cosine_scaled_reward": -0.3137916624546051, |
| "rewards/format_reward": 0.2916666679084301, |
| "step": 42 |
| }, |
| { |
| "completion_length": 2866.2083740234375, |
| "epoch": 0.04914285714285714, |
| "grad_norm": 0.03492705151438713, |
| "kl": 4.682131111621857e-05, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0498, |
| "reward": 0.017277991399168968, |
| "reward_std": 0.693747516721487, |
| "rewards/cosine_scaled_reward": -0.14761100709438324, |
| "rewards/format_reward": 0.3125000074505806, |
| "step": 43 |
| }, |
| { |
| "completion_length": 2410.562530517578, |
| "epoch": 0.05028571428571429, |
| "grad_norm": 0.020008008927106857, |
| "kl": 4.0531158447265625e-05, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0014, |
| "reward": 0.5113720148801804, |
| "reward_std": 0.5214159078896046, |
| "rewards/cosine_scaled_reward": 0.04735266789793968, |
| "rewards/format_reward": 0.4166666679084301, |
| "step": 44 |
| }, |
| { |
| "completion_length": 3197.2291870117188, |
| "epoch": 0.05142857142857143, |
| "grad_norm": 0.03368664160370827, |
| "kl": 4.8667192459106445e-05, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0349, |
| "reward": -0.1545441746711731, |
| "reward_std": 0.6294996133074164, |
| "rewards/cosine_scaled_reward": -0.20227208855794743, |
| "rewards/format_reward": 0.2500000037252903, |
| "step": 45 |
| }, |
| { |
| "completion_length": 3049.458366394043, |
| "epoch": 0.052571428571428575, |
| "grad_norm": 0.023678315803408623, |
| "kl": 4.738569259643555e-05, |
| "learning_rate": 9e-07, |
| "loss": 0.0258, |
| "reward": -0.08186815958470106, |
| "reward_std": 0.4881300590932369, |
| "rewards/cosine_scaled_reward": -0.17635074770078063, |
| "rewards/format_reward": 0.2708333395421505, |
| "step": 46 |
| }, |
| { |
| "completion_length": 2636.1458740234375, |
| "epoch": 0.053714285714285714, |
| "grad_norm": 0.04777181148529053, |
| "kl": 4.3764710426330566e-05, |
| "learning_rate": 9.2e-07, |
| "loss": 0.0374, |
| "reward": 0.3579790024086833, |
| "reward_std": 0.8919647391885519, |
| "rewards/cosine_scaled_reward": -0.08142716065049171, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 47 |
| }, |
| { |
| "completion_length": 2504.8125534057617, |
| "epoch": 0.054857142857142854, |
| "grad_norm": 0.040610894560813904, |
| "kl": 3.6403536796569824e-05, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": 0.0161, |
| "reward": 0.3527611903846264, |
| "reward_std": 0.796766672283411, |
| "rewards/cosine_scaled_reward": -0.06320274842437357, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 48 |
| }, |
| { |
| "completion_length": 1906.791690826416, |
| "epoch": 0.056, |
| "grad_norm": 0.03556185960769653, |
| "kl": 2.3037195205688477e-05, |
| "learning_rate": 9.6e-07, |
| "loss": 0.0207, |
| "reward": 0.5905593386851251, |
| "reward_std": 0.6989730293862522, |
| "rewards/cosine_scaled_reward": -0.02763699647039175, |
| "rewards/format_reward": 0.645833345130086, |
| "step": 49 |
| }, |
| { |
| "completion_length": 3025.2500228881836, |
| "epoch": 0.05714285714285714, |
| "grad_norm": 0.03857988119125366, |
| "kl": 3.446638584136963e-05, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0268, |
| "reward": 0.29927817918360233, |
| "reward_std": 0.555895734578371, |
| "rewards/cosine_scaled_reward": -0.017027591355144978, |
| "rewards/format_reward": 0.3333333395421505, |
| "step": 50 |
| }, |
| { |
| "completion_length": 2294.208335876465, |
| "epoch": 0.05828571428571429, |
| "grad_norm": 0.020732907578349113, |
| "kl": 4.1797757148742676e-05, |
| "learning_rate": 1e-06, |
| "loss": 0.0055, |
| "reward": 0.25655205361545086, |
| "reward_std": 0.4376220293343067, |
| "rewards/cosine_scaled_reward": -0.12172399461269379, |
| "rewards/format_reward": 0.5, |
| "step": 51 |
| }, |
| { |
| "completion_length": 2716.0834045410156, |
| "epoch": 0.05942857142857143, |
| "grad_norm": 0.0520336888730526, |
| "kl": 4.2897649109363556e-05, |
| "learning_rate": 9.999890338174275e-07, |
| "loss": 0.0296, |
| "reward": 0.6955370828509331, |
| "reward_std": 0.8111556768417358, |
| "rewards/cosine_scaled_reward": 0.07693520188331604, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 52 |
| }, |
| { |
| "completion_length": 2616.000015258789, |
| "epoch": 0.060571428571428575, |
| "grad_norm": 0.04615948721766472, |
| "kl": 4.25875186920166e-05, |
| "learning_rate": 9.999561358041868e-07, |
| "loss": 0.0573, |
| "reward": 0.6107768043875694, |
| "reward_std": 1.0185654014348984, |
| "rewards/cosine_scaled_reward": 0.03455505130114034, |
| "rewards/format_reward": 0.5416666753590107, |
| "step": 53 |
| }, |
| { |
| "completion_length": 2352.0625381469727, |
| "epoch": 0.061714285714285715, |
| "grad_norm": 0.04578646644949913, |
| "kl": 2.933293581008911e-05, |
| "learning_rate": 9.999013075636804e-07, |
| "loss": 0.0947, |
| "reward": 0.9733524695038795, |
| "reward_std": 0.9531826749444008, |
| "rewards/cosine_scaled_reward": 0.1637595584616065, |
| "rewards/format_reward": 0.6458333432674408, |
| "step": 54 |
| }, |
| { |
| "completion_length": 2831.3958435058594, |
| "epoch": 0.06285714285714286, |
| "grad_norm": 0.04407193139195442, |
| "kl": 3.0428171157836914e-05, |
| "learning_rate": 9.998245517681593e-07, |
| "loss": 0.0082, |
| "reward": 0.6569849252700806, |
| "reward_std": 0.8474150542169809, |
| "rewards/cosine_scaled_reward": 0.08890912728384137, |
| "rewards/format_reward": 0.4791666753590107, |
| "step": 55 |
| }, |
| { |
| "completion_length": 3047.125015258789, |
| "epoch": 0.064, |
| "grad_norm": 0.026214739307761192, |
| "kl": 4.430115222930908e-05, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": 0.0257, |
| "reward": -0.18962648510932922, |
| "reward_std": 0.5728549361228943, |
| "rewards/cosine_scaled_reward": -0.2302299104630947, |
| "rewards/format_reward": 0.2708333358168602, |
| "step": 56 |
| }, |
| { |
| "completion_length": 3043.875030517578, |
| "epoch": 0.06514285714285714, |
| "grad_norm": 0.03775238245725632, |
| "kl": 2.8438866138458252e-05, |
| "learning_rate": 9.996052735444862e-07, |
| "loss": 0.0011, |
| "reward": -0.02485821396112442, |
| "reward_std": 0.6774866338819265, |
| "rewards/cosine_scaled_reward": -0.1895124390721321, |
| "rewards/format_reward": 0.3541666679084301, |
| "step": 57 |
| }, |
| { |
| "completion_length": 2135.1042556762695, |
| "epoch": 0.06628571428571428, |
| "grad_norm": 0.04270089045166969, |
| "kl": 3.1717121601104736e-05, |
| "learning_rate": 9.994627618036452e-07, |
| "loss": 0.0935, |
| "reward": 0.6859063357114792, |
| "reward_std": 0.959196649491787, |
| "rewards/cosine_scaled_reward": 0.020036499947309494, |
| "rewards/format_reward": 0.6458333376795053, |
| "step": 58 |
| }, |
| { |
| "completion_length": 2852.1041717529297, |
| "epoch": 0.06742857142857143, |
| "grad_norm": 0.035348936915397644, |
| "kl": 2.6881694793701172e-05, |
| "learning_rate": 9.992983438818915e-07, |
| "loss": -0.0016, |
| "reward": -0.11129460483789444, |
| "reward_std": 0.6194134466350079, |
| "rewards/cosine_scaled_reward": -0.21189730428159237, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 59 |
| }, |
| { |
| "completion_length": 2817.291702270508, |
| "epoch": 0.06857142857142857, |
| "grad_norm": 0.044353168457746506, |
| "kl": 2.095848321914673e-05, |
| "learning_rate": 9.991120277927223e-07, |
| "loss": 0.0249, |
| "reward": 0.22440146282315254, |
| "reward_std": 0.9068924002349377, |
| "rewards/cosine_scaled_reward": -0.10654927371069789, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 60 |
| }, |
| { |
| "completion_length": 2894.875030517578, |
| "epoch": 0.06971428571428571, |
| "grad_norm": 0.0331726111471653, |
| "kl": 3.4576281905174255e-05, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": 0.0075, |
| "reward": 0.6926130764186382, |
| "reward_std": 0.6958329901099205, |
| "rewards/cosine_scaled_reward": 0.07547319121658802, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 61 |
| }, |
| { |
| "completion_length": 2464.4791946411133, |
| "epoch": 0.07085714285714285, |
| "grad_norm": 0.03456748649477959, |
| "kl": 2.349168062210083e-05, |
| "learning_rate": 9.98673738502114e-07, |
| "loss": 0.0285, |
| "reward": 0.6481589786708355, |
| "reward_std": 0.683325620368123, |
| "rewards/cosine_scaled_reward": 0.06366281025111675, |
| "rewards/format_reward": 0.520833333954215, |
| "step": 62 |
| }, |
| { |
| "completion_length": 2122.625045776367, |
| "epoch": 0.072, |
| "grad_norm": 0.7239912748336792, |
| "kl": 2.844538539648056e-05, |
| "learning_rate": 9.98421786662277e-07, |
| "loss": 0.0578, |
| "reward": 0.7681009210646152, |
| "reward_std": 0.7113255299627781, |
| "rewards/cosine_scaled_reward": 0.019467118196189404, |
| "rewards/format_reward": 0.7291666828095913, |
| "step": 63 |
| }, |
| { |
| "completion_length": 2740.1875534057617, |
| "epoch": 0.07314285714285715, |
| "grad_norm": 0.037132617086172104, |
| "kl": 4.111975431442261e-05, |
| "learning_rate": 9.981479793771866e-07, |
| "loss": 0.0213, |
| "reward": 0.6051971167325974, |
| "reward_std": 0.7252629213035107, |
| "rewards/cosine_scaled_reward": 0.052598556503653526, |
| "rewards/format_reward": 0.5000000111758709, |
| "step": 64 |
| }, |
| { |
| "completion_length": 2566.7916946411133, |
| "epoch": 0.07428571428571429, |
| "grad_norm": 0.01891911029815674, |
| "kl": 2.2094696760177612e-05, |
| "learning_rate": 9.97852329991824e-07, |
| "loss": -0.0092, |
| "reward": -0.04870823957026005, |
| "reward_std": 0.48125316202640533, |
| "rewards/cosine_scaled_reward": -0.25352078676223755, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 65 |
| }, |
| { |
| "completion_length": 2060.4375038146973, |
| "epoch": 0.07542857142857143, |
| "grad_norm": 0.016883157193660736, |
| "kl": 2.9248185455799103e-05, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": -0.0004, |
| "reward": 0.33483995124697685, |
| "reward_std": 0.5551962554454803, |
| "rewards/cosine_scaled_reward": -0.08258003555238247, |
| "rewards/format_reward": 0.5, |
| "step": 66 |
| }, |
| { |
| "completion_length": 3343.9791870117188, |
| "epoch": 0.07657142857142857, |
| "grad_norm": 0.019418727606534958, |
| "kl": 2.110004425048828e-05, |
| "learning_rate": 9.971955636222684e-07, |
| "loss": 0.0382, |
| "reward": -0.4314758640830405, |
| "reward_std": 0.37546689808368683, |
| "rewards/cosine_scaled_reward": -0.2678212709724903, |
| "rewards/format_reward": 0.10416666977107525, |
| "step": 67 |
| }, |
| { |
| "completion_length": 1701.6458358764648, |
| "epoch": 0.07771428571428571, |
| "grad_norm": 0.02836545556783676, |
| "kl": 2.9101967811584473e-05, |
| "learning_rate": 9.968344786479415e-07, |
| "loss": -0.0354, |
| "reward": 0.6903760433197021, |
| "reward_std": 0.8633946646004915, |
| "rewards/cosine_scaled_reward": -0.019395311828702688, |
| "rewards/format_reward": 0.7291666716337204, |
| "step": 68 |
| }, |
| { |
| "completion_length": 2116.8333740234375, |
| "epoch": 0.07885714285714286, |
| "grad_norm": 0.027675610035657883, |
| "kl": 2.561137080192566e-05, |
| "learning_rate": 9.964516155915151e-07, |
| "loss": 0.0049, |
| "reward": 0.3666526600718498, |
| "reward_std": 0.5472365878522396, |
| "rewards/cosine_scaled_reward": -0.12917369417846203, |
| "rewards/format_reward": 0.6250000037252903, |
| "step": 69 |
| }, |
| { |
| "completion_length": 2915.4583587646484, |
| "epoch": 0.08, |
| "grad_norm": 0.033280231058597565, |
| "kl": 2.2306106984615326e-05, |
| "learning_rate": 9.960469931131936e-07, |
| "loss": 0.0425, |
| "reward": 0.07917704619467258, |
| "reward_std": 0.48793724877759814, |
| "rewards/cosine_scaled_reward": -0.13749481900595129, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 70 |
| }, |
| { |
| "completion_length": 2776.1458587646484, |
| "epoch": 0.08114285714285714, |
| "grad_norm": 0.02456558682024479, |
| "kl": 1.961737871170044e-05, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": 0.0067, |
| "reward": 0.22148176468908787, |
| "reward_std": 0.5928666815161705, |
| "rewards/cosine_scaled_reward": -0.04550914093852043, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 71 |
| }, |
| { |
| "completion_length": 2835.916702270508, |
| "epoch": 0.08228571428571428, |
| "grad_norm": 0.032275594770908356, |
| "kl": 4.030764102935791e-05, |
| "learning_rate": 9.951725498333448e-07, |
| "loss": 0.03, |
| "reward": 0.0798844201490283, |
| "reward_std": 0.5809437595307827, |
| "rewards/cosine_scaled_reward": -0.1788077955134213, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 72 |
| }, |
| { |
| "completion_length": 3413.3958740234375, |
| "epoch": 0.08342857142857144, |
| "grad_norm": 0.0283922478556633, |
| "kl": 2.5540590286254883e-05, |
| "learning_rate": 9.947027716509488e-07, |
| "loss": 0.0536, |
| "reward": -0.20378641900606453, |
| "reward_std": 0.5730555988848209, |
| "rewards/cosine_scaled_reward": -0.18522655218839645, |
| "rewards/format_reward": 0.16666666977107525, |
| "step": 73 |
| }, |
| { |
| "completion_length": 2542.875030517578, |
| "epoch": 0.08457142857142858, |
| "grad_norm": 0.05036113038659096, |
| "kl": 3.673252649605274e-05, |
| "learning_rate": 9.942113192828444e-07, |
| "loss": 0.1141, |
| "reward": 0.5641527697443962, |
| "reward_std": 0.7980917133390903, |
| "rewards/cosine_scaled_reward": 0.06332638207823038, |
| "rewards/format_reward": 0.4375000111758709, |
| "step": 74 |
| }, |
| { |
| "completion_length": 2902.291748046875, |
| "epoch": 0.08571428571428572, |
| "grad_norm": 0.030917614698410034, |
| "kl": 0.00010601989924907684, |
| "learning_rate": 9.93698216681727e-07, |
| "loss": -0.0053, |
| "reward": 0.5767261572182178, |
| "reward_std": 0.6028854753822088, |
| "rewards/cosine_scaled_reward": 0.0696130646392703, |
| "rewards/format_reward": 0.43750001303851604, |
| "step": 75 |
| }, |
| { |
| "completion_length": 2528.979217529297, |
| "epoch": 0.08685714285714285, |
| "grad_norm": 0.03214319422841072, |
| "kl": 2.886354923248291e-05, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": 0.0212, |
| "reward": 0.16732670925557613, |
| "reward_std": 0.5912668146193027, |
| "rewards/cosine_scaled_reward": -0.17675332073122263, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 76 |
| }, |
| { |
| "completion_length": 2850.666679382324, |
| "epoch": 0.088, |
| "grad_norm": 0.0167290810495615, |
| "kl": 2.2485852241516113e-05, |
| "learning_rate": 9.926071618660237e-07, |
| "loss": -0.002, |
| "reward": -0.04175177216529846, |
| "reward_std": 0.4047280023805797, |
| "rewards/cosine_scaled_reward": -0.17712588026188314, |
| "rewards/format_reward": 0.3125, |
| "step": 77 |
| }, |
| { |
| "completion_length": 3262.0416717529297, |
| "epoch": 0.08914285714285715, |
| "grad_norm": 0.04398359730839729, |
| "kl": 1.8542632460594177e-05, |
| "learning_rate": 9.9202926282791e-07, |
| "loss": 0.0164, |
| "reward": 0.04341258108615875, |
| "reward_std": 0.8593644499778748, |
| "rewards/cosine_scaled_reward": -0.10329371684929356, |
| "rewards/format_reward": 0.25000000931322575, |
| "step": 78 |
| }, |
| { |
| "completion_length": 2167.2708435058594, |
| "epoch": 0.09028571428571429, |
| "grad_norm": 0.03241743519902229, |
| "kl": 3.7103891372680664e-06, |
| "learning_rate": 9.91429819907136e-07, |
| "loss": 0.0034, |
| "reward": 0.34780592005699873, |
| "reward_std": 0.5333101600408554, |
| "rewards/cosine_scaled_reward": -0.12818037904798985, |
| "rewards/format_reward": 0.6041666697710752, |
| "step": 79 |
| }, |
| { |
| "completion_length": 3245.750015258789, |
| "epoch": 0.09142857142857143, |
| "grad_norm": 0.025279633700847626, |
| "kl": 1.8217913748230785e-05, |
| "learning_rate": 9.908088623197048e-07, |
| "loss": 0.0211, |
| "reward": -0.14338213577866554, |
| "reward_std": 0.3987116068601608, |
| "rewards/cosine_scaled_reward": -0.19669106788933277, |
| "rewards/format_reward": 0.25000000558793545, |
| "step": 80 |
| }, |
| { |
| "completion_length": 3126.250030517578, |
| "epoch": 0.09257142857142857, |
| "grad_norm": 0.0291125550866127, |
| "kl": 3.684312105178833e-05, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0013, |
| "reward": -0.2766275405883789, |
| "reward_std": 0.49616212770342827, |
| "rewards/cosine_scaled_reward": -0.25289710983633995, |
| "rewards/format_reward": 0.22916666977107525, |
| "step": 81 |
| }, |
| { |
| "completion_length": 2787.916732788086, |
| "epoch": 0.09371428571428571, |
| "grad_norm": 0.053675856441259384, |
| "kl": 2.8818845748901367e-05, |
| "learning_rate": 9.895025252503755e-07, |
| "loss": 0.0989, |
| "reward": 0.4680606871843338, |
| "reward_std": 0.9560514762997627, |
| "rewards/cosine_scaled_reward": 0.0152803435921669, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 82 |
| }, |
| { |
| "completion_length": 2561.979175567627, |
| "epoch": 0.09485714285714286, |
| "grad_norm": 0.025762459263205528, |
| "kl": 3.769155591726303e-05, |
| "learning_rate": 9.888172094375033e-07, |
| "loss": -0.0082, |
| "reward": 0.18416154105216265, |
| "reward_std": 0.45330405980348587, |
| "rewards/cosine_scaled_reward": -0.10583589226007462, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 83 |
| }, |
| { |
| "completion_length": 2878.229217529297, |
| "epoch": 0.096, |
| "grad_norm": 0.050619326531887054, |
| "kl": 2.41938978433609e-05, |
| "learning_rate": 9.881105062929221e-07, |
| "loss": 0.0362, |
| "reward": 0.36311598774045706, |
| "reward_std": 0.9684205129742622, |
| "rewards/cosine_scaled_reward": -0.03719199728220701, |
| "rewards/format_reward": 0.43750001303851604, |
| "step": 84 |
| }, |
| { |
| "completion_length": 3074.5416870117188, |
| "epoch": 0.09714285714285714, |
| "grad_norm": 0.03848778456449509, |
| "kl": 1.1827796697616577e-05, |
| "learning_rate": 9.873824502603459e-07, |
| "loss": 0.048, |
| "reward": 0.007899533025920391, |
| "reward_std": 0.7812178507447243, |
| "rewards/cosine_scaled_reward": -0.1523002306057606, |
| "rewards/format_reward": 0.3125000074505806, |
| "step": 85 |
| }, |
| { |
| "completion_length": 2757.7708587646484, |
| "epoch": 0.09828571428571428, |
| "grad_norm": 0.030428804457187653, |
| "kl": 3.374367952346802e-05, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": -0.007, |
| "reward": 0.2510148920118809, |
| "reward_std": 0.6730287857353687, |
| "rewards/cosine_scaled_reward": -0.09324256191030145, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 86 |
| }, |
| { |
| "completion_length": 2643.520866394043, |
| "epoch": 0.09942857142857142, |
| "grad_norm": 0.03848496824502945, |
| "kl": 2.314150333404541e-05, |
| "learning_rate": 9.85862422507884e-07, |
| "loss": 0.0456, |
| "reward": 0.45505039021372795, |
| "reward_std": 0.7903980556875467, |
| "rewards/cosine_scaled_reward": -0.012058130465447903, |
| "rewards/format_reward": 0.47916668094694614, |
| "step": 87 |
| }, |
| { |
| "completion_length": 2060.8959197998047, |
| "epoch": 0.10057142857142858, |
| "grad_norm": 0.04052264988422394, |
| "kl": 2.466072328388691e-05, |
| "learning_rate": 9.850705248720068e-07, |
| "loss": 0.0715, |
| "reward": 0.655466640368104, |
| "reward_std": 0.89967380464077, |
| "rewards/cosine_scaled_reward": 0.025649975519627333, |
| "rewards/format_reward": 0.6041666734963655, |
| "step": 88 |
| }, |
| { |
| "completion_length": 2851.687515258789, |
| "epoch": 0.10171428571428572, |
| "grad_norm": 0.026882044970989227, |
| "kl": 1.2446194887161255e-05, |
| "learning_rate": 9.8425742251254e-07, |
| "loss": 0.0021, |
| "reward": 0.33032611198723316, |
| "reward_std": 0.6415284462273121, |
| "rewards/cosine_scaled_reward": -0.032753610983490944, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 89 |
| }, |
| { |
| "completion_length": 2466.0208892822266, |
| "epoch": 0.10285714285714286, |
| "grad_norm": 0.023797115311026573, |
| "kl": 5.2119605243206024e-05, |
| "learning_rate": 9.83423155058946e-07, |
| "loss": 0.031, |
| "reward": 0.10800119815394282, |
| "reward_std": 0.4383825846016407, |
| "rewards/cosine_scaled_reward": -0.18558274419046938, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 90 |
| }, |
| { |
| "completion_length": 3075.833335876465, |
| "epoch": 0.104, |
| "grad_norm": 0.05504177510738373, |
| "kl": 2.738088369369507e-06, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0478, |
| "reward": 0.1143021360039711, |
| "reward_std": 0.8147715367376804, |
| "rewards/cosine_scaled_reward": -0.08868227154016495, |
| "rewards/format_reward": 0.29166666977107525, |
| "step": 91 |
| }, |
| { |
| "completion_length": 2582.791717529297, |
| "epoch": 0.10514285714285715, |
| "grad_norm": 0.08598244190216064, |
| "kl": 2.1602027118206024e-05, |
| "learning_rate": 9.816912885430258e-07, |
| "loss": -0.0326, |
| "reward": 0.117531917989254, |
| "reward_std": 0.5453543923795223, |
| "rewards/cosine_scaled_reward": -0.19123404938727617, |
| "rewards/format_reward": 0.5000000037252903, |
| "step": 92 |
| }, |
| { |
| "completion_length": 3582.6458435058594, |
| "epoch": 0.10628571428571429, |
| "grad_norm": 0.02585573121905327, |
| "kl": 2.4219392798841e-05, |
| "learning_rate": 9.807937738894303e-07, |
| "loss": 0.0008, |
| "reward": -0.4189921743236482, |
| "reward_std": 0.49103348329663277, |
| "rewards/cosine_scaled_reward": -0.23032942693680525, |
| "rewards/format_reward": 0.0416666679084301, |
| "step": 93 |
| }, |
| { |
| "completion_length": 2344.5416946411133, |
| "epoch": 0.10742857142857143, |
| "grad_norm": 0.01883937232196331, |
| "kl": 3.026425838470459e-05, |
| "learning_rate": 9.798752629550546e-07, |
| "loss": -0.0014, |
| "reward": 0.2969733662903309, |
| "reward_std": 0.4309884384274483, |
| "rewards/cosine_scaled_reward": -0.10151330940425396, |
| "rewards/format_reward": 0.5, |
| "step": 94 |
| }, |
| { |
| "completion_length": 3309.8958435058594, |
| "epoch": 0.10857142857142857, |
| "grad_norm": 0.039376407861709595, |
| "kl": 1.1745840311050415e-05, |
| "learning_rate": 9.78935800506826e-07, |
| "loss": 0.0141, |
| "reward": -0.019916832447052002, |
| "reward_std": 0.6842259094119072, |
| "rewards/cosine_scaled_reward": -0.1349584199488163, |
| "rewards/format_reward": 0.25, |
| "step": 95 |
| }, |
| { |
| "completion_length": 2395.979217529297, |
| "epoch": 0.10971428571428571, |
| "grad_norm": 0.03585467487573624, |
| "kl": 4.587695002555847e-06, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": 0.0035, |
| "reward": 0.3708239998668432, |
| "reward_std": 0.6535003744065762, |
| "rewards/cosine_scaled_reward": -0.08542133821174502, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 96 |
| }, |
| { |
| "completion_length": 3134.062545776367, |
| "epoch": 0.11085714285714286, |
| "grad_norm": 0.05161568522453308, |
| "kl": 2.466142177581787e-05, |
| "learning_rate": 9.769942052400235e-07, |
| "loss": 0.079, |
| "reward": 0.09229021240025759, |
| "reward_std": 0.7892664521932602, |
| "rewards/cosine_scaled_reward": -0.08927156520076096, |
| "rewards/format_reward": 0.27083334140479565, |
| "step": 97 |
| }, |
| { |
| "completion_length": 2911.7708892822266, |
| "epoch": 0.112, |
| "grad_norm": 0.032276708632707596, |
| "kl": 7.407739758491516e-06, |
| "learning_rate": 9.759921670520634e-07, |
| "loss": 0.0629, |
| "reward": 0.311330777592957, |
| "reward_std": 0.5608916245400906, |
| "rewards/cosine_scaled_reward": -0.05266796611249447, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 98 |
| }, |
| { |
| "completion_length": 2801.041702270508, |
| "epoch": 0.11314285714285714, |
| "grad_norm": 0.029017232358455658, |
| "kl": 9.931332897394896e-06, |
| "learning_rate": 9.749693666068663e-07, |
| "loss": 0.0092, |
| "reward": 0.1490808641538024, |
| "reward_std": 0.5960719883441925, |
| "rewards/cosine_scaled_reward": -0.08170956326648593, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 99 |
| }, |
| { |
| "completion_length": 2526.5208587646484, |
| "epoch": 0.11428571428571428, |
| "grad_norm": 0.04443981498479843, |
| "kl": 3.993883728981018e-05, |
| "learning_rate": 9.739258537542835e-07, |
| "loss": -0.0003, |
| "reward": 0.4763328084954992, |
| "reward_std": 0.8469073474407196, |
| "rewards/cosine_scaled_reward": -0.0014169311616569757, |
| "rewards/format_reward": 0.4791666679084301, |
| "step": 100 |
| }, |
| { |
| "completion_length": 2756.645866394043, |
| "epoch": 0.11542857142857142, |
| "grad_norm": 0.03890707716345787, |
| "kl": 1.1265277862548828e-05, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": 0.0634, |
| "reward": 0.35849976912140846, |
| "reward_std": 0.6256566159427166, |
| "rewards/cosine_scaled_reward": -0.01866680383682251, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 101 |
| }, |
| { |
| "completion_length": 2056.3125762939453, |
| "epoch": 0.11657142857142858, |
| "grad_norm": 0.03498723357915878, |
| "kl": 4.578381776809692e-06, |
| "learning_rate": 9.717768952713511e-07, |
| "loss": 0.04, |
| "reward": 0.6048059780150652, |
| "reward_std": 0.6034604609012604, |
| "rewards/cosine_scaled_reward": -0.0517636826261878, |
| "rewards/format_reward": 0.7083333488553762, |
| "step": 102 |
| }, |
| { |
| "completion_length": 2677.979202270508, |
| "epoch": 0.11771428571428572, |
| "grad_norm": 0.033229317516088486, |
| "kl": 4.5493245124816895e-05, |
| "learning_rate": 9.706715543782064e-07, |
| "loss": 0.0244, |
| "reward": 0.22921494115144014, |
| "reward_std": 0.5147040607407689, |
| "rewards/cosine_scaled_reward": -0.09372587502002716, |
| "rewards/format_reward": 0.4166666679084301, |
| "step": 103 |
| }, |
| { |
| "completion_length": 2607.9375, |
| "epoch": 0.11885714285714286, |
| "grad_norm": 0.04574336111545563, |
| "kl": 2.5503337383270264e-05, |
| "learning_rate": 9.695457105469804e-07, |
| "loss": 0.0431, |
| "reward": 0.19741635338868946, |
| "reward_std": 0.7826841361820698, |
| "rewards/cosine_scaled_reward": -0.14087516069412231, |
| "rewards/format_reward": 0.4791666753590107, |
| "step": 104 |
| }, |
| { |
| "completion_length": 2667.354217529297, |
| "epoch": 0.12, |
| "grad_norm": 0.06220825016498566, |
| "kl": 2.7738511562347412e-05, |
| "learning_rate": 9.683994186497132e-07, |
| "loss": 0.11, |
| "reward": 0.3817732520401478, |
| "reward_std": 1.0063364561647177, |
| "rewards/cosine_scaled_reward": -0.007030060514807701, |
| "rewards/format_reward": 0.39583334140479565, |
| "step": 105 |
| }, |
| { |
| "completion_length": 2350.1667137145996, |
| "epoch": 0.12114285714285715, |
| "grad_norm": 0.038587577641010284, |
| "kl": -3.159046173095703e-06, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": 0.0546, |
| "reward": 0.8650688156485558, |
| "reward_std": 0.8709494546055794, |
| "rewards/cosine_scaled_reward": 0.14086772687733173, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 106 |
| }, |
| { |
| "completion_length": 2880.0833435058594, |
| "epoch": 0.12228571428571429, |
| "grad_norm": 0.03471750020980835, |
| "kl": 5.945563316345215e-06, |
| "learning_rate": 9.66045715125541e-07, |
| "loss": 0.0146, |
| "reward": 0.45387474820017815, |
| "reward_std": 0.527577817440033, |
| "rewards/cosine_scaled_reward": -0.0022292807698249817, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 107 |
| }, |
| { |
| "completion_length": 2783.312530517578, |
| "epoch": 0.12342857142857143, |
| "grad_norm": 0.031732089817523956, |
| "kl": 9.931158274412155e-06, |
| "learning_rate": 9.648384182148252e-07, |
| "loss": 0.0456, |
| "reward": 0.1532750353217125, |
| "reward_std": 0.5677235350012779, |
| "rewards/cosine_scaled_reward": -0.10044581908732653, |
| "rewards/format_reward": 0.35416667722165585, |
| "step": 108 |
| }, |
| { |
| "completion_length": 3024.354179382324, |
| "epoch": 0.12457142857142857, |
| "grad_norm": 0.023847682401537895, |
| "kl": -6.922753527760506e-06, |
| "learning_rate": 9.636109026648554e-07, |
| "loss": 0.0082, |
| "reward": 0.10752910934388638, |
| "reward_std": 0.4340343438088894, |
| "rewards/cosine_scaled_reward": -0.09206879511475563, |
| "rewards/format_reward": 0.29166667349636555, |
| "step": 109 |
| }, |
| { |
| "completion_length": 2805.270851135254, |
| "epoch": 0.12571428571428572, |
| "grad_norm": 0.037369612604379654, |
| "kl": -4.994682967662811e-06, |
| "learning_rate": 9.623632283030077e-07, |
| "loss": -0.0008, |
| "reward": 0.11598340794444084, |
| "reward_std": 0.7331918552517891, |
| "rewards/cosine_scaled_reward": -0.16075829323381186, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 110 |
| }, |
| { |
| "completion_length": 3019.500045776367, |
| "epoch": 0.12685714285714286, |
| "grad_norm": 0.03501195088028908, |
| "kl": 1.4557619579136372e-05, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": 0.0307, |
| "reward": 0.0815199427306652, |
| "reward_std": 0.6842854991555214, |
| "rewards/cosine_scaled_reward": -0.1050733660813421, |
| "rewards/format_reward": 0.29166666977107525, |
| "step": 111 |
| }, |
| { |
| "completion_length": 3347.0833740234375, |
| "epoch": 0.128, |
| "grad_norm": 0.04136764630675316, |
| "kl": 1.260777935385704e-05, |
| "learning_rate": 9.598076473627796e-07, |
| "loss": 0.0226, |
| "reward": 0.20088393986225128, |
| "reward_std": 0.6008856520056725, |
| "rewards/cosine_scaled_reward": -0.04539137287065387, |
| "rewards/format_reward": 0.29166666977107525, |
| "step": 112 |
| }, |
| { |
| "completion_length": 2591.145854949951, |
| "epoch": 0.12914285714285714, |
| "grad_norm": 0.028484085574746132, |
| "kl": 6.703287363052368e-05, |
| "learning_rate": 9.58499865339809e-07, |
| "loss": 0.0377, |
| "reward": 0.1814913973212242, |
| "reward_std": 0.5701524242758751, |
| "rewards/cosine_scaled_reward": -0.1280043087899685, |
| "rewards/format_reward": 0.4375000111758709, |
| "step": 113 |
| }, |
| { |
| "completion_length": 2411.8958587646484, |
| "epoch": 0.13028571428571428, |
| "grad_norm": 0.03289726749062538, |
| "kl": 8.968636393547058e-06, |
| "learning_rate": 9.571721736097088e-07, |
| "loss": 0.0053, |
| "reward": 0.12762277480214834, |
| "reward_std": 0.43860352924093604, |
| "rewards/cosine_scaled_reward": -0.22785530053079128, |
| "rewards/format_reward": 0.5833333358168602, |
| "step": 114 |
| }, |
| { |
| "completion_length": 2959.895835876465, |
| "epoch": 0.13142857142857142, |
| "grad_norm": 0.0381602868437767, |
| "kl": 2.3990869522094727e-05, |
| "learning_rate": 9.55824636882301e-07, |
| "loss": 0.0392, |
| "reward": 0.28831441630609334, |
| "reward_std": 0.6521844463422894, |
| "rewards/cosine_scaled_reward": -0.01209279743488878, |
| "rewards/format_reward": 0.3125, |
| "step": 115 |
| }, |
| { |
| "completion_length": 3210.875, |
| "epoch": 0.13257142857142856, |
| "grad_norm": 0.027132300660014153, |
| "kl": 1.683831214904785e-05, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": 0.002, |
| "reward": -0.266488716006279, |
| "reward_std": 0.5367148518562317, |
| "rewards/cosine_scaled_reward": -0.21657769149169326, |
| "rewards/format_reward": 0.1666666679084301, |
| "step": 116 |
| }, |
| { |
| "completion_length": 3263.1666717529297, |
| "epoch": 0.1337142857142857, |
| "grad_norm": 0.026704585179686546, |
| "kl": 8.605420589447021e-06, |
| "learning_rate": 9.530702921077358e-07, |
| "loss": 0.005, |
| "reward": -0.2799858786165714, |
| "reward_std": 0.43628356233239174, |
| "rewards/cosine_scaled_reward": -0.2233262713998556, |
| "rewards/format_reward": 0.1666666679084301, |
| "step": 117 |
| }, |
| { |
| "completion_length": 2891.6875610351562, |
| "epoch": 0.13485714285714287, |
| "grad_norm": 0.04840896278619766, |
| "kl": -8.52346420288086e-06, |
| "learning_rate": 9.516636183034564e-07, |
| "loss": 0.0269, |
| "reward": 0.8288209615275264, |
| "reward_std": 0.9317138865590096, |
| "rewards/cosine_scaled_reward": 0.15399381378665566, |
| "rewards/format_reward": 0.5208333414047956, |
| "step": 118 |
| }, |
| { |
| "completion_length": 1989.208366394043, |
| "epoch": 0.136, |
| "grad_norm": 0.03215627372264862, |
| "kl": 6.823241710662842e-05, |
| "learning_rate": 9.502373679810839e-07, |
| "loss": 0.0413, |
| "reward": 0.5099336765706539, |
| "reward_std": 0.5711662992835045, |
| "rewards/cosine_scaled_reward": -0.06794983521103859, |
| "rewards/format_reward": 0.6458333358168602, |
| "step": 119 |
| }, |
| { |
| "completion_length": 2788.00004196167, |
| "epoch": 0.13714285714285715, |
| "grad_norm": 0.04669712111353874, |
| "kl": 1.7192214727401733e-05, |
| "learning_rate": 9.487916106540465e-07, |
| "loss": 0.0839, |
| "reward": 0.23134121485054493, |
| "reward_std": 0.6627340568229556, |
| "rewards/cosine_scaled_reward": -0.06141273118555546, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 120 |
| }, |
| { |
| "completion_length": 2010.562515258789, |
| "epoch": 0.1382857142857143, |
| "grad_norm": 0.03950565680861473, |
| "kl": 4.994124174118042e-05, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0413, |
| "reward": 0.5943057034164667, |
| "reward_std": 0.6402759049087763, |
| "rewards/cosine_scaled_reward": -0.0049304962158203125, |
| "rewards/format_reward": 0.6041666697710752, |
| "step": 121 |
| }, |
| { |
| "completion_length": 3024.6666870117188, |
| "epoch": 0.13942857142857143, |
| "grad_norm": 0.05417582765221596, |
| "kl": 3.859400749206543e-06, |
| "learning_rate": 9.458418577899774e-07, |
| "loss": 0.0436, |
| "reward": 0.3850366286933422, |
| "reward_std": 0.8862559981644154, |
| "rewards/cosine_scaled_reward": 0.005018303170800209, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 122 |
| }, |
| { |
| "completion_length": 2911.104232788086, |
| "epoch": 0.14057142857142857, |
| "grad_norm": 0.05250710994005203, |
| "kl": 7.831677794456482e-05, |
| "learning_rate": 9.443380060197385e-07, |
| "loss": 0.1311, |
| "reward": -0.02938712202012539, |
| "reward_std": 0.7504731137305498, |
| "rewards/cosine_scaled_reward": -0.18136022426187992, |
| "rewards/format_reward": 0.33333333395421505, |
| "step": 123 |
| }, |
| { |
| "completion_length": 2103.000026702881, |
| "epoch": 0.1417142857142857, |
| "grad_norm": 0.03832955285906792, |
| "kl": 2.69375741481781e-05, |
| "learning_rate": 9.428149347714143e-07, |
| "loss": -0.0021, |
| "reward": 0.6034899442456663, |
| "reward_std": 0.6659752391278744, |
| "rewards/cosine_scaled_reward": -0.00033837370574474335, |
| "rewards/format_reward": 0.6041666679084301, |
| "step": 124 |
| }, |
| { |
| "completion_length": 2813.000015258789, |
| "epoch": 0.14285714285714285, |
| "grad_norm": 0.033055417239665985, |
| "kl": 4.522502422332764e-06, |
| "learning_rate": 9.412727182773486e-07, |
| "loss": 0.0306, |
| "reward": 0.4082658104598522, |
| "reward_std": 0.5916998572647572, |
| "rewards/cosine_scaled_reward": 0.047882895451039076, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 125 |
| }, |
| { |
| "completion_length": 2842.0416870117188, |
| "epoch": 0.144, |
| "grad_norm": 0.04313673824071884, |
| "kl": 2.8833746910095215e-06, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": 0.0148, |
| "reward": 0.1651114380802028, |
| "reward_std": 0.7616937188431621, |
| "rewards/cosine_scaled_reward": -0.12577760918065906, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 126 |
| }, |
| { |
| "completion_length": 3460.750030517578, |
| "epoch": 0.14514285714285713, |
| "grad_norm": 0.02390168234705925, |
| "kl": 3.158301115036011e-05, |
| "learning_rate": 9.381311511432658e-07, |
| "loss": 0.022, |
| "reward": -0.43700045719742775, |
| "reward_std": 0.405607839114964, |
| "rewards/cosine_scaled_reward": -0.2705835606902838, |
| "rewards/format_reward": 0.1041666679084301, |
| "step": 127 |
| }, |
| { |
| "completion_length": 2771.4583740234375, |
| "epoch": 0.1462857142857143, |
| "grad_norm": 0.04748675227165222, |
| "kl": 1.5042722225189209e-05, |
| "learning_rate": 9.36531953618799e-07, |
| "loss": 0.0512, |
| "reward": 0.8968718275427818, |
| "reward_std": 0.8156895972788334, |
| "rewards/cosine_scaled_reward": 0.1671859212219715, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 128 |
| }, |
| { |
| "completion_length": 3298.250030517578, |
| "epoch": 0.14742857142857144, |
| "grad_norm": 0.04449458047747612, |
| "kl": 1.4479272067546844e-05, |
| "learning_rate": 9.34913917072228e-07, |
| "loss": 0.009, |
| "reward": -0.10969683527946472, |
| "reward_std": 0.7850774228572845, |
| "rewards/cosine_scaled_reward": -0.17984841065481305, |
| "rewards/format_reward": 0.2500000037252903, |
| "step": 129 |
| }, |
| { |
| "completion_length": 3292.7083587646484, |
| "epoch": 0.14857142857142858, |
| "grad_norm": 0.04384580999612808, |
| "kl": 2.863258123397827e-05, |
| "learning_rate": 9.332771203643714e-07, |
| "loss": 0.0654, |
| "reward": -0.18806715682148933, |
| "reward_std": 0.7124089896678925, |
| "rewards/cosine_scaled_reward": -0.16695024794898927, |
| "rewards/format_reward": 0.14583333767950535, |
| "step": 130 |
| }, |
| { |
| "completion_length": 2807.4792289733887, |
| "epoch": 0.14971428571428572, |
| "grad_norm": 0.05430476740002632, |
| "kl": 4.511140286922455e-05, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.1049, |
| "reward": 0.31562044098973274, |
| "reward_std": 0.7439009035006166, |
| "rewards/cosine_scaled_reward": -0.0401064483448863, |
| "rewards/format_reward": 0.3958333395421505, |
| "step": 131 |
| }, |
| { |
| "completion_length": 2769.083339691162, |
| "epoch": 0.15085714285714286, |
| "grad_norm": 0.0387246310710907, |
| "kl": -2.1746382117271423e-05, |
| "learning_rate": 9.299475664759068e-07, |
| "loss": -0.0041, |
| "reward": 0.15846600383520126, |
| "reward_std": 0.6973005346953869, |
| "rewards/cosine_scaled_reward": -0.09785035438835621, |
| "rewards/format_reward": 0.35416666977107525, |
| "step": 132 |
| }, |
| { |
| "completion_length": 3294.4583740234375, |
| "epoch": 0.152, |
| "grad_norm": 0.043031562119722366, |
| "kl": 1.8611550331115723e-05, |
| "learning_rate": 9.282549715730579e-07, |
| "loss": 0.0035, |
| "reward": 0.021218243055045605, |
| "reward_std": 0.645587831735611, |
| "rewards/cosine_scaled_reward": -0.12480754964053631, |
| "rewards/format_reward": 0.27083333767950535, |
| "step": 133 |
| }, |
| { |
| "completion_length": 2709.437526702881, |
| "epoch": 0.15314285714285714, |
| "grad_norm": 0.049496814608573914, |
| "kl": 9.149685502052307e-05, |
| "learning_rate": 9.265439410565328e-07, |
| "loss": 0.039, |
| "reward": 0.39167931117117405, |
| "reward_std": 0.6326965596526861, |
| "rewards/cosine_scaled_reward": -0.0020770253613591194, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 134 |
| }, |
| { |
| "completion_length": 2031.5417022705078, |
| "epoch": 0.15428571428571428, |
| "grad_norm": 0.05865013226866722, |
| "kl": 8.575990796089172e-05, |
| "learning_rate": 9.248145583195447e-07, |
| "loss": 0.1061, |
| "reward": 1.1254341015592217, |
| "reward_std": 0.8684800090268254, |
| "rewards/cosine_scaled_reward": 0.25021704845130444, |
| "rewards/format_reward": 0.6250000074505806, |
| "step": 135 |
| }, |
| { |
| "completion_length": 2618.687515258789, |
| "epoch": 0.15542857142857142, |
| "grad_norm": 0.034709468483924866, |
| "kl": 4.778057336807251e-05, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": 0.023, |
| "reward": 0.48167144507169724, |
| "reward_std": 0.6561040449887514, |
| "rewards/cosine_scaled_reward": 0.02208572020754218, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 136 |
| }, |
| { |
| "completion_length": 3057.8958435058594, |
| "epoch": 0.15657142857142858, |
| "grad_norm": 0.021808914840221405, |
| "kl": 1.4843419194221497e-05, |
| "learning_rate": 9.213010742252327e-07, |
| "loss": 0.0142, |
| "reward": -0.08836949244141579, |
| "reward_std": 0.4218329731374979, |
| "rewards/cosine_scaled_reward": -0.1796014215797186, |
| "rewards/format_reward": 0.2708333395421505, |
| "step": 137 |
| }, |
| { |
| "completion_length": 2785.1875228881836, |
| "epoch": 0.15771428571428572, |
| "grad_norm": 0.0407271683216095, |
| "kl": 1.3926997780799866e-05, |
| "learning_rate": 9.195171441101668e-07, |
| "loss": 0.003, |
| "reward": 0.33298980072140694, |
| "reward_std": 0.7713625803589821, |
| "rewards/cosine_scaled_reward": -0.06267178524285555, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 138 |
| }, |
| { |
| "completion_length": 2804.0833435058594, |
| "epoch": 0.15885714285714286, |
| "grad_norm": 0.04251856729388237, |
| "kl": 0.00016787275671958923, |
| "learning_rate": 9.177152042508077e-07, |
| "loss": 0.0123, |
| "reward": 0.13880036072805524, |
| "reward_std": 0.7100770510733128, |
| "rewards/cosine_scaled_reward": -0.13893315801396966, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 139 |
| }, |
| { |
| "completion_length": 3116.958396911621, |
| "epoch": 0.16, |
| "grad_norm": 0.03125656023621559, |
| "kl": 8.677691221237183e-05, |
| "learning_rate": 9.158953424711624e-07, |
| "loss": 0.0472, |
| "reward": 0.06694573163986206, |
| "reward_std": 0.5762135218828917, |
| "rewards/cosine_scaled_reward": -0.07069380325265229, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 140 |
| }, |
| { |
| "completion_length": 2992.2708892822266, |
| "epoch": 0.16114285714285714, |
| "grad_norm": 0.041330691426992416, |
| "kl": 9.230338037014008e-06, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0029, |
| "reward": 0.19635527953505516, |
| "reward_std": 0.7302406523376703, |
| "rewards/cosine_scaled_reward": -0.12057237513363361, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 141 |
| }, |
| { |
| "completion_length": 2647.187545776367, |
| "epoch": 0.16228571428571428, |
| "grad_norm": 0.037891265004873276, |
| "kl": 3.4689903259277344e-05, |
| "learning_rate": 9.122022088101613e-07, |
| "loss": 0.0244, |
| "reward": 0.33850586879998446, |
| "reward_std": 0.7777349427342415, |
| "rewards/cosine_scaled_reward": -0.12241373397409916, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 142 |
| }, |
| { |
| "completion_length": 2517.7917098999023, |
| "epoch": 0.16342857142857142, |
| "grad_norm": 0.038469068706035614, |
| "kl": 4.398822784423828e-05, |
| "learning_rate": 9.103291169269299e-07, |
| "loss": 0.0177, |
| "reward": 0.10013854561839253, |
| "reward_std": 0.6364383529871702, |
| "rewards/cosine_scaled_reward": -0.17909739445894957, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 143 |
| }, |
| { |
| "completion_length": 3101.375030517578, |
| "epoch": 0.16457142857142856, |
| "grad_norm": 0.047153256833553314, |
| "kl": 2.6047229766845703e-05, |
| "learning_rate": 9.084384631108882e-07, |
| "loss": 0.0287, |
| "reward": 0.22925031930208206, |
| "reward_std": 0.7248009741306305, |
| "rewards/cosine_scaled_reward": -0.020791523158550262, |
| "rewards/format_reward": 0.2708333395421505, |
| "step": 144 |
| }, |
| { |
| "completion_length": 2370.1875228881836, |
| "epoch": 0.1657142857142857, |
| "grad_norm": 0.0311027429997921, |
| "kl": 4.681199789047241e-05, |
| "learning_rate": 9.065303395098358e-07, |
| "loss": 0.0081, |
| "reward": 0.4629867151379585, |
| "reward_std": 0.7178505845367908, |
| "rewards/cosine_scaled_reward": -0.028923317790031433, |
| "rewards/format_reward": 0.520833333954215, |
| "step": 145 |
| }, |
| { |
| "completion_length": 2521.020866394043, |
| "epoch": 0.16685714285714287, |
| "grad_norm": 0.022678615525364876, |
| "kl": 6.041303277015686e-05, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": 0.0343, |
| "reward": 0.04914231039583683, |
| "reward_std": 0.4269477091729641, |
| "rewards/cosine_scaled_reward": -0.2775121796876192, |
| "rewards/format_reward": 0.604166679084301, |
| "step": 146 |
| }, |
| { |
| "completion_length": 3500.4166870117188, |
| "epoch": 0.168, |
| "grad_norm": 0.04476216807961464, |
| "kl": 4.99039888381958e-05, |
| "learning_rate": 9.026620557966279e-07, |
| "loss": 0.0183, |
| "reward": -0.014710072427988052, |
| "reward_std": 0.8408091142773628, |
| "rewards/cosine_scaled_reward": -0.09068836877122521, |
| "rewards/format_reward": 0.1666666716337204, |
| "step": 147 |
| }, |
| { |
| "completion_length": 2774.5416717529297, |
| "epoch": 0.16914285714285715, |
| "grad_norm": 0.028264645487070084, |
| "kl": 2.7420930564403534e-05, |
| "learning_rate": 9.007020842191634e-07, |
| "loss": 0.0294, |
| "reward": 0.31881215423345566, |
| "reward_std": 0.535629041492939, |
| "rewards/cosine_scaled_reward": -0.06976059079170227, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 148 |
| }, |
| { |
| "completion_length": 2830.875015258789, |
| "epoch": 0.1702857142857143, |
| "grad_norm": 0.037253376096487045, |
| "kl": 3.8120895624160767e-05, |
| "learning_rate": 8.987250199168808e-07, |
| "loss": 0.047, |
| "reward": 0.2539705242961645, |
| "reward_std": 0.6319136489182711, |
| "rewards/cosine_scaled_reward": -0.08134806924499571, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 149 |
| }, |
| { |
| "completion_length": 2714.9167098999023, |
| "epoch": 0.17142857142857143, |
| "grad_norm": 0.03853278234601021, |
| "kl": 0.00010446656960994005, |
| "learning_rate": 8.967309592491052e-07, |
| "loss": 0.0443, |
| "reward": -0.02947249379940331, |
| "reward_std": 0.6823416836559772, |
| "rewards/cosine_scaled_reward": -0.19181958585977554, |
| "rewards/format_reward": 0.35416666977107525, |
| "step": 150 |
| }, |
| { |
| "completion_length": 2669.3334045410156, |
| "epoch": 0.17257142857142857, |
| "grad_norm": 0.05166688933968544, |
| "kl": 0.0001301094889640808, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0583, |
| "reward": 0.3849489726126194, |
| "reward_std": 0.9069008305668831, |
| "rewards/cosine_scaled_reward": -0.04710885416716337, |
| "rewards/format_reward": 0.47916667349636555, |
| "step": 151 |
| }, |
| { |
| "completion_length": 3023.062530517578, |
| "epoch": 0.1737142857142857, |
| "grad_norm": 0.03680495545268059, |
| "kl": 4.774332046508789e-05, |
| "learning_rate": 8.926922383915315e-07, |
| "loss": 0.0118, |
| "reward": -0.15647598542273045, |
| "reward_std": 0.4975847424939275, |
| "rewards/cosine_scaled_reward": -0.2240713369101286, |
| "rewards/format_reward": 0.2916666679084301, |
| "step": 152 |
| }, |
| { |
| "completion_length": 2933.187545776367, |
| "epoch": 0.17485714285714285, |
| "grad_norm": 0.04766285791993141, |
| "kl": 4.67933714389801e-05, |
| "learning_rate": 8.906477750432903e-07, |
| "loss": 0.0522, |
| "reward": 0.08462437242269516, |
| "reward_std": 0.8217967357486486, |
| "rewards/cosine_scaled_reward": -0.12435449287295341, |
| "rewards/format_reward": 0.3333333395421505, |
| "step": 153 |
| }, |
| { |
| "completion_length": 3438.8333740234375, |
| "epoch": 0.176, |
| "grad_norm": 0.06172926723957062, |
| "kl": 5.605979822576046e-06, |
| "learning_rate": 8.88586709003076e-07, |
| "loss": -0.0067, |
| "reward": 0.6275491081178188, |
| "reward_std": 1.102588940411806, |
| "rewards/cosine_scaled_reward": 0.10544120147824287, |
| "rewards/format_reward": 0.416666679084301, |
| "step": 154 |
| }, |
| { |
| "completion_length": 2463.5208435058594, |
| "epoch": 0.17714285714285713, |
| "grad_norm": 0.026917118579149246, |
| "kl": 6.724148988723755e-05, |
| "learning_rate": 8.865091407243394e-07, |
| "loss": 0.0085, |
| "reward": 0.40650070272386074, |
| "reward_std": 0.5913192741572857, |
| "rewards/cosine_scaled_reward": 0.005333678796887398, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 155 |
| }, |
| { |
| "completion_length": 2991.145835876465, |
| "epoch": 0.1782857142857143, |
| "grad_norm": 0.05746571347117424, |
| "kl": 2.1982938051223755e-05, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": 0.0599, |
| "reward": 0.1549793779850006, |
| "reward_std": 0.6098005510866642, |
| "rewards/cosine_scaled_reward": -0.06834365800023079, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 156 |
| }, |
| { |
| "completion_length": 3096.5208435058594, |
| "epoch": 0.17942857142857144, |
| "grad_norm": 0.037943121045827866, |
| "kl": 5.745887756347656e-05, |
| "learning_rate": 8.823049032816478e-07, |
| "loss": 0.0273, |
| "reward": 0.06849924847483635, |
| "reward_std": 0.5304312091320753, |
| "rewards/cosine_scaled_reward": -0.13241703808307648, |
| "rewards/format_reward": 0.3333333432674408, |
| "step": 157 |
| }, |
| { |
| "completion_length": 2825.979232788086, |
| "epoch": 0.18057142857142858, |
| "grad_norm": 0.04266660660505295, |
| "kl": 0.00021417438983917236, |
| "learning_rate": 8.801784390262943e-07, |
| "loss": 0.063, |
| "reward": 0.6505898106843233, |
| "reward_std": 0.7261264026165009, |
| "rewards/cosine_scaled_reward": 0.0961282174102962, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 158 |
| }, |
| { |
| "completion_length": 3209.979202270508, |
| "epoch": 0.18171428571428572, |
| "grad_norm": 0.0403718538582325, |
| "kl": 3.383122384548187e-05, |
| "learning_rate": 8.780358823396352e-07, |
| "loss": 0.0048, |
| "reward": -0.04634229780640453, |
| "reward_std": 0.7521483562886715, |
| "rewards/cosine_scaled_reward": -0.15858781896531582, |
| "rewards/format_reward": 0.27083333767950535, |
| "step": 159 |
| }, |
| { |
| "completion_length": 3112.020866394043, |
| "epoch": 0.18285714285714286, |
| "grad_norm": 0.06450565159320831, |
| "kl": 0.00014490634202957153, |
| "learning_rate": 8.758773376468604e-07, |
| "loss": 0.0331, |
| "reward": 0.17703042179346085, |
| "reward_std": 0.8230392001569271, |
| "rewards/cosine_scaled_reward": -0.0364847918972373, |
| "rewards/format_reward": 0.25000000186264515, |
| "step": 160 |
| }, |
| { |
| "completion_length": 1949.7291870117188, |
| "epoch": 0.184, |
| "grad_norm": 0.04517294839024544, |
| "kl": 0.00019407272338867188, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": 0.0781, |
| "reward": 0.4693678207695484, |
| "reward_std": 0.6783907152712345, |
| "rewards/cosine_scaled_reward": -0.11948275938630104, |
| "rewards/format_reward": 0.7083333507180214, |
| "step": 161 |
| }, |
| { |
| "completion_length": 3424.2084350585938, |
| "epoch": 0.18514285714285714, |
| "grad_norm": 0.05282009020447731, |
| "kl": 0.00010896846652030945, |
| "learning_rate": 8.715127058347614e-07, |
| "loss": 0.0252, |
| "reward": 0.03611734416335821, |
| "reward_std": 0.8644157852977514, |
| "rewards/cosine_scaled_reward": -0.09652466559782624, |
| "rewards/format_reward": 0.22916667349636555, |
| "step": 162 |
| }, |
| { |
| "completion_length": 2668.750045776367, |
| "epoch": 0.18628571428571428, |
| "grad_norm": 0.030406512320041656, |
| "kl": 9.234249591827393e-05, |
| "learning_rate": 8.693068314414344e-07, |
| "loss": 0.0055, |
| "reward": 0.8739382587373257, |
| "reward_std": 0.6097183618694544, |
| "rewards/cosine_scaled_reward": 0.14530246332287788, |
| "rewards/format_reward": 0.5833333358168602, |
| "step": 163 |
| }, |
| { |
| "completion_length": 2629.8125228881836, |
| "epoch": 0.18742857142857142, |
| "grad_norm": 0.04248577728867531, |
| "kl": 8.138641715049744e-05, |
| "learning_rate": 8.670853944836176e-07, |
| "loss": 0.0345, |
| "reward": 0.42669382283929735, |
| "reward_std": 0.7748677339404821, |
| "rewards/cosine_scaled_reward": -0.0054031130857765675, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 164 |
| }, |
| { |
| "completion_length": 3354.8541870117188, |
| "epoch": 0.18857142857142858, |
| "grad_norm": 0.051660098135471344, |
| "kl": 9.429454803466797e-05, |
| "learning_rate": 8.648485032310144e-07, |
| "loss": 0.0124, |
| "reward": -0.10903086792677641, |
| "reward_std": 0.8036016914993525, |
| "rewards/cosine_scaled_reward": -0.14826543792150915, |
| "rewards/format_reward": 0.18750000558793545, |
| "step": 165 |
| }, |
| { |
| "completion_length": 2896.4583740234375, |
| "epoch": 0.18971428571428572, |
| "grad_norm": 0.03565189614892006, |
| "kl": 0.00016220659017562866, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": 0.0169, |
| "reward": -0.00203709676861763, |
| "reward_std": 0.6630012057721615, |
| "rewards/cosine_scaled_reward": -0.1468518888577819, |
| "rewards/format_reward": 0.29166667349636555, |
| "step": 166 |
| }, |
| { |
| "completion_length": 2433.2292251586914, |
| "epoch": 0.19085714285714286, |
| "grad_norm": 0.07079535722732544, |
| "kl": 0.0008552521467208862, |
| "learning_rate": 8.603287946810513e-07, |
| "loss": -0.0086, |
| "reward": 0.42900961078703403, |
| "reward_std": 0.7506549386307597, |
| "rewards/cosine_scaled_reward": -0.0771618754370138, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 167 |
| }, |
| { |
| "completion_length": 3103.7708740234375, |
| "epoch": 0.192, |
| "grad_norm": 0.053577445447444916, |
| "kl": 8.881837129592896e-05, |
| "learning_rate": 8.580461976679099e-07, |
| "loss": 0.0283, |
| "reward": 0.275710541754961, |
| "reward_std": 0.9691016413271427, |
| "rewards/cosine_scaled_reward": -0.0288114077411592, |
| "rewards/format_reward": 0.3333333395421505, |
| "step": 168 |
| }, |
| { |
| "completion_length": 2718.1041870117188, |
| "epoch": 0.19314285714285714, |
| "grad_norm": 0.028121525421738625, |
| "kl": 5.266070365905762e-05, |
| "learning_rate": 8.557485869176825e-07, |
| "loss": 0.0111, |
| "reward": 0.6644045971333981, |
| "reward_std": 0.6455990113317966, |
| "rewards/cosine_scaled_reward": 0.08220228180289268, |
| "rewards/format_reward": 0.5000000055879354, |
| "step": 169 |
| }, |
| { |
| "completion_length": 2760.3125228881836, |
| "epoch": 0.19428571428571428, |
| "grad_norm": 0.02649330534040928, |
| "kl": 7.764250040054321e-05, |
| "learning_rate": 8.534360744126753e-07, |
| "loss": 0.008, |
| "reward": 0.22724516410380602, |
| "reward_std": 0.5219327211380005, |
| "rewards/cosine_scaled_reward": -0.06346075981855392, |
| "rewards/format_reward": 0.3541666679084301, |
| "step": 170 |
| }, |
| { |
| "completion_length": 2697.1250228881836, |
| "epoch": 0.19542857142857142, |
| "grad_norm": 0.030017036944627762, |
| "kl": 0.00019634515047073364, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": 0.0456, |
| "reward": 0.48459129920229316, |
| "reward_std": 0.6084076017141342, |
| "rewards/cosine_scaled_reward": 0.0339623149484396, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 171 |
| }, |
| { |
| "completion_length": 3093.937530517578, |
| "epoch": 0.19657142857142856, |
| "grad_norm": 0.037974875420331955, |
| "kl": 0.0001561082899570465, |
| "learning_rate": 8.487667956935087e-07, |
| "loss": 0.0175, |
| "reward": 0.4516556728631258, |
| "reward_std": 0.6006606463342905, |
| "rewards/cosine_scaled_reward": 0.059161149663850665, |
| "rewards/format_reward": 0.33333333395421505, |
| "step": 172 |
| }, |
| { |
| "completion_length": 2046.0000343322754, |
| "epoch": 0.1977142857142857, |
| "grad_norm": 0.029817163944244385, |
| "kl": 6.169034168124199e-05, |
| "learning_rate": 8.464102570534061e-07, |
| "loss": -0.008, |
| "reward": 0.14771767449565232, |
| "reward_std": 0.6277614803984761, |
| "rewards/cosine_scaled_reward": -0.19697449961677194, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 173 |
| }, |
| { |
| "completion_length": 2523.833381652832, |
| "epoch": 0.19885714285714284, |
| "grad_norm": 0.02026381902396679, |
| "kl": 0.0006473064422607422, |
| "learning_rate": 8.440392717955475e-07, |
| "loss": 0.0157, |
| "reward": 0.3172628991305828, |
| "reward_std": 0.5097487587481737, |
| "rewards/cosine_scaled_reward": -0.10178520064800978, |
| "rewards/format_reward": 0.5208333376795053, |
| "step": 174 |
| }, |
| { |
| "completion_length": 2944.562545776367, |
| "epoch": 0.2, |
| "grad_norm": 0.04808598756790161, |
| "kl": 9.900331497192383e-05, |
| "learning_rate": 8.416539554784089e-07, |
| "loss": 0.0413, |
| "reward": 0.3816917687654495, |
| "reward_std": 0.6620169542729855, |
| "rewards/cosine_scaled_reward": 0.0033458725083619356, |
| "rewards/format_reward": 0.3750000074505806, |
| "step": 175 |
| }, |
| { |
| "completion_length": 2707.2500228881836, |
| "epoch": 0.20114285714285715, |
| "grad_norm": 0.048613887280225754, |
| "kl": 8.374452590942383e-05, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": 0.0113, |
| "reward": 0.5291388533078134, |
| "reward_std": 0.7917350004427135, |
| "rewards/cosine_scaled_reward": 0.0458194212988019, |
| "rewards/format_reward": 0.4375000037252903, |
| "step": 176 |
| }, |
| { |
| "completion_length": 3056.229248046875, |
| "epoch": 0.2022857142857143, |
| "grad_norm": 0.04693533480167389, |
| "kl": 0.00028914958238601685, |
| "learning_rate": 8.368407953869103e-07, |
| "loss": 0.038, |
| "reward": 0.3346361154690385, |
| "reward_std": 0.8135349042713642, |
| "rewards/cosine_scaled_reward": -0.030598610173910856, |
| "rewards/format_reward": 0.39583334140479565, |
| "step": 177 |
| }, |
| { |
| "completion_length": 2260.6250076293945, |
| "epoch": 0.20342857142857143, |
| "grad_norm": 0.02989472635090351, |
| "kl": 0.0001444593071937561, |
| "learning_rate": 8.344131861991828e-07, |
| "loss": 0.0159, |
| "reward": 0.4688864201307297, |
| "reward_std": 0.6323110768571496, |
| "rewards/cosine_scaled_reward": -0.04680680809542537, |
| "rewards/format_reward": 0.5625, |
| "step": 178 |
| }, |
| { |
| "completion_length": 2971.125015258789, |
| "epoch": 0.20457142857142857, |
| "grad_norm": 0.03039778396487236, |
| "kl": 0.0001380816102027893, |
| "learning_rate": 8.319717151140072e-07, |
| "loss": 0.007, |
| "reward": -0.12367597967386246, |
| "reward_std": 0.5375447850674391, |
| "rewards/cosine_scaled_reward": -0.21808799169957638, |
| "rewards/format_reward": 0.3125000074505806, |
| "step": 179 |
| }, |
| { |
| "completion_length": 2201.8125343322754, |
| "epoch": 0.2057142857142857, |
| "grad_norm": 0.019160043448209763, |
| "kl": 0.00012689828872680664, |
| "learning_rate": 8.295165011252396e-07, |
| "loss": -0.0167, |
| "reward": 0.7198411021381617, |
| "reward_std": 0.4092927183955908, |
| "rewards/cosine_scaled_reward": 0.047420548275113106, |
| "rewards/format_reward": 0.625, |
| "step": 180 |
| }, |
| { |
| "completion_length": 3133.6875228881836, |
| "epoch": 0.20685714285714285, |
| "grad_norm": 0.0425766259431839, |
| "kl": 8.432567119598389e-05, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.031, |
| "reward": 0.2028632378205657, |
| "reward_std": 0.7808081433176994, |
| "rewards/cosine_scaled_reward": -0.03398505225777626, |
| "rewards/format_reward": 0.27083333395421505, |
| "step": 181 |
| }, |
| { |
| "completion_length": 2342.3750381469727, |
| "epoch": 0.208, |
| "grad_norm": 0.04425498843193054, |
| "kl": 0.00025501102209091187, |
| "learning_rate": 8.245653237555705e-07, |
| "loss": 0.004, |
| "reward": 0.31899992749094963, |
| "reward_std": 0.8116986956447363, |
| "rewards/cosine_scaled_reward": -0.11133337735373061, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 182 |
| }, |
| { |
| "completion_length": 1913.8541870117188, |
| "epoch": 0.20914285714285713, |
| "grad_norm": 0.029876578599214554, |
| "kl": 0.00026257336139678955, |
| "learning_rate": 8.220696016880687e-07, |
| "loss": 0.0399, |
| "reward": 0.5424154847860336, |
| "reward_std": 0.636641725897789, |
| "rewards/cosine_scaled_reward": -0.07254226750228554, |
| "rewards/format_reward": 0.6875, |
| "step": 183 |
| }, |
| { |
| "completion_length": 3032.395854949951, |
| "epoch": 0.2102857142857143, |
| "grad_norm": 0.020256591960787773, |
| "kl": 0.00012014806270599365, |
| "learning_rate": 8.195606193320136e-07, |
| "loss": 0.011, |
| "reward": -0.12789930403232574, |
| "reward_std": 0.4264537561684847, |
| "rewards/cosine_scaled_reward": -0.17853298410773277, |
| "rewards/format_reward": 0.22916666977107525, |
| "step": 184 |
| }, |
| { |
| "completion_length": 2749.9375343322754, |
| "epoch": 0.21142857142857144, |
| "grad_norm": 0.03656504303216934, |
| "kl": 0.00017721951007843018, |
| "learning_rate": 8.170384989716657e-07, |
| "loss": 0.0238, |
| "reward": 0.023411770351231098, |
| "reward_std": 0.640469016507268, |
| "rewards/cosine_scaled_reward": -0.1549607841297984, |
| "rewards/format_reward": 0.33333333395421505, |
| "step": 185 |
| }, |
| { |
| "completion_length": 2899.375015258789, |
| "epoch": 0.21257142857142858, |
| "grad_norm": 0.03554477170109749, |
| "kl": 9.587407112121582e-05, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": 0.0213, |
| "reward": 0.3093617632985115, |
| "reward_std": 0.6292181555181742, |
| "rewards/cosine_scaled_reward": -0.022402465343475342, |
| "rewards/format_reward": 0.35416666977107525, |
| "step": 186 |
| }, |
| { |
| "completion_length": 2228.0416870117188, |
| "epoch": 0.21371428571428572, |
| "grad_norm": 0.023758431896567345, |
| "kl": 0.00029593706130981445, |
| "learning_rate": 8.119553365707802e-07, |
| "loss": 0.0325, |
| "reward": 0.09883421659469604, |
| "reward_std": 0.5377654749900103, |
| "rewards/cosine_scaled_reward": -0.2109995698556304, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 187 |
| }, |
| { |
| "completion_length": 3543.2916870117188, |
| "epoch": 0.21485714285714286, |
| "grad_norm": 0.03328506276011467, |
| "kl": 5.07943332195282e-05, |
| "learning_rate": 8.093945422764069e-07, |
| "loss": 0.0109, |
| "reward": -0.2927169185131788, |
| "reward_std": 0.5551654025912285, |
| "rewards/cosine_scaled_reward": -0.1880251243710518, |
| "rewards/format_reward": 0.0833333358168602, |
| "step": 188 |
| }, |
| { |
| "completion_length": 2309.729202270508, |
| "epoch": 0.216, |
| "grad_norm": 0.032304707914590836, |
| "kl": 0.0003388524055480957, |
| "learning_rate": 8.068211054579943e-07, |
| "loss": 0.0375, |
| "reward": 0.05538259819149971, |
| "reward_std": 0.521863205358386, |
| "rewards/cosine_scaled_reward": -0.24314204324036837, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 189 |
| }, |
| { |
| "completion_length": 2878.229202270508, |
| "epoch": 0.21714285714285714, |
| "grad_norm": 0.026849228888750076, |
| "kl": 0.0002093091607093811, |
| "learning_rate": 8.04235151541222e-07, |
| "loss": 0.007, |
| "reward": 0.20538577064871788, |
| "reward_std": 0.4814210291951895, |
| "rewards/cosine_scaled_reward": -0.06397378008114174, |
| "rewards/format_reward": 0.3333333358168602, |
| "step": 190 |
| }, |
| { |
| "completion_length": 2574.1041870117188, |
| "epoch": 0.21828571428571428, |
| "grad_norm": 0.04074694588780403, |
| "kl": 0.00031504780054092407, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": -0.0004, |
| "reward": 0.32336311414837837, |
| "reward_std": 0.6287105903029442, |
| "rewards/cosine_scaled_reward": -0.06748512433841825, |
| "rewards/format_reward": 0.45833333395421505, |
| "step": 191 |
| }, |
| { |
| "completion_length": 3383.8958435058594, |
| "epoch": 0.21942857142857142, |
| "grad_norm": 0.03389768674969673, |
| "kl": 0.00012552272528409958, |
| "learning_rate": 7.990261971595048e-07, |
| "loss": 0.0249, |
| "reward": -0.01775958389043808, |
| "reward_std": 0.6146044600754976, |
| "rewards/cosine_scaled_reward": -0.14429646357893944, |
| "rewards/format_reward": 0.2708333395421505, |
| "step": 192 |
| }, |
| { |
| "completion_length": 2860.687530517578, |
| "epoch": 0.22057142857142858, |
| "grad_norm": 0.04098944365978241, |
| "kl": 0.00015905871987342834, |
| "learning_rate": 7.964034505716476e-07, |
| "loss": 0.0101, |
| "reward": 0.28354677464812994, |
| "reward_std": 0.7337615005671978, |
| "rewards/cosine_scaled_reward": -0.08739327266812325, |
| "rewards/format_reward": 0.4583333395421505, |
| "step": 193 |
| }, |
| { |
| "completion_length": 3281.0000610351562, |
| "epoch": 0.22171428571428572, |
| "grad_norm": 0.05677646026015282, |
| "kl": 7.194280624389648e-05, |
| "learning_rate": 7.93768694627233e-07, |
| "loss": 0.0276, |
| "reward": 0.6990666054189205, |
| "reward_std": 0.836215304210782, |
| "rewards/cosine_scaled_reward": 0.14119997806847095, |
| "rewards/format_reward": 0.41666666977107525, |
| "step": 194 |
| }, |
| { |
| "completion_length": 2777.250030517578, |
| "epoch": 0.22285714285714286, |
| "grad_norm": 0.025516223162412643, |
| "kl": 0.0005084574222564697, |
| "learning_rate": 7.911220577405484e-07, |
| "loss": 0.0117, |
| "reward": -0.06681104190647602, |
| "reward_std": 0.5002289786934853, |
| "rewards/cosine_scaled_reward": -0.23132219538092613, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 195 |
| }, |
| { |
| "completion_length": 3459.750030517578, |
| "epoch": 0.224, |
| "grad_norm": 0.029978394508361816, |
| "kl": 0.00012940913438796997, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": 0.0098, |
| "reward": 0.272402954287827, |
| "reward_std": 0.5220753028988838, |
| "rewards/cosine_scaled_reward": -0.020048514008522034, |
| "rewards/format_reward": 0.31250000558793545, |
| "step": 196 |
| }, |
| { |
| "completion_length": 2773.645851135254, |
| "epoch": 0.22514285714285714, |
| "grad_norm": 0.048345357179641724, |
| "kl": 0.00043816864490509033, |
| "learning_rate": 7.857936576865356e-07, |
| "loss": 0.02, |
| "reward": 0.21063009148929268, |
| "reward_std": 0.9322120416909456, |
| "rewards/cosine_scaled_reward": -0.08218496525660157, |
| "rewards/format_reward": 0.37500000186264515, |
| "step": 197 |
| }, |
| { |
| "completion_length": 2772.1875076293945, |
| "epoch": 0.22628571428571428, |
| "grad_norm": 0.03043530508875847, |
| "kl": 0.0005050599575042725, |
| "learning_rate": 7.831121542179086e-07, |
| "loss": 0.0052, |
| "reward": 0.29729510098695755, |
| "reward_std": 0.5535988472402096, |
| "rewards/cosine_scaled_reward": -0.028435785323381424, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 198 |
| }, |
| { |
| "completion_length": 3581.5625, |
| "epoch": 0.22742857142857142, |
| "grad_norm": 0.02494131401181221, |
| "kl": 8.360063657164574e-05, |
| "learning_rate": 7.804192891917571e-07, |
| "loss": 0.0005, |
| "reward": -0.43263228237628937, |
| "reward_std": 0.47081154212355614, |
| "rewards/cosine_scaled_reward": -0.25798280723392963, |
| "rewards/format_reward": 0.0833333358168602, |
| "step": 199 |
| }, |
| { |
| "completion_length": 2370.666717529297, |
| "epoch": 0.22857142857142856, |
| "grad_norm": 0.03381456062197685, |
| "kl": 0.0002932697534561157, |
| "learning_rate": 7.777151938545235e-07, |
| "loss": 0.0061, |
| "reward": 1.13365722540766, |
| "reward_std": 0.56923015601933, |
| "rewards/cosine_scaled_reward": 0.24391193874180317, |
| "rewards/format_reward": 0.6458333358168602, |
| "step": 200 |
| }, |
| { |
| "completion_length": 2703.2708587646484, |
| "epoch": 0.2297142857142857, |
| "grad_norm": 0.04875003919005394, |
| "kl": 9.842216968536377e-05, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0188, |
| "reward": 0.6816193205304444, |
| "reward_std": 0.9491816088557243, |
| "rewards/cosine_scaled_reward": 0.09080967318732291, |
| "rewards/format_reward": 0.5000000074505806, |
| "step": 201 |
| }, |
| { |
| "completion_length": 2428.520866394043, |
| "epoch": 0.23085714285714284, |
| "grad_norm": 0.029104772955179214, |
| "kl": 0.0004838407039642334, |
| "learning_rate": 7.72273839962904e-07, |
| "loss": 0.0076, |
| "reward": 0.8386933319270611, |
| "reward_std": 0.4117593439295888, |
| "rewards/cosine_scaled_reward": 0.1901800061459653, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 202 |
| }, |
| { |
| "completion_length": 3084.0416717529297, |
| "epoch": 0.232, |
| "grad_norm": 0.01628260500729084, |
| "kl": 0.00013211369514465332, |
| "learning_rate": 7.695368466124296e-07, |
| "loss": 0.0065, |
| "reward": -0.11864904500544071, |
| "reward_std": 0.2944770138710737, |
| "rewards/cosine_scaled_reward": -0.17390786111354828, |
| "rewards/format_reward": 0.2291666716337204, |
| "step": 203 |
| }, |
| { |
| "completion_length": 2296.666717529297, |
| "epoch": 0.23314285714285715, |
| "grad_norm": 0.028948936611413956, |
| "kl": 0.0003097057342529297, |
| "learning_rate": 7.667891533457718e-07, |
| "loss": 0.0015, |
| "reward": 0.3847005255520344, |
| "reward_std": 0.6527448669075966, |
| "rewards/cosine_scaled_reward": -0.09931641444563866, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 204 |
| }, |
| { |
| "completion_length": 2873.229232788086, |
| "epoch": 0.2342857142857143, |
| "grad_norm": 0.04777153953909874, |
| "kl": 0.00016327202320098877, |
| "learning_rate": 7.640308940816239e-07, |
| "loss": 0.0657, |
| "reward": 0.5415339916944504, |
| "reward_std": 0.8121820725500584, |
| "rewards/cosine_scaled_reward": 0.041600316762924194, |
| "rewards/format_reward": 0.4583333395421505, |
| "step": 205 |
| }, |
| { |
| "completion_length": 2879.979202270508, |
| "epoch": 0.23542857142857143, |
| "grad_norm": 0.03969888389110565, |
| "kl": 5.835294723510742e-05, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0206, |
| "reward": 0.03187836706638336, |
| "reward_std": 0.6640341132879257, |
| "rewards/cosine_scaled_reward": -0.16114415228366852, |
| "rewards/format_reward": 0.3541666753590107, |
| "step": 206 |
| }, |
| { |
| "completion_length": 3115.6666870117188, |
| "epoch": 0.23657142857142857, |
| "grad_norm": 0.03586557134985924, |
| "kl": 0.00026457011699676514, |
| "learning_rate": 7.584832158039378e-07, |
| "loss": 0.0158, |
| "reward": -0.057422760874032974, |
| "reward_std": 0.601855780929327, |
| "rewards/cosine_scaled_reward": -0.20579471392557025, |
| "rewards/format_reward": 0.35416668094694614, |
| "step": 207 |
| }, |
| { |
| "completion_length": 2807.125015258789, |
| "epoch": 0.2377142857142857, |
| "grad_norm": 0.028317734599113464, |
| "kl": 0.0002864748239517212, |
| "learning_rate": 7.556940671764124e-07, |
| "loss": 0.0223, |
| "reward": 0.241438128054142, |
| "reward_std": 0.5762086287140846, |
| "rewards/cosine_scaled_reward": -0.066780935972929, |
| "rewards/format_reward": 0.3750000111758709, |
| "step": 208 |
| }, |
| { |
| "completion_length": 2613.0417098999023, |
| "epoch": 0.23885714285714285, |
| "grad_norm": 0.042905014008283615, |
| "kl": 0.00037148594856262207, |
| "learning_rate": 7.528948933102438e-07, |
| "loss": 0.0532, |
| "reward": 0.4992841109633446, |
| "reward_std": 0.6773411780595779, |
| "rewards/cosine_scaled_reward": 0.01005872106179595, |
| "rewards/format_reward": 0.4791666753590107, |
| "step": 209 |
| }, |
| { |
| "completion_length": 2671.1458435058594, |
| "epoch": 0.24, |
| "grad_norm": 0.017170142382383347, |
| "kl": 0.00027191638946533203, |
| "learning_rate": 7.500858306332172e-07, |
| "loss": -0.0012, |
| "reward": 0.31039364635944366, |
| "reward_std": 0.4712617564946413, |
| "rewards/cosine_scaled_reward": -0.03230317682027817, |
| "rewards/format_reward": 0.375, |
| "step": 210 |
| }, |
| { |
| "completion_length": 2630.500030517578, |
| "epoch": 0.24114285714285713, |
| "grad_norm": 0.03910549357533455, |
| "kl": 0.000303804874420166, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0362, |
| "reward": 0.4297244977205992, |
| "reward_std": 0.6497951280325651, |
| "rewards/cosine_scaled_reward": -0.024721081601455808, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 211 |
| }, |
| { |
| "completion_length": 2433.3541870117188, |
| "epoch": 0.2422857142857143, |
| "grad_norm": 0.01393934153020382, |
| "kl": 0.00036037713289260864, |
| "learning_rate": 7.444385869608921e-07, |
| "loss": -0.0072, |
| "reward": 0.4994283504784107, |
| "reward_std": 0.31923972349613905, |
| "rewards/cosine_scaled_reward": -0.00028583407402038574, |
| "rewards/format_reward": 0.5, |
| "step": 212 |
| }, |
| { |
| "completion_length": 2127.208396911621, |
| "epoch": 0.24342857142857144, |
| "grad_norm": 0.03985314443707466, |
| "kl": 0.0005207061767578125, |
| "learning_rate": 7.416006812042827e-07, |
| "loss": 0.0386, |
| "reward": 0.9866954833269119, |
| "reward_std": 0.8575046211481094, |
| "rewards/cosine_scaled_reward": 0.17043107002973557, |
| "rewards/format_reward": 0.6458333395421505, |
| "step": 213 |
| }, |
| { |
| "completion_length": 2820.625045776367, |
| "epoch": 0.24457142857142858, |
| "grad_norm": 0.05272352322936058, |
| "kl": 0.00022479891777038574, |
| "learning_rate": 7.387534371007797e-07, |
| "loss": 0.0394, |
| "reward": 0.6067041866481304, |
| "reward_std": 0.8792510256171227, |
| "rewards/cosine_scaled_reward": 0.07418542727828026, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 214 |
| }, |
| { |
| "completion_length": 2600.395896911621, |
| "epoch": 0.24571428571428572, |
| "grad_norm": 0.02042786218225956, |
| "kl": 0.0003108680248260498, |
| "learning_rate": 7.358969934210438e-07, |
| "loss": 0.0222, |
| "reward": 0.038707589730620384, |
| "reward_std": 0.41035953164100647, |
| "rewards/cosine_scaled_reward": -0.2514795530587435, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 215 |
| }, |
| { |
| "completion_length": 2171.187526702881, |
| "epoch": 0.24685714285714286, |
| "grad_norm": 0.030419498682022095, |
| "kl": 0.0006196498870849609, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": 0.0272, |
| "reward": 0.47143199387937784, |
| "reward_std": 0.6538783833384514, |
| "rewards/cosine_scaled_reward": -0.03511733375489712, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 216 |
| }, |
| { |
| "completion_length": 2755.916679382324, |
| "epoch": 0.248, |
| "grad_norm": 0.042481135576963425, |
| "kl": 0.0003572404384613037, |
| "learning_rate": 7.301570646506027e-07, |
| "loss": 0.0345, |
| "reward": 0.2117303553968668, |
| "reward_std": 0.9634007029235363, |
| "rewards/cosine_scaled_reward": -0.11288482137024403, |
| "rewards/format_reward": 0.4375000111758709, |
| "step": 217 |
| }, |
| { |
| "completion_length": 2861.312530517578, |
| "epoch": 0.24914285714285714, |
| "grad_norm": 0.03387823700904846, |
| "kl": 0.0003177523612976074, |
| "learning_rate": 7.27273859315928e-07, |
| "loss": 0.0076, |
| "reward": 0.04665176011621952, |
| "reward_std": 0.6162600554525852, |
| "rewards/cosine_scaled_reward": -0.12250744178891182, |
| "rewards/format_reward": 0.2916666679084301, |
| "step": 218 |
| }, |
| { |
| "completion_length": 2398.937515258789, |
| "epoch": 0.2502857142857143, |
| "grad_norm": 0.03438759222626686, |
| "kl": 0.00035077333450317383, |
| "learning_rate": 7.243820139034464e-07, |
| "loss": 0.0038, |
| "reward": 0.5890939775854349, |
| "reward_std": 0.7752751149237156, |
| "rewards/cosine_scaled_reward": 0.02371366578154266, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 219 |
| }, |
| { |
| "completion_length": 2606.1041946411133, |
| "epoch": 0.25142857142857145, |
| "grad_norm": 0.012811764143407345, |
| "kl": 0.0001837611198425293, |
| "learning_rate": 7.214816693576234e-07, |
| "loss": 0.0032, |
| "reward": -0.1477857008576393, |
| "reward_std": 0.2786343488842249, |
| "rewards/cosine_scaled_reward": -0.282226188108325, |
| "rewards/format_reward": 0.4166666679084301, |
| "step": 220 |
| }, |
| { |
| "completion_length": 2278.2708587646484, |
| "epoch": 0.25257142857142856, |
| "grad_norm": 0.026400132104754448, |
| "kl": 0.00043514370918273926, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": 0.003, |
| "reward": 0.5976126529276371, |
| "reward_std": 0.723968205973506, |
| "rewards/cosine_scaled_reward": 0.027972997166216373, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 221 |
| }, |
| { |
| "completion_length": 2221.833366394043, |
| "epoch": 0.2537142857142857, |
| "grad_norm": 0.032811444252729416, |
| "kl": 0.000776827335357666, |
| "learning_rate": 7.156560487081051e-07, |
| "loss": 0.0661, |
| "reward": 0.5920759253203869, |
| "reward_std": 0.5432851929217577, |
| "rewards/cosine_scaled_reward": 0.01478794775903225, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 222 |
| }, |
| { |
| "completion_length": 2540.0208892822266, |
| "epoch": 0.25485714285714284, |
| "grad_norm": 0.03456755727529526, |
| "kl": 0.0004057884216308594, |
| "learning_rate": 7.127310565369415e-07, |
| "loss": 0.0262, |
| "reward": 0.34556883573532104, |
| "reward_std": 0.7227263785898685, |
| "rewards/cosine_scaled_reward": -0.077215576035087, |
| "rewards/format_reward": 0.5000000093132257, |
| "step": 223 |
| }, |
| { |
| "completion_length": 3318.875030517578, |
| "epoch": 0.256, |
| "grad_norm": 0.04437468200922012, |
| "kl": 0.00018399953842163086, |
| "learning_rate": 7.097981330836616e-07, |
| "loss": 0.0454, |
| "reward": 0.16465971246361732, |
| "reward_std": 0.7901786658912897, |
| "rewards/cosine_scaled_reward": -0.06350347894476727, |
| "rewards/format_reward": 0.29166667722165585, |
| "step": 224 |
| }, |
| { |
| "completion_length": 2950.5417404174805, |
| "epoch": 0.2571428571428571, |
| "grad_norm": 0.04053080826997757, |
| "kl": 0.00041607022285461426, |
| "learning_rate": 7.068574212948169e-07, |
| "loss": 0.0464, |
| "reward": 0.01986863650381565, |
| "reward_std": 0.649408801458776, |
| "rewards/cosine_scaled_reward": -0.1463156845420599, |
| "rewards/format_reward": 0.31250000558793545, |
| "step": 225 |
| }, |
| { |
| "completion_length": 2706.7500610351562, |
| "epoch": 0.2582857142857143, |
| "grad_norm": 0.0555625818669796, |
| "kl": 0.0008983612060546875, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": 0.0504, |
| "reward": 0.7885134015232325, |
| "reward_std": 0.8663289472460747, |
| "rewards/cosine_scaled_reward": 0.14425668492913246, |
| "rewards/format_reward": 0.5000000055879354, |
| "step": 226 |
| }, |
| { |
| "completion_length": 1995.1041831970215, |
| "epoch": 0.25942857142857145, |
| "grad_norm": 0.03679594770073891, |
| "kl": 0.0006301403045654297, |
| "learning_rate": 7.009532063876148e-07, |
| "loss": 0.0354, |
| "reward": 0.3761005823034793, |
| "reward_std": 0.7344776913523674, |
| "rewards/cosine_scaled_reward": -0.12444971542572603, |
| "rewards/format_reward": 0.6250000037252903, |
| "step": 227 |
| }, |
| { |
| "completion_length": 2323.4583435058594, |
| "epoch": 0.26057142857142856, |
| "grad_norm": 0.03594999015331268, |
| "kl": 0.00032375752925872803, |
| "learning_rate": 6.979899910323624e-07, |
| "loss": 0.019, |
| "reward": 0.7633006721735001, |
| "reward_std": 0.7087479755282402, |
| "rewards/cosine_scaled_reward": 0.1316503193229437, |
| "rewards/format_reward": 0.5000000037252903, |
| "step": 228 |
| }, |
| { |
| "completion_length": 3394.75, |
| "epoch": 0.26171428571428573, |
| "grad_norm": 0.030971722677350044, |
| "kl": 0.00020450353622436523, |
| "learning_rate": 6.950195628537299e-07, |
| "loss": 0.0338, |
| "reward": 0.06433436647057533, |
| "reward_std": 0.4984890129417181, |
| "rewards/cosine_scaled_reward": -0.05116615444421768, |
| "rewards/format_reward": 0.16666667349636555, |
| "step": 229 |
| }, |
| { |
| "completion_length": 3045.0625534057617, |
| "epoch": 0.26285714285714284, |
| "grad_norm": 0.0360412634909153, |
| "kl": 0.00022391974925994873, |
| "learning_rate": 6.920420666261961e-07, |
| "loss": 0.038, |
| "reward": 0.02990809455513954, |
| "reward_std": 0.6434181109070778, |
| "rewards/cosine_scaled_reward": -0.14129596762359142, |
| "rewards/format_reward": 0.31250000558793545, |
| "step": 230 |
| }, |
| { |
| "completion_length": 2590.312515258789, |
| "epoch": 0.264, |
| "grad_norm": 0.025739798322319984, |
| "kl": 0.00043526291847229004, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": -0.0002, |
| "reward": 0.34738841000944376, |
| "reward_std": 0.5048788581043482, |
| "rewards/cosine_scaled_reward": -0.06588914524763823, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 231 |
| }, |
| { |
| "completion_length": 3344.5625610351562, |
| "epoch": 0.2651428571428571, |
| "grad_norm": 0.034720636904239655, |
| "kl": 0.0002605915069580078, |
| "learning_rate": 6.860664508377001e-07, |
| "loss": 0.0408, |
| "reward": -0.1548066809773445, |
| "reward_std": 0.537173880264163, |
| "rewards/cosine_scaled_reward": -0.160736670717597, |
| "rewards/format_reward": 0.16666666977107525, |
| "step": 232 |
| }, |
| { |
| "completion_length": 2555.1666946411133, |
| "epoch": 0.2662857142857143, |
| "grad_norm": 0.04706970229744911, |
| "kl": 0.00042426586151123047, |
| "learning_rate": 6.83068622519821e-07, |
| "loss": 0.0466, |
| "reward": 0.36857184290420264, |
| "reward_std": 0.7978673111647367, |
| "rewards/cosine_scaled_reward": -0.08654742129147053, |
| "rewards/format_reward": 0.5416666753590107, |
| "step": 233 |
| }, |
| { |
| "completion_length": 2744.833335876465, |
| "epoch": 0.2674285714285714, |
| "grad_norm": 0.030540212988853455, |
| "kl": 0.0005177855491638184, |
| "learning_rate": 6.800643086250121e-07, |
| "loss": 0.0196, |
| "reward": 0.16414142958819866, |
| "reward_std": 0.5068646352738142, |
| "rewards/cosine_scaled_reward": -0.09501262754201889, |
| "rewards/format_reward": 0.3541666679084301, |
| "step": 234 |
| }, |
| { |
| "completion_length": 2465.2708625793457, |
| "epoch": 0.26857142857142857, |
| "grad_norm": 0.023667052388191223, |
| "kl": 0.00041544437408447266, |
| "learning_rate": 6.770536555792944e-07, |
| "loss": 0.0041, |
| "reward": 0.6389858163893223, |
| "reward_std": 0.5823408327996731, |
| "rewards/cosine_scaled_reward": 0.11115959542803466, |
| "rewards/format_reward": 0.4166666679084301, |
| "step": 235 |
| }, |
| { |
| "completion_length": 2748.5417098999023, |
| "epoch": 0.26971428571428574, |
| "grad_norm": 0.0339183434844017, |
| "kl": 0.0003077387809753418, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": -0.0088, |
| "reward": 0.16632992029190063, |
| "reward_std": 0.7507955506443977, |
| "rewards/cosine_scaled_reward": -0.12516836973372847, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 236 |
| }, |
| { |
| "completion_length": 2843.1458740234375, |
| "epoch": 0.27085714285714285, |
| "grad_norm": 0.03987835720181465, |
| "kl": 0.0008280277252197266, |
| "learning_rate": 6.710139192768694e-07, |
| "loss": 0.0305, |
| "reward": 0.2756032687611878, |
| "reward_std": 0.7616374446079135, |
| "rewards/cosine_scaled_reward": -0.06011504400521517, |
| "rewards/format_reward": 0.3958333395421505, |
| "step": 237 |
| }, |
| { |
| "completion_length": 3365.2083740234375, |
| "epoch": 0.272, |
| "grad_norm": 0.051739439368247986, |
| "kl": 0.00021576881408691406, |
| "learning_rate": 6.679851303883891e-07, |
| "loss": 0.0456, |
| "reward": 0.186602633446455, |
| "reward_std": 0.8221456026658416, |
| "rewards/cosine_scaled_reward": -0.07336535991635174, |
| "rewards/format_reward": 0.33333334513008595, |
| "step": 238 |
| }, |
| { |
| "completion_length": 1754.3750076293945, |
| "epoch": 0.27314285714285713, |
| "grad_norm": 0.027313968166708946, |
| "kl": 0.0010634064674377441, |
| "learning_rate": 6.649505910711058e-07, |
| "loss": 0.0085, |
| "reward": 1.1690001524984837, |
| "reward_std": 0.7178722098469734, |
| "rewards/cosine_scaled_reward": 0.20950003527104855, |
| "rewards/format_reward": 0.7500000055879354, |
| "step": 239 |
| }, |
| { |
| "completion_length": 3099.708335876465, |
| "epoch": 0.2742857142857143, |
| "grad_norm": 0.023615289479494095, |
| "kl": 0.0005241632461547852, |
| "learning_rate": 6.619104492241847e-07, |
| "loss": 0.0084, |
| "reward": -0.22005018219351768, |
| "reward_std": 0.3783438242971897, |
| "rewards/cosine_scaled_reward": -0.2246084287762642, |
| "rewards/format_reward": 0.2291666716337204, |
| "step": 240 |
| }, |
| { |
| "completion_length": 3463.6041870117188, |
| "epoch": 0.2754285714285714, |
| "grad_norm": 0.03201238811016083, |
| "kl": 0.0002460479736328125, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.0075, |
| "reward": -0.10366834700107574, |
| "reward_std": 0.5739485621452332, |
| "rewards/cosine_scaled_reward": -0.18725084699690342, |
| "rewards/format_reward": 0.2708333432674408, |
| "step": 241 |
| }, |
| { |
| "completion_length": 2606.791717529297, |
| "epoch": 0.2765714285714286, |
| "grad_norm": 0.03816881403326988, |
| "kl": 0.0006372928619384766, |
| "learning_rate": 6.558139508961654e-07, |
| "loss": 0.0182, |
| "reward": 0.16975077614188194, |
| "reward_std": 0.6602888819179498, |
| "rewards/cosine_scaled_reward": -0.14429128426127136, |
| "rewards/format_reward": 0.4583333395421505, |
| "step": 242 |
| }, |
| { |
| "completion_length": 2902.8542098999023, |
| "epoch": 0.2777142857142857, |
| "grad_norm": 0.047052547335624695, |
| "kl": 0.0002627074718475342, |
| "learning_rate": 6.527578915497951e-07, |
| "loss": 0.0463, |
| "reward": 0.24082460720092058, |
| "reward_std": 0.658990764990449, |
| "rewards/cosine_scaled_reward": -0.05667102336883545, |
| "rewards/format_reward": 0.35416667722165585, |
| "step": 243 |
| }, |
| { |
| "completion_length": 3030.4375534057617, |
| "epoch": 0.27885714285714286, |
| "grad_norm": 0.04656903073191643, |
| "kl": 0.0015395283699035645, |
| "learning_rate": 6.496968239287603e-07, |
| "loss": 0.0069, |
| "reward": 0.38501549884676933, |
| "reward_std": 0.8437606617808342, |
| "rewards/cosine_scaled_reward": 0.01542440801858902, |
| "rewards/format_reward": 0.35416667349636555, |
| "step": 244 |
| }, |
| { |
| "completion_length": 2680.8541870117188, |
| "epoch": 0.28, |
| "grad_norm": 0.03648906201124191, |
| "kl": 0.000688149593770504, |
| "learning_rate": 6.466308972251785e-07, |
| "loss": 0.0536, |
| "reward": 0.19830208271741867, |
| "reward_std": 0.6960498318076134, |
| "rewards/cosine_scaled_reward": -0.09876563586294651, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 245 |
| }, |
| { |
| "completion_length": 2983.104202270508, |
| "epoch": 0.28114285714285714, |
| "grad_norm": 0.05028293654322624, |
| "kl": 0.00043523311614990234, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": 0.0866, |
| "reward": 0.4245212096720934, |
| "reward_std": 0.8826902657747269, |
| "rewards/cosine_scaled_reward": -0.006489396095275879, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 246 |
| }, |
| { |
| "completion_length": 3257.187530517578, |
| "epoch": 0.2822857142857143, |
| "grad_norm": 0.02985483780503273, |
| "kl": 0.0003070831298828125, |
| "learning_rate": 6.404850645156841e-07, |
| "loss": 0.0182, |
| "reward": -0.2367397672496736, |
| "reward_std": 0.5003039184957743, |
| "rewards/cosine_scaled_reward": -0.2121198857203126, |
| "rewards/format_reward": 0.18750000186264515, |
| "step": 247 |
| }, |
| { |
| "completion_length": 2173.875030517578, |
| "epoch": 0.2834285714285714, |
| "grad_norm": 0.041356880217790604, |
| "kl": 0.0006842613220214844, |
| "learning_rate": 6.374054580489873e-07, |
| "loss": 0.0086, |
| "reward": 1.2376226875931025, |
| "reward_std": 0.6682583466172218, |
| "rewards/cosine_scaled_reward": 0.3271446730941534, |
| "rewards/format_reward": 0.583333333954215, |
| "step": 248 |
| }, |
| { |
| "completion_length": 2189.1666870117188, |
| "epoch": 0.2845714285714286, |
| "grad_norm": 0.04028550907969475, |
| "kl": 0.0006466060876846313, |
| "learning_rate": 6.343215915635761e-07, |
| "loss": 0.0563, |
| "reward": 0.7381115891039371, |
| "reward_std": 0.7054900173097849, |
| "rewards/cosine_scaled_reward": 0.08780577592551708, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 249 |
| }, |
| { |
| "completion_length": 2997.187515258789, |
| "epoch": 0.2857142857142857, |
| "grad_norm": 0.06118670478463173, |
| "kl": 0.0006084442138671875, |
| "learning_rate": 6.31233615362752e-07, |
| "loss": 0.0546, |
| "reward": 0.07891043275594711, |
| "reward_std": 0.8724489659070969, |
| "rewards/cosine_scaled_reward": -0.12721144780516624, |
| "rewards/format_reward": 0.33333333767950535, |
| "step": 250 |
| }, |
| { |
| "completion_length": 2369.50004196167, |
| "epoch": 0.28685714285714287, |
| "grad_norm": 0.03798350319266319, |
| "kl": 0.00027310848236083984, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": 0.0488, |
| "reward": 0.7616634331643581, |
| "reward_std": 0.7558214440941811, |
| "rewards/cosine_scaled_reward": 0.0787483798339963, |
| "rewards/format_reward": 0.6041666772216558, |
| "step": 251 |
| }, |
| { |
| "completion_length": 2793.125015258789, |
| "epoch": 0.288, |
| "grad_norm": 0.026387983933091164, |
| "kl": 0.0006684064865112305, |
| "learning_rate": 6.25045936022246e-07, |
| "loss": 0.0151, |
| "reward": 0.24926170520484447, |
| "reward_std": 0.5719782337546349, |
| "rewards/cosine_scaled_reward": -0.07328582555055618, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 252 |
| }, |
| { |
| "completion_length": 2910.3125534057617, |
| "epoch": 0.28914285714285715, |
| "grad_norm": 0.03298799693584442, |
| "kl": 0.00051116943359375, |
| "learning_rate": 6.219465344613258e-07, |
| "loss": 0.0004, |
| "reward": 0.2467479296028614, |
| "reward_std": 0.5817495100200176, |
| "rewards/cosine_scaled_reward": -0.04329271428287029, |
| "rewards/format_reward": 0.3333333358168602, |
| "step": 253 |
| }, |
| { |
| "completion_length": 2709.437530517578, |
| "epoch": 0.29028571428571426, |
| "grad_norm": 0.040090933442115784, |
| "kl": 0.0008482933044433594, |
| "learning_rate": 6.188436263278172e-07, |
| "loss": 0.0401, |
| "reward": 0.19492281647399068, |
| "reward_std": 0.803937803953886, |
| "rewards/cosine_scaled_reward": -0.13170527666807175, |
| "rewards/format_reward": 0.4583333469927311, |
| "step": 254 |
| }, |
| { |
| "completion_length": 3190.4375228881836, |
| "epoch": 0.2914285714285714, |
| "grad_norm": 0.032341450452804565, |
| "kl": 0.0004298090934753418, |
| "learning_rate": 6.157373628530852e-07, |
| "loss": 0.0009, |
| "reward": -0.1900683492422104, |
| "reward_std": 0.6940745413303375, |
| "rewards/cosine_scaled_reward": -0.1992008574306965, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 255 |
| }, |
| { |
| "completion_length": 3150.5625610351562, |
| "epoch": 0.2925714285714286, |
| "grad_norm": 0.05708003789186478, |
| "kl": 0.00038230419158935547, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": 0.079, |
| "reward": 0.40191663336008787, |
| "reward_std": 0.9853764846920967, |
| "rewards/cosine_scaled_reward": 0.023874984588474035, |
| "rewards/format_reward": 0.35416667349636555, |
| "step": 256 |
| }, |
| { |
| "completion_length": 3104.395896911621, |
| "epoch": 0.2937142857142857, |
| "grad_norm": 0.04910450056195259, |
| "kl": 0.00029408931732177734, |
| "learning_rate": 6.095153756157051e-07, |
| "loss": 0.0179, |
| "reward": 0.5222748387604952, |
| "reward_std": 0.8857487104833126, |
| "rewards/cosine_scaled_reward": 0.07363741181325167, |
| "rewards/format_reward": 0.37500000558793545, |
| "step": 257 |
| }, |
| { |
| "completion_length": 3502.9166870117188, |
| "epoch": 0.2948571428571429, |
| "grad_norm": 0.04182790219783783, |
| "kl": 0.0003981590270996094, |
| "learning_rate": 6.06399955103937e-07, |
| "loss": 0.0028, |
| "reward": -0.13565716426819563, |
| "reward_std": 0.6289124507457018, |
| "rewards/cosine_scaled_reward": -0.19282858772203326, |
| "rewards/format_reward": 0.25000000186264515, |
| "step": 258 |
| }, |
| { |
| "completion_length": 2970.1458740234375, |
| "epoch": 0.296, |
| "grad_norm": 0.04299155995249748, |
| "kl": 0.0008516311645507812, |
| "learning_rate": 6.032817857379256e-07, |
| "loss": 0.064, |
| "reward": 0.15694763511419296, |
| "reward_std": 0.5294061750173569, |
| "rewards/cosine_scaled_reward": -0.056942859664559364, |
| "rewards/format_reward": 0.27083333767950535, |
| "step": 259 |
| }, |
| { |
| "completion_length": 2157.0625228881836, |
| "epoch": 0.29714285714285715, |
| "grad_norm": 0.04957187548279762, |
| "kl": 0.0002192854881286621, |
| "learning_rate": 6.001610194928464e-07, |
| "loss": 0.0369, |
| "reward": 0.9001389928162098, |
| "reward_std": 0.9185776375234127, |
| "rewards/cosine_scaled_reward": 0.1584028152283281, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 260 |
| }, |
| { |
| "completion_length": 3176.3750228881836, |
| "epoch": 0.29828571428571427, |
| "grad_norm": 0.04083000496029854, |
| "kl": 0.0002732276916503906, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0182, |
| "reward": 0.18310315907001495, |
| "reward_std": 0.6294377241283655, |
| "rewards/cosine_scaled_reward": -0.043865093030035496, |
| "rewards/format_reward": 0.2708333358168602, |
| "step": 261 |
| }, |
| { |
| "completion_length": 3245.2291717529297, |
| "epoch": 0.29942857142857143, |
| "grad_norm": 0.019805744290351868, |
| "kl": 0.00045013427734375, |
| "learning_rate": 5.939123048916173e-07, |
| "loss": 0.0029, |
| "reward": -0.25881996005773544, |
| "reward_std": 0.3580850623548031, |
| "rewards/cosine_scaled_reward": -0.20232664234936237, |
| "rewards/format_reward": 0.14583333395421505, |
| "step": 262 |
| }, |
| { |
| "completion_length": 2710.8125228881836, |
| "epoch": 0.30057142857142854, |
| "grad_norm": 0.030118314549326897, |
| "kl": 0.0011047124862670898, |
| "learning_rate": 5.907846610890011e-07, |
| "loss": 0.0096, |
| "reward": -0.03488955553621054, |
| "reward_std": 0.6062153112143278, |
| "rewards/cosine_scaled_reward": -0.19452812150120735, |
| "rewards/format_reward": 0.35416666977107525, |
| "step": 263 |
| }, |
| { |
| "completion_length": 2831.041702270508, |
| "epoch": 0.3017142857142857, |
| "grad_norm": 0.025041300803422928, |
| "kl": 0.0004094839096069336, |
| "learning_rate": 5.87655029499542e-07, |
| "loss": 0.0005, |
| "reward": 0.15325931343249977, |
| "reward_std": 0.5651900470256805, |
| "rewards/cosine_scaled_reward": -0.11087033338844776, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 264 |
| }, |
| { |
| "completion_length": 2064.687545776367, |
| "epoch": 0.3028571428571429, |
| "grad_norm": 0.03248697519302368, |
| "kl": 0.001496434211730957, |
| "learning_rate": 5.845235626570683e-07, |
| "loss": 0.0528, |
| "reward": 0.6930933259427547, |
| "reward_std": 0.6866586022078991, |
| "rewards/cosine_scaled_reward": 0.04446331039071083, |
| "rewards/format_reward": 0.6041666772216558, |
| "step": 265 |
| }, |
| { |
| "completion_length": 3208.5416717529297, |
| "epoch": 0.304, |
| "grad_norm": 0.0215072650462389, |
| "kl": 0.000579833984375, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": -0.0004, |
| "reward": -0.09674983285367489, |
| "reward_std": 0.40319861471652985, |
| "rewards/cosine_scaled_reward": -0.11087492946535349, |
| "rewards/format_reward": 0.125, |
| "step": 266 |
| }, |
| { |
| "completion_length": 3249.3541870117188, |
| "epoch": 0.30514285714285716, |
| "grad_norm": 0.03392777964472771, |
| "kl": 0.0008037090301513672, |
| "learning_rate": 5.78255733788191e-07, |
| "loss": -0.0111, |
| "reward": -0.2167503908276558, |
| "reward_std": 0.5864087976515293, |
| "rewards/cosine_scaled_reward": -0.20212520100176334, |
| "rewards/format_reward": 0.1875000074505806, |
| "step": 267 |
| }, |
| { |
| "completion_length": 2485.8750228881836, |
| "epoch": 0.3062857142857143, |
| "grad_norm": 0.040282804518938065, |
| "kl": 0.0007268190383911133, |
| "learning_rate": 5.751196772469237e-07, |
| "loss": 0.0522, |
| "reward": 0.48096081614494324, |
| "reward_std": 0.8175039198249578, |
| "rewards/cosine_scaled_reward": 0.021730393171310425, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 268 |
| }, |
| { |
| "completion_length": 2987.9166870117188, |
| "epoch": 0.30742857142857144, |
| "grad_norm": 0.03961595892906189, |
| "kl": 0.0005930662155151367, |
| "learning_rate": 5.71982396408026e-07, |
| "loss": 0.0168, |
| "reward": 0.15192145481705666, |
| "reward_std": 0.7950946986675262, |
| "rewards/cosine_scaled_reward": -0.08028928004205227, |
| "rewards/format_reward": 0.31250000558793545, |
| "step": 269 |
| }, |
| { |
| "completion_length": 3080.625030517578, |
| "epoch": 0.30857142857142855, |
| "grad_norm": 0.04469272121787071, |
| "kl": 0.00038649141788482666, |
| "learning_rate": 5.688440441781398e-07, |
| "loss": 0.0194, |
| "reward": 0.43130027689039707, |
| "reward_std": 0.9086147882044315, |
| "rewards/cosine_scaled_reward": 0.03856680044555105, |
| "rewards/format_reward": 0.35416667349636555, |
| "step": 270 |
| }, |
| { |
| "completion_length": 2210.687530517578, |
| "epoch": 0.3097142857142857, |
| "grad_norm": 0.03942432254552841, |
| "kl": 0.0009759664535522461, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": 0.0599, |
| "reward": 0.6358127221465111, |
| "reward_std": 0.803847155533731, |
| "rewards/cosine_scaled_reward": -0.02584364265203476, |
| "rewards/format_reward": 0.687500013038516, |
| "step": 271 |
| }, |
| { |
| "completion_length": 2895.625030517578, |
| "epoch": 0.31085714285714283, |
| "grad_norm": 0.029458731412887573, |
| "kl": 0.00041615962982177734, |
| "learning_rate": 5.625647374256061e-07, |
| "loss": 0.0218, |
| "reward": 0.2146900595689658, |
| "reward_std": 0.5223457869142294, |
| "rewards/cosine_scaled_reward": -0.06973831099458039, |
| "rewards/format_reward": 0.35416666977107525, |
| "step": 272 |
| }, |
| { |
| "completion_length": 2339.8958435058594, |
| "epoch": 0.312, |
| "grad_norm": 0.02511209435760975, |
| "kl": 0.0005612373352050781, |
| "learning_rate": 5.594240889475106e-07, |
| "loss": 0.0052, |
| "reward": 0.6176238246262074, |
| "reward_std": 0.6360815949738026, |
| "rewards/cosine_scaled_reward": 0.058811913593672216, |
| "rewards/format_reward": 0.5000000055879354, |
| "step": 273 |
| }, |
| { |
| "completion_length": 1936.6250305175781, |
| "epoch": 0.31314285714285717, |
| "grad_norm": 0.0384528823196888, |
| "kl": 0.00042247772216796875, |
| "learning_rate": 5.562829811526154e-07, |
| "loss": 0.0107, |
| "reward": 1.0110880993306637, |
| "reward_std": 0.8275207653641701, |
| "rewards/cosine_scaled_reward": 0.16179403848946095, |
| "rewards/format_reward": 0.6875000018626451, |
| "step": 274 |
| }, |
| { |
| "completion_length": 2277.541702270508, |
| "epoch": 0.3142857142857143, |
| "grad_norm": 0.03816717863082886, |
| "kl": 0.001892685890197754, |
| "learning_rate": 5.531415671340826e-07, |
| "loss": 0.048, |
| "reward": 0.8937440374866128, |
| "reward_std": 0.7764279395341873, |
| "rewards/cosine_scaled_reward": 0.1656220075674355, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 275 |
| }, |
| { |
| "completion_length": 2728.437530517578, |
| "epoch": 0.31542857142857145, |
| "grad_norm": 0.03653620183467865, |
| "kl": 0.0014216899871826172, |
| "learning_rate": 5.5e-07, |
| "loss": -0.0137, |
| "reward": 0.44955473113805056, |
| "reward_std": 0.6848132107406855, |
| "rewards/cosine_scaled_reward": 0.026860691141337156, |
| "rewards/format_reward": 0.3958333395421505, |
| "step": 276 |
| }, |
| { |
| "completion_length": 2431.7500381469727, |
| "epoch": 0.31657142857142856, |
| "grad_norm": 0.06454865634441376, |
| "kl": 0.0006378889083862305, |
| "learning_rate": 5.468584328659172e-07, |
| "loss": 0.0178, |
| "reward": 0.48344410210847855, |
| "reward_std": 0.7250744290649891, |
| "rewards/cosine_scaled_reward": 0.012555371969938278, |
| "rewards/format_reward": 0.45833333395421505, |
| "step": 277 |
| }, |
| { |
| "completion_length": 2068.0416870117188, |
| "epoch": 0.3177142857142857, |
| "grad_norm": 0.04215235263109207, |
| "kl": 0.0013701915740966797, |
| "learning_rate": 5.437170188473847e-07, |
| "loss": 0.0084, |
| "reward": 0.961402993183583, |
| "reward_std": 0.8674832321703434, |
| "rewards/cosine_scaled_reward": 0.12653482053428888, |
| "rewards/format_reward": 0.7083333395421505, |
| "step": 278 |
| }, |
| { |
| "completion_length": 3224.625, |
| "epoch": 0.31885714285714284, |
| "grad_norm": 0.022943608462810516, |
| "kl": 0.0012241601943969727, |
| "learning_rate": 5.405759110524894e-07, |
| "loss": -0.0093, |
| "reward": -0.18678050953894854, |
| "reward_std": 0.4048001943156123, |
| "rewards/cosine_scaled_reward": -0.17672359757125378, |
| "rewards/format_reward": 0.1666666679084301, |
| "step": 279 |
| }, |
| { |
| "completion_length": 2174.9583625793457, |
| "epoch": 0.32, |
| "grad_norm": 0.039122022688388824, |
| "kl": 0.0006496906280517578, |
| "learning_rate": 5.37435262574394e-07, |
| "loss": 0.0228, |
| "reward": 0.7332258597016335, |
| "reward_std": 0.9436057284474373, |
| "rewards/cosine_scaled_reward": 0.08536292100325227, |
| "rewards/format_reward": 0.5625000018626451, |
| "step": 280 |
| }, |
| { |
| "completion_length": 3451.625, |
| "epoch": 0.3211428571428571, |
| "grad_norm": 0.02864747680723667, |
| "kl": 0.00041675567626953125, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0233, |
| "reward": -0.32699717301875353, |
| "reward_std": 0.46905585937201977, |
| "rewards/cosine_scaled_reward": -0.21558191592339426, |
| "rewards/format_reward": 0.10416666977107525, |
| "step": 281 |
| }, |
| { |
| "completion_length": 2733.9583740234375, |
| "epoch": 0.3222857142857143, |
| "grad_norm": 0.04495016112923622, |
| "kl": 0.0015126466751098633, |
| "learning_rate": 5.311559558218603e-07, |
| "loss": -0.0161, |
| "reward": 0.39272309094667435, |
| "reward_std": 0.6825822591781616, |
| "rewards/cosine_scaled_reward": -0.03280512057244778, |
| "rewards/format_reward": 0.4583333395421505, |
| "step": 282 |
| }, |
| { |
| "completion_length": 2954.0208587646484, |
| "epoch": 0.32342857142857145, |
| "grad_norm": 0.03212931752204895, |
| "kl": 0.0006183385848999023, |
| "learning_rate": 5.28017603591974e-07, |
| "loss": 0.0181, |
| "reward": 0.7553600296378136, |
| "reward_std": 0.6611499711871147, |
| "rewards/cosine_scaled_reward": 0.13809666596353054, |
| "rewards/format_reward": 0.4791666679084301, |
| "step": 283 |
| }, |
| { |
| "completion_length": 2272.000045776367, |
| "epoch": 0.32457142857142857, |
| "grad_norm": 0.04936413839459419, |
| "kl": 0.0004665851593017578, |
| "learning_rate": 5.248803227530763e-07, |
| "loss": 0.0378, |
| "reward": 0.5260394886136055, |
| "reward_std": 0.8827262446284294, |
| "rewards/cosine_scaled_reward": -0.05989692127332091, |
| "rewards/format_reward": 0.6458333432674408, |
| "step": 284 |
| }, |
| { |
| "completion_length": 2675.916732788086, |
| "epoch": 0.32571428571428573, |
| "grad_norm": 0.021733392030000687, |
| "kl": 0.0008116960525512695, |
| "learning_rate": 5.21744266211809e-07, |
| "loss": 0.0038, |
| "reward": 0.5852530999109149, |
| "reward_std": 0.5077806441113353, |
| "rewards/cosine_scaled_reward": -0.030290118884295225, |
| "rewards/format_reward": 0.6458333395421505, |
| "step": 285 |
| }, |
| { |
| "completion_length": 2508.9583892822266, |
| "epoch": 0.32685714285714285, |
| "grad_norm": 0.04166712611913681, |
| "kl": 0.0005065202713012695, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0488, |
| "reward": 0.4353667050600052, |
| "reward_std": 0.8739612437784672, |
| "rewards/cosine_scaled_reward": -0.03231665026396513, |
| "rewards/format_reward": 0.5, |
| "step": 286 |
| }, |
| { |
| "completion_length": 1978.8125343322754, |
| "epoch": 0.328, |
| "grad_norm": 0.03577245771884918, |
| "kl": 0.0008919239044189453, |
| "learning_rate": 5.154764373429315e-07, |
| "loss": 0.0558, |
| "reward": 0.5270357728004456, |
| "reward_std": 0.6659693606197834, |
| "rewards/cosine_scaled_reward": -0.01773212105035782, |
| "rewards/format_reward": 0.5625, |
| "step": 287 |
| }, |
| { |
| "completion_length": 3146.0208587646484, |
| "epoch": 0.3291428571428571, |
| "grad_norm": 0.04421306028962135, |
| "kl": 0.0004572868347167969, |
| "learning_rate": 5.123449705004581e-07, |
| "loss": 0.0147, |
| "reward": 0.1639587520621717, |
| "reward_std": 0.741458848118782, |
| "rewards/cosine_scaled_reward": -0.0742706386372447, |
| "rewards/format_reward": 0.3125000074505806, |
| "step": 288 |
| }, |
| { |
| "completion_length": 2259.9375038146973, |
| "epoch": 0.3302857142857143, |
| "grad_norm": 0.030598286539316177, |
| "kl": 0.0006687641143798828, |
| "learning_rate": 5.09215338910999e-07, |
| "loss": -0.0, |
| "reward": 0.5977206081151962, |
| "reward_std": 0.6630996689200401, |
| "rewards/cosine_scaled_reward": 0.03844361938536167, |
| "rewards/format_reward": 0.520833333954215, |
| "step": 289 |
| }, |
| { |
| "completion_length": 1871.6875305175781, |
| "epoch": 0.3314285714285714, |
| "grad_norm": 0.041915781795978546, |
| "kl": 0.0013911724090576172, |
| "learning_rate": 5.060876951083828e-07, |
| "loss": 0.0246, |
| "reward": 0.7659082859754562, |
| "reward_std": 0.9081554226577282, |
| "rewards/cosine_scaled_reward": 0.007954126223921776, |
| "rewards/format_reward": 0.7500000074505806, |
| "step": 290 |
| }, |
| { |
| "completion_length": 2547.1250534057617, |
| "epoch": 0.3325714285714286, |
| "grad_norm": 0.0511026568710804, |
| "kl": 0.0010018348693847656, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0576, |
| "reward": 0.62677151709795, |
| "reward_std": 0.9465496614575386, |
| "rewards/cosine_scaled_reward": 0.0529690757393837, |
| "rewards/format_reward": 0.5208333414047956, |
| "step": 291 |
| }, |
| { |
| "completion_length": 3320.250030517578, |
| "epoch": 0.33371428571428574, |
| "grad_norm": 0.04419681057333946, |
| "kl": 0.0006957054138183594, |
| "learning_rate": 4.998389805071536e-07, |
| "loss": 0.0262, |
| "reward": -0.012292207218706608, |
| "reward_std": 0.8140805494040251, |
| "rewards/cosine_scaled_reward": -0.17281277664005756, |
| "rewards/format_reward": 0.3333333395421505, |
| "step": 292 |
| }, |
| { |
| "completion_length": 2636.7916870117188, |
| "epoch": 0.33485714285714285, |
| "grad_norm": 0.017586873844265938, |
| "kl": 0.0006794929504394531, |
| "learning_rate": 4.967182142620745e-07, |
| "loss": 0.0078, |
| "reward": 0.3901491202414036, |
| "reward_std": 0.326943714171648, |
| "rewards/cosine_scaled_reward": -0.054925426840782166, |
| "rewards/format_reward": 0.5000000055879354, |
| "step": 293 |
| }, |
| { |
| "completion_length": 3162.4583587646484, |
| "epoch": 0.336, |
| "grad_norm": 0.027655253186821938, |
| "kl": 0.0004392862319946289, |
| "learning_rate": 4.93600044896063e-07, |
| "loss": 0.0199, |
| "reward": 0.11368493363261223, |
| "reward_std": 0.44556378945708275, |
| "rewards/cosine_scaled_reward": -0.0994075471535325, |
| "rewards/format_reward": 0.31250000558793545, |
| "step": 294 |
| }, |
| { |
| "completion_length": 3195.6458435058594, |
| "epoch": 0.33714285714285713, |
| "grad_norm": 0.03356190025806427, |
| "kl": 0.0003917217254638672, |
| "learning_rate": 4.904846243842949e-07, |
| "loss": 0.0145, |
| "reward": -0.07907827943563461, |
| "reward_std": 0.5164209427312016, |
| "rewards/cosine_scaled_reward": -0.12287248979555443, |
| "rewards/format_reward": 0.1666666679084301, |
| "step": 295 |
| }, |
| { |
| "completion_length": 3008.5625534057617, |
| "epoch": 0.3382857142857143, |
| "grad_norm": 0.03211989253759384, |
| "kl": 0.0007897615432739258, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0291, |
| "reward": -0.17747718811733648, |
| "reward_std": 0.5453593544661999, |
| "rewards/cosine_scaled_reward": -0.23457193188369274, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 296 |
| }, |
| { |
| "completion_length": 3558.3958740234375, |
| "epoch": 0.3394285714285714, |
| "grad_norm": 0.0319380946457386, |
| "kl": 0.00032269954681396484, |
| "learning_rate": 4.842626371469149e-07, |
| "loss": 0.0192, |
| "reward": -0.3608126677572727, |
| "reward_std": 0.5530665349215269, |
| "rewards/cosine_scaled_reward": -0.20123967458494008, |
| "rewards/format_reward": 0.0416666679084301, |
| "step": 297 |
| }, |
| { |
| "completion_length": 2848.7708740234375, |
| "epoch": 0.3405714285714286, |
| "grad_norm": 0.03126157075166702, |
| "kl": 0.00034427642822265625, |
| "learning_rate": 4.811563736721829e-07, |
| "loss": 0.0216, |
| "reward": 0.294229757739231, |
| "reward_std": 0.6536255031824112, |
| "rewards/cosine_scaled_reward": -0.06121844658628106, |
| "rewards/format_reward": 0.41666666977107525, |
| "step": 298 |
| }, |
| { |
| "completion_length": 3238.6666717529297, |
| "epoch": 0.3417142857142857, |
| "grad_norm": 0.03061261959373951, |
| "kl": 0.0004975795745849609, |
| "learning_rate": 4.780534655386743e-07, |
| "loss": 0.0007, |
| "reward": 0.2360800839960575, |
| "reward_std": 0.48118265718221664, |
| "rewards/cosine_scaled_reward": -0.01737664802931249, |
| "rewards/format_reward": 0.27083333395421505, |
| "step": 299 |
| }, |
| { |
| "completion_length": 3446.6041870117188, |
| "epoch": 0.34285714285714286, |
| "grad_norm": 0.0574151873588562, |
| "kl": 0.0003980398178100586, |
| "learning_rate": 4.749540639777539e-07, |
| "loss": 0.061, |
| "reward": -0.1822480447590351, |
| "reward_std": 0.7430878691375256, |
| "rewards/cosine_scaled_reward": -0.1744573526084423, |
| "rewards/format_reward": 0.16666666977107525, |
| "step": 300 |
| }, |
| { |
| "completion_length": 2296.041679382324, |
| "epoch": 0.344, |
| "grad_norm": 0.036582205444574356, |
| "kl": 0.0013877153396606445, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0257, |
| "reward": 0.3718048664741218, |
| "reward_std": 0.7346825525164604, |
| "rewards/cosine_scaled_reward": -0.1161809004843235, |
| "rewards/format_reward": 0.6041666809469461, |
| "step": 301 |
| }, |
| { |
| "completion_length": 2405.125026702881, |
| "epoch": 0.34514285714285714, |
| "grad_norm": 0.032144494354724884, |
| "kl": 0.0007233619689941406, |
| "learning_rate": 4.68766384637248e-07, |
| "loss": 0.0155, |
| "reward": 0.7637341096997261, |
| "reward_std": 0.5499859433621168, |
| "rewards/cosine_scaled_reward": 0.12145039439201355, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 302 |
| }, |
| { |
| "completion_length": 2446.1250381469727, |
| "epoch": 0.3462857142857143, |
| "grad_norm": 0.04364825040102005, |
| "kl": 0.0014946460723876953, |
| "learning_rate": 4.656784084364238e-07, |
| "loss": 0.062, |
| "reward": 0.37057532370090485, |
| "reward_std": 0.7990845926105976, |
| "rewards/cosine_scaled_reward": -0.08554568700492382, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 303 |
| }, |
| { |
| "completion_length": 2854.937530517578, |
| "epoch": 0.3474285714285714, |
| "grad_norm": 0.03800087422132492, |
| "kl": 0.0013117790222167969, |
| "learning_rate": 4.6259454195101267e-07, |
| "loss": 0.0191, |
| "reward": -0.04802834615111351, |
| "reward_std": 0.5804362390190363, |
| "rewards/cosine_scaled_reward": -0.19068085309118032, |
| "rewards/format_reward": 0.33333333395421505, |
| "step": 304 |
| }, |
| { |
| "completion_length": 2861.166679382324, |
| "epoch": 0.3485714285714286, |
| "grad_norm": 0.03797345981001854, |
| "kl": 0.001414179801940918, |
| "learning_rate": 4.59514935484316e-07, |
| "loss": -0.001, |
| "reward": 0.20239191502332687, |
| "reward_std": 0.6676407773047686, |
| "rewards/cosine_scaled_reward": -0.07588737271726131, |
| "rewards/format_reward": 0.3541666679084301, |
| "step": 305 |
| }, |
| { |
| "completion_length": 2406.4791870117188, |
| "epoch": 0.3497142857142857, |
| "grad_norm": 0.041493017226457596, |
| "kl": 0.0019769668579101562, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": 0.0715, |
| "reward": 0.4393942207098007, |
| "reward_std": 0.6489860704168677, |
| "rewards/cosine_scaled_reward": -0.03030291385948658, |
| "rewards/format_reward": 0.5000000037252903, |
| "step": 306 |
| }, |
| { |
| "completion_length": 2538.68754196167, |
| "epoch": 0.35085714285714287, |
| "grad_norm": 0.03819847106933594, |
| "kl": 0.0011295080184936523, |
| "learning_rate": 4.5336910277482155e-07, |
| "loss": 0.0372, |
| "reward": 0.3050793409347534, |
| "reward_std": 0.7908993605524302, |
| "rewards/cosine_scaled_reward": -0.09746033605188131, |
| "rewards/format_reward": 0.5000000093132257, |
| "step": 307 |
| }, |
| { |
| "completion_length": 3305.375, |
| "epoch": 0.352, |
| "grad_norm": 0.025576284155249596, |
| "kl": 0.0006071329116821289, |
| "learning_rate": 4.503031760712397e-07, |
| "loss": -0.0213, |
| "reward": -0.008270788937807083, |
| "reward_std": 0.5344270393252373, |
| "rewards/cosine_scaled_reward": -0.10830206237733364, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 308 |
| }, |
| { |
| "completion_length": 3398.062530517578, |
| "epoch": 0.35314285714285715, |
| "grad_norm": 0.036179594695568085, |
| "kl": 0.0006154775619506836, |
| "learning_rate": 4.4724210845020494e-07, |
| "loss": 0.0303, |
| "reward": 0.17174796015024185, |
| "reward_std": 0.718455646187067, |
| "rewards/cosine_scaled_reward": -0.039126007817685604, |
| "rewards/format_reward": 0.2500000037252903, |
| "step": 309 |
| }, |
| { |
| "completion_length": 2386.5208892822266, |
| "epoch": 0.35428571428571426, |
| "grad_norm": 0.019351672381162643, |
| "kl": 0.0006570816040039062, |
| "learning_rate": 4.441860491038345e-07, |
| "loss": 0.0162, |
| "reward": 0.014205506071448326, |
| "reward_std": 0.39019110426306725, |
| "rewards/cosine_scaled_reward": -0.23248059302568436, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 310 |
| }, |
| { |
| "completion_length": 2644.9791984558105, |
| "epoch": 0.3554285714285714, |
| "grad_norm": 0.03210190311074257, |
| "kl": 0.0006589889526367188, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": 0.0523, |
| "reward": 0.45767392963171005, |
| "reward_std": 0.5884373337030411, |
| "rewards/cosine_scaled_reward": 0.010086966678500175, |
| "rewards/format_reward": 0.4375000149011612, |
| "step": 311 |
| }, |
| { |
| "completion_length": 2269.062530517578, |
| "epoch": 0.3565714285714286, |
| "grad_norm": 0.054676324129104614, |
| "kl": 0.0008935928344726562, |
| "learning_rate": 4.3808955077581546e-07, |
| "loss": 0.1022, |
| "reward": 0.5176713224500418, |
| "reward_std": 0.6970217786729336, |
| "rewards/cosine_scaled_reward": 0.02966897562146187, |
| "rewards/format_reward": 0.45833334885537624, |
| "step": 312 |
| }, |
| { |
| "completion_length": 3095.1250534057617, |
| "epoch": 0.3577142857142857, |
| "grad_norm": 0.051631879061460495, |
| "kl": 0.0010917186737060547, |
| "learning_rate": 4.350494089288943e-07, |
| "loss": 0.0596, |
| "reward": 0.1791437454521656, |
| "reward_std": 0.832375283818692, |
| "rewards/cosine_scaled_reward": -0.03542814147658646, |
| "rewards/format_reward": 0.2500000037252903, |
| "step": 313 |
| }, |
| { |
| "completion_length": 2249.2291946411133, |
| "epoch": 0.3588571428571429, |
| "grad_norm": 0.0347190760076046, |
| "kl": 0.0015243291854858398, |
| "learning_rate": 4.3201486961161093e-07, |
| "loss": 0.0493, |
| "reward": 0.6420404259115458, |
| "reward_std": 0.6056624613702297, |
| "rewards/cosine_scaled_reward": 0.029353542253375053, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 314 |
| }, |
| { |
| "completion_length": 2830.104217529297, |
| "epoch": 0.36, |
| "grad_norm": 0.029440823942422867, |
| "kl": 0.0004910826683044434, |
| "learning_rate": 4.2898608072313045e-07, |
| "loss": 0.0184, |
| "reward": 0.5590658411383629, |
| "reward_std": 0.5381645411252975, |
| "rewards/cosine_scaled_reward": 0.060782887041568756, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 315 |
| }, |
| { |
| "completion_length": 3479.0625, |
| "epoch": 0.36114285714285715, |
| "grad_norm": 0.039480630308389664, |
| "kl": 0.00043773651123046875, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": 0.0194, |
| "reward": -0.34035787358880043, |
| "reward_std": 0.6184021793305874, |
| "rewards/cosine_scaled_reward": -0.22226226888597012, |
| "rewards/format_reward": 0.10416666977107525, |
| "step": 316 |
| }, |
| { |
| "completion_length": 3024.6458892822266, |
| "epoch": 0.36228571428571427, |
| "grad_norm": 0.04857990890741348, |
| "kl": 0.0009016990661621094, |
| "learning_rate": 4.2294634442070553e-07, |
| "loss": 0.0733, |
| "reward": 0.15108232758939266, |
| "reward_std": 0.8656900152564049, |
| "rewards/cosine_scaled_reward": -0.09112551622092724, |
| "rewards/format_reward": 0.33333333767950535, |
| "step": 317 |
| }, |
| { |
| "completion_length": 2164.416732788086, |
| "epoch": 0.36342857142857143, |
| "grad_norm": 0.06030331179499626, |
| "kl": 0.002919435501098633, |
| "learning_rate": 4.1993569137498776e-07, |
| "loss": 0.0517, |
| "reward": 0.6099264821968973, |
| "reward_std": 0.9467958249151707, |
| "rewards/cosine_scaled_reward": -0.028370092622935772, |
| "rewards/format_reward": 0.6666666716337204, |
| "step": 318 |
| }, |
| { |
| "completion_length": 2844.4375228881836, |
| "epoch": 0.36457142857142855, |
| "grad_norm": 0.02526162751019001, |
| "kl": 0.0008511543273925781, |
| "learning_rate": 4.1693137748017915e-07, |
| "loss": 0.041, |
| "reward": -0.2095866478048265, |
| "reward_std": 0.45644592493772507, |
| "rewards/cosine_scaled_reward": -0.24020998924970627, |
| "rewards/format_reward": 0.27083333767950535, |
| "step": 319 |
| }, |
| { |
| "completion_length": 2073.020835876465, |
| "epoch": 0.3657142857142857, |
| "grad_norm": 0.05878617987036705, |
| "kl": 0.001214742660522461, |
| "learning_rate": 4.1393354916230005e-07, |
| "loss": 0.0585, |
| "reward": 0.3800775744020939, |
| "reward_std": 0.6021665297448635, |
| "rewards/cosine_scaled_reward": -0.1016278937458992, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 320 |
| }, |
| { |
| "completion_length": 1917.3750228881836, |
| "epoch": 0.3668571428571429, |
| "grad_norm": 0.03980216011404991, |
| "kl": 0.001294553279876709, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.0804, |
| "reward": 0.7248329035937786, |
| "reward_std": 0.6771653518080711, |
| "rewards/cosine_scaled_reward": 0.008249786798842251, |
| "rewards/format_reward": 0.7083333395421505, |
| "step": 321 |
| }, |
| { |
| "completion_length": 2903.875011444092, |
| "epoch": 0.368, |
| "grad_norm": 0.0290102269500494, |
| "kl": 0.0006527900695800781, |
| "learning_rate": 4.079579333738039e-07, |
| "loss": 0.0155, |
| "reward": -0.1609306987375021, |
| "reward_std": 0.4162718243896961, |
| "rewards/cosine_scaled_reward": -0.1950486795976758, |
| "rewards/format_reward": 0.2291666716337204, |
| "step": 322 |
| }, |
| { |
| "completion_length": 2931.895851135254, |
| "epoch": 0.36914285714285716, |
| "grad_norm": 0.019431445747613907, |
| "kl": 0.0010836124420166016, |
| "learning_rate": 4.0498043714627006e-07, |
| "loss": 0.0199, |
| "reward": 0.35527924820780754, |
| "reward_std": 0.44612617418169975, |
| "rewards/cosine_scaled_reward": -0.020277027040719986, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 323 |
| }, |
| { |
| "completion_length": 2903.729217529297, |
| "epoch": 0.3702857142857143, |
| "grad_norm": 0.0290395338088274, |
| "kl": 0.0006427764892578125, |
| "learning_rate": 4.020100089676376e-07, |
| "loss": 0.019, |
| "reward": 0.27246633544564247, |
| "reward_std": 0.5725754294544458, |
| "rewards/cosine_scaled_reward": -0.05126684159040451, |
| "rewards/format_reward": 0.37500000558793545, |
| "step": 324 |
| }, |
| { |
| "completion_length": 2823.7500610351562, |
| "epoch": 0.37142857142857144, |
| "grad_norm": 0.027312133461236954, |
| "kl": 0.0011664628982543945, |
| "learning_rate": 3.9904679361238526e-07, |
| "loss": 0.0044, |
| "reward": 0.5526620149612427, |
| "reward_std": 0.578266765922308, |
| "rewards/cosine_scaled_reward": 0.05758099630475044, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 325 |
| }, |
| { |
| "completion_length": 2344.4792098999023, |
| "epoch": 0.37257142857142855, |
| "grad_norm": 0.025410566478967667, |
| "kl": 0.0010523796081542969, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": 0.036, |
| "reward": 0.6994030121713877, |
| "reward_std": 0.48659895546734333, |
| "rewards/cosine_scaled_reward": 0.06845148652791977, |
| "rewards/format_reward": 0.5625000018626451, |
| "step": 326 |
| }, |
| { |
| "completion_length": 2858.5833435058594, |
| "epoch": 0.3737142857142857, |
| "grad_norm": 0.033931881189346313, |
| "kl": 0.0009119510650634766, |
| "learning_rate": 3.931425787051832e-07, |
| "loss": -0.0144, |
| "reward": 0.597864555194974, |
| "reward_std": 0.6859371922910213, |
| "rewards/cosine_scaled_reward": 0.10101561388000846, |
| "rewards/format_reward": 0.3958333395421505, |
| "step": 327 |
| }, |
| { |
| "completion_length": 3584.0, |
| "epoch": 0.37485714285714283, |
| "grad_norm": 0.02540241740643978, |
| "kl": 0.0006210803985595703, |
| "learning_rate": 3.902018669163384e-07, |
| "loss": 0.0, |
| "reward": -0.4537691902369261, |
| "reward_std": 0.4180416464805603, |
| "rewards/cosine_scaled_reward": -0.237301261164248, |
| "rewards/format_reward": 0.02083333395421505, |
| "step": 328 |
| }, |
| { |
| "completion_length": 2313.520881652832, |
| "epoch": 0.376, |
| "grad_norm": 0.0391564816236496, |
| "kl": 0.0012059211730957031, |
| "learning_rate": 3.872689434630585e-07, |
| "loss": 0.007, |
| "reward": 0.6544864065945148, |
| "reward_std": 0.6928528789430857, |
| "rewards/cosine_scaled_reward": 0.056409850250929594, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 329 |
| }, |
| { |
| "completion_length": 2332.1250228881836, |
| "epoch": 0.37714285714285717, |
| "grad_norm": 0.04612577334046364, |
| "kl": 0.0016050338745117188, |
| "learning_rate": 3.843439512918949e-07, |
| "loss": 0.0728, |
| "reward": 0.27069247141480446, |
| "reward_std": 0.5735555719584227, |
| "rewards/cosine_scaled_reward": -0.1250704526901245, |
| "rewards/format_reward": 0.520833345130086, |
| "step": 330 |
| }, |
| { |
| "completion_length": 2372.687515258789, |
| "epoch": 0.3782857142857143, |
| "grad_norm": 0.02713729254901409, |
| "kl": 0.0010589361190795898, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": 0.0437, |
| "reward": 0.070272296667099, |
| "reward_std": 0.49196940287947655, |
| "rewards/cosine_scaled_reward": -0.18361386097967625, |
| "rewards/format_reward": 0.4375, |
| "step": 331 |
| }, |
| { |
| "completion_length": 2441.0833587646484, |
| "epoch": 0.37942857142857145, |
| "grad_norm": 0.31212538480758667, |
| "kl": 0.0019620656967163086, |
| "learning_rate": 3.785183306423767e-07, |
| "loss": 0.037, |
| "reward": 0.42729340121150017, |
| "reward_std": 0.5920264758169651, |
| "rewards/cosine_scaled_reward": -0.046769993379712105, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 332 |
| }, |
| { |
| "completion_length": 2342.8958435058594, |
| "epoch": 0.38057142857142856, |
| "grad_norm": 0.024794911965727806, |
| "kl": 0.0012290477752685547, |
| "learning_rate": 3.7561798609655373e-07, |
| "loss": 0.0199, |
| "reward": 0.3754307944327593, |
| "reward_std": 0.525186138227582, |
| "rewards/cosine_scaled_reward": -0.06228460434067529, |
| "rewards/format_reward": 0.5000000037252903, |
| "step": 333 |
| }, |
| { |
| "completion_length": 3427.000030517578, |
| "epoch": 0.38171428571428573, |
| "grad_norm": 0.03751340135931969, |
| "kl": 0.0007123947143554688, |
| "learning_rate": 3.72726140684072e-07, |
| "loss": 0.0284, |
| "reward": -0.16761411912739277, |
| "reward_std": 0.5903099803254008, |
| "rewards/cosine_scaled_reward": -0.1671403832733631, |
| "rewards/format_reward": 0.16666666977107525, |
| "step": 334 |
| }, |
| { |
| "completion_length": 2378.3958892822266, |
| "epoch": 0.38285714285714284, |
| "grad_norm": 0.03808344900608063, |
| "kl": 0.0016427040100097656, |
| "learning_rate": 3.6984293534939737e-07, |
| "loss": 0.0017, |
| "reward": 0.486670202575624, |
| "reward_std": 0.7270042449235916, |
| "rewards/cosine_scaled_reward": -0.03791489452123642, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 335 |
| }, |
| { |
| "completion_length": 3137.541717529297, |
| "epoch": 0.384, |
| "grad_norm": 0.05256379023194313, |
| "kl": 0.0007867813110351562, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": 0.0447, |
| "reward": 0.24836627580225468, |
| "reward_std": 0.8177415505051613, |
| "rewards/cosine_scaled_reward": -0.05290019465610385, |
| "rewards/format_reward": 0.35416668094694614, |
| "step": 336 |
| }, |
| { |
| "completion_length": 3336.8333435058594, |
| "epoch": 0.3851428571428571, |
| "grad_norm": 0.040195003151893616, |
| "kl": 0.0008945465087890625, |
| "learning_rate": 3.641030065789562e-07, |
| "loss": -0.0036, |
| "reward": -0.056929025799036026, |
| "reward_std": 0.6236543748527765, |
| "rewards/cosine_scaled_reward": -0.13263118267059326, |
| "rewards/format_reward": 0.2083333395421505, |
| "step": 337 |
| }, |
| { |
| "completion_length": 2392.1250534057617, |
| "epoch": 0.3862857142857143, |
| "grad_norm": 0.06893110275268555, |
| "kl": 0.0015697479248046875, |
| "learning_rate": 3.612465628992203e-07, |
| "loss": 0.1161, |
| "reward": 0.6895103082060814, |
| "reward_std": 0.8643823079764843, |
| "rewards/cosine_scaled_reward": 0.07392181595787406, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 338 |
| }, |
| { |
| "completion_length": 3073.0625228881836, |
| "epoch": 0.38742857142857146, |
| "grad_norm": 0.03818909078836441, |
| "kl": 0.0008697509765625, |
| "learning_rate": 3.5839931879571725e-07, |
| "loss": 0.0467, |
| "reward": -0.09941295813769102, |
| "reward_std": 0.49821673333644867, |
| "rewards/cosine_scaled_reward": -0.1538731474429369, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 339 |
| }, |
| { |
| "completion_length": 2484.5625381469727, |
| "epoch": 0.38857142857142857, |
| "grad_norm": 0.040925703942775726, |
| "kl": 0.0010941028594970703, |
| "learning_rate": 3.555614130391079e-07, |
| "loss": 0.0031, |
| "reward": 0.39815794909372926, |
| "reward_std": 0.7465380132198334, |
| "rewards/cosine_scaled_reward": -0.030087683349847794, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 340 |
| }, |
| { |
| "completion_length": 2501.6041946411133, |
| "epoch": 0.38971428571428574, |
| "grad_norm": 0.03836272656917572, |
| "kl": 0.0011022090911865234, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.036, |
| "reward": 0.49983580166008323, |
| "reward_std": 0.7316713891923428, |
| "rewards/cosine_scaled_reward": -8.210213854908943e-05, |
| "rewards/format_reward": 0.5000000055879354, |
| "step": 341 |
| }, |
| { |
| "completion_length": 3069.2291870117188, |
| "epoch": 0.39085714285714285, |
| "grad_norm": 0.03620956465601921, |
| "kl": 0.0009083747863769531, |
| "learning_rate": 3.4991416936678276e-07, |
| "loss": 0.0476, |
| "reward": 0.07453913427889347, |
| "reward_std": 0.5634984392672777, |
| "rewards/cosine_scaled_reward": -0.0668970886617899, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 342 |
| }, |
| { |
| "completion_length": 3340.1875, |
| "epoch": 0.392, |
| "grad_norm": 0.0366065539419651, |
| "kl": 0.0004994869232177734, |
| "learning_rate": 3.471051066897562e-07, |
| "loss": 0.048, |
| "reward": 0.05115175619721413, |
| "reward_std": 0.7214457457885146, |
| "rewards/cosine_scaled_reward": -0.09942414052784443, |
| "rewards/format_reward": 0.2500000074505806, |
| "step": 343 |
| }, |
| { |
| "completion_length": 2400.5000915527344, |
| "epoch": 0.3931428571428571, |
| "grad_norm": 0.026572655886411667, |
| "kl": 0.0007510185241699219, |
| "learning_rate": 3.4430593282358777e-07, |
| "loss": 0.0261, |
| "reward": 0.9928230717778206, |
| "reward_std": 0.5358506329357624, |
| "rewards/cosine_scaled_reward": 0.18391155824065208, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 344 |
| }, |
| { |
| "completion_length": 2765.3541870117188, |
| "epoch": 0.3942857142857143, |
| "grad_norm": 0.04380709305405617, |
| "kl": 0.0007009506225585938, |
| "learning_rate": 3.4151678419606233e-07, |
| "loss": 0.0862, |
| "reward": 0.2515047416090965, |
| "reward_std": 0.5667482540011406, |
| "rewards/cosine_scaled_reward": -0.051330966874957085, |
| "rewards/format_reward": 0.35416666977107525, |
| "step": 345 |
| }, |
| { |
| "completion_length": 3413.5833740234375, |
| "epoch": 0.3954285714285714, |
| "grad_norm": 0.04162891209125519, |
| "kl": 0.0006406307220458984, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": 0.0233, |
| "reward": -0.05835883319377899, |
| "reward_std": 0.6794736217707396, |
| "rewards/cosine_scaled_reward": -0.17501276172697544, |
| "rewards/format_reward": 0.29166667349636555, |
| "step": 346 |
| }, |
| { |
| "completion_length": 3015.500030517578, |
| "epoch": 0.3965714285714286, |
| "grad_norm": 0.04500904306769371, |
| "kl": 0.0009529590606689453, |
| "learning_rate": 3.359691059183761e-07, |
| "loss": 0.0428, |
| "reward": -0.06104391813278198, |
| "reward_std": 0.7086496539413929, |
| "rewards/cosine_scaled_reward": -0.20760529045946896, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 347 |
| }, |
| { |
| "completion_length": 2832.8541946411133, |
| "epoch": 0.3977142857142857, |
| "grad_norm": 0.019439518451690674, |
| "kl": 0.0007596015930175781, |
| "learning_rate": 3.3321084665422803e-07, |
| "loss": 0.0, |
| "reward": 0.2556960296933539, |
| "reward_std": 0.40971279237419367, |
| "rewards/cosine_scaled_reward": -0.0596519922837615, |
| "rewards/format_reward": 0.375, |
| "step": 348 |
| }, |
| { |
| "completion_length": 2908.812515258789, |
| "epoch": 0.39885714285714285, |
| "grad_norm": 0.03599149361252785, |
| "kl": 0.0007562637329101562, |
| "learning_rate": 3.3046315338757026e-07, |
| "loss": 0.0131, |
| "reward": 0.2589077763259411, |
| "reward_std": 0.626338753849268, |
| "rewards/cosine_scaled_reward": -0.08929613418877125, |
| "rewards/format_reward": 0.4375000149011612, |
| "step": 349 |
| }, |
| { |
| "completion_length": 2553.083339691162, |
| "epoch": 0.4, |
| "grad_norm": 0.027066772803664207, |
| "kl": 0.0008411407470703125, |
| "learning_rate": 3.2772616003709616e-07, |
| "loss": 0.0154, |
| "reward": 0.4995659068226814, |
| "reward_std": 0.4548289757221937, |
| "rewards/cosine_scaled_reward": 0.010199605487287045, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 350 |
| }, |
| { |
| "completion_length": 3268.625030517578, |
| "epoch": 0.40114285714285713, |
| "grad_norm": 0.03507920354604721, |
| "kl": 0.0008692741394042969, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0301, |
| "reward": -0.001983426511287689, |
| "reward_std": 0.5756698325276375, |
| "rewards/cosine_scaled_reward": -0.13640837743878365, |
| "rewards/format_reward": 0.2708333395421505, |
| "step": 351 |
| }, |
| { |
| "completion_length": 2692.9583587646484, |
| "epoch": 0.4022857142857143, |
| "grad_norm": 0.03933379054069519, |
| "kl": 0.0012662410736083984, |
| "learning_rate": 3.222848061454764e-07, |
| "loss": 0.0339, |
| "reward": 0.39516958221793175, |
| "reward_std": 0.7729976400732994, |
| "rewards/cosine_scaled_reward": -0.010748542845249176, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 352 |
| }, |
| { |
| "completion_length": 2451.9375, |
| "epoch": 0.4034285714285714, |
| "grad_norm": 0.02567175030708313, |
| "kl": 0.0020258426666259766, |
| "learning_rate": 3.195807108082429e-07, |
| "loss": -0.0063, |
| "reward": 0.22795281372964382, |
| "reward_std": 0.49539407063275576, |
| "rewards/cosine_scaled_reward": -0.12560690939426422, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 353 |
| }, |
| { |
| "completion_length": 1874.9791793823242, |
| "epoch": 0.4045714285714286, |
| "grad_norm": 0.029671069234609604, |
| "kl": 0.0017531514167785645, |
| "learning_rate": 3.168878457820915e-07, |
| "loss": 0.0081, |
| "reward": 0.940945464768447, |
| "reward_std": 0.7216029353439808, |
| "rewards/cosine_scaled_reward": 0.11630604974925518, |
| "rewards/format_reward": 0.7083333358168602, |
| "step": 354 |
| }, |
| { |
| "completion_length": 2448.7500381469727, |
| "epoch": 0.4057142857142857, |
| "grad_norm": 0.03424989804625511, |
| "kl": 0.0019562244415283203, |
| "learning_rate": 3.142063423134644e-07, |
| "loss": -0.0181, |
| "reward": 0.7008613422513008, |
| "reward_std": 0.6350522413849831, |
| "rewards/cosine_scaled_reward": 0.06918067391961813, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 355 |
| }, |
| { |
| "completion_length": 2569.250015258789, |
| "epoch": 0.40685714285714286, |
| "grad_norm": 0.032561905682086945, |
| "kl": 0.0011096000671386719, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": 0.0258, |
| "reward": 0.5701954085379839, |
| "reward_std": 0.7334288004785776, |
| "rewards/cosine_scaled_reward": -0.006568958284333348, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 356 |
| }, |
| { |
| "completion_length": 3241.1250610351562, |
| "epoch": 0.408, |
| "grad_norm": 0.03412913531064987, |
| "kl": 0.0009722709655761719, |
| "learning_rate": 3.0887794225945143e-07, |
| "loss": 0.0399, |
| "reward": -0.1981919500976801, |
| "reward_std": 0.6031934916973114, |
| "rewards/cosine_scaled_reward": -0.22409598156809807, |
| "rewards/format_reward": 0.25000000558793545, |
| "step": 357 |
| }, |
| { |
| "completion_length": 2790.041748046875, |
| "epoch": 0.40914285714285714, |
| "grad_norm": 0.05226265639066696, |
| "kl": 0.0027484893798828125, |
| "learning_rate": 3.062313053727671e-07, |
| "loss": 0.0425, |
| "reward": 0.5322411619126797, |
| "reward_std": 0.9997622594237328, |
| "rewards/cosine_scaled_reward": 0.03695392981171608, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 358 |
| }, |
| { |
| "completion_length": 2785.3958587646484, |
| "epoch": 0.4102857142857143, |
| "grad_norm": 0.03838330879807472, |
| "kl": 0.0009531974792480469, |
| "learning_rate": 3.0359654942835247e-07, |
| "loss": 0.0174, |
| "reward": 0.24072397220879793, |
| "reward_std": 0.742347358725965, |
| "rewards/cosine_scaled_reward": -0.08797135204076767, |
| "rewards/format_reward": 0.41666667722165585, |
| "step": 359 |
| }, |
| { |
| "completion_length": 2664.7083892822266, |
| "epoch": 0.4114285714285714, |
| "grad_norm": 0.04071582481265068, |
| "kl": 0.0034384727478027344, |
| "learning_rate": 3.0097380284049523e-07, |
| "loss": 0.0468, |
| "reward": 0.322513684630394, |
| "reward_std": 0.7571201398968697, |
| "rewards/cosine_scaled_reward": -0.0366598404943943, |
| "rewards/format_reward": 0.3958333432674408, |
| "step": 360 |
| }, |
| { |
| "completion_length": 3133.4167098999023, |
| "epoch": 0.4125714285714286, |
| "grad_norm": 0.04097485542297363, |
| "kl": 0.0013079643249511719, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0059, |
| "reward": 0.24762597493827343, |
| "reward_std": 0.7427336126565933, |
| "rewards/cosine_scaled_reward": -0.032437026500701904, |
| "rewards/format_reward": 0.3125000074505806, |
| "step": 361 |
| }, |
| { |
| "completion_length": 1420.1875114440918, |
| "epoch": 0.4137142857142857, |
| "grad_norm": 0.03498203307390213, |
| "kl": 0.0024487972259521484, |
| "learning_rate": 2.9576484845877793e-07, |
| "loss": 0.0363, |
| "reward": 1.1095450557768345, |
| "reward_std": 0.6055663973093033, |
| "rewards/cosine_scaled_reward": 0.127689179033041, |
| "rewards/format_reward": 0.8541666753590107, |
| "step": 362 |
| }, |
| { |
| "completion_length": 2218.708381652832, |
| "epoch": 0.41485714285714287, |
| "grad_norm": 0.03273200988769531, |
| "kl": 0.001424551010131836, |
| "learning_rate": 2.931788945420058e-07, |
| "loss": 0.0456, |
| "reward": 0.5941947225946933, |
| "reward_std": 0.6653445195406675, |
| "rewards/cosine_scaled_reward": 0.015847355127334595, |
| "rewards/format_reward": 0.5625000037252903, |
| "step": 363 |
| }, |
| { |
| "completion_length": 2686.854202270508, |
| "epoch": 0.416, |
| "grad_norm": 0.033127423375844955, |
| "kl": 0.0012614727020263672, |
| "learning_rate": 2.9060545772359305e-07, |
| "loss": 0.0182, |
| "reward": 0.13789374008774757, |
| "reward_std": 0.517003508284688, |
| "rewards/cosine_scaled_reward": -0.16021979413926601, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 364 |
| }, |
| { |
| "completion_length": 2956.166702270508, |
| "epoch": 0.41714285714285715, |
| "grad_norm": 0.048571985214948654, |
| "kl": 0.0009555816650390625, |
| "learning_rate": 2.8804466342921987e-07, |
| "loss": 0.0338, |
| "reward": 0.0561866071075201, |
| "reward_std": 0.7425644807517529, |
| "rewards/cosine_scaled_reward": -0.1177400229498744, |
| "rewards/format_reward": 0.29166667349636555, |
| "step": 365 |
| }, |
| { |
| "completion_length": 1896.750036239624, |
| "epoch": 0.41828571428571426, |
| "grad_norm": 0.030456840991973877, |
| "kl": 0.0005739927291870117, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": 0.0198, |
| "reward": 0.781156275421381, |
| "reward_std": 0.6628811918199062, |
| "rewards/cosine_scaled_reward": 0.046828120946884155, |
| "rewards/format_reward": 0.6875, |
| "step": 366 |
| }, |
| { |
| "completion_length": 2907.187530517578, |
| "epoch": 0.41942857142857143, |
| "grad_norm": 0.04903361201286316, |
| "kl": 0.0008039474487304688, |
| "learning_rate": 2.829615010283344e-07, |
| "loss": -0.001, |
| "reward": 0.41779946256428957, |
| "reward_std": 0.8784012943506241, |
| "rewards/cosine_scaled_reward": -0.030683615244925022, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 367 |
| }, |
| { |
| "completion_length": 2909.0000076293945, |
| "epoch": 0.4205714285714286, |
| "grad_norm": 0.03092154674232006, |
| "kl": 0.0025424957275390625, |
| "learning_rate": 2.8043938066798645e-07, |
| "loss": -0.0078, |
| "reward": 0.0029743313789367676, |
| "reward_std": 0.4080376699566841, |
| "rewards/cosine_scaled_reward": -0.12351284455507994, |
| "rewards/format_reward": 0.25, |
| "step": 368 |
| }, |
| { |
| "completion_length": 3040.6042098999023, |
| "epoch": 0.4217142857142857, |
| "grad_norm": 0.05602690950036049, |
| "kl": 0.0014162063598632812, |
| "learning_rate": 2.7793039831193133e-07, |
| "loss": 0.0144, |
| "reward": 0.3254664018750191, |
| "reward_std": 0.957383282482624, |
| "rewards/cosine_scaled_reward": -0.035183459520339966, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 369 |
| }, |
| { |
| "completion_length": 3252.25, |
| "epoch": 0.4228571428571429, |
| "grad_norm": 0.02123953402042389, |
| "kl": 0.0016574859619140625, |
| "learning_rate": 2.7543467624442956e-07, |
| "loss": 0.0164, |
| "reward": 0.04274958372116089, |
| "reward_std": 0.37660638615489006, |
| "rewards/cosine_scaled_reward": -0.051541879773139954, |
| "rewards/format_reward": 0.14583333395421505, |
| "step": 370 |
| }, |
| { |
| "completion_length": 1920.6250305175781, |
| "epoch": 0.424, |
| "grad_norm": 0.013661705888807774, |
| "kl": 0.0009822845458984375, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0059, |
| "reward": 0.5551425702869892, |
| "reward_std": 0.278125814627856, |
| "rewards/cosine_scaled_reward": 0.01715458557009697, |
| "rewards/format_reward": 0.520833333954215, |
| "step": 371 |
| }, |
| { |
| "completion_length": 3028.520896911621, |
| "epoch": 0.42514285714285716, |
| "grad_norm": 0.0525103434920311, |
| "kl": 0.00101470947265625, |
| "learning_rate": 2.7048349887476037e-07, |
| "loss": 0.0585, |
| "reward": 0.7306250557303429, |
| "reward_std": 0.8802761994302273, |
| "rewards/cosine_scaled_reward": 0.167395843192935, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 372 |
| }, |
| { |
| "completion_length": 1821.583351135254, |
| "epoch": 0.42628571428571427, |
| "grad_norm": 0.028777100145816803, |
| "kl": 0.0010874271392822266, |
| "learning_rate": 2.6802828488599294e-07, |
| "loss": 0.039, |
| "reward": 0.31937169190496206, |
| "reward_std": 0.6239332426339388, |
| "rewards/cosine_scaled_reward": -0.15281416662037373, |
| "rewards/format_reward": 0.6250000037252903, |
| "step": 373 |
| }, |
| { |
| "completion_length": 2500.604202270508, |
| "epoch": 0.42742857142857144, |
| "grad_norm": 0.0391635037958622, |
| "kl": 0.0013954639434814453, |
| "learning_rate": 2.655868138008171e-07, |
| "loss": 0.0541, |
| "reward": 0.37847861149930395, |
| "reward_std": 0.7538540698587894, |
| "rewards/cosine_scaled_reward": -0.039927379228174686, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 374 |
| }, |
| { |
| "completion_length": 2978.7291717529297, |
| "epoch": 0.42857142857142855, |
| "grad_norm": 0.03049525059759617, |
| "kl": 0.0025815963745117188, |
| "learning_rate": 2.631592046130896e-07, |
| "loss": 0.0099, |
| "reward": 0.37722547352313995, |
| "reward_std": 0.5434530153870583, |
| "rewards/cosine_scaled_reward": 0.0011127367615699768, |
| "rewards/format_reward": 0.37500000558793545, |
| "step": 375 |
| }, |
| { |
| "completion_length": 2328.083396911621, |
| "epoch": 0.4297142857142857, |
| "grad_norm": 0.04338321462273598, |
| "kl": 0.0022089481353759766, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.0388, |
| "reward": 0.33715473487973213, |
| "reward_std": 0.6841284055262804, |
| "rewards/cosine_scaled_reward": -0.1126726483926177, |
| "rewards/format_reward": 0.5625000111758709, |
| "step": 376 |
| }, |
| { |
| "completion_length": 3434.3541870117188, |
| "epoch": 0.4308571428571429, |
| "grad_norm": 0.04104272648692131, |
| "kl": 0.0009138584136962891, |
| "learning_rate": 2.583460445215911e-07, |
| "loss": 0.0306, |
| "reward": -0.12041943147778511, |
| "reward_std": 0.6869102958589792, |
| "rewards/cosine_scaled_reward": -0.17479303665459156, |
| "rewards/format_reward": 0.22916667349636555, |
| "step": 377 |
| }, |
| { |
| "completion_length": 2245.437545776367, |
| "epoch": 0.432, |
| "grad_norm": 0.02392558939754963, |
| "kl": 0.0010918378829956055, |
| "learning_rate": 2.5596072820445254e-07, |
| "loss": 0.021, |
| "reward": 0.6035024983575568, |
| "reward_std": 0.6711528412997723, |
| "rewards/cosine_scaled_reward": 0.010084576904773712, |
| "rewards/format_reward": 0.583333333954215, |
| "step": 378 |
| }, |
| { |
| "completion_length": 3069.1041870117188, |
| "epoch": 0.43314285714285716, |
| "grad_norm": 0.02428600750863552, |
| "kl": 0.0008909702301025391, |
| "learning_rate": 2.5358974294659373e-07, |
| "loss": 0.0264, |
| "reward": -0.1455503827892244, |
| "reward_std": 0.5249228626489639, |
| "rewards/cosine_scaled_reward": -0.16652518138289452, |
| "rewards/format_reward": 0.18750000186264515, |
| "step": 379 |
| }, |
| { |
| "completion_length": 2903.437515258789, |
| "epoch": 0.4342857142857143, |
| "grad_norm": 0.05160444974899292, |
| "kl": 0.0016338825225830078, |
| "learning_rate": 2.512332043064913e-07, |
| "loss": 0.0391, |
| "reward": 0.152958738617599, |
| "reward_std": 0.8814124129712582, |
| "rewards/cosine_scaled_reward": -0.11102063581347466, |
| "rewards/format_reward": 0.37500000931322575, |
| "step": 380 |
| }, |
| { |
| "completion_length": 2945.3125534057617, |
| "epoch": 0.43542857142857144, |
| "grad_norm": 0.04407874122262001, |
| "kl": 0.0018510818481445312, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0489, |
| "reward": -0.054892294108867645, |
| "reward_std": 0.6566630490124226, |
| "rewards/cosine_scaled_reward": -0.18369614984840155, |
| "rewards/format_reward": 0.3125, |
| "step": 381 |
| }, |
| { |
| "completion_length": 2357.4375534057617, |
| "epoch": 0.43657142857142855, |
| "grad_norm": 0.03399001434445381, |
| "kl": 0.0019459724426269531, |
| "learning_rate": 2.465639255873246e-07, |
| "loss": 0.0484, |
| "reward": 0.3038070909678936, |
| "reward_std": 0.4995873998850584, |
| "rewards/cosine_scaled_reward": -0.13976313127204776, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 382 |
| }, |
| { |
| "completion_length": 2986.9583435058594, |
| "epoch": 0.4377142857142857, |
| "grad_norm": 0.06184740364551544, |
| "kl": 0.0022149085998535156, |
| "learning_rate": 2.4425141308231765e-07, |
| "loss": 0.0698, |
| "reward": 0.3715968355536461, |
| "reward_std": 0.6343178395181894, |
| "rewards/cosine_scaled_reward": -0.0017015933990478516, |
| "rewards/format_reward": 0.3750000074505806, |
| "step": 383 |
| }, |
| { |
| "completion_length": 2208.4583892822266, |
| "epoch": 0.43885714285714283, |
| "grad_norm": 0.04220281541347504, |
| "kl": 0.002426624298095703, |
| "learning_rate": 2.4195380233209006e-07, |
| "loss": -0.0163, |
| "reward": 1.1835149601101875, |
| "reward_std": 0.8550020232796669, |
| "rewards/cosine_scaled_reward": 0.2584241358563304, |
| "rewards/format_reward": 0.6666666716337204, |
| "step": 384 |
| }, |
| { |
| "completion_length": 2592.8542098999023, |
| "epoch": 0.44, |
| "grad_norm": 0.02935204468667507, |
| "kl": 0.0010533332824707031, |
| "learning_rate": 2.3967120531894857e-07, |
| "loss": -0.0031, |
| "reward": 0.2314518727362156, |
| "reward_std": 0.5928132049739361, |
| "rewards/cosine_scaled_reward": -0.15510739805176854, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 385 |
| }, |
| { |
| "completion_length": 2843.5416870117188, |
| "epoch": 0.44114285714285717, |
| "grad_norm": 0.03138664364814758, |
| "kl": 0.0016608238220214844, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": 0.0227, |
| "reward": 0.29218395706266165, |
| "reward_std": 0.5787023734301329, |
| "rewards/cosine_scaled_reward": -0.07265802705660462, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 386 |
| }, |
| { |
| "completion_length": 2934.5625, |
| "epoch": 0.4422857142857143, |
| "grad_norm": 0.020530929788947105, |
| "kl": 0.0008387565612792969, |
| "learning_rate": 2.3515149676898552e-07, |
| "loss": 0.013, |
| "reward": -0.04479955695569515, |
| "reward_std": 0.36785753816366196, |
| "rewards/cosine_scaled_reward": -0.147399777546525, |
| "rewards/format_reward": 0.25000000558793545, |
| "step": 387 |
| }, |
| { |
| "completion_length": 2610.062515258789, |
| "epoch": 0.44342857142857145, |
| "grad_norm": 0.040762562304735184, |
| "kl": 0.0011615753173828125, |
| "learning_rate": 2.3291460551638237e-07, |
| "loss": 0.0708, |
| "reward": 0.23008868657052517, |
| "reward_std": 0.6174043659120798, |
| "rewards/cosine_scaled_reward": -0.10370565578341484, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 388 |
| }, |
| { |
| "completion_length": 2543.354202270508, |
| "epoch": 0.44457142857142856, |
| "grad_norm": 0.04136168956756592, |
| "kl": 0.0022974014282226562, |
| "learning_rate": 2.306931685585657e-07, |
| "loss": 0.0045, |
| "reward": 0.37671348359435797, |
| "reward_std": 0.7336033247411251, |
| "rewards/cosine_scaled_reward": -0.07205992937088013, |
| "rewards/format_reward": 0.5208333414047956, |
| "step": 389 |
| }, |
| { |
| "completion_length": 3020.4791870117188, |
| "epoch": 0.44571428571428573, |
| "grad_norm": 0.05496221408247948, |
| "kl": 0.0008085966110229492, |
| "learning_rate": 2.2848729416523859e-07, |
| "loss": 0.0478, |
| "reward": 0.24858180806040764, |
| "reward_std": 0.8255434706807137, |
| "rewards/cosine_scaled_reward": -0.07362576154991984, |
| "rewards/format_reward": 0.39583334140479565, |
| "step": 390 |
| }, |
| { |
| "completion_length": 2627.8542098999023, |
| "epoch": 0.44685714285714284, |
| "grad_norm": 0.03822245076298714, |
| "kl": 0.0011529922485351562, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": -0.0056, |
| "reward": 1.1691500842571259, |
| "reward_std": 0.6973480880260468, |
| "rewards/cosine_scaled_reward": 0.30332503467798233, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 391 |
| }, |
| { |
| "completion_length": 2101.8750038146973, |
| "epoch": 0.448, |
| "grad_norm": 0.04114387929439545, |
| "kl": 0.0023064613342285156, |
| "learning_rate": 2.2412266235313973e-07, |
| "loss": 0.0151, |
| "reward": 0.4915292300283909, |
| "reward_std": 0.7246231567114592, |
| "rewards/cosine_scaled_reward": -0.07715206500142813, |
| "rewards/format_reward": 0.6458333358168602, |
| "step": 392 |
| }, |
| { |
| "completion_length": 2701.479232788086, |
| "epoch": 0.4491428571428571, |
| "grad_norm": 0.05024963617324829, |
| "kl": 0.0015339851379394531, |
| "learning_rate": 2.2196411766036487e-07, |
| "loss": 0.0441, |
| "reward": 0.4650035873055458, |
| "reward_std": 0.9533145241439342, |
| "rewards/cosine_scaled_reward": -0.04874821309931576, |
| "rewards/format_reward": 0.5625000167638063, |
| "step": 393 |
| }, |
| { |
| "completion_length": 3071.2083435058594, |
| "epoch": 0.4502857142857143, |
| "grad_norm": 0.03059108555316925, |
| "kl": 0.002701997756958008, |
| "learning_rate": 2.1982156097370557e-07, |
| "loss": 0.0348, |
| "reward": 0.016890098340809345, |
| "reward_std": 0.5630042045377195, |
| "rewards/cosine_scaled_reward": -0.10613830108195543, |
| "rewards/format_reward": 0.2291666679084301, |
| "step": 394 |
| }, |
| { |
| "completion_length": 2223.4375076293945, |
| "epoch": 0.4514285714285714, |
| "grad_norm": 0.016346530988812447, |
| "kl": 0.0012898445129394531, |
| "learning_rate": 2.1769509671835223e-07, |
| "loss": -0.0074, |
| "reward": 0.1286599598824978, |
| "reward_std": 0.4426659531891346, |
| "rewards/cosine_scaled_reward": -0.18567002192139626, |
| "rewards/format_reward": 0.5, |
| "step": 395 |
| }, |
| { |
| "completion_length": 3036.125030517578, |
| "epoch": 0.45257142857142857, |
| "grad_norm": 0.078156977891922, |
| "kl": 0.002143383026123047, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": 0.1641, |
| "reward": 0.2860925877466798, |
| "reward_std": 1.1900311894714832, |
| "rewards/cosine_scaled_reward": -0.02362038509454578, |
| "rewards/format_reward": 0.3333333395421505, |
| "step": 396 |
| }, |
| { |
| "completion_length": 2902.562545776367, |
| "epoch": 0.45371428571428574, |
| "grad_norm": 0.05982230231165886, |
| "kl": 0.0010967254638671875, |
| "learning_rate": 2.134908592756607e-07, |
| "loss": 0.0246, |
| "reward": 0.3248216570354998, |
| "reward_std": 1.0227275677025318, |
| "rewards/cosine_scaled_reward": -0.07717251125723124, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 397 |
| }, |
| { |
| "completion_length": 2611.916702270508, |
| "epoch": 0.45485714285714285, |
| "grad_norm": 0.05539969727396965, |
| "kl": 0.0018358230590820312, |
| "learning_rate": 2.1141329099692406e-07, |
| "loss": 0.0695, |
| "reward": 0.2927081920206547, |
| "reward_std": 0.8153704311698675, |
| "rewards/cosine_scaled_reward": -0.10364590771496296, |
| "rewards/format_reward": 0.5000000037252903, |
| "step": 398 |
| }, |
| { |
| "completion_length": 2259.0625610351562, |
| "epoch": 0.456, |
| "grad_norm": 0.04596502333879471, |
| "kl": 0.0021970272064208984, |
| "learning_rate": 2.0935222495670968e-07, |
| "loss": 0.0574, |
| "reward": 0.6550063313916326, |
| "reward_std": 0.7109723538160324, |
| "rewards/cosine_scaled_reward": 0.004586491733789444, |
| "rewards/format_reward": 0.6458333414047956, |
| "step": 399 |
| }, |
| { |
| "completion_length": 1735.2917175292969, |
| "epoch": 0.45714285714285713, |
| "grad_norm": 0.048301637172698975, |
| "kl": 0.0027017593383789062, |
| "learning_rate": 2.0730776160846853e-07, |
| "loss": 0.0258, |
| "reward": 0.9397249203175306, |
| "reward_std": 0.7951749358326197, |
| "rewards/cosine_scaled_reward": 0.09486244898289442, |
| "rewards/format_reward": 0.7500000093132257, |
| "step": 400 |
| }, |
| { |
| "completion_length": 3075.8333435058594, |
| "epoch": 0.4582857142857143, |
| "grad_norm": 0.02250814624130726, |
| "kl": 0.0019578933715820312, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0031, |
| "reward": 0.09915575897321105, |
| "reward_std": 0.5018129777163267, |
| "rewards/cosine_scaled_reward": -0.1066721323877573, |
| "rewards/format_reward": 0.3125000074505806, |
| "step": 401 |
| }, |
| { |
| "completion_length": 2486.0000076293945, |
| "epoch": 0.4594285714285714, |
| "grad_norm": 0.031174227595329285, |
| "kl": 0.0019636154174804688, |
| "learning_rate": 2.032690407508949e-07, |
| "loss": 0.0438, |
| "reward": 0.2905198944499716, |
| "reward_std": 0.5358554609119892, |
| "rewards/cosine_scaled_reward": -0.0839067182969302, |
| "rewards/format_reward": 0.45833333395421505, |
| "step": 402 |
| }, |
| { |
| "completion_length": 1904.0625, |
| "epoch": 0.4605714285714286, |
| "grad_norm": 0.03299061581492424, |
| "kl": 0.002137422561645508, |
| "learning_rate": 2.0127498008311922e-07, |
| "loss": 0.0207, |
| "reward": 0.615192599594593, |
| "reward_std": 0.5438333973288536, |
| "rewards/cosine_scaled_reward": -0.03615369647741318, |
| "rewards/format_reward": 0.6875, |
| "step": 403 |
| }, |
| { |
| "completion_length": 2334.2291870117188, |
| "epoch": 0.4617142857142857, |
| "grad_norm": 0.046507738530635834, |
| "kl": 0.0019326210021972656, |
| "learning_rate": 1.9929791578083655e-07, |
| "loss": 0.0536, |
| "reward": 0.22189591638743877, |
| "reward_std": 0.7510978206992149, |
| "rewards/cosine_scaled_reward": -0.11821871809661388, |
| "rewards/format_reward": 0.45833333395421505, |
| "step": 404 |
| }, |
| { |
| "completion_length": 2323.229206085205, |
| "epoch": 0.46285714285714286, |
| "grad_norm": 0.049295179545879364, |
| "kl": 0.0019037723541259766, |
| "learning_rate": 1.9733794420337213e-07, |
| "loss": 0.0334, |
| "reward": 0.8066165037453175, |
| "reward_std": 0.84293257817626, |
| "rewards/cosine_scaled_reward": 0.12205823619660805, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 405 |
| }, |
| { |
| "completion_length": 2272.812545776367, |
| "epoch": 0.464, |
| "grad_norm": 0.040780920535326004, |
| "kl": 0.0019860267639160156, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": 0.0374, |
| "reward": 0.6606193948537111, |
| "reward_std": 0.8417980894446373, |
| "rewards/cosine_scaled_reward": 0.007393034175038338, |
| "rewards/format_reward": 0.645833345130086, |
| "step": 406 |
| }, |
| { |
| "completion_length": 2579.8333778381348, |
| "epoch": 0.46514285714285714, |
| "grad_norm": 0.03767210990190506, |
| "kl": 0.0013513565063476562, |
| "learning_rate": 1.934696604901642e-07, |
| "loss": 0.0199, |
| "reward": 0.5471592950634658, |
| "reward_std": 0.5273555251769722, |
| "rewards/cosine_scaled_reward": 0.04441297799348831, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 407 |
| }, |
| { |
| "completion_length": 2723.500030517578, |
| "epoch": 0.4662857142857143, |
| "grad_norm": 0.03993203490972519, |
| "kl": 0.0015883445739746094, |
| "learning_rate": 1.915615368891117e-07, |
| "loss": 0.0578, |
| "reward": 0.43359660543501377, |
| "reward_std": 0.5000373655930161, |
| "rewards/cosine_scaled_reward": 0.008464958518743515, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 408 |
| }, |
| { |
| "completion_length": 3507.3958740234375, |
| "epoch": 0.4674285714285714, |
| "grad_norm": 0.026279030367732048, |
| "kl": 0.0009474754333496094, |
| "learning_rate": 1.8967088307307e-07, |
| "loss": 0.022, |
| "reward": -0.263646949082613, |
| "reward_std": 0.45825990103185177, |
| "rewards/cosine_scaled_reward": -0.18390680570155382, |
| "rewards/format_reward": 0.1041666679084301, |
| "step": 409 |
| }, |
| { |
| "completion_length": 2405.93754196167, |
| "epoch": 0.4685714285714286, |
| "grad_norm": 0.04245587810873985, |
| "kl": 0.0011708736419677734, |
| "learning_rate": 1.8779779118983867e-07, |
| "loss": 0.035, |
| "reward": 0.4530321164056659, |
| "reward_std": 0.8689349945634604, |
| "rewards/cosine_scaled_reward": -0.033900603419169784, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 410 |
| }, |
| { |
| "completion_length": 3133.0, |
| "epoch": 0.4697142857142857, |
| "grad_norm": 0.045392949134111404, |
| "kl": 0.001537322998046875, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0303, |
| "reward": 0.1537340972572565, |
| "reward_std": 0.6930182911455631, |
| "rewards/cosine_scaled_reward": -0.08979963045567274, |
| "rewards/format_reward": 0.3333333395421505, |
| "step": 411 |
| }, |
| { |
| "completion_length": 2966.1666870117188, |
| "epoch": 0.47085714285714286, |
| "grad_norm": 0.03469331935048103, |
| "kl": 0.001659393310546875, |
| "learning_rate": 1.8410465752883758e-07, |
| "loss": -0.0277, |
| "reward": 0.5148756094276905, |
| "reward_std": 0.6638762913644314, |
| "rewards/cosine_scaled_reward": 0.049104465171694756, |
| "rewards/format_reward": 0.4166666679084301, |
| "step": 412 |
| }, |
| { |
| "completion_length": 2527.187511444092, |
| "epoch": 0.472, |
| "grad_norm": 0.0342419259250164, |
| "kl": 0.002641916275024414, |
| "learning_rate": 1.822847957491922e-07, |
| "loss": 0.0405, |
| "reward": 0.49921591579914093, |
| "reward_std": 0.7078933194279671, |
| "rewards/cosine_scaled_reward": 0.010024622781202197, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 413 |
| }, |
| { |
| "completion_length": 3006.729202270508, |
| "epoch": 0.47314285714285714, |
| "grad_norm": 0.03782186284661293, |
| "kl": 0.0012745857238769531, |
| "learning_rate": 1.804828558898332e-07, |
| "loss": 0.0095, |
| "reward": -0.013905536383390427, |
| "reward_std": 0.6790427416563034, |
| "rewards/cosine_scaled_reward": -0.1423694370314479, |
| "rewards/format_reward": 0.2708333358168602, |
| "step": 414 |
| }, |
| { |
| "completion_length": 3249.333335876465, |
| "epoch": 0.4742857142857143, |
| "grad_norm": 0.017366327345371246, |
| "kl": 0.0010700225830078125, |
| "learning_rate": 1.7869892577476722e-07, |
| "loss": 0.0009, |
| "reward": -0.3696242421865463, |
| "reward_std": 0.3092060061171651, |
| "rewards/cosine_scaled_reward": -0.25772880017757416, |
| "rewards/format_reward": 0.14583333395421505, |
| "step": 415 |
| }, |
| { |
| "completion_length": 1827.9375267028809, |
| "epoch": 0.4754285714285714, |
| "grad_norm": 0.030641866847872734, |
| "kl": 0.0006153583526611328, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": 0.0127, |
| "reward": 0.7272525690495968, |
| "reward_std": 0.704660689458251, |
| "rewards/cosine_scaled_reward": -0.021790377446450293, |
| "rewards/format_reward": 0.7708333395421505, |
| "step": 416 |
| }, |
| { |
| "completion_length": 3335.5833435058594, |
| "epoch": 0.4765714285714286, |
| "grad_norm": 0.033628471195697784, |
| "kl": 0.0013918876647949219, |
| "learning_rate": 1.7518544168045524e-07, |
| "loss": 0.0198, |
| "reward": -0.24187161587178707, |
| "reward_std": 0.571142709814012, |
| "rewards/cosine_scaled_reward": -0.23551913630217314, |
| "rewards/format_reward": 0.22916667349636555, |
| "step": 417 |
| }, |
| { |
| "completion_length": 2151.937526702881, |
| "epoch": 0.4777142857142857, |
| "grad_norm": 0.02097875066101551, |
| "kl": 0.0009946823120117188, |
| "learning_rate": 1.7345605894346726e-07, |
| "loss": 0.0207, |
| "reward": 0.5706320106983185, |
| "reward_std": 0.5004640761762857, |
| "rewards/cosine_scaled_reward": -0.027184022590517998, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 418 |
| }, |
| { |
| "completion_length": 2525.7708587646484, |
| "epoch": 0.47885714285714287, |
| "grad_norm": 0.036434732377529144, |
| "kl": 0.0015592575073242188, |
| "learning_rate": 1.7174502842694212e-07, |
| "loss": 0.0261, |
| "reward": 0.16563719511032104, |
| "reward_std": 0.6850580945611, |
| "rewards/cosine_scaled_reward": -0.13593140104785562, |
| "rewards/format_reward": 0.4375, |
| "step": 419 |
| }, |
| { |
| "completion_length": 1836.8542098999023, |
| "epoch": 0.48, |
| "grad_norm": 0.0419175811111927, |
| "kl": 0.0034418106079101562, |
| "learning_rate": 1.7005243352409333e-07, |
| "loss": 0.0254, |
| "reward": 0.5675348229706287, |
| "reward_std": 0.7641635444015265, |
| "rewards/cosine_scaled_reward": -0.0808159353910014, |
| "rewards/format_reward": 0.7291666883975267, |
| "step": 420 |
| }, |
| { |
| "completion_length": 3365.437530517578, |
| "epoch": 0.48114285714285715, |
| "grad_norm": 0.04015079140663147, |
| "kl": 0.004153251647949219, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": 0.0357, |
| "reward": -0.140715591609478, |
| "reward_std": 0.6398463062942028, |
| "rewards/cosine_scaled_reward": -0.13285779720172286, |
| "rewards/format_reward": 0.12500000186264515, |
| "step": 421 |
| }, |
| { |
| "completion_length": 2927.6666870117188, |
| "epoch": 0.48228571428571426, |
| "grad_norm": 0.02774575911462307, |
| "kl": 0.0009286403656005859, |
| "learning_rate": 1.6672287963562852e-07, |
| "loss": 0.0137, |
| "reward": 0.16687930934131145, |
| "reward_std": 0.49705400690436363, |
| "rewards/cosine_scaled_reward": -0.08322702161967754, |
| "rewards/format_reward": 0.3333333358168602, |
| "step": 422 |
| }, |
| { |
| "completion_length": 2936.25, |
| "epoch": 0.48342857142857143, |
| "grad_norm": 0.031355079263448715, |
| "kl": 0.0010771751403808594, |
| "learning_rate": 1.6508608292777203e-07, |
| "loss": 0.005, |
| "reward": 0.07443832606077194, |
| "reward_std": 0.5912903603166342, |
| "rewards/cosine_scaled_reward": -0.13986417837440968, |
| "rewards/format_reward": 0.3541666679084301, |
| "step": 423 |
| }, |
| { |
| "completion_length": 3266.125030517578, |
| "epoch": 0.4845714285714286, |
| "grad_norm": 0.04135151579976082, |
| "kl": 0.0009894371032714844, |
| "learning_rate": 1.6346804638120098e-07, |
| "loss": 0.0246, |
| "reward": -0.2541657374240458, |
| "reward_std": 0.6840569227933884, |
| "rewards/cosine_scaled_reward": -0.220832874532789, |
| "rewards/format_reward": 0.1875000074505806, |
| "step": 424 |
| }, |
| { |
| "completion_length": 2052.7500610351562, |
| "epoch": 0.4857142857142857, |
| "grad_norm": 0.051734067499637604, |
| "kl": 0.0017888545989990234, |
| "learning_rate": 1.6186884885673413e-07, |
| "loss": 0.0244, |
| "reward": 1.310227697074879, |
| "reward_std": 1.0274488306604326, |
| "rewards/cosine_scaled_reward": 0.2696971520781517, |
| "rewards/format_reward": 0.770833345130086, |
| "step": 425 |
| }, |
| { |
| "completion_length": 2170.4583740234375, |
| "epoch": 0.4868571428571429, |
| "grad_norm": 0.03390904888510704, |
| "kl": 0.0038836002349853516, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": -0.0068, |
| "reward": 0.5739936158061028, |
| "reward_std": 0.8549008993431926, |
| "rewards/cosine_scaled_reward": 0.005746812559664249, |
| "rewards/format_reward": 0.5625, |
| "step": 426 |
| }, |
| { |
| "completion_length": 3058.1458740234375, |
| "epoch": 0.488, |
| "grad_norm": 0.04581299424171448, |
| "kl": 0.0008256435394287109, |
| "learning_rate": 1.5872728172265146e-07, |
| "loss": 0.0541, |
| "reward": 0.21424740552902222, |
| "reward_std": 0.5585381351411343, |
| "rewards/cosine_scaled_reward": -0.049126310274004936, |
| "rewards/format_reward": 0.3125000074505806, |
| "step": 427 |
| }, |
| { |
| "completion_length": 2518.250030517578, |
| "epoch": 0.48914285714285716, |
| "grad_norm": 0.03789118677377701, |
| "kl": 0.0015597343444824219, |
| "learning_rate": 1.5718506522858572e-07, |
| "loss": 0.0503, |
| "reward": 0.22319819265976548, |
| "reward_std": 0.7634806632995605, |
| "rewards/cosine_scaled_reward": -0.14881757437251508, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 428 |
| }, |
| { |
| "completion_length": 2405.25008392334, |
| "epoch": 0.49028571428571427, |
| "grad_norm": 0.03913323953747749, |
| "kl": 0.002506732940673828, |
| "learning_rate": 1.5566199398026147e-07, |
| "loss": 0.0521, |
| "reward": 0.43615793297067285, |
| "reward_std": 0.7961943745613098, |
| "rewards/cosine_scaled_reward": -0.06317103141918778, |
| "rewards/format_reward": 0.5625000186264515, |
| "step": 429 |
| }, |
| { |
| "completion_length": 2516.6041984558105, |
| "epoch": 0.49142857142857144, |
| "grad_norm": 0.03834352642297745, |
| "kl": 0.001491546630859375, |
| "learning_rate": 1.5415814221002265e-07, |
| "loss": 0.0167, |
| "reward": 0.698354821652174, |
| "reward_std": 0.5752123966813087, |
| "rewards/cosine_scaled_reward": 0.057510748505592346, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 430 |
| }, |
| { |
| "completion_length": 2556.229179382324, |
| "epoch": 0.49257142857142855, |
| "grad_norm": 0.021933771669864655, |
| "kl": 0.0019750595092773438, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.027, |
| "reward": 0.1585888583213091, |
| "reward_std": 0.39748182776384056, |
| "rewards/cosine_scaled_reward": -0.16028890456072986, |
| "rewards/format_reward": 0.47916667722165585, |
| "step": 431 |
| }, |
| { |
| "completion_length": 2948.791717529297, |
| "epoch": 0.4937142857142857, |
| "grad_norm": 0.03498787805438042, |
| "kl": 0.0016126632690429688, |
| "learning_rate": 1.5120838934595337e-07, |
| "loss": 0.0353, |
| "reward": 0.1526618276257068, |
| "reward_std": 0.5853009335696697, |
| "rewards/cosine_scaled_reward": -0.1007524230517447, |
| "rewards/format_reward": 0.35416668094694614, |
| "step": 432 |
| }, |
| { |
| "completion_length": 2779.395866394043, |
| "epoch": 0.4948571428571429, |
| "grad_norm": 0.046449173241853714, |
| "kl": 0.001285552978515625, |
| "learning_rate": 1.4976263201891613e-07, |
| "loss": 0.0361, |
| "reward": 0.5470792800188065, |
| "reward_std": 0.7514309994876385, |
| "rewards/cosine_scaled_reward": 0.04437295813113451, |
| "rewards/format_reward": 0.4583333395421505, |
| "step": 433 |
| }, |
| { |
| "completion_length": 2833.0208740234375, |
| "epoch": 0.496, |
| "grad_norm": 0.03701100870966911, |
| "kl": 0.002968311309814453, |
| "learning_rate": 1.483363816965435e-07, |
| "loss": 0.0405, |
| "reward": -0.023492522537708282, |
| "reward_std": 0.5920409262180328, |
| "rewards/cosine_scaled_reward": -0.17841292917728424, |
| "rewards/format_reward": 0.33333334140479565, |
| "step": 434 |
| }, |
| { |
| "completion_length": 2294.895866394043, |
| "epoch": 0.49714285714285716, |
| "grad_norm": 0.03758431226015091, |
| "kl": 0.0011827945709228516, |
| "learning_rate": 1.469297078922642e-07, |
| "loss": 0.041, |
| "reward": 0.41764480620622635, |
| "reward_std": 0.6425671465694904, |
| "rewards/cosine_scaled_reward": -0.051594268530607224, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 435 |
| }, |
| { |
| "completion_length": 2106.375015258789, |
| "epoch": 0.4982857142857143, |
| "grad_norm": 0.021261492744088173, |
| "kl": 0.002359628677368164, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": -0.0056, |
| "reward": 0.863133005797863, |
| "reward_std": 0.5500740725547075, |
| "rewards/cosine_scaled_reward": 0.15031648427248, |
| "rewards/format_reward": 0.5625000018626451, |
| "step": 436 |
| }, |
| { |
| "completion_length": 2651.645866394043, |
| "epoch": 0.49942857142857144, |
| "grad_norm": 0.040734272450208664, |
| "kl": 0.0017135143280029297, |
| "learning_rate": 1.4417536311769885e-07, |
| "loss": 0.09, |
| "reward": 0.01604996738024056, |
| "reward_std": 0.665538102388382, |
| "rewards/cosine_scaled_reward": -0.22114169038832188, |
| "rewards/format_reward": 0.4583333395421505, |
| "step": 437 |
| }, |
| { |
| "completion_length": 2993.9791870117188, |
| "epoch": 0.5005714285714286, |
| "grad_norm": 0.05256485193967819, |
| "kl": 0.0008656978607177734, |
| "learning_rate": 1.4282782639029128e-07, |
| "loss": 0.0184, |
| "reward": 0.19979790039360523, |
| "reward_std": 0.8984194695949554, |
| "rewards/cosine_scaled_reward": -0.07718438468873501, |
| "rewards/format_reward": 0.35416667349636555, |
| "step": 438 |
| }, |
| { |
| "completion_length": 2457.937515258789, |
| "epoch": 0.5017142857142857, |
| "grad_norm": 0.026962121948599815, |
| "kl": 0.002255082130432129, |
| "learning_rate": 1.4150013466019114e-07, |
| "loss": 0.0319, |
| "reward": -0.06446752324700356, |
| "reward_std": 0.5604766383767128, |
| "rewards/cosine_scaled_reward": -0.2509837690740824, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 439 |
| }, |
| { |
| "completion_length": 2894.0625076293945, |
| "epoch": 0.5028571428571429, |
| "grad_norm": 0.02316948212683201, |
| "kl": 0.001163482666015625, |
| "learning_rate": 1.4019235263722034e-07, |
| "loss": 0.021, |
| "reward": -0.2982286214828491, |
| "reward_std": 0.4396653138101101, |
| "rewards/cosine_scaled_reward": -0.26369765028357506, |
| "rewards/format_reward": 0.2291666716337204, |
| "step": 440 |
| }, |
| { |
| "completion_length": 3043.270866394043, |
| "epoch": 0.504, |
| "grad_norm": 0.044711362570524216, |
| "kl": 0.0011162757873535156, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.0672, |
| "reward": 0.10117581891245209, |
| "reward_std": 0.7147481255233288, |
| "rewards/cosine_scaled_reward": -0.07441211328841746, |
| "rewards/format_reward": 0.2500000037252903, |
| "step": 441 |
| }, |
| { |
| "completion_length": 2906.2083587646484, |
| "epoch": 0.5051428571428571, |
| "grad_norm": 0.03437203913927078, |
| "kl": 0.001821756362915039, |
| "learning_rate": 1.3763677169699217e-07, |
| "loss": 0.0265, |
| "reward": -0.09266237542033195, |
| "reward_std": 0.5656116344034672, |
| "rewards/cosine_scaled_reward": -0.19216451607644558, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 442 |
| }, |
| { |
| "completion_length": 3142.979179382324, |
| "epoch": 0.5062857142857143, |
| "grad_norm": 0.03172105550765991, |
| "kl": 0.0008325576782226562, |
| "learning_rate": 1.3638909733514452e-07, |
| "loss": 0.0029, |
| "reward": 0.1617426760494709, |
| "reward_std": 0.566162696108222, |
| "rewards/cosine_scaled_reward": -0.04412867687642574, |
| "rewards/format_reward": 0.2500000037252903, |
| "step": 443 |
| }, |
| { |
| "completion_length": 2948.208335876465, |
| "epoch": 0.5074285714285715, |
| "grad_norm": 0.031045345589518547, |
| "kl": 0.0011382102966308594, |
| "learning_rate": 1.351615817851748e-07, |
| "loss": 0.037, |
| "reward": 0.016486244276165962, |
| "reward_std": 0.395066162571311, |
| "rewards/cosine_scaled_reward": -0.11675689741969109, |
| "rewards/format_reward": 0.25, |
| "step": 444 |
| }, |
| { |
| "completion_length": 2974.375045776367, |
| "epoch": 0.5085714285714286, |
| "grad_norm": 0.0441366508603096, |
| "kl": 0.0014166831970214844, |
| "learning_rate": 1.3395428487445914e-07, |
| "loss": 0.0557, |
| "reward": 0.44049030914902687, |
| "reward_std": 0.8217526823282242, |
| "rewards/cosine_scaled_reward": 0.02232850785367191, |
| "rewards/format_reward": 0.39583334140479565, |
| "step": 445 |
| }, |
| { |
| "completion_length": 2984.479217529297, |
| "epoch": 0.5097142857142857, |
| "grad_norm": 0.04864772781729698, |
| "kl": 0.0013647079467773438, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": 0.0512, |
| "reward": 0.024046601727604866, |
| "reward_std": 0.828495018184185, |
| "rewards/cosine_scaled_reward": -0.14422670053318143, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 446 |
| }, |
| { |
| "completion_length": 2257.5625495910645, |
| "epoch": 0.5108571428571429, |
| "grad_norm": 0.0407555028796196, |
| "kl": 0.0022869110107421875, |
| "learning_rate": 1.316005813502869e-07, |
| "loss": 0.0436, |
| "reward": 0.5557453141082078, |
| "reward_std": 0.7650147341191769, |
| "rewards/cosine_scaled_reward": -0.045044018886983395, |
| "rewards/format_reward": 0.6458333414047956, |
| "step": 447 |
| }, |
| { |
| "completion_length": 2130.416690826416, |
| "epoch": 0.512, |
| "grad_norm": 0.018810981884598732, |
| "kl": 0.0023889541625976562, |
| "learning_rate": 1.3045428945301953e-07, |
| "loss": 0.0052, |
| "reward": 0.33343374729156494, |
| "reward_std": 0.37268934305757284, |
| "rewards/cosine_scaled_reward": -0.10411648452281952, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 448 |
| }, |
| { |
| "completion_length": 2702.791679382324, |
| "epoch": 0.5131428571428571, |
| "grad_norm": 0.027145305648446083, |
| "kl": 0.0036830902099609375, |
| "learning_rate": 1.2932844562179352e-07, |
| "loss": 0.0092, |
| "reward": -0.003954991698265076, |
| "reward_std": 0.48061698488891125, |
| "rewards/cosine_scaled_reward": -0.16864416131284088, |
| "rewards/format_reward": 0.3333333358168602, |
| "step": 449 |
| }, |
| { |
| "completion_length": 2385.8333587646484, |
| "epoch": 0.5142857142857142, |
| "grad_norm": 0.13469496369361877, |
| "kl": 0.0027618408203125, |
| "learning_rate": 1.2822310472864885e-07, |
| "loss": 0.0508, |
| "reward": 0.16872227331623435, |
| "reward_std": 0.7135936170816422, |
| "rewards/cosine_scaled_reward": -0.1343888696283102, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 450 |
| }, |
| { |
| "completion_length": 2780.7916946411133, |
| "epoch": 0.5154285714285715, |
| "grad_norm": 0.03198925033211708, |
| "kl": 0.0016019344329833984, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": 0.001, |
| "reward": -0.05699261091649532, |
| "reward_std": 0.5354622071608901, |
| "rewards/cosine_scaled_reward": -0.17432964034378529, |
| "rewards/format_reward": 0.2916666679084301, |
| "step": 451 |
| }, |
| { |
| "completion_length": 3177.8958740234375, |
| "epoch": 0.5165714285714286, |
| "grad_norm": 0.03804057464003563, |
| "kl": 0.0010409355163574219, |
| "learning_rate": 1.260741462457165e-07, |
| "loss": 0.0253, |
| "reward": 0.47885728627443314, |
| "reward_std": 0.6608881391584873, |
| "rewards/cosine_scaled_reward": 0.06234530918300152, |
| "rewards/format_reward": 0.3541666679084301, |
| "step": 452 |
| }, |
| { |
| "completion_length": 2606.3125381469727, |
| "epoch": 0.5177142857142857, |
| "grad_norm": 0.036739181727170944, |
| "kl": 0.0011627674102783203, |
| "learning_rate": 1.2503063339313356e-07, |
| "loss": 0.0038, |
| "reward": 0.17389470152556896, |
| "reward_std": 0.671304065734148, |
| "rewards/cosine_scaled_reward": -0.1422193255275488, |
| "rewards/format_reward": 0.45833333395421505, |
| "step": 453 |
| }, |
| { |
| "completion_length": 2695.1459045410156, |
| "epoch": 0.5188571428571429, |
| "grad_norm": 0.034578122198581696, |
| "kl": 0.0020804405212402344, |
| "learning_rate": 1.2400783294793668e-07, |
| "loss": 0.0247, |
| "reward": 0.44620663672685623, |
| "reward_std": 0.7098315693438053, |
| "rewards/cosine_scaled_reward": -0.06856334558688104, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 454 |
| }, |
| { |
| "completion_length": 2958.7708740234375, |
| "epoch": 0.52, |
| "grad_norm": 0.030018191784620285, |
| "kl": 0.0013599395751953125, |
| "learning_rate": 1.2300579475997657e-07, |
| "loss": 0.0325, |
| "reward": -0.15596182458102703, |
| "reward_std": 0.5225675888359547, |
| "rewards/cosine_scaled_reward": -0.25506424461491406, |
| "rewards/format_reward": 0.3541666753590107, |
| "step": 455 |
| }, |
| { |
| "completion_length": 3160.291679382324, |
| "epoch": 0.5211428571428571, |
| "grad_norm": 0.04109412059187889, |
| "kl": 0.0009658336639404297, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": 0.0054, |
| "reward": 0.016125384718179703, |
| "reward_std": 0.6339304633438587, |
| "rewards/cosine_scaled_reward": -0.10652064997702837, |
| "rewards/format_reward": 0.22916666977107525, |
| "step": 456 |
| }, |
| { |
| "completion_length": 3051.416679382324, |
| "epoch": 0.5222857142857142, |
| "grad_norm": 0.046411171555519104, |
| "kl": 0.0009918212890625, |
| "learning_rate": 1.2106419949317388e-07, |
| "loss": 0.0284, |
| "reward": -0.02184194140136242, |
| "reward_std": 0.687283992767334, |
| "rewards/cosine_scaled_reward": -0.1255043102428317, |
| "rewards/format_reward": 0.2291666679084301, |
| "step": 457 |
| }, |
| { |
| "completion_length": 2320.4167098999023, |
| "epoch": 0.5234285714285715, |
| "grad_norm": 0.03369192034006119, |
| "kl": 0.0011844635009765625, |
| "learning_rate": 1.2012473704494537e-07, |
| "loss": -0.0035, |
| "reward": 0.22107653319835663, |
| "reward_std": 0.5749458260834217, |
| "rewards/cosine_scaled_reward": -0.13946174271404743, |
| "rewards/format_reward": 0.5000000055879354, |
| "step": 458 |
| }, |
| { |
| "completion_length": 1983.1667022705078, |
| "epoch": 0.5245714285714286, |
| "grad_norm": 0.04080894589424133, |
| "kl": 0.0028548240661621094, |
| "learning_rate": 1.1920622611056974e-07, |
| "loss": 0.0328, |
| "reward": 0.7140507474541664, |
| "reward_std": 0.9648182429373264, |
| "rewards/cosine_scaled_reward": 0.023692045360803604, |
| "rewards/format_reward": 0.6666666697710752, |
| "step": 459 |
| }, |
| { |
| "completion_length": 3069.437545776367, |
| "epoch": 0.5257142857142857, |
| "grad_norm": 0.05579410493373871, |
| "kl": 0.0021648406982421875, |
| "learning_rate": 1.1830871145697412e-07, |
| "loss": 0.0697, |
| "reward": 0.0923931347206235, |
| "reward_std": 0.8388851378113031, |
| "rewards/cosine_scaled_reward": -0.14130343310534954, |
| "rewards/format_reward": 0.37500000558793545, |
| "step": 460 |
| }, |
| { |
| "completion_length": 2985.0833435058594, |
| "epoch": 0.5268571428571428, |
| "grad_norm": 0.031334202736616135, |
| "kl": 0.0011229515075683594, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.0152, |
| "reward": -0.03977155685424805, |
| "reward_std": 0.5789243541657925, |
| "rewards/cosine_scaled_reward": -0.15530244540423155, |
| "rewards/format_reward": 0.2708333395421505, |
| "step": 461 |
| }, |
| { |
| "completion_length": 2826.895835876465, |
| "epoch": 0.528, |
| "grad_norm": 0.01678531803190708, |
| "kl": 0.0008094310760498047, |
| "learning_rate": 1.1657684494105386e-07, |
| "loss": -0.0017, |
| "reward": -0.2524063400924206, |
| "reward_std": 0.340161694213748, |
| "rewards/cosine_scaled_reward": -0.2616198379546404, |
| "rewards/format_reward": 0.27083333395421505, |
| "step": 462 |
| }, |
| { |
| "completion_length": 2652.9791870117188, |
| "epoch": 0.5291428571428571, |
| "grad_norm": 0.030127134174108505, |
| "kl": 0.002656221389770508, |
| "learning_rate": 1.1574257748745986e-07, |
| "loss": 0.0208, |
| "reward": 0.33723626285791397, |
| "reward_std": 0.6605229675769806, |
| "rewards/cosine_scaled_reward": -0.050131882540881634, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 463 |
| }, |
| { |
| "completion_length": 1788.250015258789, |
| "epoch": 0.5302857142857142, |
| "grad_norm": 0.014693168923258781, |
| "kl": 0.0027604103088378906, |
| "learning_rate": 1.1492947512799328e-07, |
| "loss": -0.0001, |
| "reward": 0.8204499296844006, |
| "reward_std": 0.36152973771095276, |
| "rewards/cosine_scaled_reward": 0.09772495925426483, |
| "rewards/format_reward": 0.625, |
| "step": 464 |
| }, |
| { |
| "completion_length": 3023.291679382324, |
| "epoch": 0.5314285714285715, |
| "grad_norm": 0.02821505255997181, |
| "kl": 0.001667022705078125, |
| "learning_rate": 1.1413757749211602e-07, |
| "loss": 0.0299, |
| "reward": -0.02931244857609272, |
| "reward_std": 0.4763452261686325, |
| "rewards/cosine_scaled_reward": -0.1604895582422614, |
| "rewards/format_reward": 0.29166666977107525, |
| "step": 465 |
| }, |
| { |
| "completion_length": 2858.8958435058594, |
| "epoch": 0.5325714285714286, |
| "grad_norm": 0.016902461647987366, |
| "kl": 0.0013074874877929688, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": 0.0013, |
| "reward": 0.20501752942800522, |
| "reward_std": 0.389689764007926, |
| "rewards/cosine_scaled_reward": -0.012074556201696396, |
| "rewards/format_reward": 0.2291666716337204, |
| "step": 466 |
| }, |
| { |
| "completion_length": 3056.7291870117188, |
| "epoch": 0.5337142857142857, |
| "grad_norm": 0.024957498535513878, |
| "kl": 0.0012183189392089844, |
| "learning_rate": 1.1261754973965422e-07, |
| "loss": 0.0121, |
| "reward": -0.0974850058555603, |
| "reward_std": 0.47587360069155693, |
| "rewards/cosine_scaled_reward": -0.17374251037836075, |
| "rewards/format_reward": 0.2500000037252903, |
| "step": 467 |
| }, |
| { |
| "completion_length": 2902.0208740234375, |
| "epoch": 0.5348571428571428, |
| "grad_norm": 0.020729240030050278, |
| "kl": 0.0029973983764648438, |
| "learning_rate": 1.1188949370707787e-07, |
| "loss": 0.0248, |
| "reward": -0.3498317201156169, |
| "reward_std": 0.31344279972836375, |
| "rewards/cosine_scaled_reward": -0.3207492008805275, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 468 |
| }, |
| { |
| "completion_length": 2738.8958854675293, |
| "epoch": 0.536, |
| "grad_norm": 0.04061265289783478, |
| "kl": 0.0017533302307128906, |
| "learning_rate": 1.1118279056249653e-07, |
| "loss": 0.0035, |
| "reward": 0.16780934482812881, |
| "reward_std": 0.6997935622930527, |
| "rewards/cosine_scaled_reward": -0.10359533131122589, |
| "rewards/format_reward": 0.3750000074505806, |
| "step": 469 |
| }, |
| { |
| "completion_length": 3170.3334045410156, |
| "epoch": 0.5371428571428571, |
| "grad_norm": 0.05352408438920975, |
| "kl": 0.0013861656188964844, |
| "learning_rate": 1.1049747474962444e-07, |
| "loss": 0.0772, |
| "reward": -0.12879883614368737, |
| "reward_std": 0.7819614112377167, |
| "rewards/cosine_scaled_reward": -0.21023273840546608, |
| "rewards/format_reward": 0.2916666679084301, |
| "step": 470 |
| }, |
| { |
| "completion_length": 3140.2291870117188, |
| "epoch": 0.5382857142857143, |
| "grad_norm": 0.03319269046187401, |
| "kl": 0.0009810924530029297, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": -0.0023, |
| "reward": 0.078701913356781, |
| "reward_std": 0.5363645683974028, |
| "rewards/cosine_scaled_reward": -0.10648237727582455, |
| "rewards/format_reward": 0.2916666679084301, |
| "step": 471 |
| }, |
| { |
| "completion_length": 2882.3750610351562, |
| "epoch": 0.5394285714285715, |
| "grad_norm": 0.04058046266436577, |
| "kl": 0.0022449493408203125, |
| "learning_rate": 1.0919113768029517e-07, |
| "loss": -0.0056, |
| "reward": 0.23404515581205487, |
| "reward_std": 0.6296902745962143, |
| "rewards/cosine_scaled_reward": -0.11214408185333014, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 472 |
| }, |
| { |
| "completion_length": 3213.0625, |
| "epoch": 0.5405714285714286, |
| "grad_norm": 0.03943871706724167, |
| "kl": 0.0012938976287841797, |
| "learning_rate": 1.0857018009286381e-07, |
| "loss": 0.0228, |
| "reward": 0.07427317555993795, |
| "reward_std": 0.7803191654384136, |
| "rewards/cosine_scaled_reward": -0.06703009083867073, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 473 |
| }, |
| { |
| "completion_length": 2547.12504196167, |
| "epoch": 0.5417142857142857, |
| "grad_norm": 0.04717176407575607, |
| "kl": 0.0013964176177978516, |
| "learning_rate": 1.0797073717209013e-07, |
| "loss": 0.0238, |
| "reward": 0.7898019873537123, |
| "reward_std": 0.8049750197678804, |
| "rewards/cosine_scaled_reward": 0.15531766414642334, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 474 |
| }, |
| { |
| "completion_length": 2314.645881652832, |
| "epoch": 0.5428571428571428, |
| "grad_norm": 0.041776709258556366, |
| "kl": 0.0023360252380371094, |
| "learning_rate": 1.0739283813397639e-07, |
| "loss": 0.0467, |
| "reward": 0.635431744158268, |
| "reward_std": 0.7481589121744037, |
| "rewards/cosine_scaled_reward": 0.026049194857478142, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 475 |
| }, |
| { |
| "completion_length": 2867.8750610351562, |
| "epoch": 0.544, |
| "grad_norm": 0.053181666880846024, |
| "kl": 0.002093791961669922, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": 0.0942, |
| "reward": 0.23149395920336246, |
| "reward_std": 0.7845707032829523, |
| "rewards/cosine_scaled_reward": -0.050919692032039165, |
| "rewards/format_reward": 0.33333334513008595, |
| "step": 476 |
| }, |
| { |
| "completion_length": 2275.062545776367, |
| "epoch": 0.5451428571428572, |
| "grad_norm": 0.0416170097887516, |
| "kl": 0.001888275146484375, |
| "learning_rate": 1.063017833182728e-07, |
| "loss": 0.031, |
| "reward": 1.290647640824318, |
| "reward_std": 0.759942751377821, |
| "rewards/cosine_scaled_reward": 0.22865712642669678, |
| "rewards/format_reward": 0.8333333488553762, |
| "step": 477 |
| }, |
| { |
| "completion_length": 3111.270896911621, |
| "epoch": 0.5462857142857143, |
| "grad_norm": 0.0378287248313427, |
| "kl": 0.0014688968658447266, |
| "learning_rate": 1.0578868071715544e-07, |
| "loss": 0.0332, |
| "reward": 0.21425859350711107, |
| "reward_std": 0.8052146192640066, |
| "rewards/cosine_scaled_reward": -0.06995403207838535, |
| "rewards/format_reward": 0.35416666977107525, |
| "step": 478 |
| }, |
| { |
| "completion_length": 2907.12508392334, |
| "epoch": 0.5474285714285714, |
| "grad_norm": 0.04446679353713989, |
| "kl": 0.0024013519287109375, |
| "learning_rate": 1.0529722834905125e-07, |
| "loss": 0.0509, |
| "reward": 0.1522473245859146, |
| "reward_std": 0.8119659163057804, |
| "rewards/cosine_scaled_reward": -0.13220967527013272, |
| "rewards/format_reward": 0.41666667722165585, |
| "step": 479 |
| }, |
| { |
| "completion_length": 2317.8125381469727, |
| "epoch": 0.5485714285714286, |
| "grad_norm": 0.040562789887189865, |
| "kl": 0.0023407936096191406, |
| "learning_rate": 1.0482745016665526e-07, |
| "loss": 0.0618, |
| "reward": 0.23905727022793144, |
| "reward_std": 0.696567952632904, |
| "rewards/cosine_scaled_reward": -0.14088802970945835, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 480 |
| }, |
| { |
| "completion_length": 3088.7291870117188, |
| "epoch": 0.5497142857142857, |
| "grad_norm": 0.04644802585244179, |
| "kl": 0.001194000244140625, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0554, |
| "reward": -0.07575955055654049, |
| "reward_std": 0.6837183889001608, |
| "rewards/cosine_scaled_reward": -0.22537978272885084, |
| "rewards/format_reward": 0.3750000111758709, |
| "step": 481 |
| }, |
| { |
| "completion_length": 2879.500030517578, |
| "epoch": 0.5508571428571428, |
| "grad_norm": 0.033522870391607285, |
| "kl": 0.0019969940185546875, |
| "learning_rate": 1.0395300688680625e-07, |
| "loss": 0.0197, |
| "reward": 0.3703960571438074, |
| "reward_std": 0.726120725274086, |
| "rewards/cosine_scaled_reward": 0.02894800715148449, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 482 |
| }, |
| { |
| "completion_length": 2807.6458587646484, |
| "epoch": 0.552, |
| "grad_norm": 0.029158450663089752, |
| "kl": 0.0014438629150390625, |
| "learning_rate": 1.0354838440848501e-07, |
| "loss": 0.0186, |
| "reward": 0.02249845489859581, |
| "reward_std": 0.5169437192380428, |
| "rewards/cosine_scaled_reward": -0.2179174479097128, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 483 |
| }, |
| { |
| "completion_length": 2524.9583778381348, |
| "epoch": 0.5531428571428572, |
| "grad_norm": 0.042325735092163086, |
| "kl": 0.0013332366943359375, |
| "learning_rate": 1.0316552135205837e-07, |
| "loss": 0.0593, |
| "reward": 0.270411791279912, |
| "reward_std": 0.7619899287819862, |
| "rewards/cosine_scaled_reward": -0.08354412391781807, |
| "rewards/format_reward": 0.4375000037252903, |
| "step": 484 |
| }, |
| { |
| "completion_length": 1929.500015258789, |
| "epoch": 0.5542857142857143, |
| "grad_norm": 0.03541100025177002, |
| "kl": 0.0027222633361816406, |
| "learning_rate": 1.0280443637773163e-07, |
| "loss": 0.0206, |
| "reward": 0.4553542360663414, |
| "reward_std": 0.7294214889407158, |
| "rewards/cosine_scaled_reward": -0.12648956006160006, |
| "rewards/format_reward": 0.7083333358168602, |
| "step": 485 |
| }, |
| { |
| "completion_length": 2066.2292137145996, |
| "epoch": 0.5554285714285714, |
| "grad_norm": 0.025865087285637856, |
| "kl": 0.0031957626342773438, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": 0.0397, |
| "reward": 0.31694683339446783, |
| "reward_std": 0.37571004405617714, |
| "rewards/cosine_scaled_reward": -0.11235993192531168, |
| "rewards/format_reward": 0.5416666697710752, |
| "step": 486 |
| }, |
| { |
| "completion_length": 1944.8750400543213, |
| "epoch": 0.5565714285714286, |
| "grad_norm": 0.028248826041817665, |
| "kl": 0.0013153553009033203, |
| "learning_rate": 1.0214767000817596e-07, |
| "loss": 0.0261, |
| "reward": 1.1763640493154526, |
| "reward_std": 0.6212100638076663, |
| "rewards/cosine_scaled_reward": 0.24443202713155188, |
| "rewards/format_reward": 0.6875, |
| "step": 487 |
| }, |
| { |
| "completion_length": 2371.125030517578, |
| "epoch": 0.5577142857142857, |
| "grad_norm": 0.01648075133562088, |
| "kl": 0.0020608901977539062, |
| "learning_rate": 1.0185202062281336e-07, |
| "loss": 0.0115, |
| "reward": 0.06456807442009449, |
| "reward_std": 0.45360255613923073, |
| "rewards/cosine_scaled_reward": -0.19688262417912483, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 488 |
| }, |
| { |
| "completion_length": 3052.4166717529297, |
| "epoch": 0.5588571428571428, |
| "grad_norm": 0.027904629707336426, |
| "kl": 0.0017666816711425781, |
| "learning_rate": 1.0157821333772304e-07, |
| "loss": 0.012, |
| "reward": -0.22679759562015533, |
| "reward_std": 0.4569165948778391, |
| "rewards/cosine_scaled_reward": -0.22798212803900242, |
| "rewards/format_reward": 0.22916666977107525, |
| "step": 489 |
| }, |
| { |
| "completion_length": 2675.854217529297, |
| "epoch": 0.56, |
| "grad_norm": 0.03642764315009117, |
| "kl": 0.0011510848999023438, |
| "learning_rate": 1.013262614978859e-07, |
| "loss": 0.0183, |
| "reward": 0.6243100725114346, |
| "reward_std": 0.6654925271868706, |
| "rewards/cosine_scaled_reward": 0.03090503066778183, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 490 |
| }, |
| { |
| "completion_length": 2528.312515258789, |
| "epoch": 0.5611428571428572, |
| "grad_norm": 0.053110428154468536, |
| "kl": 0.006933927536010742, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": 0.0718, |
| "reward": 0.5572945028543472, |
| "reward_std": 0.755212415009737, |
| "rewards/cosine_scaled_reward": 0.00781390443444252, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 491 |
| }, |
| { |
| "completion_length": 2572.291679382324, |
| "epoch": 0.5622857142857143, |
| "grad_norm": 0.028202729299664497, |
| "kl": 0.0012972354888916016, |
| "learning_rate": 1.0088797220727779e-07, |
| "loss": -0.0014, |
| "reward": 0.14036076702177525, |
| "reward_std": 0.5402057655155659, |
| "rewards/cosine_scaled_reward": -0.12773629277944565, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 492 |
| }, |
| { |
| "completion_length": 2267.0833740234375, |
| "epoch": 0.5634285714285714, |
| "grad_norm": 0.04113708436489105, |
| "kl": 0.001312255859375, |
| "learning_rate": 1.0070165611810855e-07, |
| "loss": 0.0483, |
| "reward": 0.8241308415308595, |
| "reward_std": 0.7886662483215332, |
| "rewards/cosine_scaled_reward": 0.08914875192567706, |
| "rewards/format_reward": 0.645833345130086, |
| "step": 493 |
| }, |
| { |
| "completion_length": 2292.354232788086, |
| "epoch": 0.5645714285714286, |
| "grad_norm": 0.05553987994790077, |
| "kl": 0.0015769004821777344, |
| "learning_rate": 1.005372381963547e-07, |
| "loss": 0.0064, |
| "reward": 0.927642391063273, |
| "reward_std": 1.0857042334973812, |
| "rewards/cosine_scaled_reward": 0.10965454811230302, |
| "rewards/format_reward": 0.7083333376795053, |
| "step": 494 |
| }, |
| { |
| "completion_length": 3287.5208740234375, |
| "epoch": 0.5657142857142857, |
| "grad_norm": 0.05122198164463043, |
| "kl": 0.0015921592712402344, |
| "learning_rate": 1.0039472645551372e-07, |
| "loss": 0.048, |
| "reward": 0.158145634457469, |
| "reward_std": 0.8010805640369654, |
| "rewards/cosine_scaled_reward": -0.07717718556523323, |
| "rewards/format_reward": 0.3125000074505806, |
| "step": 495 |
| }, |
| { |
| "completion_length": 1689.2500038146973, |
| "epoch": 0.5668571428571428, |
| "grad_norm": 0.03907603397965431, |
| "kl": 0.001201629638671875, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": -0.003, |
| "reward": 1.0886746868491173, |
| "reward_std": 0.9113657027482986, |
| "rewards/cosine_scaled_reward": 0.16933735646307468, |
| "rewards/format_reward": 0.7500000111758709, |
| "step": 496 |
| }, |
| { |
| "completion_length": 2586.895851135254, |
| "epoch": 0.568, |
| "grad_norm": 0.039880234748125076, |
| "kl": 0.0023741722106933594, |
| "learning_rate": 1.0017544823184055e-07, |
| "loss": 0.0129, |
| "reward": 0.7896264218725264, |
| "reward_std": 0.6654196940362453, |
| "rewards/cosine_scaled_reward": 0.1656465344130993, |
| "rewards/format_reward": 0.45833333395421505, |
| "step": 497 |
| }, |
| { |
| "completion_length": 2759.7500228881836, |
| "epoch": 0.5691428571428572, |
| "grad_norm": 0.03684534505009651, |
| "kl": 0.0011496543884277344, |
| "learning_rate": 1.0009869243631952e-07, |
| "loss": -0.0041, |
| "reward": 0.49048804119229317, |
| "reward_std": 0.7747926935553551, |
| "rewards/cosine_scaled_reward": -0.004755992442369461, |
| "rewards/format_reward": 0.5000000018626451, |
| "step": 498 |
| }, |
| { |
| "completion_length": 2801.791702270508, |
| "epoch": 0.5702857142857143, |
| "grad_norm": 0.03754589334130287, |
| "kl": 0.0016314983367919922, |
| "learning_rate": 1.000438641958131e-07, |
| "loss": 0.0704, |
| "reward": 0.5553555823862553, |
| "reward_std": 0.9310576990246773, |
| "rewards/cosine_scaled_reward": 0.03809444326907396, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 499 |
| }, |
| { |
| "completion_length": 3048.500045776367, |
| "epoch": 0.5714285714285714, |
| "grad_norm": 0.03999260067939758, |
| "kl": 0.0013794898986816406, |
| "learning_rate": 1.0001096618257236e-07, |
| "loss": 0.0509, |
| "reward": -0.01913883164525032, |
| "reward_std": 0.7366000525653362, |
| "rewards/cosine_scaled_reward": -0.14498609118163586, |
| "rewards/format_reward": 0.27083333767950535, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "step": 500, |
| "total_flos": 0.0, |
| "train_loss": 0.027085746813583684, |
| "train_runtime": 165463.4808, |
| "train_samples_per_second": 0.145, |
| "train_steps_per_second": 0.003 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|