| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5714285714285714, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "advantage_max": 0.1587137309834361, |
| "advantage_mean": -8.537123882823572e-09, |
| "advantage_min": -0.14114269940182567, |
| "advantage_std": 0.12265788647346199, |
| "completion_length": 2253.854206085205, |
| "epoch": 0.001142857142857143, |
| "grad_norm": 0.0020193569362163544, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.0016, |
| "reward": 0.16043265676125884, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12265789229422808, |
| "rewards/cosine_scaled_reward": 0.16032031644135714, |
| "rewards/format_reward": 0.6250000037252903, |
| "step": 1 |
| }, |
| { |
| "advantage_max": 0.15091374469920993, |
| "advantage_mean": -5.6655459629295635e-09, |
| "advantage_min": -0.1861058697104454, |
| "advantage_std": 0.13657334074378014, |
| "completion_length": 2566.395854949951, |
| "epoch": 0.002285714285714286, |
| "grad_norm": 0.003380397567525506, |
| "kl": 0.0, |
| "learning_rate": 2e-08, |
| "loss": 0.0117, |
| "reward": 0.13084001699462533, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1365733384154737, |
| "rewards/cosine_scaled_reward": 0.12725313939154148, |
| "rewards/format_reward": 0.5208333414047956, |
| "step": 2 |
| }, |
| { |
| "advantage_max": 0.16393633373081684, |
| "advantage_mean": -4.2685618234505895e-09, |
| "advantage_min": -0.14902829099446535, |
| "advantage_std": 0.12841436569578946, |
| "completion_length": 2859.7708740234375, |
| "epoch": 0.0034285714285714284, |
| "grad_norm": 0.0022618037182837725, |
| "kl": 0.00017625093460083008, |
| "learning_rate": 4e-08, |
| "loss": 0.0052, |
| "reward": 0.038310326635837555, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1284143622033298, |
| "rewards/cosine_scaled_reward": -0.08417008072137833, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 3 |
| }, |
| { |
| "advantage_max": 0.14144689589738846, |
| "advantage_mean": -9.934107814135729e-09, |
| "advantage_min": -0.1055870414711535, |
| "advantage_std": 0.10548431565985084, |
| "completion_length": 1437.4375305175781, |
| "epoch": 0.004571428571428572, |
| "grad_norm": 0.0020079181995242834, |
| "kl": 0.00010022521018981934, |
| "learning_rate": 6e-08, |
| "loss": 0.0045, |
| "reward": 0.2122791176661849, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10548432543873787, |
| "rewards/cosine_scaled_reward": 0.207040709676221, |
| "rewards/format_reward": 0.8333333358168602, |
| "step": 4 |
| }, |
| { |
| "advantage_max": 0.21683576330542564, |
| "advantage_mean": -4.967053768289986e-09, |
| "advantage_min": -0.2010047109797597, |
| "advantage_std": 0.1657102182507515, |
| "completion_length": 3105.1458587646484, |
| "epoch": 0.005714285714285714, |
| "grad_norm": 0.004448694176971912, |
| "kl": 0.00015798211097717285, |
| "learning_rate": 8e-08, |
| "loss": 0.014, |
| "reward": 0.08359449577983469, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1657102219760418, |
| "rewards/cosine_scaled_reward": 0.01688160002231598, |
| "rewards/format_reward": 0.45833335258066654, |
| "step": 5 |
| }, |
| { |
| "advantage_max": 0.22248137276619673, |
| "advantage_mean": -2.79396782099095e-09, |
| "advantage_min": -0.15466022863984108, |
| "advantage_std": 0.14450703794136643, |
| "completion_length": 2365.9583435058594, |
| "epoch": 0.006857142857142857, |
| "grad_norm": 0.0027177755255252123, |
| "kl": 0.00011008977890014648, |
| "learning_rate": 1e-07, |
| "loss": 0.0044, |
| "reward": 0.06316112671629526, |
| "reward_advantage_correlation": 0.9999999999999997, |
| "reward_std": 0.14450704446062446, |
| "rewards/cosine_scaled_reward": -0.11656539421528578, |
| "rewards/format_reward": 0.6041666772216558, |
| "step": 6 |
| }, |
| { |
| "advantage_max": 0.16089605633169413, |
| "advantage_mean": -9.934107828013516e-09, |
| "advantage_min": -0.19252846017479897, |
| "advantage_std": 0.14089244278147817, |
| "completion_length": 2459.416732788086, |
| "epoch": 0.008, |
| "grad_norm": 0.0031194211915135384, |
| "kl": 0.00013837218284606934, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0105, |
| "reward": 0.11613669246435165, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14089244417846203, |
| "rewards/cosine_scaled_reward": -0.010624844580888748, |
| "rewards/format_reward": 0.7083333432674408, |
| "step": 7 |
| }, |
| { |
| "advantage_max": 0.16271019540727139, |
| "advantage_mean": 2.1730860721991263e-09, |
| "advantage_min": -0.18036252213642, |
| "advantage_std": 0.1353623152244836, |
| "completion_length": 1742.7083702087402, |
| "epoch": 0.009142857142857144, |
| "grad_norm": 0.0014795621391385794, |
| "kl": 6.800517439842224e-05, |
| "learning_rate": 1.4e-07, |
| "loss": -0.0062, |
| "reward": 0.2013200237415731, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1353623173199594, |
| "rewards/cosine_scaled_reward": 0.24841690342873335, |
| "rewards/format_reward": 0.6875000055879354, |
| "step": 8 |
| }, |
| { |
| "advantage_max": 0.18095109798014164, |
| "advantage_mean": -1.5522042678961512e-09, |
| "advantage_min": -0.14702739380300045, |
| "advantage_std": 0.1355222244746983, |
| "completion_length": 2552.8125228881836, |
| "epoch": 0.010285714285714285, |
| "grad_norm": 0.0029326872900128365, |
| "kl": 0.00014982372522354126, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0096, |
| "reward": 0.05854184099007398, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13552221888676286, |
| "rewards/cosine_scaled_reward": -0.06791253259871155, |
| "rewards/format_reward": 0.47916667349636555, |
| "step": 9 |
| }, |
| { |
| "advantage_max": 0.12060405313968658, |
| "advantage_mean": 4.113341514622171e-09, |
| "advantage_min": -0.142030019313097, |
| "advantage_std": 0.11728623230010271, |
| "completion_length": 2442.5833587646484, |
| "epoch": 0.011428571428571429, |
| "grad_norm": 0.002123701386153698, |
| "kl": 0.00010627508163452148, |
| "learning_rate": 1.8e-07, |
| "loss": 0.0069, |
| "reward": 0.12899871496483684, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11728623881936073, |
| "rewards/cosine_scaled_reward": 0.09703383408486843, |
| "rewards/format_reward": 0.5625000037252903, |
| "step": 10 |
| }, |
| { |
| "advantage_max": 0.21631120378151536, |
| "advantage_mean": 3.1044089521259366e-10, |
| "advantage_min": -0.177880696952343, |
| "advantage_std": 0.1529299677349627, |
| "completion_length": 3065.0833587646484, |
| "epoch": 0.012571428571428572, |
| "grad_norm": 0.0030999763403087854, |
| "kl": 0.0001526474952697754, |
| "learning_rate": 2e-07, |
| "loss": 0.014, |
| "reward": 0.025770303327590227, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15292996680364013, |
| "rewards/cosine_scaled_reward": -0.09046578034758568, |
| "rewards/format_reward": 0.3333333469927311, |
| "step": 11 |
| }, |
| { |
| "advantage_max": 0.17459326144307852, |
| "advantage_mean": -6.8296991118099726e-09, |
| "advantage_min": -0.15691349003463984, |
| "advantage_std": 0.14575656410306692, |
| "completion_length": 1936.3542022705078, |
| "epoch": 0.013714285714285714, |
| "grad_norm": 0.00448928028345108, |
| "kl": 0.00013959407806396484, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.0119, |
| "reward": 0.15124552277848125, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.14575657108798623, |
| "rewards/cosine_scaled_reward": 0.06061667948961258, |
| "rewards/format_reward": 0.7708333358168602, |
| "step": 12 |
| }, |
| { |
| "advantage_max": 0.1999238901771605, |
| "advantage_mean": -6.053596887656276e-09, |
| "advantage_min": -0.17521686758846045, |
| "advantage_std": 0.1415856694802642, |
| "completion_length": 2761.5625381469727, |
| "epoch": 0.014857142857142857, |
| "grad_norm": 0.0024026259779930115, |
| "kl": 0.00013381242752075195, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0116, |
| "reward": 0.07723551848903298, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14158567460253835, |
| "rewards/cosine_scaled_reward": 0.02042432501912117, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 13 |
| }, |
| { |
| "advantage_max": 0.14435320254415274, |
| "advantage_mean": -1.5522043650406658e-09, |
| "advantage_min": -0.13152476958930492, |
| "advantage_std": 0.10402885114308447, |
| "completion_length": 2247.8750228881836, |
| "epoch": 0.016, |
| "grad_norm": 0.001248424407094717, |
| "kl": 0.00012195110321044922, |
| "learning_rate": 2.6e-07, |
| "loss": 0.0017, |
| "reward": 0.0968486382625997, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1040288528893143, |
| "rewards/cosine_scaled_reward": -0.01446421816945076, |
| "rewards/format_reward": 0.6041666697710752, |
| "step": 14 |
| }, |
| { |
| "advantage_max": 0.07975641544908285, |
| "advantage_mean": -9.313225607376907e-10, |
| "advantage_min": -0.09877181611955166, |
| "advantage_std": 0.0715660082641989, |
| "completion_length": 2612.458366394043, |
| "epoch": 0.017142857142857144, |
| "grad_norm": 0.0010066244285553694, |
| "kl": 0.000102996826171875, |
| "learning_rate": 2.8e-07, |
| "loss": -0.0006, |
| "reward": 0.12488156370818615, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.07156601082533598, |
| "rewards/cosine_scaled_reward": 0.11940987780690193, |
| "rewards/format_reward": 0.5, |
| "step": 15 |
| }, |
| { |
| "advantage_max": 0.12272757943719625, |
| "advantage_mean": -4.656613150633149e-10, |
| "advantage_min": -0.13491731509566307, |
| "advantage_std": 0.1079402850009501, |
| "completion_length": 3536.8958435058594, |
| "epoch": 0.018285714285714287, |
| "grad_norm": 0.002257565502077341, |
| "kl": 0.0002008676528930664, |
| "learning_rate": 3e-07, |
| "loss": 0.0014, |
| "reward": 0.007092840503901243, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1079402850009501, |
| "rewards/cosine_scaled_reward": -0.06191747821867466, |
| "rewards/format_reward": 0.16666667349636555, |
| "step": 16 |
| }, |
| { |
| "advantage_max": 0.15527505613863468, |
| "advantage_mean": -1.2417634698280722e-09, |
| "advantage_min": -0.17817360255867243, |
| "advantage_std": 0.13301802705973387, |
| "completion_length": 1949.2916793823242, |
| "epoch": 0.019428571428571427, |
| "grad_norm": 0.0018681100336834788, |
| "kl": 0.00013206154108047485, |
| "learning_rate": 3.2e-07, |
| "loss": -0.0032, |
| "reward": 0.16996530396863818, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.13301803125068545, |
| "rewards/cosine_scaled_reward": 0.15604307316243649, |
| "rewards/format_reward": 0.6875000055879354, |
| "step": 17 |
| }, |
| { |
| "advantage_max": 0.1609904021024704, |
| "advantage_mean": 4.163336342344337e-17, |
| "advantage_min": -0.1347449072636664, |
| "advantage_std": 0.1273661465384066, |
| "completion_length": 2442.2083587646484, |
| "epoch": 0.02057142857142857, |
| "grad_norm": 0.002498304471373558, |
| "kl": 0.0001045428216457367, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.0077, |
| "reward": 0.11062846053391695, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12736614840105176, |
| "rewards/cosine_scaled_reward": 0.013204759918153286, |
| "rewards/format_reward": 0.625, |
| "step": 18 |
| }, |
| { |
| "advantage_max": 0.20167000405490398, |
| "advantage_mean": -1.4668330938771845e-08, |
| "advantage_min": -0.2308518048375845, |
| "advantage_std": 0.16828895779326558, |
| "completion_length": 2169.2500648498535, |
| "epoch": 0.021714285714285714, |
| "grad_norm": 0.0021624648943543434, |
| "kl": 0.00011816620826721191, |
| "learning_rate": 3.6e-07, |
| "loss": 0.0, |
| "reward": 0.23799258447252214, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.16828896710649133, |
| "rewards/cosine_scaled_reward": 0.3468063613399863, |
| "rewards/format_reward": 0.708333345130086, |
| "step": 19 |
| }, |
| { |
| "advantage_max": 0.10925093479454517, |
| "advantage_mean": 1.7074248404025383e-09, |
| "advantage_min": -0.21123075019568205, |
| "advantage_std": 0.13155136164277792, |
| "completion_length": 1562.8958702087402, |
| "epoch": 0.022857142857142857, |
| "grad_norm": 0.0014371563447639346, |
| "kl": 7.089972496032715e-05, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": 0.0053, |
| "reward": 0.24731143051758409, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.13155136303976178, |
| "rewards/cosine_scaled_reward": 0.28850897774100304, |
| "rewards/format_reward": 0.8750000055879354, |
| "step": 20 |
| }, |
| { |
| "advantage_max": 0.2152009317651391, |
| "advantage_mean": -6.519257994552774e-09, |
| "advantage_min": -0.19526144675910473, |
| "advantage_std": 0.1745052202604711, |
| "completion_length": 2338.8333587646484, |
| "epoch": 0.024, |
| "grad_norm": 0.0031636806670576334, |
| "kl": 0.0001360177993774414, |
| "learning_rate": 4e-07, |
| "loss": 0.0088, |
| "reward": 0.12647646106779575, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.17450521886348724, |
| "rewards/cosine_scaled_reward": 0.10836795996874571, |
| "rewards/format_reward": 0.5208333376795053, |
| "step": 21 |
| }, |
| { |
| "advantage_max": 0.134979996830225, |
| "advantage_mean": -8.498318746635869e-09, |
| "advantage_min": -0.09254785464145243, |
| "advantage_std": 0.0842497721023392, |
| "completion_length": 1393.520851135254, |
| "epoch": 0.025142857142857144, |
| "grad_norm": 0.0015769954770803452, |
| "kl": 8.021295070648193e-05, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.0068, |
| "reward": 0.144379162156838, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.08424977766117081, |
| "rewards/cosine_scaled_reward": -0.010229567997157574, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 22 |
| }, |
| { |
| "advantage_max": 0.24604428745806217, |
| "advantage_mean": -2.173086099954702e-09, |
| "advantage_min": -0.1898985542356968, |
| "advantage_std": 0.17475335206836462, |
| "completion_length": 2572.395866394043, |
| "epoch": 0.026285714285714287, |
| "grad_norm": 0.0029409886337816715, |
| "kl": 0.00012180209159851074, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0091, |
| "reward": 0.12211265473160893, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.17475335579365492, |
| "rewards/cosine_scaled_reward": 0.02622226788662374, |
| "rewards/format_reward": 0.6666666753590107, |
| "step": 23 |
| }, |
| { |
| "advantage_max": 0.1982029126957059, |
| "advantage_mean": -9.778887047340312e-09, |
| "advantage_min": -0.17622305173426867, |
| "advantage_std": 0.15681974356994033, |
| "completion_length": 1935.3958892822266, |
| "epoch": 0.027428571428571427, |
| "grad_norm": 0.002483836840838194, |
| "kl": 7.27921724319458e-05, |
| "learning_rate": 4.6e-07, |
| "loss": 0.0088, |
| "reward": 0.17156797600910068, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.15681974682956934, |
| "rewards/cosine_scaled_reward": 0.11053762771189213, |
| "rewards/format_reward": 0.7916666716337204, |
| "step": 24 |
| }, |
| { |
| "advantage_max": 0.1764061450958252, |
| "advantage_mean": -7.372970400876255e-09, |
| "advantage_min": -0.1701870709657669, |
| "advantage_std": 0.13987470557913184, |
| "completion_length": 2115.000045776367, |
| "epoch": 0.02857142857142857, |
| "grad_norm": 0.0017042590770870447, |
| "kl": 0.00013157352805137634, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0077, |
| "reward": 0.10251787485321984, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13987471163272858, |
| "rewards/cosine_scaled_reward": 0.020700588822364807, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 25 |
| }, |
| { |
| "advantage_max": 0.18950836267322302, |
| "advantage_mean": 3.104407703125034e-10, |
| "advantage_min": -0.10491138324141502, |
| "advantage_std": 0.11330888234078884, |
| "completion_length": 2396.9791946411133, |
| "epoch": 0.029714285714285714, |
| "grad_norm": 0.0017541834386065602, |
| "kl": 0.00014390423893928528, |
| "learning_rate": 5e-07, |
| "loss": 0.0003, |
| "reward": 0.03794134716736153, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1133088837377727, |
| "rewards/cosine_scaled_reward": -0.1579243573360145, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 26 |
| }, |
| { |
| "advantage_max": 0.13833303237333894, |
| "advantage_mean": 1.5522041985072121e-10, |
| "advantage_min": -0.15812412649393082, |
| "advantage_std": 0.12111463444307446, |
| "completion_length": 2430.4167098999023, |
| "epoch": 0.030857142857142857, |
| "grad_norm": 0.0028511241544038057, |
| "kl": 0.0001679062843322754, |
| "learning_rate": 5.2e-07, |
| "loss": 0.0071, |
| "reward": 0.14495101105421782, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12111463863402605, |
| "rewards/cosine_scaled_reward": 0.10645421966910362, |
| "rewards/format_reward": 0.6458333432674408, |
| "step": 27 |
| }, |
| { |
| "advantage_max": 0.2110468242317438, |
| "advantage_mean": 3.1044086884479682e-09, |
| "advantage_min": -0.11917469836771488, |
| "advantage_std": 0.12643845193088055, |
| "completion_length": 2344.7500648498535, |
| "epoch": 0.032, |
| "grad_norm": 0.002487706718966365, |
| "kl": 0.00012912601232528687, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0023, |
| "reward": 0.12900030775927007, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12643845239654183, |
| "rewards/cosine_scaled_reward": 0.11002454720437527, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 28 |
| }, |
| { |
| "advantage_max": 0.11905187461525202, |
| "advantage_mean": -8.537123993845874e-10, |
| "advantage_min": -0.14191375905647874, |
| "advantage_std": 0.1037194412201643, |
| "completion_length": 2789.500015258789, |
| "epoch": 0.03314285714285714, |
| "grad_norm": 0.0016927722608670592, |
| "kl": 0.00016814470291137695, |
| "learning_rate": 5.6e-07, |
| "loss": 0.0041, |
| "reward": 0.032076418632641435, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.10371943796053529, |
| "rewards/cosine_scaled_reward": -0.12424678239040077, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 29 |
| }, |
| { |
| "advantage_max": 0.2387219499796629, |
| "advantage_mean": -7.140139979266991e-09, |
| "advantage_min": -0.1722863893955946, |
| "advantage_std": 0.16893034940585494, |
| "completion_length": 2417.9375610351562, |
| "epoch": 0.03428571428571429, |
| "grad_norm": 0.0027509131468832493, |
| "kl": 0.00010019540786743164, |
| "learning_rate": 5.8e-07, |
| "loss": 0.0047, |
| "reward": 0.17001164075918496, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.16893034894019365, |
| "rewards/cosine_scaled_reward": 0.1693029752932489, |
| "rewards/format_reward": 0.6666666772216558, |
| "step": 30 |
| }, |
| { |
| "advantage_max": 0.15399669483304024, |
| "advantage_mean": -5.432715249886755e-09, |
| "advantage_min": -0.16008222801610827, |
| "advantage_std": 0.1239645341411233, |
| "completion_length": 2762.229217529297, |
| "epoch": 0.03542857142857143, |
| "grad_norm": 0.0022213158663362265, |
| "kl": 0.00014109909534454346, |
| "learning_rate": 6e-07, |
| "loss": 0.0054, |
| "reward": 0.06478143483400345, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.123964530415833, |
| "rewards/cosine_scaled_reward": -0.027757282368838787, |
| "rewards/format_reward": 0.4375000037252903, |
| "step": 31 |
| }, |
| { |
| "advantage_max": 0.21103184670209885, |
| "advantage_mean": -2.7163574722877115e-09, |
| "advantage_min": -0.2043379843235016, |
| "advantage_std": 0.17043100483715534, |
| "completion_length": 2347.166732788086, |
| "epoch": 0.036571428571428574, |
| "grad_norm": 0.003295725677162409, |
| "kl": 0.00011375546455383301, |
| "learning_rate": 6.2e-07, |
| "loss": 0.019, |
| "reward": 0.17621385538950562, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.17043101135641336, |
| "rewards/cosine_scaled_reward": 0.17827051342464983, |
| "rewards/format_reward": 0.6875000111758709, |
| "step": 32 |
| }, |
| { |
| "advantage_max": 0.16886692307889462, |
| "advantage_mean": -9.778887394285007e-09, |
| "advantage_min": -0.19185489788651466, |
| "advantage_std": 0.15452855033800006, |
| "completion_length": 2871.562545776367, |
| "epoch": 0.037714285714285714, |
| "grad_norm": 0.0032696141861379147, |
| "kl": 0.00010472536087036133, |
| "learning_rate": 6.4e-07, |
| "loss": 0.0087, |
| "reward": 0.1361775571713224, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1545285526663065, |
| "rewards/cosine_scaled_reward": 0.12209459021687508, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 33 |
| }, |
| { |
| "advantage_max": 0.12458384316414595, |
| "advantage_mean": -7.140139868244688e-09, |
| "advantage_min": -0.15718477871268988, |
| "advantage_std": 0.12218879768624902, |
| "completion_length": 1836.4583587646484, |
| "epoch": 0.038857142857142854, |
| "grad_norm": 0.001347140409052372, |
| "kl": 0.00014571286737918854, |
| "learning_rate": 6.6e-07, |
| "loss": -0.0048, |
| "reward": 0.2071411805227399, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1221888018772006, |
| "rewards/cosine_scaled_reward": 0.25896920543164015, |
| "rewards/format_reward": 0.7083333358168602, |
| "step": 34 |
| }, |
| { |
| "advantage_max": 0.1953197568655014, |
| "advantage_mean": -5.587935614226325e-09, |
| "advantage_min": -0.22316306363791227, |
| "advantage_std": 0.1690281443297863, |
| "completion_length": 2355.6042289733887, |
| "epoch": 0.04, |
| "grad_norm": 0.0035231634974479675, |
| "kl": 0.0001258254051208496, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": 0.0114, |
| "reward": 0.1693128461483866, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.16902815271168947, |
| "rewards/cosine_scaled_reward": 0.1864055972546339, |
| "rewards/format_reward": 0.625000013038516, |
| "step": 35 |
| }, |
| { |
| "advantage_max": 0.16095423232764006, |
| "advantage_mean": -9.313226440044176e-10, |
| "advantage_min": -0.16731015034019947, |
| "advantage_std": 0.1404950194992125, |
| "completion_length": 3038.5208892822266, |
| "epoch": 0.04114285714285714, |
| "grad_norm": 0.003486273344606161, |
| "kl": 0.00018405914306640625, |
| "learning_rate": 7e-07, |
| "loss": 0.0089, |
| "reward": 0.04707640549167991, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.14049502136185765, |
| "rewards/cosine_scaled_reward": -0.10159374866634607, |
| "rewards/format_reward": 0.4791666753590107, |
| "step": 36 |
| }, |
| { |
| "advantage_max": 0.14632703876122832, |
| "advantage_mean": -2.0954758830904474e-09, |
| "advantage_min": -0.1389367524534464, |
| "advantage_std": 0.11993890162557364, |
| "completion_length": 2859.4166717529297, |
| "epoch": 0.04228571428571429, |
| "grad_norm": 0.0022138648200780153, |
| "kl": 0.00013276375830173492, |
| "learning_rate": 7.2e-07, |
| "loss": 0.0039, |
| "reward": 0.04992359317839146, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11993890441954136, |
| "rewards/cosine_scaled_reward": -0.03023509867489338, |
| "rewards/format_reward": 0.35416667349636555, |
| "step": 37 |
| }, |
| { |
| "advantage_max": 0.12173652416095138, |
| "advantage_mean": 4.579002788052122e-09, |
| "advantage_min": -0.14573213178664446, |
| "advantage_std": 0.10948428139090538, |
| "completion_length": 3242.8958435058594, |
| "epoch": 0.04342857142857143, |
| "grad_norm": 0.0019941311329603195, |
| "kl": 0.00017582625150680542, |
| "learning_rate": 7.4e-07, |
| "loss": -0.0007, |
| "reward": -0.012153132352977991, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10948428325355053, |
| "rewards/cosine_scaled_reward": -0.10800831019878387, |
| "rewards/format_reward": 0.1458333395421505, |
| "step": 38 |
| }, |
| { |
| "advantage_max": 0.11650802264921367, |
| "advantage_mean": -3.2790316486369653e-09, |
| "advantage_min": -0.1185021202545613, |
| "advantage_std": 0.09036520542576909, |
| "completion_length": 2290.125072479248, |
| "epoch": 0.044571428571428574, |
| "grad_norm": 0.00172609428409487, |
| "kl": 0.00010192953050136566, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": 0.0069, |
| "reward": 0.09569389157695696, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09036520589143038, |
| "rewards/cosine_scaled_reward": -0.028405792079865932, |
| "rewards/format_reward": 0.6250000111758709, |
| "step": 39 |
| }, |
| { |
| "advantage_max": 0.13623419683426619, |
| "advantage_mean": -5.89837637066104e-09, |
| "advantage_min": -0.10135996155440807, |
| "advantage_std": 0.09501322568394244, |
| "completion_length": 2165.000068664551, |
| "epoch": 0.045714285714285714, |
| "grad_norm": 0.0017249195370823145, |
| "kl": 0.00013747811317443848, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.005, |
| "reward": 0.08313871989957988, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.0950132254511118, |
| "rewards/cosine_scaled_reward": -0.06722887419164181, |
| "rewards/format_reward": 0.6250000093132257, |
| "step": 40 |
| }, |
| { |
| "advantage_max": 0.12406869698315859, |
| "advantage_mean": -1.2650465452956894e-08, |
| "advantage_min": -0.20387937780469656, |
| "advantage_std": 0.12945719296112657, |
| "completion_length": 2695.000045776367, |
| "epoch": 0.046857142857142854, |
| "grad_norm": 0.0025204960256814957, |
| "kl": 0.00013300776481628418, |
| "learning_rate": 8e-07, |
| "loss": 0.0037, |
| "reward": 0.18478128965944052, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12945719715207815, |
| "rewards/cosine_scaled_reward": 0.22261435391556006, |
| "rewards/format_reward": 0.6458333432674408, |
| "step": 41 |
| }, |
| { |
| "advantage_max": 0.15024259313941002, |
| "advantage_mean": 2.638747415018017e-09, |
| "advantage_min": -0.09568950766697526, |
| "advantage_std": 0.10328615305479616, |
| "completion_length": 2578.604169845581, |
| "epoch": 0.048, |
| "grad_norm": 0.002379926387220621, |
| "kl": 0.00016836822032928467, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": 0.003, |
| "reward": -0.006026094313710928, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10328615212347358, |
| "rewards/cosine_scaled_reward": -0.236235816963017, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 42 |
| }, |
| { |
| "advantage_max": 0.19424660969525576, |
| "advantage_mean": -2.3283065059276353e-09, |
| "advantage_min": -0.18449017591774464, |
| "advantage_std": 0.15552015556022525, |
| "completion_length": 2532.604232788086, |
| "epoch": 0.04914285714285714, |
| "grad_norm": 0.0030434627551585436, |
| "kl": 0.00013034045696258545, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0078, |
| "reward": 0.11620450200280175, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.15552015462890267, |
| "rewards/cosine_scaled_reward": 0.05307496804744005, |
| "rewards/format_reward": 0.5833333469927311, |
| "step": 43 |
| }, |
| { |
| "advantage_max": 0.1315040308982134, |
| "advantage_mean": -7.2177500920478366e-09, |
| "advantage_min": -0.09994148276746273, |
| "advantage_std": 0.08571109781041741, |
| "completion_length": 2133.1458740234375, |
| "epoch": 0.05028571428571429, |
| "grad_norm": 0.0013428078964352608, |
| "kl": 0.000141829252243042, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0031, |
| "reward": 0.18769649555906653, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08571109967306256, |
| "rewards/cosine_scaled_reward": 0.197374586481601, |
| "rewards/format_reward": 0.7083333414047956, |
| "step": 44 |
| }, |
| { |
| "advantage_max": 0.19893326330929995, |
| "advantage_mean": -6.208817363018149e-09, |
| "advantage_min": -0.18502129800617695, |
| "advantage_std": 0.16644518170505762, |
| "completion_length": 3050.1458587646484, |
| "epoch": 0.05142857142857143, |
| "grad_norm": 0.0026690622325986624, |
| "kl": 0.0001322031021118164, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0064, |
| "reward": 0.09938832372426987, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.16644519148394465, |
| "rewards/cosine_scaled_reward": 0.08417692640796304, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 45 |
| }, |
| { |
| "advantage_max": 0.13772548642009497, |
| "advantage_mean": -2.173086030565763e-09, |
| "advantage_min": -0.13554238621145487, |
| "advantage_std": 0.10642092488706112, |
| "completion_length": 2790.6041870117188, |
| "epoch": 0.052571428571428575, |
| "grad_norm": 0.002247325610369444, |
| "kl": 0.0001497715711593628, |
| "learning_rate": 9e-07, |
| "loss": 0.0117, |
| "reward": 0.01905112573876977, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10642092442139983, |
| "rewards/cosine_scaled_reward": -0.16121318750083447, |
| "rewards/format_reward": 0.4375000111758709, |
| "step": 46 |
| }, |
| { |
| "advantage_max": 0.15123218018561602, |
| "advantage_mean": -4.656613192266512e-09, |
| "advantage_min": -0.13677284540608525, |
| "advantage_std": 0.11217275122180581, |
| "completion_length": 2225.750015258789, |
| "epoch": 0.053714285714285714, |
| "grad_norm": 0.0019338211277499795, |
| "kl": 0.00010457634925842285, |
| "learning_rate": 9.2e-07, |
| "loss": 0.005, |
| "reward": 0.12341659748926759, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1121727554127574, |
| "rewards/cosine_scaled_reward": 0.05270408093929291, |
| "rewards/format_reward": 0.6250000037252903, |
| "step": 47 |
| }, |
| { |
| "advantage_max": 0.180091115180403, |
| "advantage_mean": -1.1098261233633e-08, |
| "advantage_min": -0.20166349643841386, |
| "advantage_std": 0.1485751592554152, |
| "completion_length": 2522.645881652832, |
| "epoch": 0.054857142857142854, |
| "grad_norm": 0.00238457671366632, |
| "kl": 0.00011201947927474976, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": 0.0041, |
| "reward": 0.14124550856649876, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14857516065239906, |
| "rewards/cosine_scaled_reward": 0.15710615552961826, |
| "rewards/format_reward": 0.5208333414047956, |
| "step": 48 |
| }, |
| { |
| "advantage_max": 0.17702306748833507, |
| "advantage_mean": -9.410238656012981e-10, |
| "advantage_min": -0.10869404303957708, |
| "advantage_std": 0.11685236936318688, |
| "completion_length": 1774.020851135254, |
| "epoch": 0.056, |
| "grad_norm": 0.0023415982723236084, |
| "kl": 9.121932089328766e-05, |
| "learning_rate": 9.6e-07, |
| "loss": 0.0043, |
| "reward": 0.11408375017344952, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11685237029450946, |
| "rewards/cosine_scaled_reward": -0.03847436048090458, |
| "rewards/format_reward": 0.7500000093132257, |
| "step": 49 |
| }, |
| { |
| "advantage_max": 0.09399801213294268, |
| "advantage_mean": -2.638747317873502e-09, |
| "advantage_min": -0.08518766891211271, |
| "advantage_std": 0.07225358131108806, |
| "completion_length": 2767.6041831970215, |
| "epoch": 0.05714285714285714, |
| "grad_norm": 0.0011482579866424203, |
| "kl": 9.690225124359131e-05, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0011, |
| "reward": 0.13219367619603872, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.07225357874995098, |
| "rewards/cosine_scaled_reward": 0.1503643374890089, |
| "rewards/format_reward": 0.4791666679084301, |
| "step": 50 |
| }, |
| { |
| "advantage_max": 0.12806095415726304, |
| "advantage_mean": -5.452117693080516e-09, |
| "advantage_min": -0.10972547065466642, |
| "advantage_std": 0.0959368993062526, |
| "completion_length": 2330.2291717529297, |
| "epoch": 0.05828571428571429, |
| "grad_norm": 0.0021673294249922037, |
| "kl": 0.0001529306173324585, |
| "learning_rate": 1e-06, |
| "loss": -0.0003, |
| "reward": 0.0236921610776335, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09593690512701869, |
| "rewards/cosine_scaled_reward": -0.1689696293324232, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 51 |
| }, |
| { |
| "advantage_max": 0.18533404497429729, |
| "advantage_mean": -1.1020650517168384e-08, |
| "advantage_min": -0.20416315738111734, |
| "advantage_std": 0.16357391513884068, |
| "completion_length": 2461.3959159851074, |
| "epoch": 0.05942857142857143, |
| "grad_norm": 0.0032797979656606913, |
| "kl": 0.00010143965482711792, |
| "learning_rate": 9.999890338174275e-07, |
| "loss": 0.0137, |
| "reward": 0.18816695609712042, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.16357392026111484, |
| "rewards/cosine_scaled_reward": 0.2337212460115552, |
| "rewards/format_reward": 0.6458333414047956, |
| "step": 52 |
| }, |
| { |
| "advantage_max": 0.1701141782104969, |
| "advantage_mean": -1.8626452463754717e-09, |
| "advantage_min": -0.2136994767934084, |
| "advantage_std": 0.15749749122187495, |
| "completion_length": 2312.6250381469727, |
| "epoch": 0.060571428571428575, |
| "grad_norm": 0.002415906172245741, |
| "kl": 0.00011346861720085144, |
| "learning_rate": 9.999561358041868e-07, |
| "loss": 0.0041, |
| "reward": 0.15444708871655166, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.1574974935501814, |
| "rewards/cosine_scaled_reward": 0.11371952062472701, |
| "rewards/format_reward": 0.6875000074505806, |
| "step": 53 |
| }, |
| { |
| "advantage_max": 0.17725317552685738, |
| "advantage_mean": -7.605801322085881e-09, |
| "advantage_min": -0.2166620921343565, |
| "advantage_std": 0.1593807926401496, |
| "completion_length": 1764.9166946411133, |
| "epoch": 0.061714285714285715, |
| "grad_norm": 0.002244009170681238, |
| "kl": 7.051974534988403e-05, |
| "learning_rate": 9.999013075636804e-07, |
| "loss": -0.0003, |
| "reward": 0.23366648191586137, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1593807958997786, |
| "rewards/cosine_scaled_reward": 0.2675904119387269, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 54 |
| }, |
| { |
| "advantage_max": 0.18578462721779943, |
| "advantage_mean": -7.761021547647573e-10, |
| "advantage_min": -0.2353920480236411, |
| "advantage_std": 0.17948928149417043, |
| "completion_length": 2628.7083892822266, |
| "epoch": 0.06285714285714286, |
| "grad_norm": 0.00294486409984529, |
| "kl": 0.00012754648923873901, |
| "learning_rate": 9.998245517681593e-07, |
| "loss": 0.0058, |
| "reward": 0.1491623887559399, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.17948928708210588, |
| "rewards/cosine_scaled_reward": 0.15094960387796164, |
| "rewards/format_reward": 0.583333345130086, |
| "step": 55 |
| }, |
| { |
| "advantage_max": 0.10377925122156739, |
| "advantage_mean": -4.0357312075522955e-09, |
| "advantage_min": -0.1440494479611516, |
| "advantage_std": 0.1021224157884717, |
| "completion_length": 2625.583366394043, |
| "epoch": 0.064, |
| "grad_norm": 0.0012550298124551773, |
| "kl": 0.00013631582260131836, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": -0.0009, |
| "reward": 0.07884217612445354, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10212242277339101, |
| "rewards/cosine_scaled_reward": -0.04887582641094923, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 56 |
| }, |
| { |
| "advantage_max": 0.2065029153600335, |
| "advantage_mean": 3.10440798068079e-10, |
| "advantage_min": -0.21144532784819603, |
| "advantage_std": 0.18026947043836117, |
| "completion_length": 2979.4167098999023, |
| "epoch": 0.06514285714285714, |
| "grad_norm": 0.003122963709756732, |
| "kl": 0.00011894106864929199, |
| "learning_rate": 9.996052735444862e-07, |
| "loss": 0.0089, |
| "reward": 0.10382829024456441, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.1802694769576192, |
| "rewards/cosine_scaled_reward": 0.04622313613072038, |
| "rewards/format_reward": 0.5208333469927311, |
| "step": 57 |
| }, |
| { |
| "advantage_max": 0.10899663623422384, |
| "advantage_mean": 1.5522043095295146e-09, |
| "advantage_min": -0.15494094602763653, |
| "advantage_std": 0.10533166327513754, |
| "completion_length": 1586.5625343322754, |
| "epoch": 0.06628571428571428, |
| "grad_norm": 0.0014194791438058019, |
| "kl": 8.42362642288208e-05, |
| "learning_rate": 9.994627618036452e-07, |
| "loss": 0.0066, |
| "reward": 0.18032316933386028, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.10533167002722621, |
| "rewards/cosine_scaled_reward": 0.13439065031707287, |
| "rewards/format_reward": 0.7916666716337204, |
| "step": 58 |
| }, |
| { |
| "advantage_max": 0.1733364863321185, |
| "advantage_mean": -6.286427572943509e-09, |
| "advantage_min": -0.1383139775134623, |
| "advantage_std": 0.13875256897881627, |
| "completion_length": 2547.6041679382324, |
| "epoch": 0.06742857142857143, |
| "grad_norm": 0.0021414561197161674, |
| "kl": 9.694695472717285e-05, |
| "learning_rate": 9.992983438818915e-07, |
| "loss": 0.0023, |
| "reward": 0.09846353763714433, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13875256897881627, |
| "rewards/cosine_scaled_reward": 0.0630117068067193, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 59 |
| }, |
| { |
| "advantage_max": 0.12070130999200046, |
| "advantage_mean": -6.0535968737784884e-09, |
| "advantage_min": -0.18512004520744085, |
| "advantage_std": 0.12240555556491017, |
| "completion_length": 2303.979232788086, |
| "epoch": 0.06857142857142857, |
| "grad_norm": 0.0023025507107377052, |
| "kl": 0.00011995434761047363, |
| "learning_rate": 9.991120277927223e-07, |
| "loss": 0.0079, |
| "reward": 0.14335119677707553, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12240555975586176, |
| "rewards/cosine_scaled_reward": 0.08939424622803926, |
| "rewards/format_reward": 0.6666666865348816, |
| "step": 60 |
| }, |
| { |
| "advantage_max": 0.12060253554955125, |
| "advantage_mean": 6.208817071584605e-10, |
| "advantage_min": -0.14025499392300844, |
| "advantage_std": 0.09711946547031403, |
| "completion_length": 2346.750030517578, |
| "epoch": 0.06971428571428571, |
| "grad_norm": 0.00180693285074085, |
| "kl": 8.498877286911011e-05, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": -0.0049, |
| "reward": 0.09262806148035452, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09711947059258819, |
| "rewards/cosine_scaled_reward": -0.00842318870127201, |
| "rewards/format_reward": 0.5625000149011612, |
| "step": 61 |
| }, |
| { |
| "advantage_max": 0.16065927874296904, |
| "advantage_mean": -4.501392619760125e-09, |
| "advantage_min": -0.23090818990021944, |
| "advantage_std": 0.16813216032460332, |
| "completion_length": 2060.812515258789, |
| "epoch": 0.07085714285714285, |
| "grad_norm": 0.0032414915040135384, |
| "kl": 7.87973403930664e-05, |
| "learning_rate": 9.98673738502114e-07, |
| "loss": 0.0045, |
| "reward": 0.18154411297291517, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.16813215287402272, |
| "rewards/cosine_scaled_reward": 0.184435760602355, |
| "rewards/format_reward": 0.7083333432674408, |
| "step": 62 |
| }, |
| { |
| "advantage_max": 0.10771181527525187, |
| "advantage_mean": -2.173086099954702e-09, |
| "advantage_min": -0.09812528779730201, |
| "advantage_std": 0.08161175763234496, |
| "completion_length": 1515.2916717529297, |
| "epoch": 0.072, |
| "grad_norm": 0.0015955866547301412, |
| "kl": 8.296966552734375e-05, |
| "learning_rate": 9.98421786662277e-07, |
| "loss": -0.0048, |
| "reward": 0.22865570522844791, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.08161176042631269, |
| "rewards/cosine_scaled_reward": 0.23915747739374638, |
| "rewards/format_reward": 0.875, |
| "step": 63 |
| }, |
| { |
| "advantage_max": 0.14562141429632902, |
| "advantage_mean": -3.9581210253825105e-09, |
| "advantage_min": -0.20186926797032356, |
| "advantage_std": 0.1426104260608554, |
| "completion_length": 2601.916702270508, |
| "epoch": 0.07314285714285715, |
| "grad_norm": 0.0024525534827262163, |
| "kl": 0.00014842301607131958, |
| "learning_rate": 9.981479793771866e-07, |
| "loss": 0.0056, |
| "reward": 0.10283284028992057, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14261042792350054, |
| "rewards/cosine_scaled_reward": 0.051970590837299824, |
| "rewards/format_reward": 0.5000000074505806, |
| "step": 64 |
| }, |
| { |
| "advantage_max": 0.12431543041020632, |
| "advantage_mean": -4.423782326568038e-09, |
| "advantage_min": -0.12246682308614254, |
| "advantage_std": 0.10179906385019422, |
| "completion_length": 2456.2292098999023, |
| "epoch": 0.07428571428571429, |
| "grad_norm": 0.0021826298907399178, |
| "kl": 0.00010146945714950562, |
| "learning_rate": 9.97852329991824e-07, |
| "loss": -0.0063, |
| "reward": 0.08970492146909237, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10179906524717808, |
| "rewards/cosine_scaled_reward": -0.029155070893466473, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 65 |
| }, |
| { |
| "advantage_max": 0.11777388351038098, |
| "advantage_mean": -3.2596293303432944e-09, |
| "advantage_min": -0.08773828111588955, |
| "advantage_std": 0.07737651432398707, |
| "completion_length": 2061.0000038146973, |
| "epoch": 0.07542857142857143, |
| "grad_norm": 0.0009109475067816675, |
| "kl": 0.00010826066136360168, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": 0.0001, |
| "reward": 0.07414777716621757, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.0773765177000314, |
| "rewards/cosine_scaled_reward": -0.03114328160881996, |
| "rewards/format_reward": 0.5, |
| "step": 66 |
| }, |
| { |
| "advantage_max": 0.1809951732866466, |
| "advantage_mean": 8.537123508123301e-10, |
| "advantage_min": -0.09886624943464994, |
| "advantage_std": 0.11706549394875765, |
| "completion_length": 3042.4583587646484, |
| "epoch": 0.07657142857142857, |
| "grad_norm": 0.0019939993508160114, |
| "kl": 0.00010170042514801025, |
| "learning_rate": 9.971955636222684e-07, |
| "loss": 0.0053, |
| "reward": -0.01826105872169137, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.11706549627706409, |
| "rewards/cosine_scaled_reward": -0.22059013764373958, |
| "rewards/format_reward": 0.3333333358168602, |
| "step": 67 |
| }, |
| { |
| "advantage_max": 0.10667644999921322, |
| "advantage_mean": -6.6744789001260685e-09, |
| "advantage_min": -0.13124415185302496, |
| "advantage_std": 0.10147338453680277, |
| "completion_length": 1382.4583473205566, |
| "epoch": 0.07771428571428571, |
| "grad_norm": 0.0009440166177228093, |
| "kl": 7.583759725093842e-05, |
| "learning_rate": 9.968344786479415e-07, |
| "loss": -0.0006, |
| "reward": 0.16119565116241574, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10147338453680277, |
| "rewards/cosine_scaled_reward": 0.09969675447791815, |
| "rewards/format_reward": 0.75, |
| "step": 68 |
| }, |
| { |
| "advantage_max": 0.19610813772305846, |
| "advantage_mean": -3.7640954658746395e-09, |
| "advantage_min": -0.1299685575067997, |
| "advantage_std": 0.13548616133630276, |
| "completion_length": 1842.7292098999023, |
| "epoch": 0.07885714285714286, |
| "grad_norm": 0.002522306516766548, |
| "kl": 9.226799011230469e-05, |
| "learning_rate": 9.964516155915151e-07, |
| "loss": 0.0051, |
| "reward": 0.10152785666286945, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1354861636646092, |
| "rewards/cosine_scaled_reward": -0.05570081574842334, |
| "rewards/format_reward": 0.7083333358168602, |
| "step": 69 |
| }, |
| { |
| "advantage_max": 0.08541852049529552, |
| "advantage_mean": 5.432715666220389e-10, |
| "advantage_min": -0.14142528641968966, |
| "advantage_std": 0.08910802565515041, |
| "completion_length": 2474.979202270508, |
| "epoch": 0.08, |
| "grad_norm": 0.0011430344311520457, |
| "kl": 9.79304313659668e-05, |
| "learning_rate": 9.960469931131936e-07, |
| "loss": -0.0021, |
| "reward": 0.13445935118943453, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08910802891477942, |
| "rewards/cosine_scaled_reward": 0.08494364470243454, |
| "rewards/format_reward": 0.625, |
| "step": 70 |
| }, |
| { |
| "advantage_max": 0.10955090029165149, |
| "advantage_mean": -3.104408646814605e-09, |
| "advantage_min": -0.12331806868314743, |
| "advantage_std": 0.09641325660049915, |
| "completion_length": 2457.1458587646484, |
| "epoch": 0.08114285714285714, |
| "grad_norm": 0.0009280177182517946, |
| "kl": 0.00010801851749420166, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": -0.0, |
| "reward": 0.06796744232997298, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09641325753182173, |
| "rewards/cosine_scaled_reward": 0.0028737219981849194, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 71 |
| }, |
| { |
| "advantage_max": 0.1530079017393291, |
| "advantage_mean": -6.053597047250836e-09, |
| "advantage_min": -0.13696587504819036, |
| "advantage_std": 0.11541180987842381, |
| "completion_length": 2491.187572479248, |
| "epoch": 0.08228571428571428, |
| "grad_norm": 0.002064738655462861, |
| "kl": 0.0001221299171447754, |
| "learning_rate": 9.951725498333448e-07, |
| "loss": 0.0071, |
| "reward": 0.06420155242085457, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11541181430220604, |
| "rewards/cosine_scaled_reward": -0.10319141205400229, |
| "rewards/format_reward": 0.5833333414047956, |
| "step": 72 |
| }, |
| { |
| "advantage_max": 0.15204641316086054, |
| "advantage_mean": -5.161079424942372e-09, |
| "advantage_min": -0.1872421819716692, |
| "advantage_std": 0.1310334224253893, |
| "completion_length": 3098.479217529297, |
| "epoch": 0.08342857142857144, |
| "grad_norm": 0.0022661956027150154, |
| "kl": 0.00015395879745483398, |
| "learning_rate": 9.947027716509488e-07, |
| "loss": 0.0065, |
| "reward": 0.07659045979380608, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1310334224253893, |
| "rewards/cosine_scaled_reward": 0.00787665881216526, |
| "rewards/format_reward": 0.43750001303851604, |
| "step": 73 |
| }, |
| { |
| "advantage_max": 0.17096955608576536, |
| "advantage_mean": -4.190951724547531e-09, |
| "advantage_min": -0.17159210238605738, |
| "advantage_std": 0.13066924829035997, |
| "completion_length": 2244.520866394043, |
| "epoch": 0.08457142857142858, |
| "grad_norm": 0.0017521478002890944, |
| "kl": 0.00010526180267333984, |
| "learning_rate": 9.942113192828444e-07, |
| "loss": 0.0054, |
| "reward": 0.1286474959924817, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.13066925154998899, |
| "rewards/cosine_scaled_reward": 0.07717459555715322, |
| "rewards/format_reward": 0.6041666753590107, |
| "step": 74 |
| }, |
| { |
| "advantage_max": 0.14245222136378288, |
| "advantage_mean": -8.343098368418511e-09, |
| "advantage_min": -0.12452936079353094, |
| "advantage_std": 0.10217330139130354, |
| "completion_length": 2732.3958892822266, |
| "epoch": 0.08571428571428572, |
| "grad_norm": 0.0021405534353107214, |
| "kl": 0.00012764334678649902, |
| "learning_rate": 9.93698216681727e-07, |
| "loss": 0.0107, |
| "reward": 0.10671802004799247, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10217330139130354, |
| "rewards/cosine_scaled_reward": 0.07396111264824867, |
| "rewards/format_reward": 0.4791666828095913, |
| "step": 75 |
| }, |
| { |
| "advantage_max": 0.1363742845132947, |
| "advantage_mean": -1.9402554285452567e-09, |
| "advantage_min": -0.1722245216369629, |
| "advantage_std": 0.1291468944400549, |
| "completion_length": 2299.4583587646484, |
| "epoch": 0.08685714285714285, |
| "grad_norm": 0.002542425412684679, |
| "kl": 0.00011056661605834961, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": 0.0118, |
| "reward": 0.059120094403624535, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12914689676836133, |
| "rewards/cosine_scaled_reward": -0.0978600550442934, |
| "rewards/format_reward": 0.5416666753590107, |
| "step": 76 |
| }, |
| { |
| "advantage_max": 0.1297641615383327, |
| "advantage_mean": 8.537124410179509e-10, |
| "advantage_min": -0.1266080942004919, |
| "advantage_std": 0.10432415176182985, |
| "completion_length": 2606.7708892822266, |
| "epoch": 0.088, |
| "grad_norm": 0.0015011918731033802, |
| "kl": 0.00010880827903747559, |
| "learning_rate": 9.926071618660237e-07, |
| "loss": 0.0022, |
| "reward": 0.0441059676813893, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.10432415641844273, |
| "rewards/cosine_scaled_reward": -0.09794393740594387, |
| "rewards/format_reward": 0.45833333395421505, |
| "step": 77 |
| }, |
| { |
| "advantage_max": 0.21008419059216976, |
| "advantage_mean": -9.71445146547012e-17, |
| "advantage_min": -0.17278954200446606, |
| "advantage_std": 0.15343208238482475, |
| "completion_length": 2731.3542098999023, |
| "epoch": 0.08914285714285715, |
| "grad_norm": 0.0027688215486705303, |
| "kl": 0.00013116281479597092, |
| "learning_rate": 9.9202926282791e-07, |
| "loss": 0.0028, |
| "reward": 0.12476505199447274, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15343208704143763, |
| "rewards/cosine_scaled_reward": 0.09734340105205774, |
| "rewards/format_reward": 0.5416666772216558, |
| "step": 78 |
| }, |
| { |
| "advantage_max": 0.20920497737824917, |
| "advantage_mean": -5.898376342905465e-09, |
| "advantage_min": -0.1681775012984872, |
| "advantage_std": 0.15584641904570162, |
| "completion_length": 1935.1041984558105, |
| "epoch": 0.09028571428571429, |
| "grad_norm": 0.0024462228175252676, |
| "kl": 8.627399802207947e-05, |
| "learning_rate": 9.91429819907136e-07, |
| "loss": 0.0024, |
| "reward": 0.12909462582319975, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.15584642230533063, |
| "rewards/cosine_scaled_reward": 0.004465287551283836, |
| "rewards/format_reward": 0.75, |
| "step": 79 |
| }, |
| { |
| "advantage_max": 0.16656427085399628, |
| "advantage_mean": -1.5522044760629683e-10, |
| "advantage_min": -0.1844524722546339, |
| "advantage_std": 0.13766634557396173, |
| "completion_length": 2919.6250610351562, |
| "epoch": 0.09142857142857143, |
| "grad_norm": 0.002895612735301256, |
| "kl": 0.00016814470291137695, |
| "learning_rate": 9.908088623197048e-07, |
| "loss": -0.0024, |
| "reward": 0.08655929937958717, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13766634929925203, |
| "rewards/cosine_scaled_reward": 0.037981233559548855, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 80 |
| }, |
| { |
| "advantage_max": 0.17092055454850197, |
| "advantage_mean": -5.820766389719179e-09, |
| "advantage_min": -0.17761663650162518, |
| "advantage_std": 0.1536490712314844, |
| "completion_length": 2850.3333854675293, |
| "epoch": 0.09257142857142857, |
| "grad_norm": 0.0033340235240757465, |
| "kl": 0.0001818835735321045, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0037, |
| "reward": 0.08180352626368403, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.153649078682065, |
| "rewards/cosine_scaled_reward": 0.003097064793109894, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 81 |
| }, |
| { |
| "advantage_max": 0.12402567837852985, |
| "advantage_mean": -5.0446641516876944e-09, |
| "advantage_min": -0.15394007693976164, |
| "advantage_std": 0.11961714894277975, |
| "completion_length": 2321.187530517578, |
| "epoch": 0.09371428571428571, |
| "grad_norm": 0.0027620706241577864, |
| "kl": 9.965896606445312e-05, |
| "learning_rate": 9.895025252503755e-07, |
| "loss": 0.0116, |
| "reward": 0.11978777777403593, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11961715843062848, |
| "rewards/cosine_scaled_reward": 0.08022868749685585, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 82 |
| }, |
| { |
| "advantage_max": 0.22959647234529257, |
| "advantage_mean": -2.405916771364147e-09, |
| "advantage_min": -0.17005170974880457, |
| "advantage_std": 0.1516456357203424, |
| "completion_length": 2457.958381652832, |
| "epoch": 0.09485714285714286, |
| "grad_norm": 0.002318345010280609, |
| "kl": 0.00013965368270874023, |
| "learning_rate": 9.888172094375033e-07, |
| "loss": 0.0055, |
| "reward": 0.08302936844120268, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.15164564363658428, |
| "rewards/cosine_scaled_reward": -0.005184752866625786, |
| "rewards/format_reward": 0.5000000018626451, |
| "step": 83 |
| }, |
| { |
| "advantage_max": 0.16241815499961376, |
| "advantage_mean": -7.722216546768301e-09, |
| "advantage_min": -0.1314363582059741, |
| "advantage_std": 0.12282431870698929, |
| "completion_length": 2358.8125228881836, |
| "epoch": 0.096, |
| "grad_norm": 0.0016149451257660985, |
| "kl": 0.00011220574378967285, |
| "learning_rate": 9.881105062929221e-07, |
| "loss": 0.0038, |
| "reward": 0.11924176779575646, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12282432615756989, |
| "rewards/cosine_scaled_reward": 0.04767691483721137, |
| "rewards/format_reward": 0.6041666679084301, |
| "step": 84 |
| }, |
| { |
| "advantage_max": 0.251274854876101, |
| "advantage_mean": -4.19095166903638e-09, |
| "advantage_min": -0.20053553488105536, |
| "advantage_std": 0.18136232160031796, |
| "completion_length": 2729.6250610351562, |
| "epoch": 0.09714285714285714, |
| "grad_norm": 0.0030045255552977324, |
| "kl": 9.201047942042351e-05, |
| "learning_rate": 9.873824502603459e-07, |
| "loss": 0.0082, |
| "reward": 0.10162241314537823, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.18136232951655984, |
| "rewards/cosine_scaled_reward": 0.009953074157238007, |
| "rewards/format_reward": 0.583333345130086, |
| "step": 85 |
| }, |
| { |
| "advantage_max": 0.10346131678670645, |
| "advantage_mean": 2.7939678903798892e-09, |
| "advantage_min": -0.11049740668386221, |
| "advantage_std": 0.08825402474030852, |
| "completion_length": 2581.270881652832, |
| "epoch": 0.09828571428571428, |
| "grad_norm": 0.0011986541794613004, |
| "kl": 0.00014419853687286377, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": 0.0016, |
| "reward": 0.0768510882044211, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08825402474030852, |
| "rewards/cosine_scaled_reward": -0.002405572682619095, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 86 |
| }, |
| { |
| "advantage_max": 0.12127275113016367, |
| "advantage_mean": -5.1998845854162035e-09, |
| "advantage_min": -0.16166772227734327, |
| "advantage_std": 0.10958564793691039, |
| "completion_length": 2274.125068664551, |
| "epoch": 0.09942857142857142, |
| "grad_norm": 0.0028325302992016077, |
| "kl": 0.00013668090105056763, |
| "learning_rate": 9.85862422507884e-07, |
| "loss": 0.0113, |
| "reward": 0.1180245433570235, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10958564607426524, |
| "rewards/cosine_scaled_reward": 0.0059294914826750755, |
| "rewards/format_reward": 0.687500013038516, |
| "step": 87 |
| }, |
| { |
| "advantage_max": 0.23650739900767803, |
| "advantage_mean": -1.746229910670749e-09, |
| "advantage_min": -0.18534229695796967, |
| "advantage_std": 0.1711007342673838, |
| "completion_length": 1488.5833587646484, |
| "epoch": 0.10057142857142858, |
| "grad_norm": 0.001865549013018608, |
| "kl": 8.325278759002686e-05, |
| "learning_rate": 9.850705248720068e-07, |
| "loss": 0.009, |
| "reward": 0.19773138477467, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.17110073706135154, |
| "rewards/cosine_scaled_reward": 0.15710189566016197, |
| "rewards/format_reward": 0.8541666697710752, |
| "step": 88 |
| }, |
| { |
| "advantage_max": 0.17963521927595139, |
| "advantage_mean": -7.605801405352608e-09, |
| "advantage_min": -0.19737609662115574, |
| "advantage_std": 0.16144832829013467, |
| "completion_length": 2659.791702270508, |
| "epoch": 0.10171428571428572, |
| "grad_norm": 0.0037124978844076395, |
| "kl": 0.00011199712753295898, |
| "learning_rate": 9.8425742251254e-07, |
| "loss": 0.0162, |
| "reward": 0.09529236517846584, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.16144833154976368, |
| "rewards/cosine_scaled_reward": 0.03067113645374775, |
| "rewards/format_reward": 0.5000000093132257, |
| "step": 89 |
| }, |
| { |
| "advantage_max": 0.13574577076360583, |
| "advantage_mean": -3.8999133436523614e-09, |
| "advantage_min": -0.1119693242944777, |
| "advantage_std": 0.10564618976786733, |
| "completion_length": 2417.0833435058594, |
| "epoch": 0.10285714285714286, |
| "grad_norm": 0.001697812112979591, |
| "kl": 0.00015023350715637207, |
| "learning_rate": 9.83423155058946e-07, |
| "loss": 0.003, |
| "reward": 0.040875127888284624, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10564619302749634, |
| "rewards/cosine_scaled_reward": -0.14221901632845402, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 90 |
| }, |
| { |
| "advantage_max": 0.16820846288464963, |
| "advantage_mean": 3.4924597380747713e-09, |
| "advantage_min": -0.16563586331903934, |
| "advantage_std": 0.13759498205035925, |
| "completion_length": 2622.062515258789, |
| "epoch": 0.104, |
| "grad_norm": 0.0026448904536664486, |
| "kl": 0.00011968612670898438, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0119, |
| "reward": 0.11700052605010569, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13759497925639153, |
| "rewards/cosine_scaled_reward": 0.07498859567567706, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 91 |
| }, |
| { |
| "advantage_max": 0.12286161910742521, |
| "advantage_mean": -6.286427836621478e-09, |
| "advantage_min": -0.1205812394618988, |
| "advantage_std": 0.10014992253854871, |
| "completion_length": 2147.791717529297, |
| "epoch": 0.10514285714285715, |
| "grad_norm": 0.0013253620127215981, |
| "kl": 8.553266525268555e-05, |
| "learning_rate": 9.816912885430258e-07, |
| "loss": -0.0002, |
| "reward": 0.12493289890699089, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10014992393553257, |
| "rewards/cosine_scaled_reward": 0.004685905296355486, |
| "rewards/format_reward": 0.7291666697710752, |
| "step": 92 |
| }, |
| { |
| "advantage_max": 0.18725960794836283, |
| "advantage_mean": 2.3283065753165744e-10, |
| "advantage_min": -0.09588433895260096, |
| "advantage_std": 0.10476150875911117, |
| "completion_length": 3570.2708740234375, |
| "epoch": 0.10628571428571429, |
| "grad_norm": 0.0024138952139765024, |
| "kl": 0.0002326369285583496, |
| "learning_rate": 9.807937738894303e-07, |
| "loss": 0.0014, |
| "reward": -0.049280768260359764, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10476151248440146, |
| "rewards/cosine_scaled_reward": -0.18744310783222318, |
| "rewards/format_reward": 0.0833333358168602, |
| "step": 93 |
| }, |
| { |
| "advantage_max": 0.13818231970071793, |
| "advantage_mean": -2.0566707503721915e-09, |
| "advantage_min": -0.12359800864942372, |
| "advantage_std": 0.09686838975176215, |
| "completion_length": 2422.6458740234375, |
| "epoch": 0.10742857142857143, |
| "grad_norm": 0.0018462935695424676, |
| "kl": 0.00012992694973945618, |
| "learning_rate": 9.798752629550546e-07, |
| "loss": -0.0004, |
| "reward": 0.10946622129995376, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09686839301139116, |
| "rewards/cosine_scaled_reward": 0.03989738831296563, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 94 |
| }, |
| { |
| "advantage_max": 0.1666426188312471, |
| "advantage_mean": 1.7074247293802358e-09, |
| "advantage_min": -0.1642469959333539, |
| "advantage_std": 0.13464828813448548, |
| "completion_length": 3139.666732788086, |
| "epoch": 0.10857142857142857, |
| "grad_norm": 0.002334951190277934, |
| "kl": 0.00013262033462524414, |
| "learning_rate": 9.78935800506826e-07, |
| "loss": 0.0073, |
| "reward": 0.05827001016587019, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13464829558506608, |
| "rewards/cosine_scaled_reward": -0.0356507133692503, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 95 |
| }, |
| { |
| "advantage_max": 0.2095841746777296, |
| "advantage_mean": 2.017865666226193e-09, |
| "advantage_min": -0.17328586243093014, |
| "advantage_std": 0.14680979307740927, |
| "completion_length": 2370.6458587646484, |
| "epoch": 0.10971428571428571, |
| "grad_norm": 0.0024344930425286293, |
| "kl": 0.00011432915925979614, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": 0.0048, |
| "reward": 0.1633957652375102, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14680979494005442, |
| "rewards/cosine_scaled_reward": 0.182056006626226, |
| "rewards/format_reward": 0.6041666734963655, |
| "step": 96 |
| }, |
| { |
| "advantage_max": 0.2188143515959382, |
| "advantage_mean": -4.423782243301311e-09, |
| "advantage_min": -0.17432072758674622, |
| "advantage_std": 0.15836003702133894, |
| "completion_length": 2806.6458892822266, |
| "epoch": 0.11085714285714286, |
| "grad_norm": 0.00301153352484107, |
| "kl": 0.00013709068298339844, |
| "learning_rate": 9.769942052400235e-07, |
| "loss": 0.0127, |
| "reward": 0.0940831717234687, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1583600454032421, |
| "rewards/cosine_scaled_reward": 0.039922329247929156, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 97 |
| }, |
| { |
| "advantage_max": 0.13472749013453722, |
| "advantage_mean": -4.811833625995021e-09, |
| "advantage_min": -0.09283868130296469, |
| "advantage_std": 0.08354483381845057, |
| "completion_length": 2286.208381652832, |
| "epoch": 0.112, |
| "grad_norm": 0.0012522657634690404, |
| "kl": 8.559972047805786e-05, |
| "learning_rate": 9.759921670520634e-07, |
| "loss": 0.0064, |
| "reward": 0.08262644917704165, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08354483777657151, |
| "rewards/cosine_scaled_reward": -0.015163867268711329, |
| "rewards/format_reward": 0.5208333376795053, |
| "step": 98 |
| }, |
| { |
| "advantage_max": 0.1714438796043396, |
| "advantage_mean": -4.268561934472892e-09, |
| "advantage_min": -0.20534021221101284, |
| "advantage_std": 0.1506601725704968, |
| "completion_length": 2804.3333587646484, |
| "epoch": 0.11314285714285714, |
| "grad_norm": 0.0023277695290744305, |
| "kl": 0.0001052170991897583, |
| "learning_rate": 9.749693666068663e-07, |
| "loss": 0.0042, |
| "reward": 0.12414564751088619, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.15066017862409353, |
| "rewards/cosine_scaled_reward": 0.16006462997756898, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 99 |
| }, |
| { |
| "advantage_max": 0.13038562145084143, |
| "advantage_mean": -8.30429328774196e-09, |
| "advantage_min": -0.11401992756873369, |
| "advantage_std": 0.10838590876664966, |
| "completion_length": 2251.979179382324, |
| "epoch": 0.11428571428571428, |
| "grad_norm": 0.0022543719969689846, |
| "kl": 0.00011374056339263916, |
| "learning_rate": 9.739258537542835e-07, |
| "loss": 0.0052, |
| "reward": 0.13824327662587166, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10838590934872627, |
| "rewards/cosine_scaled_reward": 0.09618911519646645, |
| "rewards/format_reward": 0.6250000037252903, |
| "step": 100 |
| }, |
| { |
| "advantage_max": 0.1658927546814084, |
| "advantage_mean": -2.6387474566513802e-09, |
| "advantage_min": -0.10248309839516878, |
| "advantage_std": 0.10365857649594545, |
| "completion_length": 2299.0417098999023, |
| "epoch": 0.11542857142857142, |
| "grad_norm": 0.0026296344585716724, |
| "kl": 0.0001501142978668213, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": 0.0098, |
| "reward": 0.1061628689058125, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.10365858301520348, |
| "rewards/cosine_scaled_reward": 0.03263038757722825, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 101 |
| }, |
| { |
| "advantage_max": 0.1605947259813547, |
| "advantage_mean": -3.4924597103191957e-09, |
| "advantage_min": -0.18639070075005293, |
| "advantage_std": 0.15262889862060547, |
| "completion_length": 2045.4375457763672, |
| "epoch": 0.11657142857142858, |
| "grad_norm": 0.003396169049665332, |
| "kl": 0.0001347959041595459, |
| "learning_rate": 9.717768952713511e-07, |
| "loss": 0.0105, |
| "reward": 0.1790522364899516, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.15262890281155705, |
| "rewards/cosine_scaled_reward": 0.15230208821594715, |
| "rewards/format_reward": 0.7500000074505806, |
| "step": 102 |
| }, |
| { |
| "advantage_max": 0.14158586133271456, |
| "advantage_mean": 4.4237822571790986e-09, |
| "advantage_min": -0.10933790914714336, |
| "advantage_std": 0.09761173883453012, |
| "completion_length": 2373.9792251586914, |
| "epoch": 0.11771428571428572, |
| "grad_norm": 0.0012007191544398665, |
| "kl": 0.00010399753227829933, |
| "learning_rate": 9.706715543782064e-07, |
| "loss": -0.0011, |
| "reward": 0.11079470813274384, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09761174395680428, |
| "rewards/cosine_scaled_reward": -0.038000524044036865, |
| "rewards/format_reward": 0.7291666697710752, |
| "step": 103 |
| }, |
| { |
| "advantage_max": 0.11513470765203238, |
| "advantage_mean": -8.925174974083738e-09, |
| "advantage_min": -0.11279049189761281, |
| "advantage_std": 0.09129662462510169, |
| "completion_length": 2454.00004196167, |
| "epoch": 0.11885714285714286, |
| "grad_norm": 0.0016338457353413105, |
| "kl": 0.00012861378490924835, |
| "learning_rate": 9.695457105469804e-07, |
| "loss": -0.0011, |
| "reward": 0.06962736044079065, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.0912966295145452, |
| "rewards/cosine_scaled_reward": -0.05629314109683037, |
| "rewards/format_reward": 0.5208333376795053, |
| "step": 104 |
| }, |
| { |
| "advantage_max": 0.15599115658551455, |
| "advantage_mean": -4.346172109703783e-09, |
| "advantage_min": -0.23021886963397264, |
| "advantage_std": 0.1603334224782884, |
| "completion_length": 2441.083366394043, |
| "epoch": 0.12, |
| "grad_norm": 0.002161208540201187, |
| "kl": 0.00013302266597747803, |
| "learning_rate": 9.683994186497132e-07, |
| "loss": 0.0091, |
| "reward": 0.14520665351301432, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.16033342946320772, |
| "rewards/cosine_scaled_reward": 0.178804699331522, |
| "rewards/format_reward": 0.5000000074505806, |
| "step": 105 |
| }, |
| { |
| "advantage_max": 0.10074207372963428, |
| "advantage_mean": -7.916242286687414e-09, |
| "advantage_min": -0.16330508375540376, |
| "advantage_std": 0.10734084341675043, |
| "completion_length": 1822.1041870117188, |
| "epoch": 0.12114285714285715, |
| "grad_norm": 0.001392633537761867, |
| "kl": 6.573088467121124e-05, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": 0.0026, |
| "reward": 0.21404909482225776, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10734084062278271, |
| "rewards/cosine_scaled_reward": 0.24547897465527058, |
| "rewards/format_reward": 0.7708333395421505, |
| "step": 106 |
| }, |
| { |
| "advantage_max": 0.12848057132214308, |
| "advantage_mean": -3.802900563898426e-09, |
| "advantage_min": -0.18607094045728445, |
| "advantage_std": 0.12191221117973328, |
| "completion_length": 2692.791702270508, |
| "epoch": 0.12228571428571429, |
| "grad_norm": 0.0020098562818020582, |
| "kl": 0.0001888573169708252, |
| "learning_rate": 9.66045715125541e-07, |
| "loss": 0.0038, |
| "reward": 0.07842084765434265, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12191221164539456, |
| "rewards/cosine_scaled_reward": 0.001535113900899887, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 107 |
| }, |
| { |
| "advantage_max": 0.11962755676358938, |
| "advantage_mean": 1.552204420551817e-09, |
| "advantage_min": -0.14954979997128248, |
| "advantage_std": 0.10876429115887731, |
| "completion_length": 2763.416702270508, |
| "epoch": 0.12342857142857143, |
| "grad_norm": 0.0018431171774864197, |
| "kl": 0.00015020370483398438, |
| "learning_rate": 9.648384182148252e-07, |
| "loss": 0.0066, |
| "reward": 0.051005338318645954, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10876429441850632, |
| "rewards/cosine_scaled_reward": -0.038408463820815086, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 108 |
| }, |
| { |
| "advantage_max": 0.15706831123679876, |
| "advantage_mean": -2.483526884144993e-09, |
| "advantage_min": -0.14608103781938553, |
| "advantage_std": 0.12137589370831847, |
| "completion_length": 2733.2292098999023, |
| "epoch": 0.12457142857142857, |
| "grad_norm": 0.0020496873185038567, |
| "kl": 0.00012265145778656006, |
| "learning_rate": 9.636109026648554e-07, |
| "loss": 0.0055, |
| "reward": 0.05345132830552757, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12137589510530233, |
| "rewards/cosine_scaled_reward": -0.05171956028789282, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 109 |
| }, |
| { |
| "advantage_max": 0.2400534199550748, |
| "advantage_mean": -5.355104998328031e-09, |
| "advantage_min": -0.1949408515356481, |
| "advantage_std": 0.19347044127061963, |
| "completion_length": 2730.4375228881836, |
| "epoch": 0.12571428571428572, |
| "grad_norm": 0.0037184932734817266, |
| "kl": 0.00014457106590270996, |
| "learning_rate": 9.623632283030077e-07, |
| "loss": 0.0093, |
| "reward": 0.12486143945716321, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.19347044127061963, |
| "rewards/cosine_scaled_reward": 0.09646196017274633, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 110 |
| }, |
| { |
| "advantage_max": 0.15602963138371706, |
| "advantage_mean": -2.9491884628862763e-09, |
| "advantage_min": -0.1375290732830763, |
| "advantage_std": 0.11554153729230165, |
| "completion_length": 2890.562530517578, |
| "epoch": 0.12685714285714286, |
| "grad_norm": 0.002402157988399267, |
| "kl": 0.0001817643642425537, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": -0.005, |
| "reward": 0.08678329293616116, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11554153636097908, |
| "rewards/cosine_scaled_reward": 0.03825441841036081, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 111 |
| }, |
| { |
| "advantage_max": 0.18926704861223698, |
| "advantage_mean": -2.6387472762401387e-09, |
| "advantage_min": -0.25633513927459717, |
| "advantage_std": 0.1728609693236649, |
| "completion_length": 2748.7500762939453, |
| "epoch": 0.128, |
| "grad_norm": 0.002987251617014408, |
| "kl": 0.0001405477523803711, |
| "learning_rate": 9.598076473627796e-07, |
| "loss": 0.0127, |
| "reward": 0.11737876618281007, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.17286097328178585, |
| "rewards/cosine_scaled_reward": 0.054929095320403576, |
| "rewards/format_reward": 0.5833333563059568, |
| "step": 112 |
| }, |
| { |
| "advantage_max": 0.20071829669177532, |
| "advantage_mean": -3.6476802481311132e-09, |
| "advantage_min": -0.14141817204654217, |
| "advantage_std": 0.13287563156336546, |
| "completion_length": 2007.8334159851074, |
| "epoch": 0.12914285714285714, |
| "grad_norm": 0.001896014902740717, |
| "kl": 0.00015522539615631104, |
| "learning_rate": 9.58499865339809e-07, |
| "loss": 0.0049, |
| "reward": 0.14661651686765254, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13287563901394606, |
| "rewards/cosine_scaled_reward": 0.03844105708412826, |
| "rewards/format_reward": 0.7916666697710752, |
| "step": 113 |
| }, |
| { |
| "advantage_max": 0.10647185088600963, |
| "advantage_mean": -2.7551627541921864e-09, |
| "advantage_min": -0.11569630762096494, |
| "advantage_std": 0.09618571458850056, |
| "completion_length": 2201.1042137145996, |
| "epoch": 0.13028571428571428, |
| "grad_norm": 0.0018926298944279552, |
| "kl": 8.186884224414825e-05, |
| "learning_rate": 9.571721736097088e-07, |
| "loss": 0.0013, |
| "reward": 0.07421890611294657, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09618572029285133, |
| "rewards/cosine_scaled_reward": -0.13425324205309153, |
| "rewards/format_reward": 0.708333333954215, |
| "step": 114 |
| }, |
| { |
| "advantage_max": 0.16031183023005724, |
| "advantage_mean": -1.552202394394797e-10, |
| "advantage_min": -0.16533454321324825, |
| "advantage_std": 0.14068402699194849, |
| "completion_length": 2915.5625228881836, |
| "epoch": 0.13142857142857142, |
| "grad_norm": 0.0025180347729474306, |
| "kl": 0.00015562772750854492, |
| "learning_rate": 9.55824636882301e-07, |
| "loss": 0.0035, |
| "reward": 0.06694650682038628, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14068402582779527, |
| "rewards/cosine_scaled_reward": 0.00980508397333324, |
| "rewards/format_reward": 0.37500000186264515, |
| "step": 115 |
| }, |
| { |
| "advantage_max": 0.1184748588129878, |
| "advantage_mean": -6.208816516473092e-10, |
| "advantage_min": -0.13819944020360708, |
| "advantage_std": 0.10394375585019588, |
| "completion_length": 3137.541679382324, |
| "epoch": 0.13257142857142856, |
| "grad_norm": 0.0017691698158159852, |
| "kl": 0.00019854307174682617, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": 0.0012, |
| "reward": 0.013275583041831851, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10394375957548618, |
| "rewards/cosine_scaled_reward": -0.09652221202850342, |
| "rewards/format_reward": 0.27083333395421505, |
| "step": 116 |
| }, |
| { |
| "advantage_max": 0.1553569696843624, |
| "advantage_mean": -1.8626452047421083e-09, |
| "advantage_min": -0.12070130556821823, |
| "advantage_std": 0.1120496722869575, |
| "completion_length": 2608.791732788086, |
| "epoch": 0.1337142857142857, |
| "grad_norm": 0.0024786260910332203, |
| "kl": 0.00016558915376663208, |
| "learning_rate": 9.530702921077358e-07, |
| "loss": 0.0038, |
| "reward": 0.01226228941231966, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1120496797375381, |
| "rewards/cosine_scaled_reward": -0.21452020248398185, |
| "rewards/format_reward": 0.5000000074505806, |
| "step": 117 |
| }, |
| { |
| "advantage_max": 0.1705867312848568, |
| "advantage_mean": 2.7755575615628914e-17, |
| "advantage_min": -0.28973726741969585, |
| "advantage_std": 0.1876918189227581, |
| "completion_length": 2918.7708740234375, |
| "epoch": 0.13485714285714287, |
| "grad_norm": 0.0038028184790164232, |
| "kl": 0.0001245737075805664, |
| "learning_rate": 9.516636183034564e-07, |
| "loss": 0.0099, |
| "reward": 0.18796764593571424, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.18769182451069355, |
| "rewards/cosine_scaled_reward": 0.2854941412806511, |
| "rewards/format_reward": 0.5416666828095913, |
| "step": 118 |
| }, |
| { |
| "advantage_max": 0.12709664832800627, |
| "advantage_mean": -4.656613011855271e-10, |
| "advantage_min": -0.14629495097324252, |
| "advantage_std": 0.10539561160840094, |
| "completion_length": 1824.270866394043, |
| "epoch": 0.136, |
| "grad_norm": 0.0015314030461013317, |
| "kl": 0.00011080782860517502, |
| "learning_rate": 9.502373679810839e-07, |
| "loss": 0.0013, |
| "reward": 0.16728377249091864, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10539561323821545, |
| "rewards/cosine_scaled_reward": 0.12748285697307438, |
| "rewards/format_reward": 0.7291666716337204, |
| "step": 119 |
| }, |
| { |
| "advantage_max": 0.13794818706810474, |
| "advantage_mean": -2.2118911702229127e-09, |
| "advantage_min": -0.13365713064558804, |
| "advantage_std": 0.10238635609857738, |
| "completion_length": 2093.2291870117188, |
| "epoch": 0.13714285714285715, |
| "grad_norm": 0.002086564665660262, |
| "kl": 0.00019008666276931763, |
| "learning_rate": 9.487916106540465e-07, |
| "loss": -0.0029, |
| "reward": 0.11550817801617086, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10238635912537575, |
| "rewards/cosine_scaled_reward": 0.03842526860535145, |
| "rewards/format_reward": 0.6041666697710752, |
| "step": 120 |
| }, |
| { |
| "advantage_max": 0.06518118735402822, |
| "advantage_mean": -9.041590292441226e-09, |
| "advantage_min": -0.10872633708640933, |
| "advantage_std": 0.06719960737973452, |
| "completion_length": 1744.4791946411133, |
| "epoch": 0.1382857142857143, |
| "grad_norm": 0.0014031269820407033, |
| "kl": 0.00011703372001647949, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0042, |
| "reward": 0.1042367173358798, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.06719961203634739, |
| "rewards/cosine_scaled_reward": -0.0471935048699379, |
| "rewards/format_reward": 0.7083333358168602, |
| "step": 121 |
| }, |
| { |
| "advantage_max": 0.1840760800987482, |
| "advantage_mean": -2.0178656245928295e-09, |
| "advantage_min": -0.13638843223452568, |
| "advantage_std": 0.12992733856663108, |
| "completion_length": 2759.375030517578, |
| "epoch": 0.13942857142857143, |
| "grad_norm": 0.002573323668912053, |
| "kl": 0.00018534809350967407, |
| "learning_rate": 9.458418577899774e-07, |
| "loss": 0.0003, |
| "reward": 0.09318617288954556, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12992734229192138, |
| "rewards/cosine_scaled_reward": -0.0037322649732232094, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 122 |
| }, |
| { |
| "advantage_max": 0.19431301951408386, |
| "advantage_mean": -3.026798457705926e-09, |
| "advantage_min": -0.1697028325870633, |
| "advantage_std": 0.14136297907680273, |
| "completion_length": 2471.1667098999023, |
| "epoch": 0.14057142857142857, |
| "grad_norm": 0.0024162298068404198, |
| "kl": 0.00014369189739227295, |
| "learning_rate": 9.443380060197385e-07, |
| "loss": 0.0062, |
| "reward": 0.09077914047520608, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1413629837334156, |
| "rewards/cosine_scaled_reward": -0.014139397069811821, |
| "rewards/format_reward": 0.5625000149011612, |
| "step": 123 |
| }, |
| { |
| "advantage_max": 0.1764494488015771, |
| "advantage_mean": -2.1265199989795036e-08, |
| "advantage_min": -0.19905486050993204, |
| "advantage_std": 0.14608955709263682, |
| "completion_length": 2030.208351135254, |
| "epoch": 0.1417142857142857, |
| "grad_norm": 0.0023175496608018875, |
| "kl": 8.096173405647278e-05, |
| "learning_rate": 9.428149347714143e-07, |
| "loss": 0.0027, |
| "reward": 0.20607653993647546, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14608956221491098, |
| "rewards/cosine_scaled_reward": 0.2967276629060507, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 124 |
| }, |
| { |
| "advantage_max": 0.17857370153069496, |
| "advantage_mean": -2.095475917784917e-09, |
| "advantage_min": -0.14456479204818606, |
| "advantage_std": 0.12773457053117454, |
| "completion_length": 2383.2917137145996, |
| "epoch": 0.14285714285714285, |
| "grad_norm": 0.002172604901716113, |
| "kl": 0.00010094791650772095, |
| "learning_rate": 9.412727182773486e-07, |
| "loss": 0.0083, |
| "reward": 0.10760475019924343, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1277345723938197, |
| "rewards/cosine_scaled_reward": 0.07878711789089721, |
| "rewards/format_reward": 0.4791666753590107, |
| "step": 125 |
| }, |
| { |
| "advantage_max": 0.153425102122128, |
| "advantage_mean": -1.629814533332663e-09, |
| "advantage_min": -0.17162158340215683, |
| "advantage_std": 0.13127819541841745, |
| "completion_length": 2324.458396911621, |
| "epoch": 0.144, |
| "grad_norm": 0.0026990657206624746, |
| "kl": 0.00010439753532409668, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": 0.0046, |
| "reward": 0.16764532215893269, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1312782042659819, |
| "rewards/cosine_scaled_reward": 0.1628055665642023, |
| "rewards/format_reward": 0.666666679084301, |
| "step": 126 |
| }, |
| { |
| "advantage_max": 0.11602956661954522, |
| "advantage_mean": -3.1044090909038147e-10, |
| "advantage_min": -0.13568047992885113, |
| "advantage_std": 0.09200059063732624, |
| "completion_length": 3087.354202270508, |
| "epoch": 0.14514285714285713, |
| "grad_norm": 0.0016305146273225546, |
| "kl": 0.00017446279525756836, |
| "learning_rate": 9.381311511432658e-07, |
| "loss": 0.0051, |
| "reward": 0.007634018547832966, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09200059436261654, |
| "rewards/cosine_scaled_reward": -0.1423247866332531, |
| "rewards/format_reward": 0.3333333395421505, |
| "step": 127 |
| }, |
| { |
| "advantage_max": 0.13058222271502018, |
| "advantage_mean": 8.537123716290118e-10, |
| "advantage_min": -0.15740781952627003, |
| "advantage_std": 0.121270950185135, |
| "completion_length": 2198.8542251586914, |
| "epoch": 0.1462857142857143, |
| "grad_norm": 0.0022203184198588133, |
| "kl": 0.00014868378639221191, |
| "learning_rate": 9.36531953618799e-07, |
| "loss": 0.0063, |
| "reward": 0.1890734031330794, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12127095158211887, |
| "rewards/cosine_scaled_reward": 0.21513771638274193, |
| "rewards/format_reward": 0.6875, |
| "step": 128 |
| }, |
| { |
| "advantage_max": 0.1487028319388628, |
| "advantage_mean": -1.0865429250772607e-09, |
| "advantage_min": -0.149984628893435, |
| "advantage_std": 0.12652261182665825, |
| "completion_length": 3222.437530517578, |
| "epoch": 0.14742857142857144, |
| "grad_norm": 0.002289236057549715, |
| "kl": 0.00017118453979492188, |
| "learning_rate": 9.34913917072228e-07, |
| "loss": 0.0006, |
| "reward": 0.11886557843536139, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12652262020856142, |
| "rewards/cosine_scaled_reward": 0.1228477107360959, |
| "rewards/format_reward": 0.45833333767950535, |
| "step": 129 |
| }, |
| { |
| "advantage_max": 0.18156287958845496, |
| "advantage_mean": -3.104409507237449e-10, |
| "advantage_min": -0.1829990753903985, |
| "advantage_std": 0.15616408130154014, |
| "completion_length": 2944.9375228881836, |
| "epoch": 0.14857142857142858, |
| "grad_norm": 0.003384327283129096, |
| "kl": 0.00020888447761535645, |
| "learning_rate": 9.332771203643714e-07, |
| "loss": 0.0086, |
| "reward": 0.0558637254871428, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15616408409550786, |
| "rewards/cosine_scaled_reward": -0.001300264149904251, |
| "rewards/format_reward": 0.3333333395421505, |
| "step": 130 |
| }, |
| { |
| "advantage_max": 0.11768224369734526, |
| "advantage_mean": -2.638747387262441e-09, |
| "advantage_min": -0.16481691598892212, |
| "advantage_std": 0.11381826514843851, |
| "completion_length": 2425.687545776367, |
| "epoch": 0.14971428571428572, |
| "grad_norm": 0.002549993572756648, |
| "kl": 0.0001547001302242279, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0086, |
| "reward": 0.1262263646349311, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11381827830336988, |
| "rewards/cosine_scaled_reward": 0.09911833424121141, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 131 |
| }, |
| { |
| "advantage_max": 0.20303583005443215, |
| "advantage_mean": -1.7850349531833842e-09, |
| "advantage_min": -0.17521136440336704, |
| "advantage_std": 0.156132394913584, |
| "completion_length": 2537.3958435058594, |
| "epoch": 0.15085714285714286, |
| "grad_norm": 0.0023349204566329718, |
| "kl": 0.00015839934349060059, |
| "learning_rate": 9.299475664759068e-07, |
| "loss": 0.0022, |
| "reward": 0.14965266874060035, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15613240003585815, |
| "rewards/cosine_scaled_reward": 0.19302122248336673, |
| "rewards/format_reward": 0.5000000018626451, |
| "step": 132 |
| }, |
| { |
| "advantage_max": 0.21418469492346048, |
| "advantage_mean": -1.7074247987691749e-09, |
| "advantage_min": -0.14465673360973597, |
| "advantage_std": 0.13837812095880508, |
| "completion_length": 3043.8750762939453, |
| "epoch": 0.152, |
| "grad_norm": 0.003348530502989888, |
| "kl": 0.00022482872009277344, |
| "learning_rate": 9.282549715730579e-07, |
| "loss": 0.0107, |
| "reward": 0.05317601654678583, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13837812119163573, |
| "rewards/cosine_scaled_reward": -0.041860264958813787, |
| "rewards/format_reward": 0.39583333767950535, |
| "step": 133 |
| }, |
| { |
| "advantage_max": 0.13740387186408043, |
| "advantage_mean": -9.313227272711444e-10, |
| "advantage_min": -0.1500786654651165, |
| "advantage_std": 0.11633877758868039, |
| "completion_length": 2266.4375381469727, |
| "epoch": 0.15314285714285714, |
| "grad_norm": 0.002200792543590069, |
| "kl": 0.00014699995517730713, |
| "learning_rate": 9.265439410565328e-07, |
| "loss": 0.0056, |
| "reward": 0.14638759847730398, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11633878224529326, |
| "rewards/cosine_scaled_reward": 0.11991992685943842, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 134 |
| }, |
| { |
| "advantage_max": 0.08174204314127564, |
| "advantage_mean": 3.0267983952558808e-09, |
| "advantage_min": -0.1401548283174634, |
| "advantage_std": 0.08930786373093724, |
| "completion_length": 1240.6041946411133, |
| "epoch": 0.15428571428571428, |
| "grad_norm": 0.000816557090729475, |
| "kl": 6.493180990219116e-05, |
| "learning_rate": 9.248145583195447e-07, |
| "loss": 0.0025, |
| "reward": 0.24731288943439722, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08930786419659853, |
| "rewards/cosine_scaled_reward": 0.3153181979432702, |
| "rewards/format_reward": 0.8333333358168602, |
| "step": 135 |
| }, |
| { |
| "advantage_max": 0.14499722514301538, |
| "advantage_mean": -3.2596290250319626e-09, |
| "advantage_min": -0.2093830332159996, |
| "advantage_std": 0.1496648290194571, |
| "completion_length": 2443.604202270508, |
| "epoch": 0.15542857142857142, |
| "grad_norm": 0.0018037001136690378, |
| "kl": 0.00012743473052978516, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": 0.0026, |
| "reward": 0.15657305950298905, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14966483181342483, |
| "rewards/cosine_scaled_reward": 0.19108737260103226, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 136 |
| }, |
| { |
| "advantage_max": 0.2110500643029809, |
| "advantage_mean": 3.5700699757557075e-09, |
| "advantage_min": -0.16157893557101488, |
| "advantage_std": 0.14996290765702724, |
| "completion_length": 2808.6250534057617, |
| "epoch": 0.15657142857142858, |
| "grad_norm": 0.003018659772351384, |
| "kl": 0.00014230981469154358, |
| "learning_rate": 9.213010742252327e-07, |
| "loss": 0.0047, |
| "reward": 0.06567161390557885, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.14996290765702724, |
| "rewards/cosine_scaled_reward": -0.0245496213901788, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 137 |
| }, |
| { |
| "advantage_max": 0.13625410571694374, |
| "advantage_mean": -7.081932080696407e-10, |
| "advantage_min": -0.17112221661955118, |
| "advantage_std": 0.12045667658094317, |
| "completion_length": 2566.395881652832, |
| "epoch": 0.15771428571428572, |
| "grad_norm": 0.0023109198082238436, |
| "kl": 0.00012245774269104004, |
| "learning_rate": 9.195171441101668e-07, |
| "loss": 0.0024, |
| "reward": 0.08583869109861553, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12045668461360037, |
| "rewards/cosine_scaled_reward": -0.058677418157458305, |
| "rewards/format_reward": 0.6250000149011612, |
| "step": 138 |
| }, |
| { |
| "advantage_max": 0.17770376801490784, |
| "advantage_mean": -6.51925811251397e-09, |
| "advantage_min": -0.14586754702031612, |
| "advantage_std": 0.13520123437047005, |
| "completion_length": 2895.729202270508, |
| "epoch": 0.15885714285714286, |
| "grad_norm": 0.002674209652468562, |
| "kl": 0.00019547343254089355, |
| "learning_rate": 9.177152042508077e-07, |
| "loss": 0.0058, |
| "reward": 0.10707889473997056, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1352012469433248, |
| "rewards/cosine_scaled_reward": 0.024583726655691862, |
| "rewards/format_reward": 0.5833333376795053, |
| "step": 139 |
| }, |
| { |
| "advantage_max": 0.25041482876986265, |
| "advantage_mean": 2.4835269119005687e-09, |
| "advantage_min": -0.16461172699928284, |
| "advantage_std": 0.16128699900582433, |
| "completion_length": 2781.854217529297, |
| "epoch": 0.16, |
| "grad_norm": 0.0029202878940850496, |
| "kl": 0.00022208690643310547, |
| "learning_rate": 9.158953424711624e-07, |
| "loss": 0.0026, |
| "reward": 0.11235592421144247, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.16128700133413076, |
| "rewards/cosine_scaled_reward": 0.07993581797927618, |
| "rewards/format_reward": 0.5000000093132257, |
| "step": 140 |
| }, |
| { |
| "advantage_max": 0.21202841773629189, |
| "advantage_mean": -1.3877787807814457e-17, |
| "advantage_min": -0.13598172459751368, |
| "advantage_std": 0.13371713273227215, |
| "completion_length": 2812.6458740234375, |
| "epoch": 0.16114285714285714, |
| "grad_norm": 0.0021927033085376024, |
| "kl": 0.00016339123249053955, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0039, |
| "reward": 0.06701798271387815, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13371713738888502, |
| "rewards/cosine_scaled_reward": -0.03017127327620983, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 141 |
| }, |
| { |
| "advantage_max": 0.1576713090762496, |
| "advantage_mean": -4.928248802105184e-09, |
| "advantage_min": -0.144107595551759, |
| "advantage_std": 0.1275632563047111, |
| "completion_length": 2608.3125610351562, |
| "epoch": 0.16228571428571428, |
| "grad_norm": 0.0021633445285260677, |
| "kl": 0.0001731477677822113, |
| "learning_rate": 9.122022088101613e-07, |
| "loss": 0.0054, |
| "reward": 0.06165223941206932, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.127563264220953, |
| "rewards/cosine_scaled_reward": -0.0996420830488205, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 142 |
| }, |
| { |
| "advantage_max": 0.11721245618537068, |
| "advantage_mean": -6.907309196835243e-09, |
| "advantage_min": -0.12485382426530123, |
| "advantage_std": 0.10056370904203504, |
| "completion_length": 2204.1042098999023, |
| "epoch": 0.16342857142857142, |
| "grad_norm": 0.0021462785080075264, |
| "kl": 0.00016664713621139526, |
| "learning_rate": 9.103291169269299e-07, |
| "loss": -0.0011, |
| "reward": 0.11139208162785508, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.10056372033432126, |
| "rewards/cosine_scaled_reward": 0.004287723801098764, |
| "rewards/format_reward": 0.6458333358168602, |
| "step": 143 |
| }, |
| { |
| "advantage_max": 0.1503322133794427, |
| "advantage_mean": -4.967054101356894e-09, |
| "advantage_min": -0.14401227980852127, |
| "advantage_std": 0.12609638017602265, |
| "completion_length": 2357.0416946411133, |
| "epoch": 0.16457142857142856, |
| "grad_norm": 0.0021524883341044188, |
| "kl": 0.0001500248908996582, |
| "learning_rate": 9.084384631108882e-07, |
| "loss": -0.0005, |
| "reward": 0.20968507044017315, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12609639088623226, |
| "rewards/cosine_scaled_reward": 0.3309060502797365, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 144 |
| }, |
| { |
| "advantage_max": 0.15520280180498958, |
| "advantage_mean": -1.3193737144479023e-09, |
| "advantage_min": -0.14075595536269248, |
| "advantage_std": 0.12264994671568274, |
| "completion_length": 1946.333339691162, |
| "epoch": 0.1657142857142857, |
| "grad_norm": 0.0021455343812704086, |
| "kl": 0.00014033913612365723, |
| "learning_rate": 9.065303395098358e-07, |
| "loss": 0.005, |
| "reward": 0.1299537445884198, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12264994950965047, |
| "rewards/cosine_scaled_reward": 0.06085195206105709, |
| "rewards/format_reward": 0.6458333358168602, |
| "step": 145 |
| }, |
| { |
| "advantage_max": 0.11647297162562609, |
| "advantage_mean": 1.0865430360995632e-09, |
| "advantage_min": -0.11792057100683451, |
| "advantage_std": 0.0969981993548572, |
| "completion_length": 2206.2500610351562, |
| "epoch": 0.16685714285714287, |
| "grad_norm": 0.0013740290887653828, |
| "kl": 0.0001351572573184967, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": 0.0043, |
| "reward": 0.06711362052010372, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09699820261448622, |
| "rewards/cosine_scaled_reward": -0.10451544541865587, |
| "rewards/format_reward": 0.6041666697710752, |
| "step": 146 |
| }, |
| { |
| "advantage_max": 0.16654492495581508, |
| "advantage_mean": -2.0954757998237206e-09, |
| "advantage_min": -0.15912125445902348, |
| "advantage_std": 0.1402019909583032, |
| "completion_length": 3362.9375610351562, |
| "epoch": 0.168, |
| "grad_norm": 0.0023271015379577875, |
| "kl": 0.0002307295799255371, |
| "learning_rate": 9.026620557966279e-07, |
| "loss": 0.0032, |
| "reward": 0.016447328962385654, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14020200027152896, |
| "rewards/cosine_scaled_reward": -0.0677786897867918, |
| "rewards/format_reward": 0.22916667349636555, |
| "step": 147 |
| }, |
| { |
| "advantage_max": 0.09006885858252645, |
| "advantage_mean": -2.2506962404911235e-09, |
| "advantage_min": -0.11421419773250818, |
| "advantage_std": 0.08783065434545279, |
| "completion_length": 2373.8750534057617, |
| "epoch": 0.16914285714285715, |
| "grad_norm": 0.0019086383981630206, |
| "kl": 0.00016707181930541992, |
| "learning_rate": 9.007020842191634e-07, |
| "loss": 0.0023, |
| "reward": 0.0981850721873343, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08783066272735596, |
| "rewards/cosine_scaled_reward": 0.006881414446979761, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 148 |
| }, |
| { |
| "advantage_max": 0.13361886190250516, |
| "advantage_mean": -7.528191028893794e-09, |
| "advantage_min": -0.1478055864572525, |
| "advantage_std": 0.10741094080731273, |
| "completion_length": 2408.666732788086, |
| "epoch": 0.1702857142857143, |
| "grad_norm": 0.0021227849647402763, |
| "kl": 0.00010327436029911041, |
| "learning_rate": 8.987250199168808e-07, |
| "loss": 0.0048, |
| "reward": 0.15707366378046572, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10741094499826431, |
| "rewards/cosine_scaled_reward": 0.1423771446570754, |
| "rewards/format_reward": 0.6458333395421505, |
| "step": 149 |
| }, |
| { |
| "advantage_max": 0.20952890440821648, |
| "advantage_mean": -1.3969839313121568e-09, |
| "advantage_min": -0.183829627931118, |
| "advantage_std": 0.1736277553718537, |
| "completion_length": 2727.5208892822266, |
| "epoch": 0.17142857142857143, |
| "grad_norm": 0.002815204905346036, |
| "kl": 0.00017786026000976562, |
| "learning_rate": 8.967309592491052e-07, |
| "loss": 0.0096, |
| "reward": 0.0818605124950409, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1736277644522488, |
| "rewards/cosine_scaled_reward": -0.0079753203317523, |
| "rewards/format_reward": 0.5000000055879354, |
| "step": 150 |
| }, |
| { |
| "advantage_max": 0.20418076124042273, |
| "advantage_mean": -3.4148497640718034e-09, |
| "advantage_min": -0.1933022839948535, |
| "advantage_std": 0.16485009621828794, |
| "completion_length": 2286.5209045410156, |
| "epoch": 0.17257142857142857, |
| "grad_norm": 0.004568588919937611, |
| "kl": 0.00023168325424194336, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0172, |
| "reward": 0.2044738749973476, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.16485009947791696, |
| "rewards/cosine_scaled_reward": 0.2511125993914902, |
| "rewards/format_reward": 0.7083333414047956, |
| "step": 151 |
| }, |
| { |
| "advantage_max": 0.21830918407067657, |
| "advantage_mean": -2.1730860721991263e-09, |
| "advantage_min": -0.09182694740593433, |
| "advantage_std": 0.11862736381590366, |
| "completion_length": 2849.2291984558105, |
| "epoch": 0.1737142857142857, |
| "grad_norm": 0.002062483923509717, |
| "kl": 0.0002673119306564331, |
| "learning_rate": 8.926922383915315e-07, |
| "loss": 0.0052, |
| "reward": 0.005771389231085777, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11862737126648426, |
| "rewards/cosine_scaled_reward": -0.13862175261601806, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 152 |
| }, |
| { |
| "advantage_max": 0.18801967659965158, |
| "advantage_mean": -6.984919032060333e-10, |
| "advantage_min": -0.1435602605342865, |
| "advantage_std": 0.12760146823711693, |
| "completion_length": 2704.145866394043, |
| "epoch": 0.17485714285714285, |
| "grad_norm": 0.0024463790468871593, |
| "kl": 0.00021246075630187988, |
| "learning_rate": 8.906477750432903e-07, |
| "loss": 0.0017, |
| "reward": 0.05029630567878485, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12760147219523787, |
| "rewards/cosine_scaled_reward": -0.09206442115828395, |
| "rewards/format_reward": 0.47916667349636555, |
| "step": 153 |
| }, |
| { |
| "advantage_max": 0.14180888701230288, |
| "advantage_mean": -2.2506962960022747e-09, |
| "advantage_min": -0.23996069841086864, |
| "advantage_std": 0.15492864465340972, |
| "completion_length": 2850.729217529297, |
| "epoch": 0.176, |
| "grad_norm": 0.0028651999309659004, |
| "kl": 0.00016963481903076172, |
| "learning_rate": 8.88586709003076e-07, |
| "loss": 0.0099, |
| "reward": 0.15311535075306892, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1549286488443613, |
| "rewards/cosine_scaled_reward": 0.1717417575418949, |
| "rewards/format_reward": 0.562500013038516, |
| "step": 154 |
| }, |
| { |
| "advantage_max": 0.173433696385473, |
| "advantage_mean": -7.528191139916096e-09, |
| "advantage_min": -0.15252949902787805, |
| "advantage_std": 0.13626911328174174, |
| "completion_length": 2346.8333587646484, |
| "epoch": 0.17714285714285713, |
| "grad_norm": 0.0024740456137806177, |
| "kl": 0.00018147937953472137, |
| "learning_rate": 8.865091407243394e-07, |
| "loss": 0.005, |
| "reward": 0.10649993130937219, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13626911351457238, |
| "rewards/cosine_scaled_reward": 0.05439352709800005, |
| "rewards/format_reward": 0.5208333376795053, |
| "step": 155 |
| }, |
| { |
| "advantage_max": 0.1344355084002018, |
| "advantage_mean": -9.313225746154785e-10, |
| "advantage_min": -0.1348655167967081, |
| "advantage_std": 0.10152879962697625, |
| "completion_length": 2678.833366394043, |
| "epoch": 0.1782857142857143, |
| "grad_norm": 0.0013925960520282388, |
| "kl": 0.00019456446170806885, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": 0.0022, |
| "reward": 0.07135126600041986, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10152880381792784, |
| "rewards/cosine_scaled_reward": 0.011746692471206188, |
| "rewards/format_reward": 0.3958333395421505, |
| "step": 156 |
| }, |
| { |
| "advantage_max": 0.11662959074601531, |
| "advantage_mean": 4.656613428188905e-10, |
| "advantage_min": -0.13163182232528925, |
| "advantage_std": 0.10447747865691781, |
| "completion_length": 2641.729202270508, |
| "epoch": 0.17942857142857144, |
| "grad_norm": 0.0014279123861342669, |
| "kl": 0.00022292137145996094, |
| "learning_rate": 8.823049032816478e-07, |
| "loss": 0.002, |
| "reward": 0.039941683411598206, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10447748005390167, |
| "rewards/cosine_scaled_reward": -0.09014054387807846, |
| "rewards/format_reward": 0.4166666679084301, |
| "step": 157 |
| }, |
| { |
| "advantage_max": 0.12401013169437647, |
| "advantage_mean": -2.17308623873258e-09, |
| "advantage_min": -0.10845753783360124, |
| "advantage_std": 0.09371542499866337, |
| "completion_length": 2349.4583587646484, |
| "epoch": 0.18057142857142858, |
| "grad_norm": 0.0023050915915519, |
| "kl": 0.0002039596438407898, |
| "learning_rate": 8.801784390262943e-07, |
| "loss": 0.0002, |
| "reward": 0.1618611067533493, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09371542546432465, |
| "rewards/cosine_scaled_reward": 0.1614538454450667, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 158 |
| }, |
| { |
| "advantage_max": 0.22995528485625982, |
| "advantage_mean": 1.4357891403582457e-09, |
| "advantage_min": -0.13199414312839508, |
| "advantage_std": 0.135023855837062, |
| "completion_length": 3314.125030517578, |
| "epoch": 0.18171428571428572, |
| "grad_norm": 0.0022246637381613255, |
| "kl": 0.0002588033676147461, |
| "learning_rate": 8.780358823396352e-07, |
| "loss": 0.0053, |
| "reward": 0.002544154027418699, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1350238579325378, |
| "rewards/cosine_scaled_reward": -0.11882635089568794, |
| "rewards/format_reward": 0.2500000074505806, |
| "step": 159 |
| }, |
| { |
| "advantage_max": 0.1304126875475049, |
| "advantage_mean": -2.483526828633842e-09, |
| "advantage_min": -0.15111864916980267, |
| "advantage_std": 0.11367706721648574, |
| "completion_length": 2916.458366394043, |
| "epoch": 0.18285714285714286, |
| "grad_norm": 0.002428490901365876, |
| "kl": 0.0002749040722846985, |
| "learning_rate": 8.758773376468604e-07, |
| "loss": 0.0027, |
| "reward": 0.10357946204021573, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1136770648881793, |
| "rewards/cosine_scaled_reward": 0.07447406277060509, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 160 |
| }, |
| { |
| "advantage_max": 0.22650268021970987, |
| "advantage_mean": -4.19095166903638e-09, |
| "advantage_min": -0.21486747544258833, |
| "advantage_std": 0.1732907984405756, |
| "completion_length": 2120.5833892822266, |
| "epoch": 0.184, |
| "grad_norm": 0.00323474477045238, |
| "kl": 0.00017549656331539154, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": 0.0164, |
| "reward": 0.16077806614339352, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.17329080402851105, |
| "rewards/cosine_scaled_reward": 0.11763785593211651, |
| "rewards/format_reward": 0.7083333395421505, |
| "step": 161 |
| }, |
| { |
| "advantage_max": 0.24303656117990613, |
| "advantage_mean": -1.823840096309981e-09, |
| "advantage_min": -0.20982816815376282, |
| "advantage_std": 0.1883863634429872, |
| "completion_length": 3135.729217529297, |
| "epoch": 0.18514285714285714, |
| "grad_norm": 0.003710554214194417, |
| "kl": 0.00029730796813964844, |
| "learning_rate": 8.715127058347614e-07, |
| "loss": 0.0127, |
| "reward": 0.06095714052207768, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1883863634429872, |
| "rewards/cosine_scaled_reward": -0.007758868858218193, |
| "rewards/format_reward": 0.3750000111758709, |
| "step": 162 |
| }, |
| { |
| "advantage_max": 0.19018926797434688, |
| "advantage_mean": -2.3283065614387866e-09, |
| "advantage_min": -0.16702749021351337, |
| "advantage_std": 0.1346902009099722, |
| "completion_length": 2321.00008392334, |
| "epoch": 0.18628571428571428, |
| "grad_norm": 0.002764482283964753, |
| "kl": 0.00019381940364837646, |
| "learning_rate": 8.693068314414344e-07, |
| "loss": 0.0069, |
| "reward": 0.15160547848790884, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13469020370393991, |
| "rewards/cosine_scaled_reward": 0.12644020980224013, |
| "rewards/format_reward": 0.645833345130086, |
| "step": 163 |
| }, |
| { |
| "advantage_max": 0.17234268225729465, |
| "advantage_mean": -9.313226051466117e-09, |
| "advantage_min": -0.17475124169141054, |
| "advantage_std": 0.13838404836133122, |
| "completion_length": 2284.3541946411133, |
| "epoch": 0.18742857142857142, |
| "grad_norm": 0.0023905131965875626, |
| "kl": 0.00018963217735290527, |
| "learning_rate": 8.670853944836176e-07, |
| "loss": 0.0053, |
| "reward": 0.14360764995217323, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13838405720889568, |
| "rewards/cosine_scaled_reward": 0.13964655436575413, |
| "rewards/format_reward": 0.5625000037252903, |
| "step": 164 |
| }, |
| { |
| "advantage_max": 0.17663770401850343, |
| "advantage_mean": 7.761016135310328e-11, |
| "advantage_min": -0.13784242887049913, |
| "advantage_std": 0.12715793796814978, |
| "completion_length": 2959.3958740234375, |
| "epoch": 0.18857142857142858, |
| "grad_norm": 0.0023930887691676617, |
| "kl": 0.00022131530568003654, |
| "learning_rate": 8.648485032310144e-07, |
| "loss": 0.0036, |
| "reward": 0.01792388770263642, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.127157939132303, |
| "rewards/cosine_scaled_reward": -0.13487120415084064, |
| "rewards/format_reward": 0.37500000931322575, |
| "step": 165 |
| }, |
| { |
| "advantage_max": 0.26935879176016897, |
| "advantage_mean": -7.411775679311283e-09, |
| "advantage_min": -0.21388308005407453, |
| "advantage_std": 0.19788388686720282, |
| "completion_length": 2474.6458892822266, |
| "epoch": 0.18971428571428572, |
| "grad_norm": 0.0030500064603984356, |
| "kl": 0.00017523393034934998, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": 0.0142, |
| "reward": 0.13143354514613748, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1978838904760778, |
| "rewards/cosine_scaled_reward": 0.06401193561032414, |
| "rewards/format_reward": 0.6458333395421505, |
| "step": 166 |
| }, |
| { |
| "advantage_max": 0.09043146530166268, |
| "advantage_mean": -3.725290492750943e-09, |
| "advantage_min": -0.1378228161484003, |
| "advantage_std": 0.09450012096203864, |
| "completion_length": 2120.4166946411133, |
| "epoch": 0.19085714285714286, |
| "grad_norm": 0.001775244832970202, |
| "kl": 0.00016416609287261963, |
| "learning_rate": 8.603287946810513e-07, |
| "loss": 0.001, |
| "reward": 0.13699070224538445, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09450012096203864, |
| "rewards/cosine_scaled_reward": 0.049559676088392735, |
| "rewards/format_reward": 0.7083333432674408, |
| "step": 167 |
| }, |
| { |
| "advantage_max": 0.16301921661943197, |
| "advantage_mean": -3.3372394153685647e-09, |
| "advantage_min": -0.17344017466530204, |
| "advantage_std": 0.14057715935632586, |
| "completion_length": 2687.6875534057617, |
| "epoch": 0.192, |
| "grad_norm": 0.002732840832322836, |
| "kl": 0.00020739436149597168, |
| "learning_rate": 8.580461976679099e-07, |
| "loss": 0.0038, |
| "reward": 0.12937976652756333, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.140577157959342, |
| "rewards/cosine_scaled_reward": 0.0898700375109911, |
| "rewards/format_reward": 0.5833333469927311, |
| "step": 168 |
| }, |
| { |
| "advantage_max": 0.1822348004207015, |
| "advantage_mean": -1.2417635114614356e-09, |
| "advantage_min": -0.15554382000118494, |
| "advantage_std": 0.13149547297507524, |
| "completion_length": 2024.1875305175781, |
| "epoch": 0.19314285714285714, |
| "grad_norm": 0.0024218547623604536, |
| "kl": 0.00018212199211120605, |
| "learning_rate": 8.557485869176825e-07, |
| "loss": 0.0023, |
| "reward": 0.1839947861735709, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.1314954780973494, |
| "rewards/cosine_scaled_reward": 0.2302683750167489, |
| "rewards/format_reward": 0.6250000111758709, |
| "step": 169 |
| }, |
| { |
| "advantage_max": 0.10904745385050774, |
| "advantage_mean": 1.5813081122306727e-09, |
| "advantage_min": -0.06331885978579521, |
| "advantage_std": 0.06757040356751531, |
| "completion_length": 2624.2500228881836, |
| "epoch": 0.19428571428571428, |
| "grad_norm": 0.0012609402183443308, |
| "kl": 0.00022426247596740723, |
| "learning_rate": 8.534360744126753e-07, |
| "loss": 0.0029, |
| "reward": 0.044832271145423874, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.06757040473166853, |
| "rewards/cosine_scaled_reward": -0.10952581465244293, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 170 |
| }, |
| { |
| "advantage_max": 0.18128343019634485, |
| "advantage_mean": -7.683411469561197e-09, |
| "advantage_min": -0.1281019225716591, |
| "advantage_std": 0.12827163795009255, |
| "completion_length": 2249.0000381469727, |
| "epoch": 0.19542857142857142, |
| "grad_norm": 0.00255986419506371, |
| "kl": 0.0001919977366924286, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": 0.0088, |
| "reward": 0.11522350832819939, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12827164493501186, |
| "rewards/cosine_scaled_reward": 0.05948738753795624, |
| "rewards/format_reward": 0.5625000018626451, |
| "step": 171 |
| }, |
| { |
| "advantage_max": 0.1466981265693903, |
| "advantage_mean": 4.656613705744661e-10, |
| "advantage_min": -0.1400249758735299, |
| "advantage_std": 0.11561239557340741, |
| "completion_length": 2715.5833854675293, |
| "epoch": 0.19657142857142856, |
| "grad_norm": 0.0022185237612575293, |
| "kl": 0.0002675652503967285, |
| "learning_rate": 8.487667956935087e-07, |
| "loss": -0.0039, |
| "reward": 0.10251820925623178, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11561239883303642, |
| "rewards/cosine_scaled_reward": 0.08025055937469006, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 172 |
| }, |
| { |
| "advantage_max": 0.08615154423750937, |
| "advantage_mean": -3.531264874262474e-09, |
| "advantage_min": -0.08250692702131346, |
| "advantage_std": 0.07523247081553563, |
| "completion_length": 1911.5208435058594, |
| "epoch": 0.1977142857142857, |
| "grad_norm": 0.0018191589042544365, |
| "kl": 0.00011747702956199646, |
| "learning_rate": 8.464102570534061e-07, |
| "loss": 0.0009, |
| "reward": 0.0827999617322348, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.07523247634526342, |
| "rewards/cosine_scaled_reward": -0.08963774237781763, |
| "rewards/format_reward": 0.6666666716337204, |
| "step": 173 |
| }, |
| { |
| "advantage_max": 0.17214444186538458, |
| "advantage_mean": -2.793967904257677e-09, |
| "advantage_min": -0.12071564141660929, |
| "advantage_std": 0.119583026971668, |
| "completion_length": 1707.958381652832, |
| "epoch": 0.19885714285714284, |
| "grad_norm": 0.0018704604590311646, |
| "kl": 0.00013531744480133057, |
| "learning_rate": 8.440392717955475e-07, |
| "loss": -0.0021, |
| "reward": 0.11866510892286897, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11958303209394217, |
| "rewards/cosine_scaled_reward": -0.034700583666563034, |
| "rewards/format_reward": 0.7708333395421505, |
| "step": 174 |
| }, |
| { |
| "advantage_max": 0.0757163786329329, |
| "advantage_mean": 2.0954757928848267e-09, |
| "advantage_min": -0.11116283386945724, |
| "advantage_std": 0.0754783492302522, |
| "completion_length": 2308.666675567627, |
| "epoch": 0.2, |
| "grad_norm": 0.0010337267303839326, |
| "kl": 0.00017081201076507568, |
| "learning_rate": 8.416539554784089e-07, |
| "loss": -0.0001, |
| "reward": 0.11406097328290343, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.07547835109289736, |
| "rewards/cosine_scaled_reward": 0.09473255276679993, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 175 |
| }, |
| { |
| "advantage_max": 0.16840291186235845, |
| "advantage_mean": -6.131207153092788e-09, |
| "advantage_min": -0.19575551990419626, |
| "advantage_std": 0.15237143402919173, |
| "completion_length": 2512.3542251586914, |
| "epoch": 0.20114285714285715, |
| "grad_norm": 0.002759363502264023, |
| "kl": 0.00021241046488285065, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": 0.0106, |
| "reward": 0.16955551970750093, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.15237143402919173, |
| "rewards/cosine_scaled_reward": 0.19831032841466367, |
| "rewards/format_reward": 0.6041666753590107, |
| "step": 176 |
| }, |
| { |
| "advantage_max": 0.15896850870922208, |
| "advantage_mean": 8.537123022400728e-10, |
| "advantage_min": -0.16080441791564226, |
| "advantage_std": 0.1259885341860354, |
| "completion_length": 2636.500045776367, |
| "epoch": 0.2022857142857143, |
| "grad_norm": 0.0029087516013532877, |
| "kl": 0.00024193525314331055, |
| "learning_rate": 8.368407953869103e-07, |
| "loss": 0.0062, |
| "reward": 0.06026533106341958, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12598853604868054, |
| "rewards/cosine_scaled_reward": -0.08354691602289677, |
| "rewards/format_reward": 0.5208333507180214, |
| "step": 177 |
| }, |
| { |
| "advantage_max": 0.08082104474306107, |
| "advantage_mean": -2.6387474427735924e-09, |
| "advantage_min": -0.12136463588103652, |
| "advantage_std": 0.08366369269788265, |
| "completion_length": 2332.0000610351562, |
| "epoch": 0.20342857142857143, |
| "grad_norm": 0.001465087989345193, |
| "kl": 0.00019755959510803223, |
| "learning_rate": 8.344131861991828e-07, |
| "loss": 0.0088, |
| "reward": 0.10637267166748643, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08366369549185038, |
| "rewards/cosine_scaled_reward": 0.032770249992609024, |
| "rewards/format_reward": 0.5625, |
| "step": 178 |
| }, |
| { |
| "advantage_max": 0.16390905156731606, |
| "advantage_mean": 4.656613011855271e-10, |
| "advantage_min": -0.1330079366452992, |
| "advantage_std": 0.1229178715730086, |
| "completion_length": 2670.416679382324, |
| "epoch": 0.20457142857142857, |
| "grad_norm": 0.005331814754754305, |
| "kl": 0.00022807717323303223, |
| "learning_rate": 8.319717151140072e-07, |
| "loss": 0.0051, |
| "reward": 0.049615125404670835, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12291787634603679, |
| "rewards/cosine_scaled_reward": -0.0732691722587333, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 179 |
| }, |
| { |
| "advantage_max": 0.06850773748010397, |
| "advantage_mean": -9.778887456735053e-09, |
| "advantage_min": -0.0560016599483788, |
| "advantage_std": 0.05324950837530196, |
| "completion_length": 2127.2500343322754, |
| "epoch": 0.2057142857142857, |
| "grad_norm": 0.0010600673267617822, |
| "kl": 0.00023946166038513184, |
| "learning_rate": 8.295165011252396e-07, |
| "loss": -0.0008, |
| "reward": 0.1334767653606832, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.05324950924841687, |
| "rewards/cosine_scaled_reward": 0.09442592412233353, |
| "rewards/format_reward": 0.6041666716337204, |
| "step": 180 |
| }, |
| { |
| "advantage_max": 0.177369711920619, |
| "advantage_mean": -2.949188115941581e-09, |
| "advantage_min": -0.14135410264134407, |
| "advantage_std": 0.14470088807865977, |
| "completion_length": 2986.770866394043, |
| "epoch": 0.20685714285714285, |
| "grad_norm": 0.002808406949043274, |
| "kl": 0.0002924799919128418, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.0055, |
| "reward": 0.08883980172686279, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.14470090251415968, |
| "rewards/cosine_scaled_reward": 0.044522762298583984, |
| "rewards/format_reward": 0.4375000037252903, |
| "step": 181 |
| }, |
| { |
| "advantage_max": 0.16322546359151602, |
| "advantage_mean": -4.190951738425319e-09, |
| "advantage_min": -0.10783215472474694, |
| "advantage_std": 0.10909037687815726, |
| "completion_length": 1764.2916946411133, |
| "epoch": 0.208, |
| "grad_norm": 0.0014723712811246514, |
| "kl": 0.00010727345943450928, |
| "learning_rate": 8.245653237555705e-07, |
| "loss": -0.0045, |
| "reward": 0.09074319200590253, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10909038223326206, |
| "rewards/cosine_scaled_reward": -0.10806664638221264, |
| "rewards/format_reward": 0.75, |
| "step": 182 |
| }, |
| { |
| "advantage_max": 0.1708065690472722, |
| "advantage_mean": -7.605801183308003e-09, |
| "advantage_min": -0.15987203177064657, |
| "advantage_std": 0.13489929027855396, |
| "completion_length": 1637.4792022705078, |
| "epoch": 0.20914285714285713, |
| "grad_norm": 0.0019362150924280286, |
| "kl": 0.0001348257064819336, |
| "learning_rate": 8.220696016880687e-07, |
| "loss": -0.0, |
| "reward": 0.17411585431545973, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.13489929679781199, |
| "rewards/cosine_scaled_reward": 0.08437284221872687, |
| "rewards/format_reward": 0.8541666716337204, |
| "step": 183 |
| }, |
| { |
| "advantage_max": 0.11368166282773018, |
| "advantage_mean": 2.910383149756779e-09, |
| "advantage_min": -0.06791064376011491, |
| "advantage_std": 0.07019675150513649, |
| "completion_length": 2715.479200363159, |
| "epoch": 0.2102857142857143, |
| "grad_norm": 0.0016745430184528232, |
| "kl": 0.00021871179342269897, |
| "learning_rate": 8.195606193320136e-07, |
| "loss": 0.0007, |
| "reward": 0.022002333775162697, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.070196753134951, |
| "rewards/cosine_scaled_reward": -0.13184033427387476, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 184 |
| }, |
| { |
| "advantage_max": 0.12346031097695231, |
| "advantage_mean": -3.2790315879216436e-09, |
| "advantage_min": -0.11551401333417743, |
| "advantage_std": 0.1026953593827784, |
| "completion_length": 2431.7083625793457, |
| "epoch": 0.21142857142857144, |
| "grad_norm": 0.00237295706756413, |
| "kl": 0.00017070770263671875, |
| "learning_rate": 8.170384989716657e-07, |
| "loss": 0.0061, |
| "reward": 0.07951303326990455, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10269536171108484, |
| "rewards/cosine_scaled_reward": -0.01576326903887093, |
| "rewards/format_reward": 0.5000000037252903, |
| "step": 185 |
| }, |
| { |
| "advantage_max": 0.11272265436127782, |
| "advantage_mean": 4.6566127342995145e-10, |
| "advantage_min": -0.07673908583819866, |
| "advantage_std": 0.07678107637912035, |
| "completion_length": 2727.812515258789, |
| "epoch": 0.21257142857142858, |
| "grad_norm": 0.0013483620714396238, |
| "kl": 0.00023304671049118042, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": -0.0035, |
| "reward": 0.05358618497848511, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.07678107777610421, |
| "rewards/cosine_scaled_reward": -0.02927381545305252, |
| "rewards/format_reward": 0.375, |
| "step": 186 |
| }, |
| { |
| "advantage_max": 0.1563552524894476, |
| "advantage_mean": -2.716357583310014e-09, |
| "advantage_min": -0.13328236620873213, |
| "advantage_std": 0.12376850796863437, |
| "completion_length": 2533.9166870117188, |
| "epoch": 0.21371428571428572, |
| "grad_norm": 0.0021735529880970716, |
| "kl": 0.0002467595040798187, |
| "learning_rate": 8.119553365707802e-07, |
| "loss": 0.0065, |
| "reward": 0.04993397952057421, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12376851122826338, |
| "rewards/cosine_scaled_reward": -0.12346055079251528, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 187 |
| }, |
| { |
| "advantage_max": 0.08845503395423293, |
| "advantage_mean": 3.8805124391583234e-10, |
| "advantage_min": -0.1204057689756155, |
| "advantage_std": 0.08113091951236129, |
| "completion_length": 3432.812530517578, |
| "epoch": 0.21485714285714286, |
| "grad_norm": 0.0014322166098281741, |
| "kl": 0.00029206275939941406, |
| "learning_rate": 8.093945422764069e-07, |
| "loss": 0.0022, |
| "reward": 0.045054638059809804, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.08113092044368386, |
| "rewards/cosine_scaled_reward": -0.022663846611976624, |
| "rewards/format_reward": 0.3125000074505806, |
| "step": 188 |
| }, |
| { |
| "advantage_max": 0.17914783209562302, |
| "advantage_mean": -3.5700699896334953e-09, |
| "advantage_min": -0.11173915676772594, |
| "advantage_std": 0.11490088887512684, |
| "completion_length": 1846.0000114440918, |
| "epoch": 0.216, |
| "grad_norm": 0.002113899914547801, |
| "kl": 0.0001726001501083374, |
| "learning_rate": 8.068211054579943e-07, |
| "loss": 0.0092, |
| "reward": 0.11756939408951439, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11490088701248169, |
| "rewards/cosine_scaled_reward": -0.01818160153925419, |
| "rewards/format_reward": 0.7291666734963655, |
| "step": 189 |
| }, |
| { |
| "advantage_max": 0.1964530674740672, |
| "advantage_mean": -1.5522044760629683e-10, |
| "advantage_min": -0.12849188223481178, |
| "advantage_std": 0.1331510180607438, |
| "completion_length": 2651.8750381469727, |
| "epoch": 0.21714285714285714, |
| "grad_norm": 0.002494914224371314, |
| "kl": 0.00020803511142730713, |
| "learning_rate": 8.04235151541222e-07, |
| "loss": 0.0084, |
| "reward": 0.06791404378600419, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.13315102504566312, |
| "rewards/cosine_scaled_reward": 0.0009567160159349442, |
| "rewards/format_reward": 0.39583333767950535, |
| "step": 190 |
| }, |
| { |
| "advantage_max": 0.0955268326215446, |
| "advantage_mean": -6.208817349140361e-10, |
| "advantage_min": -0.14831526763737202, |
| "advantage_std": 0.10267449170351028, |
| "completion_length": 2117.208351135254, |
| "epoch": 0.21828571428571428, |
| "grad_norm": 0.00150469527579844, |
| "kl": 0.00021576881408691406, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": -0.0023, |
| "reward": 0.15802897419780493, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10267449403181672, |
| "rewards/cosine_scaled_reward": 0.15254965890198946, |
| "rewards/format_reward": 0.625, |
| "step": 191 |
| }, |
| { |
| "advantage_max": 0.12947352742776275, |
| "advantage_mean": -1.785035036450111e-09, |
| "advantage_min": -0.1430590646341443, |
| "advantage_std": 0.11898831464350224, |
| "completion_length": 3050.4584045410156, |
| "epoch": 0.21942857142857142, |
| "grad_norm": 0.0025042772758752108, |
| "kl": 0.0002841353416442871, |
| "learning_rate": 7.990261971595048e-07, |
| "loss": 0.01, |
| "reward": 0.03274457482621074, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11898831464350224, |
| "rewards/cosine_scaled_reward": -0.11102783679962158, |
| "rewards/format_reward": 0.4166666828095913, |
| "step": 192 |
| }, |
| { |
| "advantage_max": 0.18321886658668518, |
| "advantage_mean": -5.587935447692871e-09, |
| "advantage_min": -0.21362949814647436, |
| "advantage_std": 0.16429298697039485, |
| "completion_length": 2551.6042098999023, |
| "epoch": 0.22057142857142858, |
| "grad_norm": 0.0033616998698562384, |
| "kl": 0.00018714368343353271, |
| "learning_rate": 7.964034505716476e-07, |
| "loss": 0.0094, |
| "reward": 0.10076185502111912, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.16429299302399158, |
| "rewards/cosine_scaled_reward": 0.03686658607330173, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 193 |
| }, |
| { |
| "advantage_max": 0.1511156321503222, |
| "advantage_mean": -1.3814618921026423e-08, |
| "advantage_min": -0.16178389079868793, |
| "advantage_std": 0.1266572391614318, |
| "completion_length": 2812.541717529297, |
| "epoch": 0.22171428571428572, |
| "grad_norm": 0.002837719861418009, |
| "kl": 0.00023984909057617188, |
| "learning_rate": 7.93768694627233e-07, |
| "loss": 0.0145, |
| "reward": 0.2117105281795375, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12665724474936724, |
| "rewards/cosine_scaled_reward": 0.3458312964066863, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 194 |
| }, |
| { |
| "advantage_max": 0.2311068344861269, |
| "advantage_mean": -1.2417635669725868e-09, |
| "advantage_min": -0.18620420899242163, |
| "advantage_std": 0.17218404030427337, |
| "completion_length": 2531.1666870117188, |
| "epoch": 0.22285714285714286, |
| "grad_norm": 0.002786431461572647, |
| "kl": 0.0001908913254737854, |
| "learning_rate": 7.911220577405484e-07, |
| "loss": 0.0001, |
| "reward": 0.042947592213749886, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.17218404030427337, |
| "rewards/cosine_scaled_reward": -0.10107360588153824, |
| "rewards/format_reward": 0.4583333469927311, |
| "step": 195 |
| }, |
| { |
| "advantage_max": 0.14308508206158876, |
| "advantage_mean": 1.552204059729334e-10, |
| "advantage_min": -0.12311006104573607, |
| "advantage_std": 0.10863854410126805, |
| "completion_length": 3364.9375610351562, |
| "epoch": 0.224, |
| "grad_norm": 0.002163918921723962, |
| "kl": 0.00028574466705322266, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": 0.0002, |
| "reward": 0.03790745767764747, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10863854829221964, |
| "rewards/cosine_scaled_reward": -0.09509009215980768, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 196 |
| }, |
| { |
| "advantage_max": 0.2604234963655472, |
| "advantage_mean": -1.1331091689936734e-08, |
| "advantage_min": -0.21268348023295403, |
| "advantage_std": 0.20125100389122963, |
| "completion_length": 2484.062568664551, |
| "epoch": 0.22514285714285714, |
| "grad_norm": 0.004311454016715288, |
| "kl": 0.00030350685119628906, |
| "learning_rate": 7.857936576865356e-07, |
| "loss": 0.0081, |
| "reward": 0.21867160964757204, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.2012510122731328, |
| "rewards/cosine_scaled_reward": 0.3315436402335763, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 197 |
| }, |
| { |
| "advantage_max": 0.1826078612357378, |
| "advantage_mean": -7.916242161787324e-09, |
| "advantage_min": -0.18366167414933443, |
| "advantage_std": 0.15232555009424686, |
| "completion_length": 2441.958381652832, |
| "epoch": 0.22628571428571428, |
| "grad_norm": 0.00223523355089128, |
| "kl": 0.00019724667072296143, |
| "learning_rate": 7.831121542179086e-07, |
| "loss": 0.0078, |
| "reward": 0.13793383864685893, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.15232555102556944, |
| "rewards/cosine_scaled_reward": 0.12416904792189598, |
| "rewards/format_reward": 0.5625000037252903, |
| "step": 198 |
| }, |
| { |
| "advantage_max": 0.17881701048463583, |
| "advantage_mean": 1.5522043234073024e-09, |
| "advantage_min": -0.14401802979409695, |
| "advantage_std": 0.12912732851691544, |
| "completion_length": 3530.9166870117188, |
| "epoch": 0.22742857142857142, |
| "grad_norm": 0.0025338924024254084, |
| "kl": 0.00028127431869506836, |
| "learning_rate": 7.804192891917571e-07, |
| "loss": 0.002, |
| "reward": -0.015164745040237904, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12912732968106866, |
| "rewards/cosine_scaled_reward": -0.1283108638599515, |
| "rewards/format_reward": 0.16666666977107525, |
| "step": 199 |
| }, |
| { |
| "advantage_max": 0.15909895114600658, |
| "advantage_mean": -5.355104984450243e-09, |
| "advantage_min": -0.22837194707244635, |
| "advantage_std": 0.15889625437557697, |
| "completion_length": 1886.9375457763672, |
| "epoch": 0.22857142857142856, |
| "grad_norm": 0.002250520745292306, |
| "kl": 0.00016075372695922852, |
| "learning_rate": 7.777151938545235e-07, |
| "loss": -0.003, |
| "reward": 0.1841872469522059, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15889626182615757, |
| "rewards/cosine_scaled_reward": 0.13623794727027416, |
| "rewards/format_reward": 0.812500013038516, |
| "step": 200 |
| }, |
| { |
| "advantage_max": 0.17722219973802567, |
| "advantage_mean": 3.1044086745701804e-10, |
| "advantage_min": -0.1915279608219862, |
| "advantage_std": 0.13609004858881235, |
| "completion_length": 2347.166732788086, |
| "epoch": 0.2297142857142857, |
| "grad_norm": 0.002754747634753585, |
| "kl": 0.00020887888967990875, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0038, |
| "reward": 0.28291497589088976, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13609005440957844, |
| "rewards/cosine_scaled_reward": 0.4535187867586501, |
| "rewards/format_reward": 0.770833345130086, |
| "step": 201 |
| }, |
| { |
| "advantage_max": 0.08778555504977703, |
| "advantage_mean": -4.346172172153828e-09, |
| "advantage_min": -0.08903190679848194, |
| "advantage_std": 0.06955086882226169, |
| "completion_length": 2056.6875228881836, |
| "epoch": 0.23085714285714284, |
| "grad_norm": 0.0014487183652818203, |
| "kl": 0.00016849488019943237, |
| "learning_rate": 7.72273839962904e-07, |
| "loss": 0.005, |
| "reward": 0.20257333759218454, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.06955087138339877, |
| "rewards/cosine_scaled_reward": 0.3097168318927288, |
| "rewards/format_reward": 0.5833333358168602, |
| "step": 202 |
| }, |
| { |
| "advantage_max": 0.13725088443607092, |
| "advantage_mean": 5.781961086998022e-09, |
| "advantage_min": -0.1490999348461628, |
| "advantage_std": 0.11772086331620812, |
| "completion_length": 3153.8333740234375, |
| "epoch": 0.232, |
| "grad_norm": 0.0027381619438529015, |
| "kl": 0.00029969215393066406, |
| "learning_rate": 7.695368466124296e-07, |
| "loss": 0.0089, |
| "reward": 0.05879632290452719, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11772086471319199, |
| "rewards/cosine_scaled_reward": 0.016048375517129898, |
| "rewards/format_reward": 0.3125000037252903, |
| "step": 203 |
| }, |
| { |
| "advantage_max": 0.1252095801755786, |
| "advantage_mean": -3.2596291082986895e-09, |
| "advantage_min": -0.11681158654391766, |
| "advantage_std": 0.09182075597345829, |
| "completion_length": 1734.6042251586914, |
| "epoch": 0.23314285714285715, |
| "grad_norm": 0.001834864029660821, |
| "kl": 0.00022155791521072388, |
| "learning_rate": 7.667891533457718e-07, |
| "loss": 0.0018, |
| "reward": 0.18190127734851558, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09182075783610344, |
| "rewards/cosine_scaled_reward": 0.15297963470220566, |
| "rewards/format_reward": 0.7708333395421505, |
| "step": 204 |
| }, |
| { |
| "advantage_max": 0.1340260272845626, |
| "advantage_mean": -6.984919392882816e-09, |
| "advantage_min": -0.30524480529129505, |
| "advantage_std": 0.1720650801435113, |
| "completion_length": 2263.625015258789, |
| "epoch": 0.2342857142857143, |
| "grad_norm": 0.0025743981823325157, |
| "kl": 0.00026476383209228516, |
| "learning_rate": 7.640308940816239e-07, |
| "loss": 0.0058, |
| "reward": 0.25796742155216634, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1720650834031403, |
| "rewards/cosine_scaled_reward": 0.3982603717595339, |
| "rewards/format_reward": 0.7291666865348816, |
| "step": 205 |
| }, |
| { |
| "advantage_max": 0.13997728214599192, |
| "advantage_mean": -1.5522043372850902e-09, |
| "advantage_min": -0.1517253816127777, |
| "advantage_std": 0.11635040352120996, |
| "completion_length": 2658.7291870117188, |
| "epoch": 0.23542857142857143, |
| "grad_norm": 0.0019164991099387407, |
| "kl": 0.0002377629280090332, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0035, |
| "reward": 0.050769580993801355, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1163504053838551, |
| "rewards/cosine_scaled_reward": -0.07947664987295866, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 206 |
| }, |
| { |
| "advantage_max": 0.11476221471093595, |
| "advantage_mean": -6.519258070880607e-09, |
| "advantage_min": -0.132195595651865, |
| "advantage_std": 0.0947426650673151, |
| "completion_length": 2846.520866394043, |
| "epoch": 0.23657142857142857, |
| "grad_norm": 0.0019112242152914405, |
| "kl": 0.00027942657470703125, |
| "learning_rate": 7.584832158039378e-07, |
| "loss": 0.0027, |
| "reward": 0.10473369807004929, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09474266786128283, |
| "rewards/cosine_scaled_reward": 0.028989043086767197, |
| "rewards/format_reward": 0.5625000149011612, |
| "step": 207 |
| }, |
| { |
| "advantage_max": 0.12081033829599619, |
| "advantage_mean": -2.716357666576741e-09, |
| "advantage_min": -0.1374441795051098, |
| "advantage_std": 0.11835672007873654, |
| "completion_length": 2765.4791946411133, |
| "epoch": 0.2377142857142857, |
| "grad_norm": 0.0027541995514184237, |
| "kl": 0.00021903729066252708, |
| "learning_rate": 7.556940671764124e-07, |
| "loss": 0.0065, |
| "reward": 0.08172197639942169, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11835672124288976, |
| "rewards/cosine_scaled_reward": 0.04407403990626335, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 208 |
| }, |
| { |
| "advantage_max": 0.1953953867778182, |
| "advantage_mean": -2.8715779060162205e-09, |
| "advantage_min": -0.1776261981576681, |
| "advantage_std": 0.1543159680441022, |
| "completion_length": 2137.229221343994, |
| "epoch": 0.23885714285714285, |
| "grad_norm": 0.002014850964769721, |
| "kl": 0.0002369508147239685, |
| "learning_rate": 7.528948933102438e-07, |
| "loss": 0.0048, |
| "reward": 0.121353481663391, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15431597619317472, |
| "rewards/cosine_scaled_reward": 0.05646992567926645, |
| "rewards/format_reward": 0.6041666753590107, |
| "step": 209 |
| }, |
| { |
| "advantage_max": 0.12167689856141806, |
| "advantage_mean": -4.113341334210929e-09, |
| "advantage_min": -0.1051677679643035, |
| "advantage_std": 0.09225608897395432, |
| "completion_length": 2772.7708740234375, |
| "epoch": 0.24, |
| "grad_norm": 0.00219483720138669, |
| "kl": 0.00022499263286590576, |
| "learning_rate": 7.500858306332172e-07, |
| "loss": 0.001, |
| "reward": 0.11764410836622119, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.0922560899052769, |
| "rewards/cosine_scaled_reward": 0.0887430626899004, |
| "rewards/format_reward": 0.5208333358168602, |
| "step": 210 |
| }, |
| { |
| "advantage_max": 0.11049975454807281, |
| "advantage_mean": 1.862645232497684e-09, |
| "advantage_min": -0.11649919580668211, |
| "advantage_std": 0.09677038621157408, |
| "completion_length": 2156.375045776367, |
| "epoch": 0.24114285714285713, |
| "grad_norm": 0.002211064798757434, |
| "kl": 0.0002203192561864853, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0023, |
| "reward": 0.11667975131422281, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09677039366215467, |
| "rewards/cosine_scaled_reward": 0.045283637940883636, |
| "rewards/format_reward": 0.6041666753590107, |
| "step": 211 |
| }, |
| { |
| "advantage_max": 0.17078783456236124, |
| "advantage_mean": -5.2774949826916995e-09, |
| "advantage_min": -0.13496317621320486, |
| "advantage_std": 0.12754983035847545, |
| "completion_length": 1858.0625381469727, |
| "epoch": 0.2422857142857143, |
| "grad_norm": 0.0025052272249013186, |
| "kl": 0.0001669749617576599, |
| "learning_rate": 7.444385869608921e-07, |
| "loss": 0.0025, |
| "reward": 0.1585842336062342, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12754983338527381, |
| "rewards/cosine_scaled_reward": 0.13444282207638025, |
| "rewards/format_reward": 0.6666666697710752, |
| "step": 212 |
| }, |
| { |
| "advantage_max": 0.14249009639024734, |
| "advantage_mean": 2.173086086076914e-09, |
| "advantage_min": -0.15096384286880493, |
| "advantage_std": 0.12109754607081413, |
| "completion_length": 2033.2083625793457, |
| "epoch": 0.24342857142857144, |
| "grad_norm": 0.00203386670909822, |
| "kl": 0.000254213809967041, |
| "learning_rate": 7.416006812042827e-07, |
| "loss": -0.0005, |
| "reward": 0.16817454434931278, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12109755538403988, |
| "rewards/cosine_scaled_reward": 0.15286272019147873, |
| "rewards/format_reward": 0.6875, |
| "step": 213 |
| }, |
| { |
| "advantage_max": 0.09318645251914859, |
| "advantage_mean": -3.647680116292129e-09, |
| "advantage_min": -0.11485875491052866, |
| "advantage_std": 0.08119010645896196, |
| "completion_length": 2441.00004196167, |
| "epoch": 0.24457142857142858, |
| "grad_norm": 0.0015373064670711756, |
| "kl": 0.0002943165600299835, |
| "learning_rate": 7.387534371007797e-07, |
| "loss": -0.0005, |
| "reward": 0.07720327051356435, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.08119010925292969, |
| "rewards/cosine_scaled_reward": -0.05541713163256645, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 214 |
| }, |
| { |
| "advantage_max": 0.18169015739113092, |
| "advantage_mean": -1.7850349531833842e-09, |
| "advantage_min": -0.10681417491286993, |
| "advantage_std": 0.11135896015912294, |
| "completion_length": 1973.0417098999023, |
| "epoch": 0.24571428571428572, |
| "grad_norm": 0.0019926519598811865, |
| "kl": 0.0001901760697364807, |
| "learning_rate": 7.358969934210438e-07, |
| "loss": 0.0096, |
| "reward": 0.0591527302749455, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.11135896574705839, |
| "rewards/cosine_scaled_reward": -0.16166850773151964, |
| "rewards/format_reward": 0.6666666679084301, |
| "step": 215 |
| }, |
| { |
| "advantage_max": 0.13990403385832906, |
| "advantage_mean": -1.0089328178475299e-08, |
| "advantage_min": -0.1367074535228312, |
| "advantage_std": 0.1053521609865129, |
| "completion_length": 1759.458396911621, |
| "epoch": 0.24685714285714286, |
| "grad_norm": 0.002091531176120043, |
| "kl": 0.00021722912788391113, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": 0.0054, |
| "reward": 0.17575624957680702, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10535216005519032, |
| "rewards/cosine_scaled_reward": 0.15388290956616402, |
| "rewards/format_reward": 0.7291666772216558, |
| "step": 216 |
| }, |
| { |
| "advantage_max": 0.23972546868026257, |
| "advantage_mean": -2.793967973646616e-09, |
| "advantage_min": -0.19562211446464062, |
| "advantage_std": 0.1821348867379129, |
| "completion_length": 2473.8542098999023, |
| "epoch": 0.248, |
| "grad_norm": 0.0031409154180437326, |
| "kl": 0.00028133392333984375, |
| "learning_rate": 7.301570646506027e-07, |
| "loss": 0.0158, |
| "reward": 0.16255547618493438, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1821348937228322, |
| "rewards/cosine_scaled_reward": 0.1796769928187132, |
| "rewards/format_reward": 0.6041666716337204, |
| "step": 217 |
| }, |
| { |
| "advantage_max": 0.17459031008183956, |
| "advantage_mean": -2.3283064087831207e-09, |
| "advantage_min": -0.1884671887382865, |
| "advantage_std": 0.14956898847594857, |
| "completion_length": 2652.125030517578, |
| "epoch": 0.24914285714285714, |
| "grad_norm": 0.0028272378258407116, |
| "kl": 0.00023734569549560547, |
| "learning_rate": 7.27273859315928e-07, |
| "loss": 0.0063, |
| "reward": 0.1610828833654523, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14956899639219046, |
| "rewards/cosine_scaled_reward": 0.1954907262697816, |
| "rewards/format_reward": 0.562500013038516, |
| "step": 218 |
| }, |
| { |
| "advantage_max": 0.19564510649070144, |
| "advantage_mean": -3.104408646814605e-09, |
| "advantage_min": -0.17972843209281564, |
| "advantage_std": 0.15118937706574798, |
| "completion_length": 2349.2917098999023, |
| "epoch": 0.2502857142857143, |
| "grad_norm": 0.0022383173927664757, |
| "kl": 0.00031820498406887054, |
| "learning_rate": 7.243820139034464e-07, |
| "loss": 0.0066, |
| "reward": 0.10900084767490625, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15118937892839313, |
| "rewards/cosine_scaled_reward": 0.06142610125243664, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 219 |
| }, |
| { |
| "advantage_max": 0.09967101691290736, |
| "advantage_mean": 6.596868329378225e-10, |
| "advantage_min": -0.09928332921117544, |
| "advantage_std": 0.08199074282310903, |
| "completion_length": 2653.062511444092, |
| "epoch": 0.25142857142857145, |
| "grad_norm": 0.001260359538719058, |
| "kl": 0.00023132562637329102, |
| "learning_rate": 7.214816693576234e-07, |
| "loss": 0.0026, |
| "reward": 0.004753962974064052, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.08199074515141547, |
| "rewards/cosine_scaled_reward": -0.1849276451393962, |
| "rewards/format_reward": 0.3958333395421505, |
| "step": 220 |
| }, |
| { |
| "advantage_max": 0.10617540590465069, |
| "advantage_mean": -1.0865430360995632e-09, |
| "advantage_min": -0.1566908685490489, |
| "advantage_std": 0.10643785918364301, |
| "completion_length": 1836.958366394043, |
| "epoch": 0.25257142857142856, |
| "grad_norm": 0.0013525058748200536, |
| "kl": 0.00019219331443309784, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": 0.0024, |
| "reward": 0.14161380444420502, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10643785947468132, |
| "rewards/cosine_scaled_reward": 0.0743873082101345, |
| "rewards/format_reward": 0.6875, |
| "step": 221 |
| }, |
| { |
| "advantage_max": 0.10208693100139499, |
| "advantage_mean": -1.2572854896086838e-08, |
| "advantage_min": -0.14001461677253246, |
| "advantage_std": 0.1002459516748786, |
| "completion_length": 2023.0625381469727, |
| "epoch": 0.2537142857142857, |
| "grad_norm": 0.001963146962225437, |
| "kl": 0.0002463310956954956, |
| "learning_rate": 7.156560487081051e-07, |
| "loss": -0.0009, |
| "reward": 0.20748503901995718, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10024595074355602, |
| "rewards/cosine_scaled_reward": 0.23565726913511753, |
| "rewards/format_reward": 0.7500000037252903, |
| "step": 222 |
| }, |
| { |
| "advantage_max": 0.11967136012390256, |
| "advantage_mean": 2.0954757928848267e-09, |
| "advantage_min": -0.1763377906754613, |
| "advantage_std": 0.12263105483725667, |
| "completion_length": 1949.6250534057617, |
| "epoch": 0.25485714285714284, |
| "grad_norm": 0.0019644785206764936, |
| "kl": 0.00023895502090454102, |
| "learning_rate": 7.127310565369415e-07, |
| "loss": 0.0067, |
| "reward": 0.12921895319595933, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12263105623424053, |
| "rewards/cosine_scaled_reward": 0.036362769082188606, |
| "rewards/format_reward": 0.6875000074505806, |
| "step": 223 |
| }, |
| { |
| "advantage_max": 0.22225173842161894, |
| "advantage_mean": -4.967053990334591e-09, |
| "advantage_min": -0.20200780779123306, |
| "advantage_std": 0.1708352784626186, |
| "completion_length": 2851.104217529297, |
| "epoch": 0.256, |
| "grad_norm": 0.003322938922792673, |
| "kl": 0.00026100873947143555, |
| "learning_rate": 7.097981330836616e-07, |
| "loss": 0.0085, |
| "reward": 0.10233315639197826, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.17083528079092503, |
| "rewards/cosine_scaled_reward": 0.020202322863042355, |
| "rewards/format_reward": 0.5625000093132257, |
| "step": 224 |
| }, |
| { |
| "advantage_max": 0.13286243984475732, |
| "advantage_mean": -4.811833334561477e-09, |
| "advantage_min": -0.19514462095685303, |
| "advantage_std": 0.1269650950562209, |
| "completion_length": 2449.312515258789, |
| "epoch": 0.2571428571428571, |
| "grad_norm": 0.002462556352838874, |
| "kl": 0.00022039934992790222, |
| "learning_rate": 7.068574212948169e-07, |
| "loss": 0.0073, |
| "reward": 0.07976344670169055, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12696509901434183, |
| "rewards/cosine_scaled_reward": -0.04560824343934655, |
| "rewards/format_reward": 0.5625000149011612, |
| "step": 225 |
| }, |
| { |
| "advantage_max": 0.14535690797492862, |
| "advantage_mean": -8.692344163896415e-09, |
| "advantage_min": -0.17608788143843412, |
| "advantage_std": 0.1196846547536552, |
| "completion_length": 2130.791690826416, |
| "epoch": 0.2582857142857143, |
| "grad_norm": 0.0015196395106613636, |
| "kl": 0.0002590194344520569, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": 0.0011, |
| "reward": 0.14173566875979304, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11968465615063906, |
| "rewards/cosine_scaled_reward": 0.10786362644284964, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 226 |
| }, |
| { |
| "advantage_max": 0.18973981589078903, |
| "advantage_mean": -4.579002760296547e-09, |
| "advantage_min": -0.2287753401324153, |
| "advantage_std": 0.17609892785549164, |
| "completion_length": 1823.8125228881836, |
| "epoch": 0.25942857142857145, |
| "grad_norm": 0.0025245463475584984, |
| "kl": 0.00020713824778795242, |
| "learning_rate": 7.009532063876148e-07, |
| "loss": 0.006, |
| "reward": 0.15795880928635597, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.17609893204644322, |
| "rewards/cosine_scaled_reward": 0.10740425251424313, |
| "rewards/format_reward": 0.7083333432674408, |
| "step": 227 |
| }, |
| { |
| "advantage_max": 0.08450271608307958, |
| "advantage_mean": -9.041590240399522e-09, |
| "advantage_min": -0.07050300342962146, |
| "advantage_std": 0.06420902267564088, |
| "completion_length": 2023.0833549499512, |
| "epoch": 0.26057142857142856, |
| "grad_norm": 0.0015921753365546465, |
| "kl": 0.00022837892174720764, |
| "learning_rate": 6.979899910323624e-07, |
| "loss": 0.0008, |
| "reward": 0.16322663193568587, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.06420902314130217, |
| "rewards/cosine_scaled_reward": 0.18748834542930126, |
| "rewards/format_reward": 0.583333333954215, |
| "step": 228 |
| }, |
| { |
| "advantage_max": 0.13741022581234574, |
| "advantage_mean": -3.1044086745701804e-10, |
| "advantage_min": -0.12077388912439346, |
| "advantage_std": 0.11520560039207339, |
| "completion_length": 3129.5000228881836, |
| "epoch": 0.26171428571428573, |
| "grad_norm": 0.0022859734017401934, |
| "kl": 0.0004267692565917969, |
| "learning_rate": 6.950195628537299e-07, |
| "loss": 0.0073, |
| "reward": 0.05897674150764942, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1152056036517024, |
| "rewards/cosine_scaled_reward": 0.060396708548069, |
| "rewards/format_reward": 0.2291666679084301, |
| "step": 229 |
| }, |
| { |
| "advantage_max": 0.15972233191132545, |
| "advantage_mean": -2.483526828633842e-09, |
| "advantage_min": -0.14318527560681105, |
| "advantage_std": 0.11432101391255856, |
| "completion_length": 2700.354217529297, |
| "epoch": 0.26285714285714284, |
| "grad_norm": 0.00235546356998384, |
| "kl": 0.00027105212211608887, |
| "learning_rate": 6.920420666261961e-07, |
| "loss": 0.0114, |
| "reward": 0.04795671720057726, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11432101810351014, |
| "rewards/cosine_scaled_reward": -0.06557212490588427, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 230 |
| }, |
| { |
| "advantage_max": 0.18535670265555382, |
| "advantage_mean": -4.81183344558378e-09, |
| "advantage_min": -0.16397515125572681, |
| "advantage_std": 0.14254886470735073, |
| "completion_length": 2433.5416946411133, |
| "epoch": 0.264, |
| "grad_norm": 0.0023599222768098116, |
| "kl": 0.00023667514324188232, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.005, |
| "reward": 0.08141399221494794, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.14254886843264103, |
| "rewards/cosine_scaled_reward": -0.03232991881668568, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 231 |
| }, |
| { |
| "advantage_max": 0.19113029213622212, |
| "advantage_mean": -6.596868273867074e-09, |
| "advantage_min": -0.16376679064705968, |
| "advantage_std": 0.1487897140905261, |
| "completion_length": 2886.979217529297, |
| "epoch": 0.2651428571428571, |
| "grad_norm": 0.0029496531933546066, |
| "kl": 0.0004017353057861328, |
| "learning_rate": 6.860664508377001e-07, |
| "loss": 0.0057, |
| "reward": 0.09052334149600938, |
| "reward_advantage_correlation": 0.9999999999999997, |
| "reward_std": 0.14878972386941314, |
| "rewards/cosine_scaled_reward": -0.04579928144812584, |
| "rewards/format_reward": 0.6250000093132257, |
| "step": 232 |
| }, |
| { |
| "advantage_max": 0.15348306251689792, |
| "advantage_mean": -4.0357312769412346e-09, |
| "advantage_min": -0.18462875578552485, |
| "advantage_std": 0.14140585623681545, |
| "completion_length": 1905.6042022705078, |
| "epoch": 0.2662857142857143, |
| "grad_norm": 0.001814075163565576, |
| "kl": 0.00020284950733184814, |
| "learning_rate": 6.83068622519821e-07, |
| "loss": 0.0081, |
| "reward": 0.14864619029685855, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1414058580994606, |
| "rewards/cosine_scaled_reward": 0.10695588774979115, |
| "rewards/format_reward": 0.666666679084301, |
| "step": 233 |
| }, |
| { |
| "advantage_max": 0.10880297655239701, |
| "advantage_mean": 5.432714833553121e-10, |
| "advantage_min": -0.13644719682633877, |
| "advantage_std": 0.11064268089830875, |
| "completion_length": 2632.7916870117188, |
| "epoch": 0.2674285714285714, |
| "grad_norm": 0.002043582499027252, |
| "kl": 0.00025378167629241943, |
| "learning_rate": 6.800643086250121e-07, |
| "loss": 0.0032, |
| "reward": 0.02333975490182638, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11064268462359905, |
| "rewards/cosine_scaled_reward": -0.14015551283955574, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 234 |
| }, |
| { |
| "advantage_max": 0.13019301602616906, |
| "advantage_mean": 6.984919448393967e-10, |
| "advantage_min": -0.10806654393672943, |
| "advantage_std": 0.09198249317705631, |
| "completion_length": 2275.854179382324, |
| "epoch": 0.26857142857142857, |
| "grad_norm": 0.0018223219085484743, |
| "kl": 0.00024427380412817, |
| "learning_rate": 6.770536555792944e-07, |
| "loss": -0.0012, |
| "reward": 0.0993248739396222, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09198249503970146, |
| "rewards/cosine_scaled_reward": 0.024362626485526562, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 235 |
| }, |
| { |
| "advantage_max": 0.16375997196882963, |
| "advantage_mean": -2.328306325516394e-09, |
| "advantage_min": -0.2442130297422409, |
| "advantage_std": 0.1719423239119351, |
| "completion_length": 2600.6667098999023, |
| "epoch": 0.26971428571428574, |
| "grad_norm": 0.002591744065284729, |
| "kl": 0.00027217157185077667, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": 0.0014, |
| "reward": 0.14142214879393578, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.17194233275949955, |
| "rewards/cosine_scaled_reward": 0.13721440639346838, |
| "rewards/format_reward": 0.5625000149011612, |
| "step": 236 |
| }, |
| { |
| "advantage_max": 0.17325403855647892, |
| "advantage_mean": 2.0372681319713593e-09, |
| "advantage_min": -0.1193100816453807, |
| "advantage_std": 0.11586322111543268, |
| "completion_length": 2441.2917137145996, |
| "epoch": 0.27085714285714285, |
| "grad_norm": 0.0020941535476595163, |
| "kl": 0.00025866925716400146, |
| "learning_rate": 6.710139192768694e-07, |
| "loss": 0.0041, |
| "reward": 0.08818818477448076, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1158632239094004, |
| "rewards/cosine_scaled_reward": 0.03216279484331608, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 237 |
| }, |
| { |
| "advantage_max": 0.18302309326827526, |
| "advantage_mean": -1.071020997583938e-08, |
| "advantage_min": -0.2006698572076857, |
| "advantage_std": 0.14664061879739165, |
| "completion_length": 2554.229202270508, |
| "epoch": 0.272, |
| "grad_norm": 0.002649980830028653, |
| "kl": 0.00035972893238067627, |
| "learning_rate": 6.679851303883891e-07, |
| "loss": 0.0081, |
| "reward": 0.1962011584546417, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14664062252268195, |
| "rewards/cosine_scaled_reward": 0.2361844995757565, |
| "rewards/format_reward": 0.6875000111758709, |
| "step": 238 |
| }, |
| { |
| "advantage_max": 0.0755655961111188, |
| "advantage_mean": -2.7163575277988627e-09, |
| "advantage_min": -0.14539830526337028, |
| "advantage_std": 0.09070903505198658, |
| "completion_length": 1671.4167175292969, |
| "epoch": 0.27314285714285713, |
| "grad_norm": 0.0017475574277341366, |
| "kl": 0.00013406574726104736, |
| "learning_rate": 6.649505910711058e-07, |
| "loss": 0.0024, |
| "reward": 0.26976251835003495, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09070903807878494, |
| "rewards/cosine_scaled_reward": 0.37741944566369057, |
| "rewards/format_reward": 0.8333333358168602, |
| "step": 239 |
| }, |
| { |
| "advantage_max": 0.11013911385089159, |
| "advantage_mean": -3.8805109126016646e-10, |
| "advantage_min": -0.11023932602256536, |
| "advantage_std": 0.09071210492402315, |
| "completion_length": 3068.5208435058594, |
| "epoch": 0.2742857142857143, |
| "grad_norm": 0.002247036434710026, |
| "kl": 0.0004382133483886719, |
| "learning_rate": 6.619104492241847e-07, |
| "loss": 0.0039, |
| "reward": -0.00012199021875858307, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09071210911497474, |
| "rewards/cosine_scaled_reward": -0.1677103042602539, |
| "rewards/format_reward": 0.3333333358168602, |
| "step": 240 |
| }, |
| { |
| "advantage_max": 0.12328928150236607, |
| "advantage_mean": 1.6298145749660264e-09, |
| "advantage_min": -0.13954242039471865, |
| "advantage_std": 0.11165554029867053, |
| "completion_length": 2856.5208587646484, |
| "epoch": 0.2754285714285714, |
| "grad_norm": 0.002248368225991726, |
| "kl": 0.00041091442108154297, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.0093, |
| "reward": 0.020226968685165048, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11165554216131568, |
| "rewards/cosine_scaled_reward": -0.15902700275182724, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 241 |
| }, |
| { |
| "advantage_max": 0.11202648957259953, |
| "advantage_mean": -4.6178079728120824e-09, |
| "advantage_min": -0.17740579205565155, |
| "advantage_std": 0.10806333494838327, |
| "completion_length": 1915.7708587646484, |
| "epoch": 0.2765714285714286, |
| "grad_norm": 0.0017519504763185978, |
| "kl": 0.00040813907980918884, |
| "learning_rate": 6.558139508961654e-07, |
| "loss": 0.0026, |
| "reward": 0.09660546365194023, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.108063337742351, |
| "rewards/cosine_scaled_reward": -0.04916233662515879, |
| "rewards/format_reward": 0.666666679084301, |
| "step": 242 |
| }, |
| { |
| "advantage_max": 0.2148361522704363, |
| "advantage_mean": -1.2417634698280722e-09, |
| "advantage_min": -0.1866682404652238, |
| "advantage_std": 0.15860576275736094, |
| "completion_length": 2627.2708587646484, |
| "epoch": 0.2777142857142857, |
| "grad_norm": 0.0025715562514960766, |
| "kl": 0.00027126073837280273, |
| "learning_rate": 6.527578915497951e-07, |
| "loss": 0.0012, |
| "reward": 0.15241722203791142, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.15860575903207064, |
| "rewards/cosine_scaled_reward": 0.1767411855980754, |
| "rewards/format_reward": 0.5416666753590107, |
| "step": 243 |
| }, |
| { |
| "advantage_max": 0.14424416236579418, |
| "advantage_mean": -1.3969838619232178e-09, |
| "advantage_min": -0.08643147628754377, |
| "advantage_std": 0.09387495345436037, |
| "completion_length": 2834.2916984558105, |
| "epoch": 0.27885714285714286, |
| "grad_norm": 0.0018725207773968577, |
| "kl": 0.0003355741500854492, |
| "learning_rate": 6.496968239287603e-07, |
| "loss": 0.0001, |
| "reward": 0.18463631451595575, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09387495927512646, |
| "rewards/cosine_scaled_reward": 0.26374871004372835, |
| "rewards/format_reward": 0.5625000018626451, |
| "step": 244 |
| }, |
| { |
| "advantage_max": 0.1822696654126048, |
| "advantage_mean": -7.605800961263398e-09, |
| "advantage_min": -0.2544402740895748, |
| "advantage_std": 0.17538686329498887, |
| "completion_length": 2421.520866394043, |
| "epoch": 0.28, |
| "grad_norm": 0.003280170261859894, |
| "kl": 0.0003395378589630127, |
| "learning_rate": 6.466308972251785e-07, |
| "loss": 0.0128, |
| "reward": 0.17269339971244335, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1753868730738759, |
| "rewards/cosine_scaled_reward": 0.18853969313204288, |
| "rewards/format_reward": 0.6458333507180214, |
| "step": 245 |
| }, |
| { |
| "advantage_max": 0.1803152672946453, |
| "advantage_mean": 1.0865430083439875e-09, |
| "advantage_min": -0.19117824081331491, |
| "advantage_std": 0.1531025180593133, |
| "completion_length": 2599.7084045410156, |
| "epoch": 0.28114285714285714, |
| "grad_norm": 0.0027648189570754766, |
| "kl": 0.0003256797790527344, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": 0.0135, |
| "reward": 0.17906012770254165, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15310251899063587, |
| "rewards/cosine_scaled_reward": 0.19450474623590708, |
| "rewards/format_reward": 0.6666666753590107, |
| "step": 246 |
| }, |
| { |
| "advantage_max": 0.09243609569966793, |
| "advantage_mean": -1.0865430222217753e-09, |
| "advantage_min": -0.10062496736645699, |
| "advantage_std": 0.07642483478412032, |
| "completion_length": 2902.8333587646484, |
| "epoch": 0.2822857142857143, |
| "grad_norm": 0.0014275149442255497, |
| "kl": 0.0002752244472503662, |
| "learning_rate": 6.404850645156841e-07, |
| "loss": 0.0059, |
| "reward": 0.019425339065492153, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.07642483757808805, |
| "rewards/cosine_scaled_reward": -0.09852191805839539, |
| "rewards/format_reward": 0.3125000074505806, |
| "step": 247 |
| }, |
| { |
| "advantage_max": 0.11183958873152733, |
| "advantage_mean": -1.3038516377683607e-08, |
| "advantage_min": -0.13401627726852894, |
| "advantage_std": 0.10416306741535664, |
| "completion_length": 2012.2500381469727, |
| "epoch": 0.2834285714285714, |
| "grad_norm": 0.002708690706640482, |
| "kl": 0.00023761391639709473, |
| "learning_rate": 6.374054580489873e-07, |
| "loss": 0.0037, |
| "reward": 0.20697915088385344, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10416307393461466, |
| "rewards/cosine_scaled_reward": 0.2683180356398225, |
| "rewards/format_reward": 0.6875000037252903, |
| "step": 248 |
| }, |
| { |
| "advantage_max": 0.16352112963795662, |
| "advantage_mean": 1.5522041985072121e-10, |
| "advantage_min": -0.1368715576827526, |
| "advantage_std": 0.12225319631397724, |
| "completion_length": 1839.5000114440918, |
| "epoch": 0.2845714285714286, |
| "grad_norm": 0.001457585021853447, |
| "kl": 0.00012803077697753906, |
| "learning_rate": 6.343215915635761e-07, |
| "loss": -0.0022, |
| "reward": 0.13103453570511192, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12225319864228368, |
| "rewards/cosine_scaled_reward": 0.07532077515497804, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 249 |
| }, |
| { |
| "advantage_max": 0.18821298703551292, |
| "advantage_mean": -1.6298144778215118e-09, |
| "advantage_min": -0.16967704251874238, |
| "advantage_std": 0.14901624876074493, |
| "completion_length": 2353.687545776367, |
| "epoch": 0.2857142857142857, |
| "grad_norm": 0.0034939555916935205, |
| "kl": 0.0003637373447418213, |
| "learning_rate": 6.31233615362752e-07, |
| "loss": 0.0112, |
| "reward": 0.056937860790640116, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14901624782942235, |
| "rewards/cosine_scaled_reward": -0.114087900146842, |
| "rewards/format_reward": 0.5625, |
| "step": 250 |
| }, |
| { |
| "advantage_max": 0.1813768669962883, |
| "advantage_mean": -1.1408702066395549e-08, |
| "advantage_min": -0.21314455661922693, |
| "advantage_std": 0.16639887960627675, |
| "completion_length": 1951.5000610351562, |
| "epoch": 0.28685714285714287, |
| "grad_norm": 0.003103352850303054, |
| "kl": 0.00021637976169586182, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": 0.0052, |
| "reward": 0.17971518449485302, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.16639888333156705, |
| "rewards/cosine_scaled_reward": 0.1355207832530141, |
| "rewards/format_reward": 0.7916666697710752, |
| "step": 251 |
| }, |
| { |
| "advantage_max": 0.11473383381962776, |
| "advantage_mean": 2.7939677932353746e-09, |
| "advantage_min": -0.09810259565711021, |
| "advantage_std": 0.0875001561944373, |
| "completion_length": 2449.1458587646484, |
| "epoch": 0.288, |
| "grad_norm": 0.0016089569544419646, |
| "kl": 0.0003833770751953125, |
| "learning_rate": 6.25045936022246e-07, |
| "loss": 0.0088, |
| "reward": 0.09802764293272048, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.08750016003614292, |
| "rewards/cosine_scaled_reward": 0.019266456365585327, |
| "rewards/format_reward": 0.5416666697710752, |
| "step": 252 |
| }, |
| { |
| "advantage_max": 0.19016839936375618, |
| "advantage_mean": 5.432715388664633e-10, |
| "advantage_min": -0.12732800282537937, |
| "advantage_std": 0.12401359155774117, |
| "completion_length": 2734.625026702881, |
| "epoch": 0.28914285714285715, |
| "grad_norm": 0.002614011289551854, |
| "kl": 0.00036776065826416016, |
| "learning_rate": 6.219465344613258e-07, |
| "loss": 0.0097, |
| "reward": 0.05949016893282533, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12401359574869275, |
| "rewards/cosine_scaled_reward": -0.04276910796761513, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 253 |
| }, |
| { |
| "advantage_max": 0.12035822123289108, |
| "advantage_mean": -4.423782354323613e-09, |
| "advantage_min": -0.12394018657505512, |
| "advantage_std": 0.09628990339115262, |
| "completion_length": 2267.2916870117188, |
| "epoch": 0.29028571428571426, |
| "grad_norm": 0.0013566212728619576, |
| "kl": 0.00023874640464782715, |
| "learning_rate": 6.188436263278172e-07, |
| "loss": 0.0028, |
| "reward": 0.10561956372112036, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09628990711644292, |
| "rewards/cosine_scaled_reward": 0.020613186061382294, |
| "rewards/format_reward": 0.5833333358168602, |
| "step": 254 |
| }, |
| { |
| "advantage_max": 0.1810228805989027, |
| "advantage_mean": -1.396983917434369e-09, |
| "advantage_min": -0.1731141395866871, |
| "advantage_std": 0.16093693696893752, |
| "completion_length": 2984.1041984558105, |
| "epoch": 0.2914285714285714, |
| "grad_norm": 0.002712044632062316, |
| "kl": 0.00037491321563720703, |
| "learning_rate": 6.157373628530852e-07, |
| "loss": 0.0101, |
| "reward": 0.05883750435896218, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.16093694604933262, |
| "rewards/cosine_scaled_reward": -0.024436630308628082, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 255 |
| }, |
| { |
| "advantage_max": 0.21645413525402546, |
| "advantage_mean": -1.241763414316921e-09, |
| "advantage_min": -0.21925134025514126, |
| "advantage_std": 0.1742071988992393, |
| "completion_length": 2541.562545776367, |
| "epoch": 0.2925714285714286, |
| "grad_norm": 0.0028157387860119343, |
| "kl": 0.00041604042053222656, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": -0.0019, |
| "reward": 0.1251123258844018, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1742072026245296, |
| "rewards/cosine_scaled_reward": 0.08667805790901184, |
| "rewards/format_reward": 0.562500013038516, |
| "step": 256 |
| }, |
| { |
| "advantage_max": 0.12831580359488726, |
| "advantage_mean": -1.4745941502580795e-08, |
| "advantage_min": -0.2287419093772769, |
| "advantage_std": 0.13877611607313156, |
| "completion_length": 2855.9167098999023, |
| "epoch": 0.2937142857142857, |
| "grad_norm": 0.0024661049246788025, |
| "kl": 0.0003330707550048828, |
| "learning_rate": 6.095153756157051e-07, |
| "loss": 0.0093, |
| "reward": 0.20216338173486292, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13877612398937345, |
| "rewards/cosine_scaled_reward": 0.3188007604330778, |
| "rewards/format_reward": 0.562500013038516, |
| "step": 257 |
| }, |
| { |
| "advantage_max": 0.24111134372651577, |
| "advantage_mean": -4.0357312769412346e-09, |
| "advantage_min": -0.18471561698243022, |
| "advantage_std": 0.18118810467422009, |
| "completion_length": 3009.2291870117188, |
| "epoch": 0.2948571428571429, |
| "grad_norm": 0.003488308284431696, |
| "kl": 0.0004291534423828125, |
| "learning_rate": 6.06399955103937e-07, |
| "loss": 0.0158, |
| "reward": 0.06386373564600945, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.18118811072781682, |
| "rewards/cosine_scaled_reward": -0.018720313906669617, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 258 |
| }, |
| { |
| "advantage_max": 0.20813544653356075, |
| "advantage_mean": -2.5999424002609572e-09, |
| "advantage_min": -0.19101551175117493, |
| "advantage_std": 0.16957074729725718, |
| "completion_length": 2645.8542251586914, |
| "epoch": 0.296, |
| "grad_norm": 0.0034065325744450092, |
| "kl": 0.0004336535930633545, |
| "learning_rate": 6.032817857379256e-07, |
| "loss": 0.0119, |
| "reward": 0.11577623779885471, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.16957074729725718, |
| "rewards/cosine_scaled_reward": 0.10282952804118395, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 259 |
| }, |
| { |
| "advantage_max": 0.10018521640449762, |
| "advantage_mean": -5.393910020023984e-09, |
| "advantage_min": -0.11331065790727735, |
| "advantage_std": 0.08604301093146205, |
| "completion_length": 1918.2916831970215, |
| "epoch": 0.29714285714285715, |
| "grad_norm": 0.0016338881105184555, |
| "kl": 0.00025529414415359497, |
| "learning_rate": 6.001610194928464e-07, |
| "loss": 0.0048, |
| "reward": 0.20562266194610856, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.0860430154716596, |
| "rewards/cosine_scaled_reward": 0.29520507203415036, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 260 |
| }, |
| { |
| "advantage_max": 0.1361211899202317, |
| "advantage_mean": 5.626740046116296e-10, |
| "advantage_min": -0.14850289840251207, |
| "advantage_std": 0.11766799632459879, |
| "completion_length": 2685.2708892822266, |
| "epoch": 0.29828571428571427, |
| "grad_norm": 0.002024485031142831, |
| "kl": 0.0002980828285217285, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0036, |
| "reward": 0.0964075651136227, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.1176679995842278, |
| "rewards/cosine_scaled_reward": 0.014010767918080091, |
| "rewards/format_reward": 0.5416666697710752, |
| "step": 261 |
| }, |
| { |
| "advantage_max": 0.09950929321348667, |
| "advantage_mean": 2.3283065753165744e-10, |
| "advantage_min": -0.10815745778381824, |
| "advantage_std": 0.07817226415500045, |
| "completion_length": 2924.062530517578, |
| "epoch": 0.29942857142857143, |
| "grad_norm": 0.0013728238409385085, |
| "kl": 0.0004365351051092148, |
| "learning_rate": 5.939123048916173e-07, |
| "loss": 0.0032, |
| "reward": 0.004250659607350826, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.07817226415500045, |
| "rewards/cosine_scaled_reward": -0.1638176329433918, |
| "rewards/format_reward": 0.35416667722165585, |
| "step": 262 |
| }, |
| { |
| "advantage_max": 0.11774280667304993, |
| "advantage_mean": 2.4835269396561444e-09, |
| "advantage_min": -0.08933224296197295, |
| "advantage_std": 0.08471674006432295, |
| "completion_length": 2697.458354949951, |
| "epoch": 0.30057142857142854, |
| "grad_norm": 0.0009717949433252215, |
| "kl": 0.00025212764739990234, |
| "learning_rate": 5.907846610890011e-07, |
| "loss": 0.0048, |
| "reward": -0.008404992360738106, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.08471674332395196, |
| "rewards/cosine_scaled_reward": -0.18136566318571568, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 263 |
| }, |
| { |
| "advantage_max": 0.13900742027908564, |
| "advantage_mean": -2.6387472762401387e-09, |
| "advantage_min": -0.0985984280705452, |
| "advantage_std": 0.09653732646256685, |
| "completion_length": 2763.479202270508, |
| "epoch": 0.3017142857142857, |
| "grad_norm": 0.0014484527055174112, |
| "kl": 0.0003957897424697876, |
| "learning_rate": 5.87655029499542e-07, |
| "loss": 0.0013, |
| "reward": 0.06449721958779264, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.0965373320505023, |
| "rewards/cosine_scaled_reward": -0.039240069687366486, |
| "rewards/format_reward": 0.4583333395421505, |
| "step": 264 |
| }, |
| { |
| "advantage_max": 0.12348126340657473, |
| "advantage_mean": -3.2596290944209017e-09, |
| "advantage_min": -0.1179004842415452, |
| "advantage_std": 0.0924111008644104, |
| "completion_length": 1786.0208587646484, |
| "epoch": 0.3028571428571429, |
| "grad_norm": 0.0013441459741443396, |
| "kl": 0.00021648406982421875, |
| "learning_rate": 5.845235626570683e-07, |
| "loss": 0.0007, |
| "reward": 0.17772597214207053, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09241109946742654, |
| "rewards/cosine_scaled_reward": 0.15900977700948715, |
| "rewards/format_reward": 0.7291666772216558, |
| "step": 265 |
| }, |
| { |
| "advantage_max": 0.21225751377642155, |
| "advantage_mean": -4.2685618928395286e-10, |
| "advantage_min": -0.11814463697373867, |
| "advantage_std": 0.13529152376577258, |
| "completion_length": 3109.5208587646484, |
| "epoch": 0.304, |
| "grad_norm": 0.00254819099791348, |
| "kl": 0.00038570165634155273, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": 0.0041, |
| "reward": -0.01641088235192001, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13529152469709516, |
| "rewards/cosine_scaled_reward": -0.20564116793684661, |
| "rewards/format_reward": 0.31250000558793545, |
| "step": 266 |
| }, |
| { |
| "advantage_max": 0.12912985170260072, |
| "advantage_mean": -5.083469145628072e-09, |
| "advantage_min": -0.16517041064798832, |
| "advantage_std": 0.12337575666606426, |
| "completion_length": 2857.4791717529297, |
| "epoch": 0.30514285714285716, |
| "grad_norm": 0.0021744819823652506, |
| "kl": 0.0003535747528076172, |
| "learning_rate": 5.78255733788191e-07, |
| "loss": -0.0004, |
| "reward": 0.09505193377844989, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12337576039135456, |
| "rewards/cosine_scaled_reward": 0.08310575038194656, |
| "rewards/format_reward": 0.3958333395421505, |
| "step": 267 |
| }, |
| { |
| "advantage_max": 0.2863444034010172, |
| "advantage_mean": -3.725290464995368e-09, |
| "advantage_min": -0.21928295260295272, |
| "advantage_std": 0.21429739147424698, |
| "completion_length": 2655.7292289733887, |
| "epoch": 0.3062857142857143, |
| "grad_norm": 0.003988311160355806, |
| "kl": 0.0004082322120666504, |
| "learning_rate": 5.751196772469237e-07, |
| "loss": 0.0232, |
| "reward": 0.11551153496839106, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.21429740265011787, |
| "rewards/cosine_scaled_reward": 0.0790593889541924, |
| "rewards/format_reward": 0.5208333488553762, |
| "step": 268 |
| }, |
| { |
| "advantage_max": 0.08062019851058722, |
| "advantage_mean": -9.235615966440847e-09, |
| "advantage_min": -0.12000223528593779, |
| "advantage_std": 0.07978324650321156, |
| "completion_length": 2586.8125610351562, |
| "epoch": 0.30742857142857144, |
| "grad_norm": 0.00132320960983634, |
| "kl": 0.00033305585384368896, |
| "learning_rate": 5.71982396408026e-07, |
| "loss": -0.0053, |
| "reward": 0.16532681556418538, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.07978325278963894, |
| "rewards/cosine_scaled_reward": 0.19731012731790543, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 269 |
| }, |
| { |
| "advantage_max": 0.21284050540998578, |
| "advantage_mean": -1.319373665875645e-09, |
| "advantage_min": -0.21470065601170063, |
| "advantage_std": 0.1724660824984312, |
| "completion_length": 2639.0208587646484, |
| "epoch": 0.30857142857142855, |
| "grad_norm": 0.0030540579464286566, |
| "kl": 0.00039489567279815674, |
| "learning_rate": 5.688440441781398e-07, |
| "loss": 0.008, |
| "reward": 0.15868494706228375, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1724660899490118, |
| "rewards/cosine_scaled_reward": 0.16503717796877027, |
| "rewards/format_reward": 0.6041666828095913, |
| "step": 270 |
| }, |
| { |
| "advantage_max": 0.11745514534413815, |
| "advantage_mean": -1.4280279847511679e-08, |
| "advantage_min": -0.12490505632013083, |
| "advantage_std": 0.09618132305331528, |
| "completion_length": 1718.4166946411133, |
| "epoch": 0.3097142857142857, |
| "grad_norm": 0.0017048126319423318, |
| "kl": 0.0001882612705230713, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": 0.0016, |
| "reward": 0.24240022152662277, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.0961813268950209, |
| "rewards/cosine_scaled_reward": 0.2979451888240874, |
| "rewards/format_reward": 0.8333333358168602, |
| "step": 271 |
| }, |
| { |
| "advantage_max": 0.20761525630950928, |
| "advantage_mean": -3.259629080543114e-09, |
| "advantage_min": -0.22688710037618876, |
| "advantage_std": 0.1721926424652338, |
| "completion_length": 2622.7291870117188, |
| "epoch": 0.31085714285714283, |
| "grad_norm": 0.003913143649697304, |
| "kl": 0.0003896951675415039, |
| "learning_rate": 5.625647374256061e-07, |
| "loss": 0.0139, |
| "reward": 0.111437275307253, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.17219264013692737, |
| "rewards/cosine_scaled_reward": 0.05980448704212904, |
| "rewards/format_reward": 0.5416666809469461, |
| "step": 272 |
| }, |
| { |
| "advantage_max": 0.17210129369050264, |
| "advantage_mean": -1.0865430291606692e-08, |
| "advantage_min": -0.147225983440876, |
| "advantage_std": 0.1311533278785646, |
| "completion_length": 2594.666717529297, |
| "epoch": 0.312, |
| "grad_norm": 0.0020003090612590313, |
| "kl": 0.00030806660652160645, |
| "learning_rate": 5.594240889475106e-07, |
| "loss": 0.0103, |
| "reward": 0.0801794994622469, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1311533316038549, |
| "rewards/cosine_scaled_reward": 0.027412916533648968, |
| "rewards/format_reward": 0.4166666679084301, |
| "step": 273 |
| }, |
| { |
| "advantage_max": 0.12541158869862556, |
| "advantage_mean": -1.1020650489412809e-08, |
| "advantage_min": -0.1271469658240676, |
| "advantage_std": 0.10741374921053648, |
| "completion_length": 1575.7291717529297, |
| "epoch": 0.31314285714285717, |
| "grad_norm": 0.0014687005896121264, |
| "kl": 0.00015874579548835754, |
| "learning_rate": 5.562829811526154e-07, |
| "loss": 0.0037, |
| "reward": 0.2069433918222785, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10741375316865742, |
| "rewards/cosine_scaled_reward": 0.19209350552409887, |
| "rewards/format_reward": 0.8333333358168602, |
| "step": 274 |
| }, |
| { |
| "advantage_max": 0.11719317454844713, |
| "advantage_mean": -1.3969838202898543e-09, |
| "advantage_min": -0.12048850674182177, |
| "advantage_std": 0.09731898817699403, |
| "completion_length": 2238.8958435058594, |
| "epoch": 0.3142857142857143, |
| "grad_norm": 0.0018332276958972216, |
| "kl": 0.00023164600133895874, |
| "learning_rate": 5.531415671340826e-07, |
| "loss": 0.0027, |
| "reward": 0.2168528651818633, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09731898899190128, |
| "rewards/cosine_scaled_reward": 0.3284746464341879, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 275 |
| }, |
| { |
| "advantage_max": 0.10887271910905838, |
| "advantage_mean": -5.044664352915618e-09, |
| "advantage_min": -0.16490559931844473, |
| "advantage_std": 0.11456224136054516, |
| "completion_length": 2457.4792251586914, |
| "epoch": 0.31542857142857145, |
| "grad_norm": 0.001995307393372059, |
| "kl": 0.0003548562526702881, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0063, |
| "reward": 0.17041749227792025, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1145622415933758, |
| "rewards/cosine_scaled_reward": 0.22119942121207714, |
| "rewards/format_reward": 0.5625000149011612, |
| "step": 276 |
| }, |
| { |
| "advantage_max": 0.288712446577847, |
| "advantage_mean": -8.692344219407566e-09, |
| "advantage_min": -0.20587524212896824, |
| "advantage_std": 0.2012051260098815, |
| "completion_length": 2489.979217529297, |
| "epoch": 0.31657142857142856, |
| "grad_norm": 0.004219182766973972, |
| "kl": 0.0003845691680908203, |
| "learning_rate": 5.468584328659172e-07, |
| "loss": 0.0198, |
| "reward": 0.13046906306408346, |
| "reward_advantage_correlation": 0.9999999999999997, |
| "reward_std": 0.2012051409110427, |
| "rewards/cosine_scaled_reward": 0.10410384787246585, |
| "rewards/format_reward": 0.5625000093132257, |
| "step": 277 |
| }, |
| { |
| "advantage_max": 0.10111749917268753, |
| "advantage_mean": -2.832772877381373e-09, |
| "advantage_min": -0.13670605374500155, |
| "advantage_std": 0.09576660464517772, |
| "completion_length": 1926.6458740234375, |
| "epoch": 0.3177142857142857, |
| "grad_norm": 0.001758676953613758, |
| "kl": 0.00029021501541137695, |
| "learning_rate": 5.437170188473847e-07, |
| "loss": 0.0003, |
| "reward": 0.22084622830152512, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.0957666093017906, |
| "rewards/cosine_scaled_reward": 0.20693974336609244, |
| "rewards/format_reward": 0.8750000111758709, |
| "step": 278 |
| }, |
| { |
| "advantage_max": 0.13892194349318743, |
| "advantage_mean": -2.949188213086096e-09, |
| "advantage_min": -0.10274117905646563, |
| "advantage_std": 0.10129120200872421, |
| "completion_length": 3169.729179382324, |
| "epoch": 0.31885714285714284, |
| "grad_norm": 0.0020529532339423895, |
| "kl": 0.00042885541915893555, |
| "learning_rate": 5.405759110524894e-07, |
| "loss": 0.0019, |
| "reward": 0.004306883085519075, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10129120014607906, |
| "rewards/cosine_scaled_reward": -0.1018618680536747, |
| "rewards/format_reward": 0.22916666977107525, |
| "step": 279 |
| }, |
| { |
| "advantage_max": 0.22641071490943432, |
| "advantage_mean": -2.638747387262441e-09, |
| "advantage_min": -0.15394274424761534, |
| "advantage_std": 0.15549280680716038, |
| "completion_length": 1992.2291831970215, |
| "epoch": 0.32, |
| "grad_norm": 0.002533955965191126, |
| "kl": 0.00034427642822265625, |
| "learning_rate": 5.37435262574394e-07, |
| "loss": 0.0115, |
| "reward": 0.16187963518314064, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15549280843697488, |
| "rewards/cosine_scaled_reward": 0.13246607966721058, |
| "rewards/format_reward": 0.6875000111758709, |
| "step": 280 |
| }, |
| { |
| "advantage_max": 0.20242772391065955, |
| "advantage_mean": -2.2506962960022747e-09, |
| "advantage_min": -0.13997359201312065, |
| "advantage_std": 0.13463286077603698, |
| "completion_length": 3309.4583740234375, |
| "epoch": 0.3211428571428571, |
| "grad_norm": 0.0027612389530986547, |
| "kl": 0.0004780292510986328, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0086, |
| "reward": 0.004778874106705189, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1346328603103757, |
| "rewards/cosine_scaled_reward": -0.14123893855139613, |
| "rewards/format_reward": 0.31250000931322575, |
| "step": 281 |
| }, |
| { |
| "advantage_max": 0.12792781926691532, |
| "advantage_mean": -4.6566130951219975e-09, |
| "advantage_min": -0.14909182582050562, |
| "advantage_std": 0.11060419026762247, |
| "completion_length": 2272.062568664551, |
| "epoch": 0.3222857142857143, |
| "grad_norm": 0.0016208424931392074, |
| "kl": 0.0003352165222167969, |
| "learning_rate": 5.311559558218603e-07, |
| "loss": 0.0001, |
| "reward": 0.1578272543847561, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11060419538989663, |
| "rewards/cosine_scaled_reward": 0.15399221889674664, |
| "rewards/format_reward": 0.6250000149011612, |
| "step": 282 |
| }, |
| { |
| "advantage_max": 0.13227641116827726, |
| "advantage_mean": -7.140139812733537e-09, |
| "advantage_min": -0.14622067473828793, |
| "advantage_std": 0.10864142281934619, |
| "completion_length": 2319.791717529297, |
| "epoch": 0.32342857142857145, |
| "grad_norm": 0.0018907062476500869, |
| "kl": 0.0003216862678527832, |
| "learning_rate": 5.28017603591974e-07, |
| "loss": 0.0046, |
| "reward": 0.23123158095404506, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10864142375066876, |
| "rewards/cosine_scaled_reward": 0.33133327309042215, |
| "rewards/format_reward": 0.7083333432674408, |
| "step": 283 |
| }, |
| { |
| "advantage_max": 0.13931749551557004, |
| "advantage_mean": -6.131207229420621e-09, |
| "advantage_min": -0.17834768863394856, |
| "advantage_std": 0.11884868424385786, |
| "completion_length": 1949.2917175292969, |
| "epoch": 0.32457142857142857, |
| "grad_norm": 0.0016096375184133649, |
| "kl": 0.0001983940601348877, |
| "learning_rate": 5.248803227530763e-07, |
| "loss": 0.0042, |
| "reward": 0.20003115246072412, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11884868587367237, |
| "rewards/cosine_scaled_reward": 0.2159505933523178, |
| "rewards/format_reward": 0.7500000055879354, |
| "step": 284 |
| }, |
| { |
| "advantage_max": 0.10915980814024806, |
| "advantage_mean": -2.949188275536141e-09, |
| "advantage_min": -0.07292798534035683, |
| "advantage_std": 0.07839016616344452, |
| "completion_length": 1869.0625495910645, |
| "epoch": 0.32571428571428573, |
| "grad_norm": 0.0018345932476222515, |
| "kl": 0.00028606876730918884, |
| "learning_rate": 5.21744266211809e-07, |
| "loss": 0.001, |
| "reward": 0.10105163743719459, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.0783901670947671, |
| "rewards/cosine_scaled_reward": -0.0772322453558445, |
| "rewards/format_reward": 0.7500000111758709, |
| "step": 285 |
| }, |
| { |
| "advantage_max": 0.12339612538926303, |
| "advantage_mean": -7.1013350096127414e-09, |
| "advantage_min": -0.15304887667298317, |
| "advantage_std": 0.10974670597352087, |
| "completion_length": 2313.625015258789, |
| "epoch": 0.32685714285714285, |
| "grad_norm": 0.0016531402943655849, |
| "kl": 0.00037553906440734863, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0086, |
| "reward": 0.16036886721849442, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1097467066720128, |
| "rewards/cosine_scaled_reward": 0.17942035384476185, |
| "rewards/format_reward": 0.5833333358168602, |
| "step": 286 |
| }, |
| { |
| "advantage_max": 0.090579554438591, |
| "advantage_mean": -2.0178656315317234e-09, |
| "advantage_min": -0.12944842409342527, |
| "advantage_std": 0.09058350510895252, |
| "completion_length": 1710.7083435058594, |
| "epoch": 0.328, |
| "grad_norm": 0.001025793026201427, |
| "kl": 0.00019846856594085693, |
| "learning_rate": 5.154764373429315e-07, |
| "loss": -0.0008, |
| "reward": 0.13440879015251994, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.09058351023122668, |
| "rewards/cosine_scaled_reward": 0.10385258868336678, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 287 |
| }, |
| { |
| "advantage_max": 0.19185153394937515, |
| "advantage_mean": 1.241763414316921e-09, |
| "advantage_min": -0.1199220959097147, |
| "advantage_std": 0.1265643904916942, |
| "completion_length": 2892.2708435058594, |
| "epoch": 0.3291428571428571, |
| "grad_norm": 0.0027143056504428387, |
| "kl": 0.0004878044128417969, |
| "learning_rate": 5.123449705004581e-07, |
| "loss": 0.0028, |
| "reward": 0.05917328954092227, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12656439328566194, |
| "rewards/cosine_scaled_reward": -0.010442662052810192, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 288 |
| }, |
| { |
| "advantage_max": 0.10830804985016584, |
| "advantage_mean": -3.8805109958683914e-09, |
| "advantage_min": -0.12791373440995812, |
| "advantage_std": 0.09628125256858766, |
| "completion_length": 2157.4791984558105, |
| "epoch": 0.3302857142857143, |
| "grad_norm": 0.001589475548826158, |
| "kl": 0.00038611888885498047, |
| "learning_rate": 5.09215338910999e-07, |
| "loss": 0.0004, |
| "reward": 0.10157534619793296, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09628125361632556, |
| "rewards/cosine_scaled_reward": 0.007496317848563194, |
| "rewards/format_reward": 0.5833333414047956, |
| "step": 289 |
| }, |
| { |
| "advantage_max": 0.1997305415570736, |
| "advantage_mean": -7.295360621162317e-09, |
| "advantage_min": -0.1688457289710641, |
| "advantage_std": 0.14652833994477987, |
| "completion_length": 1426.9792022705078, |
| "epoch": 0.3314285714285714, |
| "grad_norm": 0.0017456391360610723, |
| "kl": 0.00028890371322631836, |
| "learning_rate": 5.060876951083828e-07, |
| "loss": 0.0068, |
| "reward": 0.17761991049337666, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.14652834553271532, |
| "rewards/cosine_scaled_reward": 0.08774526475463063, |
| "rewards/format_reward": 0.8750000074505806, |
| "step": 290 |
| }, |
| { |
| "advantage_max": 0.15469386614859104, |
| "advantage_mean": -1.5522043650406658e-09, |
| "advantage_min": -0.10938695259392262, |
| "advantage_std": 0.10772312432527542, |
| "completion_length": 2121.895851135254, |
| "epoch": 0.3325714285714286, |
| "grad_norm": 0.001207710593007505, |
| "kl": 0.00032941997051239014, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0004, |
| "reward": 0.16409676615148783, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1077231322415173, |
| "rewards/cosine_scaled_reward": 0.1403282443061471, |
| "rewards/format_reward": 0.6875000018626451, |
| "step": 291 |
| }, |
| { |
| "advantage_max": 0.17690535634756088, |
| "advantage_mean": -2.3283067140944524e-10, |
| "advantage_min": -0.1479609040543437, |
| "advantage_std": 0.14120537089183927, |
| "completion_length": 2716.2500228881836, |
| "epoch": 0.33371428571428574, |
| "grad_norm": 0.0023947900626808405, |
| "kl": 0.0004105567932128906, |
| "learning_rate": 4.998389805071536e-07, |
| "loss": 0.0028, |
| "reward": 0.04709340166300535, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.14120537508279085, |
| "rewards/cosine_scaled_reward": -0.08947830833494663, |
| "rewards/format_reward": 0.4583333395421505, |
| "step": 292 |
| }, |
| { |
| "advantage_max": 0.15446675289422274, |
| "advantage_mean": -4.579002739479865e-09, |
| "advantage_min": -0.15014554280787706, |
| "advantage_std": 0.1075568669475615, |
| "completion_length": 1999.520851135254, |
| "epoch": 0.33485714285714285, |
| "grad_norm": 0.001623099553398788, |
| "kl": 0.0004197433590888977, |
| "learning_rate": 4.967182142620745e-07, |
| "loss": -0.0022, |
| "reward": 0.12098718318156898, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10755686787888408, |
| "rewards/cosine_scaled_reward": 0.0010581477545201778, |
| "rewards/format_reward": 0.7083333432674408, |
| "step": 293 |
| }, |
| { |
| "advantage_max": 0.2192140589468181, |
| "advantage_mean": 4.656613428188905e-10, |
| "advantage_min": -0.1265512192621827, |
| "advantage_std": 0.13496136059984565, |
| "completion_length": 3130.3958740234375, |
| "epoch": 0.336, |
| "grad_norm": 0.002399858320131898, |
| "kl": 0.0006003789603710175, |
| "learning_rate": 4.93600044896063e-07, |
| "loss": 0.0063, |
| "reward": 0.03184024168876931, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13496137037873268, |
| "rewards/cosine_scaled_reward": -0.04238404519855976, |
| "rewards/format_reward": 0.27083333767950535, |
| "step": 294 |
| }, |
| { |
| "advantage_max": 0.16546836122870445, |
| "advantage_mean": -2.483527036800659e-09, |
| "advantage_min": -0.14341549389064312, |
| "advantage_std": 0.12139872647821903, |
| "completion_length": 2891.7291717529297, |
| "epoch": 0.33714285714285713, |
| "grad_norm": 0.0022274511866271496, |
| "kl": 0.00046318769454956055, |
| "learning_rate": 4.904846243842949e-07, |
| "loss": -0.002, |
| "reward": 0.055140421725809574, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12139872647821903, |
| "rewards/cosine_scaled_reward": -0.026432855054736137, |
| "rewards/format_reward": 0.37500000558793545, |
| "step": 295 |
| }, |
| { |
| "advantage_max": 0.09151355037465692, |
| "advantage_mean": -5.587935683615264e-09, |
| "advantage_min": -0.10618274100124836, |
| "advantage_std": 0.07723336713388562, |
| "completion_length": 2818.041702270508, |
| "epoch": 0.3382857142857143, |
| "grad_norm": 0.0014323138166218996, |
| "kl": 0.00042188167572021484, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0041, |
| "reward": 0.06925072055310011, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.0772333717904985, |
| "rewards/cosine_scaled_reward": -0.025438087061047554, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 296 |
| }, |
| { |
| "advantage_max": 0.21857766713947058, |
| "advantage_mean": 3.182018898373329e-09, |
| "advantage_min": -0.12679255288094282, |
| "advantage_std": 0.14345951098948717, |
| "completion_length": 3366.729217529297, |
| "epoch": 0.3394285714285714, |
| "grad_norm": 0.0024829749017953873, |
| "kl": 0.0004658699035644531, |
| "learning_rate": 4.842626371469149e-07, |
| "loss": 0.0039, |
| "reward": 0.00389005895704031, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14345951285213232, |
| "rewards/cosine_scaled_reward": -0.09262831043452024, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 297 |
| }, |
| { |
| "advantage_max": 0.2189861796796322, |
| "advantage_mean": -3.72529045111758e-09, |
| "advantage_min": -0.1656702347099781, |
| "advantage_std": 0.15777601953595877, |
| "completion_length": 2752.520851135254, |
| "epoch": 0.3405714285714286, |
| "grad_norm": 0.002729513682425022, |
| "kl": 0.00036172568798065186, |
| "learning_rate": 4.811563736721829e-07, |
| "loss": 0.008, |
| "reward": 0.08497396449092776, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1577760260552168, |
| "rewards/cosine_scaled_reward": 0.030580737628042698, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 298 |
| }, |
| { |
| "advantage_max": 0.11331136804074049, |
| "advantage_mean": -1.6298146165993899e-09, |
| "advantage_min": -0.16553817968815565, |
| "advantage_std": 0.1081388727761805, |
| "completion_length": 3006.666717529297, |
| "epoch": 0.3417142857142857, |
| "grad_norm": 0.0022456180304288864, |
| "kl": 0.0003933906555175781, |
| "learning_rate": 4.780534655386743e-07, |
| "loss": 0.0021, |
| "reward": 0.124224784784019, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10813887417316437, |
| "rewards/cosine_scaled_reward": 0.1376944463700056, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 299 |
| }, |
| { |
| "advantage_max": 0.13589740544557571, |
| "advantage_mean": 1.3581787534910905e-09, |
| "advantage_min": -0.09810729883611202, |
| "advantage_std": 0.0930751352570951, |
| "completion_length": 3335.3125610351562, |
| "epoch": 0.34285714285714286, |
| "grad_norm": 0.0022788026835769415, |
| "kl": 0.000484466552734375, |
| "learning_rate": 4.749540639777539e-07, |
| "loss": 0.0047, |
| "reward": -0.013445806922391057, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09307514037936926, |
| "rewards/cosine_scaled_reward": -0.1955646127462387, |
| "rewards/format_reward": 0.3125000037252903, |
| "step": 300 |
| }, |
| { |
| "advantage_max": 0.1622638087719679, |
| "advantage_mean": -3.958121053138086e-09, |
| "advantage_min": -0.1490333159454167, |
| "advantage_std": 0.12028014613315463, |
| "completion_length": 2307.145866394043, |
| "epoch": 0.344, |
| "grad_norm": 0.0024759138468652964, |
| "kl": 0.00041925907135009766, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0037, |
| "reward": 0.09233828741707839, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12028014985844493, |
| "rewards/cosine_scaled_reward": -0.09192248748149723, |
| "rewards/format_reward": 0.729166679084301, |
| "step": 301 |
| }, |
| { |
| "advantage_max": 0.1092590931802988, |
| "advantage_mean": -7.838631854717448e-09, |
| "advantage_min": -0.11798757687211037, |
| "advantage_std": 0.08439132361672819, |
| "completion_length": 2324.6042251586914, |
| "epoch": 0.34514285714285714, |
| "grad_norm": 0.0013441102346405387, |
| "kl": 0.00034530647099018097, |
| "learning_rate": 4.68766384637248e-07, |
| "loss": -0.0028, |
| "reward": 0.16111605032347143, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08439132594503462, |
| "rewards/cosine_scaled_reward": 0.132624052464962, |
| "rewards/format_reward": 0.6875, |
| "step": 302 |
| }, |
| { |
| "advantage_max": 0.14658861979842186, |
| "advantage_mean": -8.071462456737954e-09, |
| "advantage_min": -0.17685645446181297, |
| "advantage_std": 0.14615025278180838, |
| "completion_length": 2429.604232788086, |
| "epoch": 0.3462857142857143, |
| "grad_norm": 0.002118069212883711, |
| "kl": 0.0004744231700897217, |
| "learning_rate": 4.656784084364238e-07, |
| "loss": 0.0035, |
| "reward": 0.1084291534498334, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.14615025976672769, |
| "rewards/cosine_scaled_reward": 0.027401255443692207, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 303 |
| }, |
| { |
| "advantage_max": 0.2133752703666687, |
| "advantage_mean": -7.761021464380846e-09, |
| "advantage_min": -0.13373715244233608, |
| "advantage_std": 0.13641660660505295, |
| "completion_length": 2490.8125610351562, |
| "epoch": 0.3474285714285714, |
| "grad_norm": 0.002510175807401538, |
| "kl": 0.00044634193181991577, |
| "learning_rate": 4.6259454195101267e-07, |
| "loss": 0.005, |
| "reward": 0.11259253711614292, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1364166084676981, |
| "rewards/cosine_scaled_reward": 0.05044803116470575, |
| "rewards/format_reward": 0.5625000037252903, |
| "step": 304 |
| }, |
| { |
| "advantage_max": 0.18745366763323545, |
| "advantage_mean": -2.3283065059276353e-09, |
| "advantage_min": -0.15571925230324268, |
| "advantage_std": 0.13849145593121648, |
| "completion_length": 2695.916732788086, |
| "epoch": 0.3485714285714286, |
| "grad_norm": 0.002323645632714033, |
| "kl": 0.00039571523666381836, |
| "learning_rate": 4.59514935484316e-07, |
| "loss": 0.0109, |
| "reward": 0.0692746420390904, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13849145593121648, |
| "rewards/cosine_scaled_reward": -0.05563849490135908, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 305 |
| }, |
| { |
| "advantage_max": 0.13452134793624282, |
| "advantage_mean": -4.3073669526993985e-09, |
| "advantage_min": -0.1587929087691009, |
| "advantage_std": 0.13013424794189632, |
| "completion_length": 2271.166690826416, |
| "epoch": 0.3497142857142857, |
| "grad_norm": 0.002338060177862644, |
| "kl": 0.0003482997417449951, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": 0.0155, |
| "reward": 0.12165643041953444, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.13013425352983177, |
| "rewards/cosine_scaled_reward": 0.07424017786979675, |
| "rewards/format_reward": 0.5625000037252903, |
| "step": 306 |
| }, |
| { |
| "advantage_max": 0.19390251953154802, |
| "advantage_mean": -3.1820189122511167e-09, |
| "advantage_min": -0.13394111022353172, |
| "advantage_std": 0.13033632142469287, |
| "completion_length": 2285.5625038146973, |
| "epoch": 0.35085714285714287, |
| "grad_norm": 0.0023057800717651844, |
| "kl": 0.00042870640754699707, |
| "learning_rate": 4.5336910277482155e-07, |
| "loss": 0.0001, |
| "reward": 0.11600270541384816, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13033632514998317, |
| "rewards/cosine_scaled_reward": 0.050759092438966036, |
| "rewards/format_reward": 0.5833333358168602, |
| "step": 307 |
| }, |
| { |
| "advantage_max": 0.20064522698521614, |
| "advantage_mean": 3.880514659604373e-11, |
| "advantage_min": -0.15068083815276623, |
| "advantage_std": 0.14386159926652908, |
| "completion_length": 3076.5209045410156, |
| "epoch": 0.352, |
| "grad_norm": 0.0026027862913906574, |
| "kl": 0.00039158761501312256, |
| "learning_rate": 4.503031760712397e-07, |
| "loss": 0.0015, |
| "reward": 0.04175692540593445, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.14386159414425492, |
| "rewards/cosine_scaled_reward": -0.0642844419926405, |
| "rewards/format_reward": 0.37500000558793545, |
| "step": 308 |
| }, |
| { |
| "advantage_max": 0.1422612089663744, |
| "advantage_mean": -7.450581041013038e-09, |
| "advantage_min": -0.17655340489000082, |
| "advantage_std": 0.12518154783174396, |
| "completion_length": 2657.562530517578, |
| "epoch": 0.35314285714285715, |
| "grad_norm": 0.0022041448391973972, |
| "kl": 0.0003338456153869629, |
| "learning_rate": 4.4724210845020494e-07, |
| "loss": 0.0006, |
| "reward": 0.1618395473342389, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.125181557610631, |
| "rewards/cosine_scaled_reward": 0.1995255146175623, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 309 |
| }, |
| { |
| "advantage_max": 0.1463564890436828, |
| "advantage_mean": 5.122274313040798e-09, |
| "advantage_min": -0.14603949431329966, |
| "advantage_std": 0.11013461998663843, |
| "completion_length": 2038.0625076293945, |
| "epoch": 0.35428571428571426, |
| "grad_norm": 0.0016478110337629914, |
| "kl": 0.0003361701965332031, |
| "learning_rate": 4.441860491038345e-07, |
| "loss": 0.0006, |
| "reward": 0.1367599029908888, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11013462860137224, |
| "rewards/cosine_scaled_reward": 0.08016027277335525, |
| "rewards/format_reward": 0.645833333954215, |
| "step": 310 |
| }, |
| { |
| "advantage_max": 0.10873270966112614, |
| "advantage_mean": 2.3283066447055134e-10, |
| "advantage_min": -0.15578097198158503, |
| "advantage_std": 0.10270903469063342, |
| "completion_length": 2404.8541946411133, |
| "epoch": 0.3554285714285714, |
| "grad_norm": 0.0017932639457285404, |
| "kl": 0.00038820505142211914, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": 0.0072, |
| "reward": 0.17740142671391368, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10270903422497213, |
| "rewards/cosine_scaled_reward": 0.21936094062402844, |
| "rewards/format_reward": 0.6041666772216558, |
| "step": 311 |
| }, |
| { |
| "advantage_max": 0.14976314548403025, |
| "advantage_mean": 1.241763553094799e-09, |
| "advantage_min": -0.1350155808031559, |
| "advantage_std": 0.11650370946153998, |
| "completion_length": 2204.1250076293945, |
| "epoch": 0.3565714285714286, |
| "grad_norm": 0.0020335179287940264, |
| "kl": 0.0004989905282855034, |
| "learning_rate": 4.3808955077581546e-07, |
| "loss": -0.0003, |
| "reward": 0.1353573240339756, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11650371551513672, |
| "rewards/cosine_scaled_reward": 0.1568785011768341, |
| "rewards/format_reward": 0.47916667722165585, |
| "step": 312 |
| }, |
| { |
| "advantage_max": 0.10151751572266221, |
| "advantage_mean": -5.2774946981970494e-09, |
| "advantage_min": -0.13054312393069267, |
| "advantage_std": 0.09196106740273535, |
| "completion_length": 2606.562530517578, |
| "epoch": 0.3577142857142857, |
| "grad_norm": 0.0014636931009590626, |
| "kl": 0.0004368424415588379, |
| "learning_rate": 4.350494089288943e-07, |
| "loss": 0.0038, |
| "reward": 0.07864985754713416, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.091961067635566, |
| "rewards/cosine_scaled_reward": 0.012477612122893333, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 313 |
| }, |
| { |
| "advantage_max": 0.083519974257797, |
| "advantage_mean": -2.79396782099095e-09, |
| "advantage_min": -0.0853169858455658, |
| "advantage_std": 0.06732387357624248, |
| "completion_length": 2245.500015258789, |
| "epoch": 0.3588571428571429, |
| "grad_norm": 0.0009150411933660507, |
| "kl": 0.00030519068241119385, |
| "learning_rate": 4.3201486961161093e-07, |
| "loss": 0.0013, |
| "reward": 0.10207456815987825, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.06732387689407915, |
| "rewards/cosine_scaled_reward": 0.03957906365394592, |
| "rewards/format_reward": 0.520833333954215, |
| "step": 314 |
| }, |
| { |
| "advantage_max": 0.09536193497478962, |
| "advantage_mean": 2.3283069916502086e-10, |
| "advantage_min": -0.17288233432918787, |
| "advantage_std": 0.0984175750054419, |
| "completion_length": 2790.8750228881836, |
| "epoch": 0.36, |
| "grad_norm": 0.0022720075212419033, |
| "kl": 0.000449448823928833, |
| "learning_rate": 4.2898608072313045e-07, |
| "loss": 0.0093, |
| "reward": 0.10023409640416503, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09841757919639349, |
| "rewards/cosine_scaled_reward": 0.08823532424867153, |
| "rewards/format_reward": 0.416666679084301, |
| "step": 315 |
| }, |
| { |
| "advantage_max": 0.11639646254479885, |
| "advantage_mean": -3.4148496252939253e-09, |
| "advantage_min": -0.12197889108210802, |
| "advantage_std": 0.09412986086681485, |
| "completion_length": 3332.479217529297, |
| "epoch": 0.36114285714285715, |
| "grad_norm": 0.0020200808066874743, |
| "kl": 0.000614166259765625, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": 0.0068, |
| "reward": 0.023105132393538952, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09412986552342772, |
| "rewards/cosine_scaled_reward": -0.06621896661818027, |
| "rewards/format_reward": 0.2708333395421505, |
| "step": 316 |
| }, |
| { |
| "advantage_max": 0.22891795448958874, |
| "advantage_mean": 1.7074248265247505e-09, |
| "advantage_min": -0.16849522665143013, |
| "advantage_std": 0.1543532907962799, |
| "completion_length": 2679.187545776367, |
| "epoch": 0.36228571428571427, |
| "grad_norm": 0.0023186705075204372, |
| "kl": 0.0004814229905605316, |
| "learning_rate": 4.2294634442070553e-07, |
| "loss": 0.0081, |
| "reward": 0.04336748970672488, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15435329405590892, |
| "rewards/cosine_scaled_reward": -0.07002291013486683, |
| "rewards/format_reward": 0.39583333767950535, |
| "step": 317 |
| }, |
| { |
| "advantage_max": 0.13123337179422379, |
| "advantage_mean": -8.498318770921998e-09, |
| "advantage_min": -0.12308492953889072, |
| "advantage_std": 0.10156573518179357, |
| "completion_length": 1147.7916870117188, |
| "epoch": 0.36342857142857143, |
| "grad_norm": 0.001232507056556642, |
| "kl": 0.00012452714145183563, |
| "learning_rate": 4.1993569137498776e-07, |
| "loss": 0.0005, |
| "reward": 0.17573032714426517, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10156573657877743, |
| "rewards/cosine_scaled_reward": 0.07146302983164787, |
| "rewards/format_reward": 0.895833333954215, |
| "step": 318 |
| }, |
| { |
| "advantage_max": 0.17452884558588266, |
| "advantage_mean": -2.6387474566513802e-09, |
| "advantage_min": -0.12432311568409204, |
| "advantage_std": 0.11903674202039838, |
| "completion_length": 2531.812515258789, |
| "epoch": 0.36457142857142855, |
| "grad_norm": 0.0020693091209977865, |
| "kl": 0.00045609474182128906, |
| "learning_rate": 4.1693137748017915e-07, |
| "loss": 0.0077, |
| "reward": 0.024991515558212996, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11903674621134996, |
| "rewards/cosine_scaled_reward": -0.14697773708030581, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 319 |
| }, |
| { |
| "advantage_max": 0.09645581245422363, |
| "advantage_mean": -1.474593960826276e-09, |
| "advantage_min": -0.11469449661672115, |
| "advantage_std": 0.08255432173609734, |
| "completion_length": 1697.708351135254, |
| "epoch": 0.3657142857142857, |
| "grad_norm": 0.001524286693893373, |
| "kl": 0.0003675222396850586, |
| "learning_rate": 4.1393354916230005e-07, |
| "loss": 0.0017, |
| "reward": 0.1566449678502977, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08255432453006506, |
| "rewards/cosine_scaled_reward": 0.06804804364219308, |
| "rewards/format_reward": 0.7916666679084301, |
| "step": 320 |
| }, |
| { |
| "advantage_max": 0.10896373726427555, |
| "advantage_mean": -5.122274382429737e-09, |
| "advantage_min": -0.1531135831028223, |
| "advantage_std": 0.10200135898776352, |
| "completion_length": 1350.6041793823242, |
| "epoch": 0.3668571428571429, |
| "grad_norm": 0.0013724055606871843, |
| "kl": 0.00025459565222263336, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": -0.0041, |
| "reward": 0.24308783560991287, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10200135805644095, |
| "rewards/cosine_scaled_reward": 0.2970201913267374, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 321 |
| }, |
| { |
| "advantage_max": 0.15791441453620791, |
| "advantage_mean": -3.104407841902912e-10, |
| "advantage_min": -0.15344735700637102, |
| "advantage_std": 0.13148926093708724, |
| "completion_length": 2771.4166946411133, |
| "epoch": 0.368, |
| "grad_norm": 0.0031580019276589155, |
| "kl": 0.0005682110786437988, |
| "learning_rate": 4.079579333738039e-07, |
| "loss": 0.0113, |
| "reward": 0.04155511595308781, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13148926885332912, |
| "rewards/cosine_scaled_reward": -0.06613871618174016, |
| "rewards/format_reward": 0.3750000111758709, |
| "step": 322 |
| }, |
| { |
| "advantage_max": 0.11292254738509655, |
| "advantage_mean": -5.044664325160042e-09, |
| "advantage_min": -0.12851296365261078, |
| "advantage_std": 0.09898313414305449, |
| "completion_length": 2861.937530517578, |
| "epoch": 0.36914285714285716, |
| "grad_norm": 0.0017387475818395615, |
| "kl": 0.0004943609237670898, |
| "learning_rate": 4.0498043714627006e-07, |
| "loss": 0.0023, |
| "reward": 0.08781374106183648, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.09898313879966736, |
| "rewards/cosine_scaled_reward": 0.020912078442052007, |
| "rewards/format_reward": 0.4791666753590107, |
| "step": 323 |
| }, |
| { |
| "advantage_max": 0.17608331004157662, |
| "advantage_mean": -2.173086106893596e-09, |
| "advantage_min": -0.12420807220041752, |
| "advantage_std": 0.11417877301573753, |
| "completion_length": 2464.000045776367, |
| "epoch": 0.3702857142857143, |
| "grad_norm": 0.002251465106382966, |
| "kl": 0.0004966855049133301, |
| "learning_rate": 4.020100089676376e-07, |
| "loss": 0.001, |
| "reward": 0.08936168113723397, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11417877511121333, |
| "rewards/cosine_scaled_reward": -0.007908736355602741, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 324 |
| }, |
| { |
| "advantage_max": 0.20293579250574112, |
| "advantage_mean": -6.67447851154801e-09, |
| "advantage_min": -0.20778802502900362, |
| "advantage_std": 0.1762455804273486, |
| "completion_length": 2415.187545776367, |
| "epoch": 0.37142857142857144, |
| "grad_norm": 0.003066692966967821, |
| "kl": 0.0004252195358276367, |
| "learning_rate": 3.9904679361238526e-07, |
| "loss": 0.0136, |
| "reward": 0.19227174390107393, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.17624558601528406, |
| "rewards/cosine_scaled_reward": 0.22559675807133317, |
| "rewards/format_reward": 0.6875000111758709, |
| "step": 325 |
| }, |
| { |
| "advantage_max": 0.11844811588525772, |
| "advantage_mean": -4.268561761000544e-09, |
| "advantage_min": -0.13017160259187222, |
| "advantage_std": 0.09890975127927959, |
| "completion_length": 2067.6667137145996, |
| "epoch": 0.37257142857142855, |
| "grad_norm": 0.0022512541618198156, |
| "kl": 0.00033466145396232605, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": 0.0074, |
| "reward": 0.12452584411948919, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09890975733287632, |
| "rewards/cosine_scaled_reward": 0.0662637110799551, |
| "rewards/format_reward": 0.6041666679084301, |
| "step": 326 |
| }, |
| { |
| "advantage_max": 0.17955493042245507, |
| "advantage_mean": -3.725290298461914e-09, |
| "advantage_min": -0.16282919980585575, |
| "advantage_std": 0.1376048857346177, |
| "completion_length": 2571.145835876465, |
| "epoch": 0.3737142857142857, |
| "grad_norm": 0.0025611212477087975, |
| "kl": 0.00043479073792696, |
| "learning_rate": 3.931425787051832e-07, |
| "loss": 0.0029, |
| "reward": 0.13137949211522937, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.137604889459908, |
| "rewards/cosine_scaled_reward": 0.12837360659614205, |
| "rewards/format_reward": 0.5208333376795053, |
| "step": 327 |
| }, |
| { |
| "advantage_max": 0.18027144204825163, |
| "advantage_mean": -4.11334142441655e-09, |
| "advantage_min": -0.12120586633682251, |
| "advantage_std": 0.12321445951238275, |
| "completion_length": 3299.437530517578, |
| "epoch": 0.37485714285714283, |
| "grad_norm": 0.002698281779885292, |
| "kl": 0.0005347728729248047, |
| "learning_rate": 3.902018669163384e-07, |
| "loss": 0.0104, |
| "reward": -0.015781979076564312, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12321445951238275, |
| "rewards/cosine_scaled_reward": -0.19141755625605583, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 328 |
| }, |
| { |
| "advantage_max": 0.11980468919500709, |
| "advantage_mean": -3.1044086051812414e-09, |
| "advantage_min": -0.15748842991888523, |
| "advantage_std": 0.10885418800171465, |
| "completion_length": 1634.270839691162, |
| "epoch": 0.376, |
| "grad_norm": 0.0012264562537893653, |
| "kl": 0.00037041306495666504, |
| "learning_rate": 3.872689434630585e-07, |
| "loss": -0.0013, |
| "reward": 0.1498857717961073, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10885419067926705, |
| "rewards/cosine_scaled_reward": 0.09750008094124496, |
| "rewards/format_reward": 0.6875000074505806, |
| "step": 329 |
| }, |
| { |
| "advantage_max": 0.18279112502932549, |
| "advantage_mean": -9.7788870612181e-09, |
| "advantage_min": -0.14392871782183647, |
| "advantage_std": 0.13134588208049536, |
| "completion_length": 2363.208351135254, |
| "epoch": 0.37714285714285717, |
| "grad_norm": 0.002868218347430229, |
| "kl": 0.0007173418998718262, |
| "learning_rate": 3.843439512918949e-07, |
| "loss": 0.0023, |
| "reward": 0.07145864237099886, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1313458839431405, |
| "rewards/cosine_scaled_reward": -0.0505201262421906, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 330 |
| }, |
| { |
| "advantage_max": 0.18987913988530636, |
| "advantage_mean": -5.743155867543592e-09, |
| "advantage_min": -0.1331134121865034, |
| "advantage_std": 0.12819427531212568, |
| "completion_length": 2273.166679382324, |
| "epoch": 0.3782857142857143, |
| "grad_norm": 0.0021069832146167755, |
| "kl": 0.0004393383860588074, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": 0.0078, |
| "reward": 0.04232434229925275, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12819427764043212, |
| "rewards/cosine_scaled_reward": -0.10524179972708225, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 331 |
| }, |
| { |
| "advantage_max": 0.20149581134319305, |
| "advantage_mean": -7.916241953620506e-09, |
| "advantage_min": -0.14159746747463942, |
| "advantage_std": 0.1383329126983881, |
| "completion_length": 2499.6458587646484, |
| "epoch": 0.37942857142857145, |
| "grad_norm": 0.002372437622398138, |
| "kl": 0.0004247426986694336, |
| "learning_rate": 3.785183306423767e-07, |
| "loss": 0.0009, |
| "reward": 0.12115885165985674, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13833291921764612, |
| "rewards/cosine_scaled_reward": 0.05477744806557894, |
| "rewards/format_reward": 0.6041666809469461, |
| "step": 332 |
| }, |
| { |
| "advantage_max": 0.13082721852697432, |
| "advantage_mean": -6.635673649446616e-09, |
| "advantage_min": -0.18306226492859423, |
| "advantage_std": 0.12717127020005137, |
| "completion_length": 2043.333366394043, |
| "epoch": 0.38057142857142856, |
| "grad_norm": 0.0020115238148719072, |
| "kl": 0.00046539306640625, |
| "learning_rate": 3.7561798609655373e-07, |
| "loss": 0.0014, |
| "reward": 0.1636866086628288, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1271712731104344, |
| "rewards/cosine_scaled_reward": 0.10506386123597622, |
| "rewards/format_reward": 0.75, |
| "step": 333 |
| }, |
| { |
| "advantage_max": 0.12227540975436568, |
| "advantage_mean": -1.6298146374160716e-09, |
| "advantage_min": -0.12114514503628016, |
| "advantage_std": 0.09305540984496474, |
| "completion_length": 2965.1875610351562, |
| "epoch": 0.38171428571428573, |
| "grad_norm": 0.0016468079993501306, |
| "kl": 0.000513911247253418, |
| "learning_rate": 3.72726140684072e-07, |
| "loss": 0.0028, |
| "reward": 0.00035559339448809624, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09305540984496474, |
| "rewards/cosine_scaled_reward": -0.18864441104233265, |
| "rewards/format_reward": 0.3750000111758709, |
| "step": 334 |
| }, |
| { |
| "advantage_max": 0.17929319106042385, |
| "advantage_mean": -1.3426567503638243e-08, |
| "advantage_min": -0.24936186987906694, |
| "advantage_std": 0.17879032995551825, |
| "completion_length": 2200.229202270508, |
| "epoch": 0.38285714285714284, |
| "grad_norm": 0.0025713045615702868, |
| "kl": 0.00046062469482421875, |
| "learning_rate": 3.6984293534939737e-07, |
| "loss": 0.0018, |
| "reward": 0.21270178398117423, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.17879033274948597, |
| "rewards/cosine_scaled_reward": 0.2734536435455084, |
| "rewards/format_reward": 0.7083333395421505, |
| "step": 335 |
| }, |
| { |
| "advantage_max": 0.13005139166489244, |
| "advantage_mean": -1.8626451769865326e-09, |
| "advantage_min": -0.15239684004336596, |
| "advantage_std": 0.10806715162470937, |
| "completion_length": 2616.2916946411133, |
| "epoch": 0.384, |
| "grad_norm": 0.0015858953120186925, |
| "kl": 0.0004538893699645996, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": 0.0022, |
| "reward": 0.09085274318931624, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10806715488433838, |
| "rewards/cosine_scaled_reward": 0.01898368075489998, |
| "rewards/format_reward": 0.5000000018626451, |
| "step": 336 |
| }, |
| { |
| "advantage_max": 0.14148719515651464, |
| "advantage_mean": -1.2417634698280722e-09, |
| "advantage_min": -0.1784699847921729, |
| "advantage_std": 0.13605968561023474, |
| "completion_length": 2599.1250381469727, |
| "epoch": 0.3851428571428571, |
| "grad_norm": 0.0025568308774381876, |
| "kl": 0.00046902894973754883, |
| "learning_rate": 3.641030065789562e-07, |
| "loss": 0.011, |
| "reward": 0.0828043669462204, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13605968886986375, |
| "rewards/cosine_scaled_reward": -0.0166609063744545, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 337 |
| }, |
| { |
| "advantage_max": 0.16837644949555397, |
| "advantage_mean": -8.692344150018627e-09, |
| "advantage_min": -0.17715413495898247, |
| "advantage_std": 0.1369879769627005, |
| "completion_length": 1891.7292098999023, |
| "epoch": 0.3862857142857143, |
| "grad_norm": 0.0030996922869235277, |
| "kl": 0.0004836171865463257, |
| "learning_rate": 3.612465628992203e-07, |
| "loss": 0.0105, |
| "reward": 0.17488606134429574, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13698797672986984, |
| "rewards/cosine_scaled_reward": 0.17192834429442883, |
| "rewards/format_reward": 0.687500013038516, |
| "step": 338 |
| }, |
| { |
| "advantage_max": 0.10435130982659757, |
| "advantage_mean": -2.4447217861212067e-09, |
| "advantage_min": -0.10210681799799204, |
| "advantage_std": 0.07953963615000248, |
| "completion_length": 2805.750030517578, |
| "epoch": 0.38742857142857146, |
| "grad_norm": 0.0016661995323374867, |
| "kl": 0.00048545002937316895, |
| "learning_rate": 3.5839931879571725e-07, |
| "loss": -0.0011, |
| "reward": 0.06721553253009915, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.07953964336775243, |
| "rewards/cosine_scaled_reward": -0.009967771358788013, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 339 |
| }, |
| { |
| "advantage_max": 0.14381090085953474, |
| "advantage_mean": -5.975986885897733e-09, |
| "advantage_min": -0.13768143858760595, |
| "advantage_std": 0.11206724308431149, |
| "completion_length": 2374.875030517578, |
| "epoch": 0.38857142857142857, |
| "grad_norm": 0.001165210036560893, |
| "kl": 0.0003544166684150696, |
| "learning_rate": 3.555614130391079e-07, |
| "loss": -0.0003, |
| "reward": 0.13663293584249914, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1120672500692308, |
| "rewards/cosine_scaled_reward": 0.07016189396381378, |
| "rewards/format_reward": 0.6666666679084301, |
| "step": 340 |
| }, |
| { |
| "advantage_max": 0.11593026760965586, |
| "advantage_mean": -6.053596623978308e-09, |
| "advantage_min": -0.1835379470139742, |
| "advantage_std": 0.11309912154683843, |
| "completion_length": 2168.2917404174805, |
| "epoch": 0.38971428571428574, |
| "grad_norm": 0.0020617349073290825, |
| "kl": 0.00035562366247177124, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.0072, |
| "reward": 0.19327943865209818, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11309912716387771, |
| "rewards/cosine_scaled_reward": 0.20469791069626808, |
| "rewards/format_reward": 0.7291666753590107, |
| "step": 341 |
| }, |
| { |
| "advantage_max": 0.2113343793898821, |
| "advantage_mean": -3.880510857090513e-09, |
| "advantage_min": -0.21403949242085218, |
| "advantage_std": 0.19739758502691984, |
| "completion_length": 2652.5416870117188, |
| "epoch": 0.39085714285714285, |
| "grad_norm": 0.0037038561422377825, |
| "kl": 0.00047537684440612793, |
| "learning_rate": 3.4991416936678276e-07, |
| "loss": 0.0136, |
| "reward": 0.12849632510915399, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.19739759154617786, |
| "rewards/cosine_scaled_reward": 0.13792963325977325, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 342 |
| }, |
| { |
| "advantage_max": 0.1500543192960322, |
| "advantage_mean": -5.665545727007171e-09, |
| "advantage_min": -0.19857817236334085, |
| "advantage_std": 0.14241656986996531, |
| "completion_length": 2764.666717529297, |
| "epoch": 0.392, |
| "grad_norm": 0.002483693417161703, |
| "kl": 0.00038484111428260803, |
| "learning_rate": 3.471051066897562e-07, |
| "loss": 0.0049, |
| "reward": 0.17116584605537355, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.14241657592356205, |
| "rewards/cosine_scaled_reward": 0.21320676431059837, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 343 |
| }, |
| { |
| "advantage_max": 0.11467262031510472, |
| "advantage_mean": -8.343098146373906e-09, |
| "advantage_min": -0.09952630288898945, |
| "advantage_std": 0.09163640858605504, |
| "completion_length": 2078.5417098999023, |
| "epoch": 0.3931428571428571, |
| "grad_norm": 0.0015788535820320249, |
| "kl": 0.00035351328551769257, |
| "learning_rate": 3.4430593282358777e-07, |
| "loss": 0.0011, |
| "reward": 0.20408116653561592, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.09163640951737761, |
| "rewards/cosine_scaled_reward": 0.2693120799958706, |
| "rewards/format_reward": 0.6666666679084301, |
| "step": 344 |
| }, |
| { |
| "advantage_max": 0.13983573578298092, |
| "advantage_mean": 3.1820189816400557e-09, |
| "advantage_min": -0.1527677569538355, |
| "advantage_std": 0.1231717630289495, |
| "completion_length": 2897.875030517578, |
| "epoch": 0.3942857142857143, |
| "grad_norm": 0.002323306631296873, |
| "kl": 0.0005290508270263672, |
| "learning_rate": 3.4151678419606233e-07, |
| "loss": -0.0001, |
| "reward": 0.05550580471754074, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1231717630289495, |
| "rewards/cosine_scaled_reward": 0.00512329675257206, |
| "rewards/format_reward": 0.3125000037252903, |
| "step": 345 |
| }, |
| { |
| "advantage_max": 0.22859193058684468, |
| "advantage_mean": 1.3969839035565812e-09, |
| "advantage_min": -0.13696812302805483, |
| "advantage_std": 0.13573638605885208, |
| "completion_length": 2998.479217529297, |
| "epoch": 0.3954285714285714, |
| "grad_norm": 0.002322471234947443, |
| "kl": 0.0005121231079101562, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": 0.0006, |
| "reward": 0.04644642909988761, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13573638536036015, |
| "rewards/cosine_scaled_reward": -0.10306344844866544, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 346 |
| }, |
| { |
| "advantage_max": 0.1349148927256465, |
| "advantage_mean": -8.381903310317185e-09, |
| "advantage_min": -0.10294443322345614, |
| "advantage_std": 0.09874543640762568, |
| "completion_length": 3018.1458740234375, |
| "epoch": 0.3965714285714286, |
| "grad_norm": 0.002016145968809724, |
| "kl": 0.0004055500030517578, |
| "learning_rate": 3.359691059183761e-07, |
| "loss": 0.004, |
| "reward": 0.033684686524793506, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.0987454392015934, |
| "rewards/cosine_scaled_reward": -0.11986671015620232, |
| "rewards/format_reward": 0.4375000037252903, |
| "step": 347 |
| }, |
| { |
| "advantage_max": 0.09889293229207397, |
| "advantage_mean": 2.0954757998237206e-09, |
| "advantage_min": -0.1360151378903538, |
| "advantage_std": 0.10096699907444417, |
| "completion_length": 2521.062530517578, |
| "epoch": 0.3977142857142857, |
| "grad_norm": 0.0023632138036191463, |
| "kl": 0.0003896355628967285, |
| "learning_rate": 3.3321084665422803e-07, |
| "loss": 0.0126, |
| "reward": 0.0665872145909816, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10096699860878289, |
| "rewards/cosine_scaled_reward": -0.06292372569441795, |
| "rewards/format_reward": 0.5208333358168602, |
| "step": 348 |
| }, |
| { |
| "advantage_max": 0.13384045055136085, |
| "advantage_mean": 1.5522043372850902e-09, |
| "advantage_min": -0.09279891615733504, |
| "advantage_std": 0.08720876974985003, |
| "completion_length": 3144.2291870117188, |
| "epoch": 0.39885714285714285, |
| "grad_norm": 0.0023430907167494297, |
| "kl": 0.0006818771362304688, |
| "learning_rate": 3.3046315338757026e-07, |
| "loss": 0.0019, |
| "reward": 0.03250998561270535, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08720877300947905, |
| "rewards/cosine_scaled_reward": -0.08978106081485748, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 349 |
| }, |
| { |
| "advantage_max": 0.2537369290366769, |
| "advantage_mean": -1.0399769351243648e-08, |
| "advantage_min": -0.2440725015476346, |
| "advantage_std": 0.2099163606762886, |
| "completion_length": 2217.2917404174805, |
| "epoch": 0.4, |
| "grad_norm": 0.0038065649569034576, |
| "kl": 0.0004671439528465271, |
| "learning_rate": 3.2772616003709616e-07, |
| "loss": 0.0138, |
| "reward": 0.1614842200651765, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.20991637371480465, |
| "rewards/cosine_scaled_reward": 0.16261385567486286, |
| "rewards/format_reward": 0.6250000093132257, |
| "step": 350 |
| }, |
| { |
| "advantage_max": 0.10746736731380224, |
| "advantage_mean": -4.346172130520465e-09, |
| "advantage_min": -0.12306162435561419, |
| "advantage_std": 0.09336511231958866, |
| "completion_length": 2644.937530517578, |
| "epoch": 0.40114285714285713, |
| "grad_norm": 0.0012864568270742893, |
| "kl": 0.0005724728107452393, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": -0.0015, |
| "reward": 0.07907040324062109, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.0933651146478951, |
| "rewards/cosine_scaled_reward": -0.024259530007839203, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 351 |
| }, |
| { |
| "advantage_max": 0.14927358692511916, |
| "advantage_mean": -1.5522045454519073e-10, |
| "advantage_min": -0.13300226628780365, |
| "advantage_std": 0.11855996306985617, |
| "completion_length": 2226.562545776367, |
| "epoch": 0.4022857142857143, |
| "grad_norm": 0.0017897867364808917, |
| "kl": 0.0004187921294942498, |
| "learning_rate": 3.222848061454764e-07, |
| "loss": 0.0041, |
| "reward": 0.1405172348022461, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11855996120721102, |
| "rewards/cosine_scaled_reward": 0.09164215251803398, |
| "rewards/format_reward": 0.6458333358168602, |
| "step": 352 |
| }, |
| { |
| "advantage_max": 0.15562463272362947, |
| "advantage_mean": -3.1044089521259366e-10, |
| "advantage_min": -0.1712775742635131, |
| "advantage_std": 0.13150706025771797, |
| "completion_length": 2046.375020980835, |
| "epoch": 0.4034285714285714, |
| "grad_norm": 0.003071481129154563, |
| "kl": 0.00032861530780792236, |
| "learning_rate": 3.195807108082429e-07, |
| "loss": 0.0127, |
| "reward": 0.1798698278144002, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1315070646815002, |
| "rewards/cosine_scaled_reward": 0.20370216853916645, |
| "rewards/format_reward": 0.6458333432674408, |
| "step": 353 |
| }, |
| { |
| "advantage_max": 0.14535299316048622, |
| "advantage_mean": -7.140139923755839e-09, |
| "advantage_min": -0.1506730942055583, |
| "advantage_std": 0.132601466961205, |
| "completion_length": 2020.5625381469727, |
| "epoch": 0.4045714285714286, |
| "grad_norm": 0.0018860672134906054, |
| "kl": 0.00036281999200582504, |
| "learning_rate": 3.168878457820915e-07, |
| "loss": 0.0052, |
| "reward": 0.1788836452178657, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.13260147930122912, |
| "rewards/cosine_scaled_reward": 0.1834542783908546, |
| "rewards/format_reward": 0.6875000018626451, |
| "step": 354 |
| }, |
| { |
| "advantage_max": 0.13799833692610264, |
| "advantage_mean": -2.2506962266133357e-09, |
| "advantage_min": -0.11328241974115372, |
| "advantage_std": 0.10098979715257883, |
| "completion_length": 2142.145881652832, |
| "epoch": 0.4057142857142857, |
| "grad_norm": 0.0024661533534526825, |
| "kl": 0.00044939108192920685, |
| "learning_rate": 3.142063423134644e-07, |
| "loss": 0.0106, |
| "reward": 0.17950351699255407, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10098980064503849, |
| "rewards/cosine_scaled_reward": 0.22795349592342973, |
| "rewards/format_reward": 0.6041666697710752, |
| "step": 355 |
| }, |
| { |
| "advantage_max": 0.17581096477806568, |
| "advantage_mean": -9.934107980669182e-09, |
| "advantage_min": -0.17233179230242968, |
| "advantage_std": 0.1403276165947318, |
| "completion_length": 2595.979202270508, |
| "epoch": 0.40685714285714286, |
| "grad_norm": 0.002081536455079913, |
| "kl": 0.0004221200942993164, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": -0.0018, |
| "reward": 0.12342565413564444, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1403276203200221, |
| "rewards/cosine_scaled_reward": 0.03239230625331402, |
| "rewards/format_reward": 0.6666666716337204, |
| "step": 356 |
| }, |
| { |
| "advantage_max": 0.1474649435840547, |
| "advantage_mean": -7.528191278693974e-09, |
| "advantage_min": -0.11082311253994703, |
| "advantage_std": 0.10918143065646291, |
| "completion_length": 2855.250045776367, |
| "epoch": 0.408, |
| "grad_norm": 0.002035984070971608, |
| "kl": 0.00044214725494384766, |
| "learning_rate": 3.0887794225945143e-07, |
| "loss": 0.006, |
| "reward": 0.0833205720409751, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10918143903836608, |
| "rewards/cosine_scaled_reward": 0.027413238771259785, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 357 |
| }, |
| { |
| "advantage_max": 0.17143051512539387, |
| "advantage_mean": -1.6453365836444078e-08, |
| "advantage_min": -0.17683418467640877, |
| "advantage_std": 0.14035541797056794, |
| "completion_length": 2261.770896911621, |
| "epoch": 0.40914285714285714, |
| "grad_norm": 0.0024120674934238195, |
| "kl": 0.0003642141819000244, |
| "learning_rate": 3.062313053727671e-07, |
| "loss": 0.006, |
| "reward": 0.26533588115125895, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14035542216151953, |
| "rewards/cosine_scaled_reward": 0.36936704453546554, |
| "rewards/format_reward": 0.8333333395421505, |
| "step": 358 |
| }, |
| { |
| "advantage_max": 0.16622212389484048, |
| "advantage_mean": -3.414849708560652e-09, |
| "advantage_min": -0.1195356696844101, |
| "advantage_std": 0.11384606640785933, |
| "completion_length": 2067.2083587646484, |
| "epoch": 0.4102857142857143, |
| "grad_norm": 0.0015450696228072047, |
| "kl": 0.0002728961408138275, |
| "learning_rate": 3.0359654942835247e-07, |
| "loss": 0.003, |
| "reward": 0.10825431568082422, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11384607246145606, |
| "rewards/cosine_scaled_reward": 0.0183385512791574, |
| "rewards/format_reward": 0.6041666772216558, |
| "step": 359 |
| }, |
| { |
| "advantage_max": 0.1626054784283042, |
| "advantage_mean": -1.226241415352991e-08, |
| "advantage_min": -0.18612738978117704, |
| "advantage_std": 0.13540870044380426, |
| "completion_length": 2324.1875610351562, |
| "epoch": 0.4114285714285714, |
| "grad_norm": 0.002945123938843608, |
| "kl": 0.0004897117614746094, |
| "learning_rate": 3.0097380284049523e-07, |
| "loss": 0.008, |
| "reward": 0.22260520420968533, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13540870510041714, |
| "rewards/cosine_scaled_reward": 0.3045365456491709, |
| "rewards/format_reward": 0.7083333488553762, |
| "step": 360 |
| }, |
| { |
| "advantage_max": 0.15933177806437016, |
| "advantage_mean": -4.501392453226671e-09, |
| "advantage_min": -0.17589706368744373, |
| "advantage_std": 0.14223455544561148, |
| "completion_length": 2791.5208740234375, |
| "epoch": 0.4125714285714286, |
| "grad_norm": 0.0035066171549260616, |
| "kl": 0.0005368292331695557, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0078, |
| "reward": 0.14737363997846842, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14223456801846623, |
| "rewards/cosine_scaled_reward": 0.1759349994827062, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 361 |
| }, |
| { |
| "advantage_max": 0.10177448485046625, |
| "advantage_mean": -1.1175871172941498e-08, |
| "advantage_min": -0.147513457108289, |
| "advantage_std": 0.09820342680905014, |
| "completion_length": 1387.0000305175781, |
| "epoch": 0.4137142857142857, |
| "grad_norm": 0.0026847573462873697, |
| "kl": 0.0003243088722229004, |
| "learning_rate": 2.9576484845877793e-07, |
| "loss": 0.0035, |
| "reward": 0.2018571854569018, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09820342832244933, |
| "rewards/cosine_scaled_reward": 0.1488652601838112, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 362 |
| }, |
| { |
| "advantage_max": 0.13114572037011385, |
| "advantage_mean": -4.656612970221907e-09, |
| "advantage_min": -0.1494987541809678, |
| "advantage_std": 0.11584013933315873, |
| "completion_length": 1757.8333473205566, |
| "epoch": 0.41485714285714287, |
| "grad_norm": 0.0014682364417240024, |
| "kl": 0.00039479881525039673, |
| "learning_rate": 2.931788945420058e-07, |
| "loss": 0.0004, |
| "reward": 0.18233315646648407, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11584014166146517, |
| "rewards/cosine_scaled_reward": 0.1738392524421215, |
| "rewards/format_reward": 0.7291666716337204, |
| "step": 363 |
| }, |
| { |
| "advantage_max": 0.14291757624596357, |
| "advantage_mean": -4.190951891080985e-09, |
| "advantage_min": -0.11483692191541195, |
| "advantage_std": 0.1075741620734334, |
| "completion_length": 2885.9166717529297, |
| "epoch": 0.416, |
| "grad_norm": 0.0023508663289248943, |
| "kl": 0.0006248950958251953, |
| "learning_rate": 2.9060545772359305e-07, |
| "loss": 0.0046, |
| "reward": 0.06019208254292607, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10757415881380439, |
| "rewards/cosine_scaled_reward": -0.020222272723913193, |
| "rewards/format_reward": 0.39583333767950535, |
| "step": 364 |
| }, |
| { |
| "advantage_max": 0.1367309088818729, |
| "advantage_mean": 2.7939678071131624e-09, |
| "advantage_min": -0.17309968266636133, |
| "advantage_std": 0.125831242185086, |
| "completion_length": 2852.395854949951, |
| "epoch": 0.41714285714285715, |
| "grad_norm": 0.0022203971166163683, |
| "kl": 0.0005035400390625, |
| "learning_rate": 2.8804466342921987e-07, |
| "loss": 0.0045, |
| "reward": 0.05301872221753001, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12583124404773116, |
| "rewards/cosine_scaled_reward": -0.022073883563280106, |
| "rewards/format_reward": 0.3541666828095913, |
| "step": 365 |
| }, |
| { |
| "advantage_max": 0.08324602991342545, |
| "advantage_mean": 9.313225052265395e-10, |
| "advantage_min": -0.15099230967462063, |
| "advantage_std": 0.0951183415018022, |
| "completion_length": 1617.1875267028809, |
| "epoch": 0.41828571428571426, |
| "grad_norm": 0.0008901763940230012, |
| "kl": 0.0003096461296081543, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": -0.0006, |
| "reward": 0.23938876390457153, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09511834103614092, |
| "rewards/cosine_scaled_reward": 0.3382144197821617, |
| "rewards/format_reward": 0.7291666716337204, |
| "step": 366 |
| }, |
| { |
| "advantage_max": 0.2165116430260241, |
| "advantage_mean": -4.811833376194841e-09, |
| "advantage_min": -0.23684036545455456, |
| "advantage_std": 0.17490943847224116, |
| "completion_length": 2510.854202270508, |
| "epoch": 0.41942857142857143, |
| "grad_norm": 0.002953734714537859, |
| "kl": 0.00034871697425842285, |
| "learning_rate": 2.829615010283344e-07, |
| "loss": 0.0067, |
| "reward": 0.14961381210014224, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.17490944173187017, |
| "rewards/cosine_scaled_reward": 0.12919975304976106, |
| "rewards/format_reward": 0.6250000093132257, |
| "step": 367 |
| }, |
| { |
| "advantage_max": 0.17113121692091227, |
| "advantage_mean": -1.5522044899407561e-09, |
| "advantage_min": -0.0952818775549531, |
| "advantage_std": 0.10084449546411633, |
| "completion_length": 2847.395854949951, |
| "epoch": 0.4205714285714286, |
| "grad_norm": 0.004048696719110012, |
| "kl": 0.0005242824554443359, |
| "learning_rate": 2.8043938066798645e-07, |
| "loss": 0.0112, |
| "reward": 0.06646438379539177, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10084450151771307, |
| "rewards/cosine_scaled_reward": 0.03680053818970919, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 368 |
| }, |
| { |
| "advantage_max": 0.16597222117707133, |
| "advantage_mean": -3.1432137274911565e-09, |
| "advantage_min": -0.12302078539505601, |
| "advantage_std": 0.11180994007736444, |
| "completion_length": 2262.1666831970215, |
| "epoch": 0.4217142857142857, |
| "grad_norm": 0.0016202060505747795, |
| "kl": 0.00044993311166763306, |
| "learning_rate": 2.7793039831193133e-07, |
| "loss": 0.0003, |
| "reward": 0.13039299566298723, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1118099435698241, |
| "rewards/cosine_scaled_reward": 0.07433954149018973, |
| "rewards/format_reward": 0.6250000074505806, |
| "step": 369 |
| }, |
| { |
| "advantage_max": 0.15608058404177427, |
| "advantage_mean": 2.716357583310014e-09, |
| "advantage_min": -0.1327444650232792, |
| "advantage_std": 0.1269805277697742, |
| "completion_length": 3054.958396911621, |
| "epoch": 0.4228571428571429, |
| "grad_norm": 0.0026487144641578197, |
| "kl": 0.0006656236946582794, |
| "learning_rate": 2.7543467624442956e-07, |
| "loss": 0.0055, |
| "reward": 0.11274721287190914, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12698053661733866, |
| "rewards/cosine_scaled_reward": 0.14613067544996738, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 370 |
| }, |
| { |
| "advantage_max": 0.10794967133551836, |
| "advantage_mean": -2.173086016687975e-09, |
| "advantage_min": -0.05772289913147688, |
| "advantage_std": 0.06405279028695077, |
| "completion_length": 1609.895881652832, |
| "epoch": 0.424, |
| "grad_norm": 0.0015763145638629794, |
| "kl": 0.0003489851951599121, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0069, |
| "reward": 0.13482660567387938, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.06405279482714832, |
| "rewards/cosine_scaled_reward": 0.043605952989310026, |
| "rewards/format_reward": 0.7083333395421505, |
| "step": 371 |
| }, |
| { |
| "advantage_max": 0.17068721819669008, |
| "advantage_mean": -8.226682959855403e-09, |
| "advantage_min": -0.1903815222904086, |
| "advantage_std": 0.1448813541792333, |
| "completion_length": 2810.4583854675293, |
| "epoch": 0.42514285714285716, |
| "grad_norm": 0.0029529884923249483, |
| "kl": 0.0004370957612991333, |
| "learning_rate": 2.7048349887476037e-07, |
| "loss": 0.0065, |
| "reward": 0.1317643583752215, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14488135650753975, |
| "rewards/cosine_scaled_reward": 0.15046954539138824, |
| "rewards/format_reward": 0.47916667349636555, |
| "step": 372 |
| }, |
| { |
| "advantage_max": 0.12174444226548076, |
| "advantage_mean": -8.071462387349015e-09, |
| "advantage_min": -0.0992058515548706, |
| "advantage_std": 0.08604467427358031, |
| "completion_length": 1773.1250305175781, |
| "epoch": 0.42628571428571427, |
| "grad_norm": 0.001331821666099131, |
| "kl": 0.00028437376022338867, |
| "learning_rate": 2.6802828488599294e-07, |
| "loss": 0.0018, |
| "reward": 0.13919993431773037, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08604467613622546, |
| "rewards/cosine_scaled_reward": 0.0785412099212408, |
| "rewards/format_reward": 0.6666666716337204, |
| "step": 373 |
| }, |
| { |
| "advantage_max": 0.13440924789756536, |
| "advantage_mean": -6.208817238118058e-09, |
| "advantage_min": -0.19716255459934473, |
| "advantage_std": 0.13642028719186783, |
| "completion_length": 1887.979232788086, |
| "epoch": 0.42742857142857144, |
| "grad_norm": 0.002169976709410548, |
| "kl": 0.00032141804695129395, |
| "learning_rate": 2.655868138008171e-07, |
| "loss": 0.0044, |
| "reward": 0.17201172886416316, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1364202918484807, |
| "rewards/cosine_scaled_reward": 0.1637210724875331, |
| "rewards/format_reward": 0.6875000186264515, |
| "step": 374 |
| }, |
| { |
| "advantage_max": 0.12574104312807322, |
| "advantage_mean": -1.3193737075090084e-09, |
| "advantage_min": -0.16399358585476875, |
| "advantage_std": 0.12628946546465158, |
| "completion_length": 2755.687530517578, |
| "epoch": 0.42857142857142855, |
| "grad_norm": 0.0029920157976448536, |
| "kl": 0.0005879402160644531, |
| "learning_rate": 2.631592046130896e-07, |
| "loss": 0.0073, |
| "reward": 0.11453350447118282, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12628946779295802, |
| "rewards/cosine_scaled_reward": 0.14050849340856075, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 375 |
| }, |
| { |
| "advantage_max": 0.15478361072018743, |
| "advantage_mean": 3.8805110513795427e-10, |
| "advantage_min": -0.08757854904979467, |
| "advantage_std": 0.09614620194770396, |
| "completion_length": 2132.7084045410156, |
| "epoch": 0.4297142857142857, |
| "grad_norm": 0.001617670408450067, |
| "kl": 0.00033554062247276306, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.001, |
| "reward": 0.11974846536759287, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09614620392676443, |
| "rewards/cosine_scaled_reward": 0.0007210008334368467, |
| "rewards/format_reward": 0.7083333414047956, |
| "step": 376 |
| }, |
| { |
| "advantage_max": 0.2501893350854516, |
| "advantage_mean": -1.552203920951456e-10, |
| "advantage_min": -0.15648298431187868, |
| "advantage_std": 0.16026539681479335, |
| "completion_length": 3177.5834350585938, |
| "epoch": 0.4308571428571429, |
| "grad_norm": 0.002923794789239764, |
| "kl": 0.0006504058837890625, |
| "learning_rate": 2.583460445215911e-07, |
| "loss": 0.0023, |
| "reward": 0.07427594714681618, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.16026540519669652, |
| "rewards/cosine_scaled_reward": -0.039964438416063786, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 377 |
| }, |
| { |
| "advantage_max": 0.18498766515403986, |
| "advantage_mean": -1.4435500295117976e-08, |
| "advantage_min": -0.18669047579169273, |
| "advantage_std": 0.14398845378309488, |
| "completion_length": 1991.2083549499512, |
| "epoch": 0.432, |
| "grad_norm": 0.0019245728617534041, |
| "kl": 0.0003337450325489044, |
| "learning_rate": 2.5596072820445254e-07, |
| "loss": 0.0059, |
| "reward": 0.14536084234714508, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14398845750838518, |
| "rewards/cosine_scaled_reward": 0.11685117019806057, |
| "rewards/format_reward": 0.6250000111758709, |
| "step": 378 |
| }, |
| { |
| "advantage_max": 0.2584569351747632, |
| "advantage_mean": -4.423782500040385e-09, |
| "advantage_min": -0.13952340185642242, |
| "advantage_std": 0.1552307652309537, |
| "completion_length": 2956.687530517578, |
| "epoch": 0.43314285714285716, |
| "grad_norm": 0.002495537744835019, |
| "kl": 0.0005144476890563965, |
| "learning_rate": 2.5358974294659373e-07, |
| "loss": 0.0077, |
| "reward": 0.023960275422723498, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1552307764068246, |
| "rewards/cosine_scaled_reward": -0.10624610353261232, |
| "rewards/format_reward": 0.35416667349636555, |
| "step": 379 |
| }, |
| { |
| "advantage_max": 0.19467550422996283, |
| "advantage_mean": -3.880510787701574e-09, |
| "advantage_min": -0.14030754379928112, |
| "advantage_std": 0.14547208277508616, |
| "completion_length": 2132.416748046875, |
| "epoch": 0.4342857142857143, |
| "grad_norm": 0.0023121978156268597, |
| "kl": 0.0005660057067871094, |
| "learning_rate": 2.512332043064913e-07, |
| "loss": -0.0004, |
| "reward": 0.13492502574808896, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14547208370640874, |
| "rewards/cosine_scaled_reward": 0.034852080047130585, |
| "rewards/format_reward": 0.7291666772216558, |
| "step": 380 |
| }, |
| { |
| "advantage_max": 0.1496616357471794, |
| "advantage_mean": -3.0267984368892442e-09, |
| "advantage_min": -0.18097604904323816, |
| "advantage_std": 0.13761152140796185, |
| "completion_length": 2775.8541984558105, |
| "epoch": 0.43542857142857144, |
| "grad_norm": 0.002602294785901904, |
| "kl": 0.0006687045097351074, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0076, |
| "reward": 0.17733385832980275, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13761152466759086, |
| "rewards/cosine_scaled_reward": 0.233534662052989, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 381 |
| }, |
| { |
| "advantage_max": 0.11240688525140285, |
| "advantage_mean": 9.313225607376907e-10, |
| "advantage_min": -0.13521632878109813, |
| "advantage_std": 0.10715258843265474, |
| "completion_length": 1760.1875305175781, |
| "epoch": 0.43657142857142855, |
| "grad_norm": 0.0019332170486450195, |
| "kl": 0.0004322826862335205, |
| "learning_rate": 2.465639255873246e-07, |
| "loss": 0.0041, |
| "reward": 0.12760076066479087, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10715258959680796, |
| "rewards/cosine_scaled_reward": -0.028475773753598332, |
| "rewards/format_reward": 0.8125000074505806, |
| "step": 382 |
| }, |
| { |
| "advantage_max": 0.21556610194966197, |
| "advantage_mean": -2.405916715852996e-09, |
| "advantage_min": -0.14847189001739025, |
| "advantage_std": 0.14524834416806698, |
| "completion_length": 2493.6041946411133, |
| "epoch": 0.4377142857142857, |
| "grad_norm": 0.0023037714418023825, |
| "kl": 0.0005925819277763367, |
| "learning_rate": 2.4425141308231765e-07, |
| "loss": 0.0052, |
| "reward": 0.12728559458628297, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14524835254997015, |
| "rewards/cosine_scaled_reward": 0.11582583468407393, |
| "rewards/format_reward": 0.5208333376795053, |
| "step": 383 |
| }, |
| { |
| "advantage_max": 0.1525868414901197, |
| "advantage_mean": 5.432715943776145e-10, |
| "advantage_min": -0.22444791439920664, |
| "advantage_std": 0.15500889671966434, |
| "completion_length": 2127.1250381469727, |
| "epoch": 0.43885714285714283, |
| "grad_norm": 0.003150203498080373, |
| "kl": 0.0004943348467350006, |
| "learning_rate": 2.4195380233209006e-07, |
| "loss": 0.005, |
| "reward": 0.27685068640857935, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.15500890417024493, |
| "rewards/cosine_scaled_reward": 0.45512693375349045, |
| "rewards/format_reward": 0.7291666753590107, |
| "step": 384 |
| }, |
| { |
| "advantage_max": 0.2526491954922676, |
| "advantage_mean": -3.259629136054265e-09, |
| "advantage_min": -0.16856408398598433, |
| "advantage_std": 0.17220128513872623, |
| "completion_length": 2676.3333892822266, |
| "epoch": 0.44, |
| "grad_norm": 0.0027297367341816425, |
| "kl": 0.0005096197128295898, |
| "learning_rate": 2.3967120531894857e-07, |
| "loss": 0.0084, |
| "reward": 0.08310869638808072, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.17220128886401653, |
| "rewards/cosine_scaled_reward": -0.046377929858863354, |
| "rewards/format_reward": 0.583333345130086, |
| "step": 385 |
| }, |
| { |
| "advantage_max": 0.077841951046139, |
| "advantage_mean": -2.1730861277102775e-09, |
| "advantage_min": -0.13375738728791475, |
| "advantage_std": 0.08136873878538609, |
| "completion_length": 2538.1458740234375, |
| "epoch": 0.44114285714285717, |
| "grad_norm": 0.0015308266738429666, |
| "kl": 0.0005868077278137207, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": 0.0024, |
| "reward": 0.08394389343447983, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08136874157935381, |
| "rewards/cosine_scaled_reward": -0.011836465448141098, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 386 |
| }, |
| { |
| "advantage_max": 0.16614294797182083, |
| "advantage_mean": -3.1044086884479682e-09, |
| "advantage_min": -0.12347709108144045, |
| "advantage_std": 0.1082666483707726, |
| "completion_length": 2941.6042098999023, |
| "epoch": 0.4422857142857143, |
| "grad_norm": 0.0023464104160666466, |
| "kl": 0.0006274953484535217, |
| "learning_rate": 2.3515149676898552e-07, |
| "loss": 0.0088, |
| "reward": 0.06053544546011835, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10826664976775646, |
| "rewards/cosine_scaled_reward": -0.009756912477314472, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 387 |
| }, |
| { |
| "advantage_max": 0.11136521841399372, |
| "advantage_mean": 4.3461721582760404e-09, |
| "advantage_min": -0.10910094575956464, |
| "advantage_std": 0.08894198993220925, |
| "completion_length": 2469.625015258789, |
| "epoch": 0.44342857142857145, |
| "grad_norm": 0.0017324852524325252, |
| "kl": 0.000505693256855011, |
| "learning_rate": 2.3291460551638237e-07, |
| "loss": 0.0039, |
| "reward": 0.07375043304637074, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.08894199575297534, |
| "rewards/cosine_scaled_reward": 0.010048863710835576, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 388 |
| }, |
| { |
| "advantage_max": 0.1969348695129156, |
| "advantage_mean": 3.880510079934396e-10, |
| "advantage_min": -0.140401273034513, |
| "advantage_std": 0.13098299829289317, |
| "completion_length": 2594.0000534057617, |
| "epoch": 0.44457142857142856, |
| "grad_norm": 0.002368423156440258, |
| "kl": 0.0004792213439941406, |
| "learning_rate": 2.306931685585657e-07, |
| "loss": 0.0015, |
| "reward": 0.04259849968366325, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1309830006211996, |
| "rewards/cosine_scaled_reward": -0.13581918645650148, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 389 |
| }, |
| { |
| "advantage_max": 0.11635044636204839, |
| "advantage_mean": -6.5192582443529545e-09, |
| "advantage_min": -0.16502864565700293, |
| "advantage_std": 0.1230549905449152, |
| "completion_length": 2135.9791870117188, |
| "epoch": 0.44571428571428573, |
| "grad_norm": 0.0020190367940813303, |
| "kl": 0.00028385967016220093, |
| "learning_rate": 2.2848729416523859e-07, |
| "loss": 0.0016, |
| "reward": 0.1558728562667966, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12305499846115708, |
| "rewards/cosine_scaled_reward": 0.1262847138568759, |
| "rewards/format_reward": 0.666666679084301, |
| "step": 390 |
| }, |
| { |
| "advantage_max": 0.20053921593353152, |
| "advantage_mean": -4.579002690907608e-09, |
| "advantage_min": -0.20975689589977264, |
| "advantage_std": 0.17516077309846878, |
| "completion_length": 2303.3959159851074, |
| "epoch": 0.44685714285714284, |
| "grad_norm": 0.0026405281387269497, |
| "kl": 0.0004959553480148315, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": 0.0077, |
| "reward": 0.19714019040111452, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1751607726328075, |
| "rewards/cosine_scaled_reward": 0.23575006239116192, |
| "rewards/format_reward": 0.6875000037252903, |
| "step": 391 |
| }, |
| { |
| "advantage_max": 0.10962881101295352, |
| "advantage_mean": -5.743155964688107e-09, |
| "advantage_min": -0.13240550691261888, |
| "advantage_std": 0.0982381934300065, |
| "completion_length": 1689.0208702087402, |
| "epoch": 0.448, |
| "grad_norm": 0.001231748261488974, |
| "kl": 0.00031384825706481934, |
| "learning_rate": 2.2412266235313973e-07, |
| "loss": -0.0001, |
| "reward": 0.16504641436040401, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09823819668963552, |
| "rewards/cosine_scaled_reward": 0.06864389590919018, |
| "rewards/format_reward": 0.8333333358168602, |
| "step": 392 |
| }, |
| { |
| "advantage_max": 0.21035106386989355, |
| "advantage_mean": -6.364037963280111e-09, |
| "advantage_min": -0.136757155880332, |
| "advantage_std": 0.136241948697716, |
| "completion_length": 2200.5625381469727, |
| "epoch": 0.4491428571428571, |
| "grad_norm": 0.0022684852592647076, |
| "kl": 0.0004363059997558594, |
| "learning_rate": 2.2196411766036487e-07, |
| "loss": 0.0052, |
| "reward": 0.11575184087269008, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.13624195754528046, |
| "rewards/cosine_scaled_reward": 0.007750899763777852, |
| "rewards/format_reward": 0.6666666679084301, |
| "step": 393 |
| }, |
| { |
| "advantage_max": 0.28035250771790743, |
| "advantage_mean": -1.4745940649096845e-09, |
| "advantage_min": -0.1694905385375023, |
| "advantage_std": 0.18372618919238448, |
| "completion_length": 2991.666717529297, |
| "epoch": 0.4502857142857143, |
| "grad_norm": 0.0034687870647758245, |
| "kl": 0.0006244778633117676, |
| "learning_rate": 2.1982156097370557e-07, |
| "loss": 0.0111, |
| "reward": 0.03293494783429196, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.18372619384899735, |
| "rewards/cosine_scaled_reward": -0.0796794897178188, |
| "rewards/format_reward": 0.35416666977107525, |
| "step": 394 |
| }, |
| { |
| "advantage_max": 0.1525165536440909, |
| "advantage_mean": -7.974449859129984e-09, |
| "advantage_min": -0.09334612678503618, |
| "advantage_std": 0.0937539076549001, |
| "completion_length": 1960.2083435058594, |
| "epoch": 0.4514285714285714, |
| "grad_norm": 0.001557901268824935, |
| "kl": 0.0003572404384613037, |
| "learning_rate": 2.1769509671835223e-07, |
| "loss": 0.0093, |
| "reward": 0.07403434929437935, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09375390998320654, |
| "rewards/cosine_scaled_reward": -0.06747329549398273, |
| "rewards/format_reward": 0.5625000018626451, |
| "step": 395 |
| }, |
| { |
| "advantage_max": 0.1718710558488965, |
| "advantage_mean": -3.1044083970144243e-10, |
| "advantage_min": -0.20201640482991934, |
| "advantage_std": 0.14323359774425626, |
| "completion_length": 2671.5208892822266, |
| "epoch": 0.45257142857142857, |
| "grad_norm": 0.0023834805469959974, |
| "kl": 0.0004636496305465698, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": 0.0118, |
| "reward": 0.1488352312007919, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1432336075231433, |
| "rewards/cosine_scaled_reward": 0.13565819896757603, |
| "rewards/format_reward": 0.6041666828095913, |
| "step": 396 |
| }, |
| { |
| "advantage_max": 0.1686799516901374, |
| "advantage_mean": -6.829699236710063e-09, |
| "advantage_min": -0.1410632198676467, |
| "advantage_std": 0.1290680062957108, |
| "completion_length": 2122.1250534057617, |
| "epoch": 0.45371428571428574, |
| "grad_norm": 0.002183465054258704, |
| "kl": 0.00043454766273498535, |
| "learning_rate": 2.134908592756607e-07, |
| "loss": 0.0065, |
| "reward": 0.16060965787619352, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12906800862401724, |
| "rewards/cosine_scaled_reward": 0.10085129458457232, |
| "rewards/format_reward": 0.7500000111758709, |
| "step": 397 |
| }, |
| { |
| "advantage_max": 0.16326016373932362, |
| "advantage_mean": -4.656613053488634e-09, |
| "advantage_min": -0.14716396015137434, |
| "advantage_std": 0.12772608175873756, |
| "completion_length": 2271.208381652832, |
| "epoch": 0.45485714285714285, |
| "grad_norm": 0.0022896837908774614, |
| "kl": 0.00048645585775375366, |
| "learning_rate": 2.1141329099692406e-07, |
| "loss": 0.0046, |
| "reward": 0.12706264690496027, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12772608385421336, |
| "rewards/cosine_scaled_reward": 0.041912979912012815, |
| "rewards/format_reward": 0.6666666809469461, |
| "step": 398 |
| }, |
| { |
| "advantage_max": 0.16285632457584143, |
| "advantage_mean": -5.355104866489047e-09, |
| "advantage_min": -0.16017363499850035, |
| "advantage_std": 0.12993760779500008, |
| "completion_length": 1922.708396911621, |
| "epoch": 0.456, |
| "grad_norm": 0.002509958343580365, |
| "kl": 0.0004304051399230957, |
| "learning_rate": 2.0935222495670968e-07, |
| "loss": 0.0073, |
| "reward": 0.18827908392995596, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1299376110546291, |
| "rewards/cosine_scaled_reward": 0.18050863035023212, |
| "rewards/format_reward": 0.7500000074505806, |
| "step": 399 |
| }, |
| { |
| "advantage_max": 0.17384717427194118, |
| "advantage_mean": -1.862645149230957e-09, |
| "advantage_min": -0.17707005143165588, |
| "advantage_std": 0.14918375061824918, |
| "completion_length": 1479.0833930969238, |
| "epoch": 0.45714285714285713, |
| "grad_norm": 0.002644736086949706, |
| "kl": 0.00038166344165802, |
| "learning_rate": 2.0730776160846853e-07, |
| "loss": 0.0081, |
| "reward": 0.2606083396822214, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.14918375457637012, |
| "rewards/cosine_scaled_reward": 0.32361013628542423, |
| "rewards/format_reward": 0.8958333358168602, |
| "step": 400 |
| }, |
| { |
| "advantage_max": 0.12657041382044554, |
| "advantage_mean": -1.901450281949213e-09, |
| "advantage_min": -0.11912647541612387, |
| "advantage_std": 0.10337408259510994, |
| "completion_length": 2812.875030517578, |
| "epoch": 0.4582857142857143, |
| "grad_norm": 0.0014676002319902182, |
| "kl": 0.0006059408187866211, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0035, |
| "reward": 0.019788147183135152, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10337408352643251, |
| "rewards/cosine_scaled_reward": -0.10925112292170525, |
| "rewards/format_reward": 0.3333333358168602, |
| "step": 401 |
| }, |
| { |
| "advantage_max": 0.12366076093167067, |
| "advantage_mean": -2.0178656801039807e-09, |
| "advantage_min": -0.1171658206731081, |
| "advantage_std": 0.09674888919107616, |
| "completion_length": 2147.979179382324, |
| "epoch": 0.4594285714285714, |
| "grad_norm": 0.001693824538961053, |
| "kl": 0.00046128034591674805, |
| "learning_rate": 2.032690407508949e-07, |
| "loss": 0.0004, |
| "reward": 0.09151852503418922, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09674889012239873, |
| "rewards/cosine_scaled_reward": -0.03256989782676101, |
| "rewards/format_reward": 0.6041666716337204, |
| "step": 402 |
| }, |
| { |
| "advantage_max": 0.08860299317166209, |
| "advantage_mean": -5.277494774524882e-09, |
| "advantage_min": -0.13536688731983304, |
| "advantage_std": 0.08131288702134043, |
| "completion_length": 1687.0208473205566, |
| "epoch": 0.4605714285714286, |
| "grad_norm": 0.001448645954951644, |
| "kl": 0.0003199884667992592, |
| "learning_rate": 2.0127498008311922e-07, |
| "loss": -0.0013, |
| "reward": 0.17468450870364904, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08131289132870734, |
| "rewards/cosine_scaled_reward": 0.08974489662796259, |
| "rewards/format_reward": 0.8541666716337204, |
| "step": 403 |
| }, |
| { |
| "advantage_max": 0.1484815194271505, |
| "advantage_mean": -2.949188129819369e-09, |
| "advantage_min": -0.14216615236364305, |
| "advantage_std": 0.11517497175373137, |
| "completion_length": 2319.9791984558105, |
| "epoch": 0.4617142857142857, |
| "grad_norm": 0.002166890539228916, |
| "kl": 0.00048048049211502075, |
| "learning_rate": 1.9929791578083655e-07, |
| "loss": 0.0118, |
| "reward": 0.09523998136864975, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11517497431486845, |
| "rewards/cosine_scaled_reward": 0.029446275904774666, |
| "rewards/format_reward": 0.5000000074505806, |
| "step": 404 |
| }, |
| { |
| "advantage_max": 0.18142827786505222, |
| "advantage_mean": -8.537124618346326e-10, |
| "advantage_min": -0.14328955672681332, |
| "advantage_std": 0.12373507604934275, |
| "completion_length": 2082.166679382324, |
| "epoch": 0.46285714285714286, |
| "grad_norm": 0.0020297281444072723, |
| "kl": 0.0005799531936645508, |
| "learning_rate": 1.9733794420337213e-07, |
| "loss": 0.0031, |
| "reward": 0.1628161850385368, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12373507721349597, |
| "rewards/cosine_scaled_reward": 0.19531266274861991, |
| "rewards/format_reward": 0.5625000018626451, |
| "step": 405 |
| }, |
| { |
| "advantage_max": 0.18078680709004402, |
| "advantage_mean": -8.071462574699151e-09, |
| "advantage_min": -0.18061870522797108, |
| "advantage_std": 0.14956936822272837, |
| "completion_length": 1885.7500534057617, |
| "epoch": 0.464, |
| "grad_norm": 0.002431582659482956, |
| "kl": 0.0005524754524230957, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": 0.003, |
| "reward": 0.1681446279399097, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14956937148235738, |
| "rewards/cosine_scaled_reward": 0.10558001510798931, |
| "rewards/format_reward": 0.7708333432674408, |
| "step": 406 |
| }, |
| { |
| "advantage_max": 0.13299659185577184, |
| "advantage_mean": -9.924406620294424e-09, |
| "advantage_min": -0.1531422910047695, |
| "advantage_std": 0.12285241187782958, |
| "completion_length": 2295.9166870117188, |
| "epoch": 0.46514285714285714, |
| "grad_norm": 0.0032989894971251488, |
| "kl": 0.0004259645938873291, |
| "learning_rate": 1.934696604901642e-07, |
| "loss": 0.0124, |
| "reward": 0.12349864930001786, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12285241659265012, |
| "rewards/cosine_scaled_reward": 0.05015348456799984, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 407 |
| }, |
| { |
| "advantage_max": 0.16066291369497776, |
| "advantage_mean": -6.053597137456457e-09, |
| "advantage_min": -0.15431132726371288, |
| "advantage_std": 0.13571413152385503, |
| "completion_length": 2232.2708473205566, |
| "epoch": 0.4662857142857143, |
| "grad_norm": 0.002392456866800785, |
| "kl": 0.0004053637385368347, |
| "learning_rate": 1.915615368891117e-07, |
| "loss": 0.0115, |
| "reward": 0.1337297521531582, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.13571413629688323, |
| "rewards/cosine_scaled_reward": 0.1021328023634851, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 408 |
| }, |
| { |
| "advantage_max": 0.16096539422869682, |
| "advantage_mean": -6.984920142283357e-10, |
| "advantage_min": -0.13622304517775774, |
| "advantage_std": 0.13016468053683639, |
| "completion_length": 3345.8333740234375, |
| "epoch": 0.4674285714285714, |
| "grad_norm": 0.0023022436071187258, |
| "kl": 0.0006246566772460938, |
| "learning_rate": 1.8967088307307e-07, |
| "loss": 0.0106, |
| "reward": 0.05926420073956251, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.1301646833308041, |
| "rewards/cosine_scaled_reward": 0.00888601504266262, |
| "rewards/format_reward": 0.3333333395421505, |
| "step": 409 |
| }, |
| { |
| "advantage_max": 0.16869010031223297, |
| "advantage_mean": -1.2029583600081661e-08, |
| "advantage_min": -0.14472126122564077, |
| "advantage_std": 0.1374293458648026, |
| "completion_length": 2234.250057220459, |
| "epoch": 0.4685714285714286, |
| "grad_norm": 0.0026337350718677044, |
| "kl": 0.00045359134674072266, |
| "learning_rate": 1.8779779118983867e-07, |
| "loss": 0.0022, |
| "reward": 0.15104275988414884, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.13742934702895582, |
| "rewards/cosine_scaled_reward": 0.11242910474538803, |
| "rewards/format_reward": 0.6666666716337204, |
| "step": 410 |
| }, |
| { |
| "advantage_max": 0.14711505104787648, |
| "advantage_mean": 2.890980579928204e-09, |
| "advantage_min": -0.1517161400988698, |
| "advantage_std": 0.12216609879396856, |
| "completion_length": 2739.750026702881, |
| "epoch": 0.4697142857142857, |
| "grad_norm": 0.0020477049984037876, |
| "kl": 0.0005456209182739258, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0069, |
| "reward": 0.0763370256536291, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12216610834002495, |
| "rewards/cosine_scaled_reward": 0.004631456453353167, |
| "rewards/format_reward": 0.4375000037252903, |
| "step": 411 |
| }, |
| { |
| "advantage_max": 0.1390043180435896, |
| "advantage_mean": -7.450580596923828e-09, |
| "advantage_min": -0.16988389380276203, |
| "advantage_std": 0.12418814655393362, |
| "completion_length": 2551.3125534057617, |
| "epoch": 0.47085714285714286, |
| "grad_norm": 0.002181727671995759, |
| "kl": 0.0006371736526489258, |
| "learning_rate": 1.8410465752883758e-07, |
| "loss": 0.0008, |
| "reward": 0.1608741357922554, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12418815260753036, |
| "rewards/cosine_scaled_reward": 0.1822793409228325, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 412 |
| }, |
| { |
| "advantage_max": 0.18585596792399883, |
| "advantage_mean": 3.2596291082986895e-09, |
| "advantage_min": -0.20336279086768627, |
| "advantage_std": 0.15997914131730795, |
| "completion_length": 2291.4375534057617, |
| "epoch": 0.472, |
| "grad_norm": 0.0022269051987677813, |
| "kl": 0.0003896765410900116, |
| "learning_rate": 1.822847957491922e-07, |
| "loss": 0.0019, |
| "reward": 0.13341048173606396, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15997914737090468, |
| "rewards/cosine_scaled_reward": 0.10117785283364356, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 413 |
| }, |
| { |
| "advantage_max": 0.23772612866014242, |
| "advantage_mean": -2.79396782099095e-09, |
| "advantage_min": -0.20109373703598976, |
| "advantage_std": 0.18670698534697294, |
| "completion_length": 2825.8750610351562, |
| "epoch": 0.47314285714285714, |
| "grad_norm": 0.0031252303160727024, |
| "kl": 0.00036650896072387695, |
| "learning_rate": 1.804828558898332e-07, |
| "loss": 0.0021, |
| "reward": 0.09702186938375235, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.18670698441565037, |
| "rewards/cosine_scaled_reward": 0.015147236525081098, |
| "rewards/format_reward": 0.5416666753590107, |
| "step": 414 |
| }, |
| { |
| "advantage_max": 0.1667441390454769, |
| "advantage_mean": 1.3969839451899446e-09, |
| "advantage_min": -0.10517374519258738, |
| "advantage_std": 0.11341422703117132, |
| "completion_length": 3224.6458435058594, |
| "epoch": 0.4742857142857143, |
| "grad_norm": 0.00234918761998415, |
| "kl": 0.0006085634231567383, |
| "learning_rate": 1.7869892577476722e-07, |
| "loss": 0.0023, |
| "reward": -0.02207179879769683, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11341423215344548, |
| "rewards/cosine_scaled_reward": -0.16945138771552593, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 415 |
| }, |
| { |
| "advantage_max": 0.19994694832712412, |
| "advantage_mean": -7.217750140620094e-09, |
| "advantage_min": -0.160794363822788, |
| "advantage_std": 0.13797596981748939, |
| "completion_length": 1336.5208473205566, |
| "epoch": 0.4754285714285714, |
| "grad_norm": 0.0012100542662665248, |
| "kl": 0.00020557385869324207, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": -0.0004, |
| "reward": 0.16311359032988548, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.13797597540542483, |
| "rewards/cosine_scaled_reward": 0.020437828032299876, |
| "rewards/format_reward": 0.9166666679084301, |
| "step": 416 |
| }, |
| { |
| "advantage_max": 0.06850483757443726, |
| "advantage_mean": -1.8626452047421083e-09, |
| "advantage_min": -0.09264840371906757, |
| "advantage_std": 0.06786012463271618, |
| "completion_length": 2991.604202270508, |
| "epoch": 0.4765714285714286, |
| "grad_norm": 0.001323464559391141, |
| "kl": 0.0007146112620830536, |
| "learning_rate": 1.7518544168045524e-07, |
| "loss": -0.0018, |
| "reward": 0.07695972826331854, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.06786012649536133, |
| "rewards/cosine_scaled_reward": 0.037449623458087444, |
| "rewards/format_reward": 0.375, |
| "step": 417 |
| }, |
| { |
| "advantage_max": 0.08946323348209262, |
| "advantage_mean": 1.5522044760629683e-10, |
| "advantage_min": -0.08228053990751505, |
| "advantage_std": 0.06771091069094837, |
| "completion_length": 2153.7708587646484, |
| "epoch": 0.4777142857142857, |
| "grad_norm": 0.0015419662231579423, |
| "kl": 0.000576341524720192, |
| "learning_rate": 1.7345605894346726e-07, |
| "loss": 0.0038, |
| "reward": 0.12136844790074974, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.06771091185510159, |
| "rewards/cosine_scaled_reward": 0.05540352314710617, |
| "rewards/format_reward": 0.6041666679084301, |
| "step": 418 |
| }, |
| { |
| "advantage_max": 0.09467934165149927, |
| "advantage_mean": -8.071462193059986e-09, |
| "advantage_min": -0.13740856852382421, |
| "advantage_std": 0.09112847782671452, |
| "completion_length": 2377.7708587646484, |
| "epoch": 0.47885714285714287, |
| "grad_norm": 0.0011218616273254156, |
| "kl": 0.00043688714504241943, |
| "learning_rate": 1.7174502842694212e-07, |
| "loss": 0.0027, |
| "reward": 0.10573733225464821, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09112848108634353, |
| "rewards/cosine_scaled_reward": 0.051418233662843704, |
| "rewards/format_reward": 0.520833333954215, |
| "step": 419 |
| }, |
| { |
| "advantage_max": 0.13071657810360193, |
| "advantage_mean": -3.1044085913034536e-09, |
| "advantage_min": -0.1421688578557223, |
| "advantage_std": 0.1184294882696122, |
| "completion_length": 1442.6667213439941, |
| "epoch": 0.48, |
| "grad_norm": 0.001580077805556357, |
| "kl": 0.0002828836441040039, |
| "learning_rate": 1.7005243352409333e-07, |
| "loss": 0.0055, |
| "reward": 0.0912556970724836, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11842949106357992, |
| "rewards/cosine_scaled_reward": -0.1258711563423276, |
| "rewards/format_reward": 0.7916666772216558, |
| "step": 420 |
| }, |
| { |
| "advantage_max": 0.23604473238810897, |
| "advantage_mean": -3.880510801579362e-09, |
| "advantage_min": -0.19979441072791815, |
| "advantage_std": 0.17005935590714216, |
| "completion_length": 2882.000030517578, |
| "epoch": 0.48114285714285715, |
| "grad_norm": 0.0032496661879122257, |
| "kl": 0.0005581378936767578, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": 0.0113, |
| "reward": 0.06888013612478971, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.17005936661735177, |
| "rewards/cosine_scaled_reward": 0.015190824400633574, |
| "rewards/format_reward": 0.37500001303851604, |
| "step": 421 |
| }, |
| { |
| "advantage_max": 0.13127990905195475, |
| "advantage_mean": -2.3283065475609988e-09, |
| "advantage_min": -0.08840413391590118, |
| "advantage_std": 0.08595326798968017, |
| "completion_length": 2885.041717529297, |
| "epoch": 0.48228571428571426, |
| "grad_norm": 0.001612317399121821, |
| "kl": 0.0006029903888702393, |
| "learning_rate": 1.6672287963562852e-07, |
| "loss": 0.0004, |
| "reward": 0.02790595730766654, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08595326961949468, |
| "rewards/cosine_scaled_reward": -0.08409846760332584, |
| "rewards/format_reward": 0.33333334140479565, |
| "step": 422 |
| }, |
| { |
| "advantage_max": 0.18713722238317132, |
| "advantage_mean": -4.423782389018083e-09, |
| "advantage_min": -0.15640555322170258, |
| "advantage_std": 0.1455209826817736, |
| "completion_length": 3003.3958892822266, |
| "epoch": 0.48342857142857143, |
| "grad_norm": 0.002777996240183711, |
| "kl": 0.0006363391876220703, |
| "learning_rate": 1.6508608292777203e-07, |
| "loss": 0.0054, |
| "reward": 0.07326357485726476, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14552099234424531, |
| "rewards/cosine_scaled_reward": 0.03013175167143345, |
| "rewards/format_reward": 0.37500000931322575, |
| "step": 423 |
| }, |
| { |
| "advantage_max": 0.19879988790489733, |
| "advantage_mean": -1.552203920951456e-10, |
| "advantage_min": -0.15926985908299685, |
| "advantage_std": 0.15668703592382371, |
| "completion_length": 2868.604217529297, |
| "epoch": 0.4845714285714286, |
| "grad_norm": 0.0034800267312675714, |
| "kl": 0.0005161762237548828, |
| "learning_rate": 1.6346804638120098e-07, |
| "loss": 0.0092, |
| "reward": 0.057857689214870334, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.15668703895062208, |
| "rewards/cosine_scaled_reward": -0.038518927060067654, |
| "rewards/format_reward": 0.4166666753590107, |
| "step": 424 |
| }, |
| { |
| "advantage_max": 0.16558178514242172, |
| "advantage_mean": -1.3038516322172455e-08, |
| "advantage_min": -0.19483254104852676, |
| "advantage_std": 0.13980256451759487, |
| "completion_length": 1757.3750305175781, |
| "epoch": 0.4857142857142857, |
| "grad_norm": 0.0022031443659216166, |
| "kl": 0.00030015595257282257, |
| "learning_rate": 1.6186884885673413e-07, |
| "loss": 0.0071, |
| "reward": 0.28972805850207806, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1398025705711916, |
| "rewards/cosine_scaled_reward": 0.43010698398575187, |
| "rewards/format_reward": 0.8541666753590107, |
| "step": 425 |
| }, |
| { |
| "advantage_max": 0.12429485190659761, |
| "advantage_mean": 2.949188268597247e-09, |
| "advantage_min": -0.12924165464937687, |
| "advantage_std": 0.10551139246672392, |
| "completion_length": 2043.2083625793457, |
| "epoch": 0.4868571428571429, |
| "grad_norm": 0.0011740243062376976, |
| "kl": 0.00042065978050231934, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": -0.0014, |
| "reward": 0.13536178693175316, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10551139432936907, |
| "rewards/cosine_scaled_reward": 0.11793831922113895, |
| "rewards/format_reward": 0.5625000149011612, |
| "step": 426 |
| }, |
| { |
| "advantage_max": 0.09893368650227785, |
| "advantage_mean": -6.674478740531509e-09, |
| "advantage_min": -0.11052933987230062, |
| "advantage_std": 0.08068146975710988, |
| "completion_length": 3011.9583435058594, |
| "epoch": 0.488, |
| "grad_norm": 0.0013815592974424362, |
| "kl": 0.00067901611328125, |
| "learning_rate": 1.5872728172265146e-07, |
| "loss": 0.001, |
| "reward": 0.04093674477189779, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08068147208541632, |
| "rewards/cosine_scaled_reward": -0.035097976215183735, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 427 |
| }, |
| { |
| "advantage_max": 0.22580352891236544, |
| "advantage_mean": -3.531264877731921e-09, |
| "advantage_min": -0.23063123784959316, |
| "advantage_std": 0.18543015886098146, |
| "completion_length": 2163.0625381469727, |
| "epoch": 0.48914285714285716, |
| "grad_norm": 0.0028671131003648043, |
| "kl": 0.0004477202892303467, |
| "learning_rate": 1.5718506522858572e-07, |
| "loss": 0.0143, |
| "reward": 0.1508345203474164, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1854301649145782, |
| "rewards/cosine_scaled_reward": 0.12118915654718876, |
| "rewards/format_reward": 0.6458333414047956, |
| "step": 428 |
| }, |
| { |
| "advantage_max": 0.11988749261945486, |
| "advantage_mean": -2.7939678903798892e-09, |
| "advantage_min": -0.14586800010874867, |
| "advantage_std": 0.11273562628775835, |
| "completion_length": 1969.0833625793457, |
| "epoch": 0.49028571428571427, |
| "grad_norm": 0.0015224118251353502, |
| "kl": 0.000494047999382019, |
| "learning_rate": 1.5566199398026147e-07, |
| "loss": 0.0048, |
| "reward": 0.10409643454477191, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1127356318756938, |
| "rewards/cosine_scaled_reward": -0.03846575319766998, |
| "rewards/format_reward": 0.6875000074505806, |
| "step": 429 |
| }, |
| { |
| "advantage_max": 0.1451888126321137, |
| "advantage_mean": -4.423782340445825e-09, |
| "advantage_min": -0.1432971404865384, |
| "advantage_std": 0.10751870181411505, |
| "completion_length": 2194.375030517578, |
| "epoch": 0.49142857142857144, |
| "grad_norm": 0.0018724583787843585, |
| "kl": 0.0004582032561302185, |
| "learning_rate": 1.5415814221002265e-07, |
| "loss": 0.0069, |
| "reward": 0.13761766906827688, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10751870553940535, |
| "rewards/cosine_scaled_reward": 0.09085702430456877, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 430 |
| }, |
| { |
| "advantage_max": 0.15991243068128824, |
| "advantage_mean": -2.949188282475035e-09, |
| "advantage_min": -0.15987397450953722, |
| "advantage_std": 0.126675630453974, |
| "completion_length": 2091.020851135254, |
| "epoch": 0.49257142857142855, |
| "grad_norm": 0.0018920974107459188, |
| "kl": 0.0006821155548095703, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.0019, |
| "reward": 0.08629386406391859, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12667563511058688, |
| "rewards/cosine_scaled_reward": -0.03934409748762846, |
| "rewards/format_reward": 0.5833333414047956, |
| "step": 431 |
| }, |
| { |
| "advantage_max": 0.22554702498018742, |
| "advantage_mean": -4.3461720750093136e-09, |
| "advantage_min": -0.2308659916743636, |
| "advantage_std": 0.18240982363931835, |
| "completion_length": 2816.7083892822266, |
| "epoch": 0.4937142857142857, |
| "grad_norm": 0.003146486124023795, |
| "kl": 0.00060272216796875, |
| "learning_rate": 1.5120838934595337e-07, |
| "loss": 0.0022, |
| "reward": 0.1152828261256218, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.18240981781855226, |
| "rewards/cosine_scaled_reward": 0.09906804747879505, |
| "rewards/format_reward": 0.47916668094694614, |
| "step": 432 |
| }, |
| { |
| "advantage_max": 0.12062860745936632, |
| "advantage_mean": -5.452117637569365e-09, |
| "advantage_min": -0.1215288108214736, |
| "advantage_std": 0.09663968626409769, |
| "completion_length": 2868.7916679382324, |
| "epoch": 0.4948571428571429, |
| "grad_norm": 0.0015883512096479535, |
| "kl": 0.0004881918430328369, |
| "learning_rate": 1.4976263201891613e-07, |
| "loss": 0.0015, |
| "reward": 0.06807709392160177, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09663968672975898, |
| "rewards/cosine_scaled_reward": 0.023668689653277397, |
| "rewards/format_reward": 0.3541666753590107, |
| "step": 433 |
| }, |
| { |
| "advantage_max": 0.14515302376821637, |
| "advantage_mean": -2.0954758137015084e-09, |
| "advantage_min": -0.09941379074007273, |
| "advantage_std": 0.0925137703306973, |
| "completion_length": 2770.3542098999023, |
| "epoch": 0.496, |
| "grad_norm": 0.0017540472326800227, |
| "kl": 0.000606052577495575, |
| "learning_rate": 1.483363816965435e-07, |
| "loss": 0.0031, |
| "reward": -0.016186986584216356, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09251376986503601, |
| "rewards/cosine_scaled_reward": -0.22496083891019225, |
| "rewards/format_reward": 0.35416667349636555, |
| "step": 434 |
| }, |
| { |
| "advantage_max": 0.11189865134656429, |
| "advantage_mean": -4.03573130469681e-09, |
| "advantage_min": -0.08333402825519443, |
| "advantage_std": 0.076182265591342, |
| "completion_length": 2093.437530517578, |
| "epoch": 0.49714285714285716, |
| "grad_norm": 0.0017215419793501496, |
| "kl": 0.0007574558258056641, |
| "learning_rate": 1.469297078922642e-07, |
| "loss": 0.0039, |
| "reward": 0.052155050449073315, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.07618226658087224, |
| "rewards/cosine_scaled_reward": -0.11770580988377333, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 435 |
| }, |
| { |
| "advantage_max": 0.17040221765637398, |
| "advantage_mean": -1.552203920951456e-10, |
| "advantage_min": -0.15949644800275564, |
| "advantage_std": 0.12988637061789632, |
| "completion_length": 1972.1458778381348, |
| "epoch": 0.4982857142857143, |
| "grad_norm": 0.0017543588764965534, |
| "kl": 0.0003309226594865322, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": 0.002, |
| "reward": 0.14922187570482492, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12988637015223503, |
| "rewards/cosine_scaled_reward": 0.1184953460469842, |
| "rewards/format_reward": 0.6458333358168602, |
| "step": 436 |
| }, |
| { |
| "advantage_max": 0.06821496458724141, |
| "advantage_mean": 5.820767028097418e-10, |
| "advantage_min": -0.08236566046252847, |
| "advantage_std": 0.05960237327963114, |
| "completion_length": 2729.250015258789, |
| "epoch": 0.49942857142857144, |
| "grad_norm": 0.0011862348765134811, |
| "kl": 0.0005130767822265625, |
| "learning_rate": 1.4417536311769885e-07, |
| "loss": 0.0057, |
| "reward": 0.05111697223037481, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.059602373745292425, |
| "rewards/cosine_scaled_reward": -0.08793694153428078, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 437 |
| }, |
| { |
| "advantage_max": 0.17163235694169998, |
| "advantage_mean": -3.104409923571083e-10, |
| "advantage_min": -0.21905045211315155, |
| "advantage_std": 0.15708328178152442, |
| "completion_length": 2668.937545776367, |
| "epoch": 0.5005714285714286, |
| "grad_norm": 0.0025618094950914383, |
| "kl": 0.0005084872245788574, |
| "learning_rate": 1.4282782639029128e-07, |
| "loss": -0.003, |
| "reward": 0.08607154805213213, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15708328364416957, |
| "rewards/cosine_scaled_reward": -0.00721331313252449, |
| "rewards/format_reward": 0.5208333488553762, |
| "step": 438 |
| }, |
| { |
| "advantage_max": 0.1785668469965458, |
| "advantage_mean": -3.4148494171271082e-09, |
| "advantage_min": -0.18488147668540478, |
| "advantage_std": 0.1533331573009491, |
| "completion_length": 2452.5208740234375, |
| "epoch": 0.5017142857142857, |
| "grad_norm": 0.0026862856466323137, |
| "kl": 0.0005292594432830811, |
| "learning_rate": 1.4150013466019114e-07, |
| "loss": 0.0009, |
| "reward": 0.07428326783701777, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.15333315962925553, |
| "rewards/cosine_scaled_reward": -0.029012007638812065, |
| "rewards/format_reward": 0.5000000167638063, |
| "step": 439 |
| }, |
| { |
| "advantage_max": 0.14857979817315936, |
| "advantage_mean": 6.208816794028849e-10, |
| "advantage_min": -0.09883663896471262, |
| "advantage_std": 0.09249451011419296, |
| "completion_length": 2944.875030517578, |
| "epoch": 0.5028571428571429, |
| "grad_norm": 0.001710064709186554, |
| "kl": 0.0007870197296142578, |
| "learning_rate": 1.4019235263722034e-07, |
| "loss": 0.0032, |
| "reward": -0.016045190238628493, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09249451290816069, |
| "rewards/cosine_scaled_reward": -0.17368111293762922, |
| "rewards/format_reward": 0.2500000037252903, |
| "step": 440 |
| }, |
| { |
| "advantage_max": 0.1465267646126449, |
| "advantage_mean": 2.3283067834833915e-10, |
| "advantage_min": -0.12699715700000525, |
| "advantage_std": 0.11076962715014815, |
| "completion_length": 2901.958354949951, |
| "epoch": 0.504, |
| "grad_norm": 0.0018061235314235091, |
| "kl": 0.0007097125053405762, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.0036, |
| "reward": 0.06491660978645086, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11076962575316429, |
| "rewards/cosine_scaled_reward": 0.0063933562487363815, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 441 |
| }, |
| { |
| "advantage_max": 0.17049752548336983, |
| "advantage_mean": 3.8805109126016646e-09, |
| "advantage_min": -0.11352705024182796, |
| "advantage_std": 0.118473204318434, |
| "completion_length": 2633.6250076293945, |
| "epoch": 0.5051428571428571, |
| "grad_norm": 0.001908893813379109, |
| "kl": 0.0005713105201721191, |
| "learning_rate": 1.3763677169699217e-07, |
| "loss": 0.0013, |
| "reward": 0.07302698490093462, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1184732080437243, |
| "rewards/cosine_scaled_reward": -0.023679533042013645, |
| "rewards/format_reward": 0.4791666679084301, |
| "step": 442 |
| }, |
| { |
| "advantage_max": 0.15183980716392398, |
| "advantage_mean": -3.0267986173004857e-09, |
| "advantage_min": -0.1316932663321495, |
| "advantage_std": 0.11646312335506082, |
| "completion_length": 2931.5000534057617, |
| "epoch": 0.5062857142857143, |
| "grad_norm": 0.0020145962480455637, |
| "kl": 0.00047835707664489746, |
| "learning_rate": 1.3638909733514452e-07, |
| "loss": 0.0068, |
| "reward": 0.06958311138441786, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11646311962977052, |
| "rewards/cosine_scaled_reward": -0.002327965572476387, |
| "rewards/format_reward": 0.4166666753590107, |
| "step": 443 |
| }, |
| { |
| "advantage_max": 0.11418756144121289, |
| "advantage_mean": 3.88050980237864e-11, |
| "advantage_min": -0.09134344570338726, |
| "advantage_std": 0.08608054695650935, |
| "completion_length": 2848.3750076293945, |
| "epoch": 0.5074285714285715, |
| "grad_norm": 0.0012016425607725978, |
| "kl": 0.0005184710025787354, |
| "learning_rate": 1.351615817851748e-07, |
| "loss": -0.0004, |
| "reward": -0.03174476232379675, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08608055114746094, |
| "rewards/cosine_scaled_reward": -0.21938861906528473, |
| "rewards/format_reward": 0.25000000558793545, |
| "step": 444 |
| }, |
| { |
| "advantage_max": 0.1748210177756846, |
| "advantage_mean": -2.8715780586718864e-09, |
| "advantage_min": -0.14708452578634024, |
| "advantage_std": 0.12430504383519292, |
| "completion_length": 2911.7917098999023, |
| "epoch": 0.5085714285714286, |
| "grad_norm": 0.0020126677118241787, |
| "kl": 0.0006580352783203125, |
| "learning_rate": 1.3395428487445914e-07, |
| "loss": 0.0046, |
| "reward": 0.10539004136808217, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1243050447665155, |
| "rewards/cosine_scaled_reward": 0.07351269014179707, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 445 |
| }, |
| { |
| "advantage_max": 0.2272405456751585, |
| "advantage_mean": -3.1820188706177532e-09, |
| "advantage_min": -0.20155336987227201, |
| "advantage_std": 0.17020691372454166, |
| "completion_length": 2772.3750381469727, |
| "epoch": 0.5097142857142857, |
| "grad_norm": 0.0035501238889992237, |
| "kl": 0.0005533397197723389, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": 0.0091, |
| "reward": 0.06983472930733114, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1702069230377674, |
| "rewards/cosine_scaled_reward": -0.054740124847739935, |
| "rewards/format_reward": 0.5208333488553762, |
| "step": 446 |
| }, |
| { |
| "advantage_max": 0.1513710436411202, |
| "advantage_mean": -2.63874733175129e-09, |
| "advantage_min": -0.11578180687502027, |
| "advantage_std": 0.12062532501295209, |
| "completion_length": 1653.7708473205566, |
| "epoch": 0.5108571428571429, |
| "grad_norm": 0.002002675784751773, |
| "kl": 0.00044733285903930664, |
| "learning_rate": 1.316005813502869e-07, |
| "loss": 0.0057, |
| "reward": 0.07071918109431863, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12062532640993595, |
| "rewards/cosine_scaled_reward": -0.19742719386704266, |
| "rewards/format_reward": 0.8125000055879354, |
| "step": 447 |
| }, |
| { |
| "advantage_max": 0.12252536416053772, |
| "advantage_mean": -9.002785267275826e-09, |
| "advantage_min": -0.112825533375144, |
| "advantage_std": 0.08908456144854426, |
| "completion_length": 1953.8333549499512, |
| "epoch": 0.512, |
| "grad_norm": 0.0019004471832886338, |
| "kl": 0.000409543514251709, |
| "learning_rate": 1.3045428945301953e-07, |
| "loss": 0.005, |
| "reward": 0.09030471183359623, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08908456075005233, |
| "rewards/cosine_scaled_reward": -0.035713573917746544, |
| "rewards/format_reward": 0.6041666716337204, |
| "step": 448 |
| }, |
| { |
| "advantage_max": 0.0966623155400157, |
| "advantage_mean": -2.328305812038245e-10, |
| "advantage_min": -0.08400777820497751, |
| "advantage_std": 0.06731040589511395, |
| "completion_length": 2492.791675567627, |
| "epoch": 0.5131428571428571, |
| "grad_norm": 0.0013465734664350748, |
| "kl": 0.0005087852478027344, |
| "learning_rate": 1.2932844562179352e-07, |
| "loss": 0.0014, |
| "reward": 0.014899131376296282, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.06731040822342038, |
| "rewards/cosine_scaled_reward": -0.16597000509500504, |
| "rewards/format_reward": 0.4166666679084301, |
| "step": 449 |
| }, |
| { |
| "advantage_max": 0.06399380508810282, |
| "advantage_mean": 2.250696351513426e-09, |
| "advantage_min": -0.08506503142416477, |
| "advantage_std": 0.057404838502407074, |
| "completion_length": 2388.562515258789, |
| "epoch": 0.5142857142857142, |
| "grad_norm": 0.0008139196434058249, |
| "kl": 0.00040439143776893616, |
| "learning_rate": 1.2822310472864885e-07, |
| "loss": 0.0015, |
| "reward": 0.07429015543311834, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.05740483803674579, |
| "rewards/cosine_scaled_reward": 0.00132070854306221, |
| "rewards/format_reward": 0.4375, |
| "step": 450 |
| }, |
| { |
| "advantage_max": 0.07913245167583227, |
| "advantage_mean": -3.8805106350459084e-09, |
| "advantage_min": -0.06130435457453132, |
| "advantage_std": 0.05127688334323466, |
| "completion_length": 2407.750030517578, |
| "epoch": 0.5154285714285715, |
| "grad_norm": 0.0008824392571114004, |
| "kl": 0.000555187463760376, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": 0.0027, |
| "reward": 0.0942617341352161, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.051276884973049164, |
| "rewards/cosine_scaled_reward": 0.016264691948890686, |
| "rewards/format_reward": 0.5208333376795053, |
| "step": 451 |
| }, |
| { |
| "advantage_max": 0.20604290487244725, |
| "advantage_mean": -5.122274437940888e-09, |
| "advantage_min": -0.16741247940808535, |
| "advantage_std": 0.14271592535078526, |
| "completion_length": 2762.500045776367, |
| "epoch": 0.5165714285714286, |
| "grad_norm": 0.002616358455270529, |
| "kl": 0.0005500912666320801, |
| "learning_rate": 1.260741462457165e-07, |
| "loss": 0.0009, |
| "reward": 0.07724724570289254, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1427159309387207, |
| "rewards/cosine_scaled_reward": -0.0017735953442752361, |
| "rewards/format_reward": 0.4583333469927311, |
| "step": 452 |
| }, |
| { |
| "advantage_max": 0.18616215698421001, |
| "advantage_mean": -7.761021464380846e-09, |
| "advantage_min": -0.1861566216684878, |
| "advantage_std": 0.14951407350599766, |
| "completion_length": 2588.4792098999023, |
| "epoch": 0.5177142857142857, |
| "grad_norm": 0.002354600466787815, |
| "kl": 0.0005657672882080078, |
| "learning_rate": 1.2503063339313356e-07, |
| "loss": 0.0084, |
| "reward": 0.10979624767787755, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.14951407723128796, |
| "rewards/cosine_scaled_reward": 0.07057444495148957, |
| "rewards/format_reward": 0.5000000093132257, |
| "step": 453 |
| }, |
| { |
| "advantage_max": 0.08877705689519644, |
| "advantage_mean": 1.0865430083439875e-09, |
| "advantage_min": -0.11890319734811783, |
| "advantage_std": 0.0781020374270156, |
| "completion_length": 2190.5416870117188, |
| "epoch": 0.5188571428571429, |
| "grad_norm": 0.0014429461443796754, |
| "kl": 0.0005607306957244873, |
| "learning_rate": 1.2400783294793668e-07, |
| "loss": 0.0021, |
| "reward": 0.135482975281775, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.07810203928966075, |
| "rewards/cosine_scaled_reward": 0.047869518399238586, |
| "rewards/format_reward": 0.7083333432674408, |
| "step": 454 |
| }, |
| { |
| "advantage_max": 0.14416319783776999, |
| "advantage_mean": -7.761021270091817e-10, |
| "advantage_min": -0.10820258548483253, |
| "advantage_std": 0.10255820630118251, |
| "completion_length": 2773.6042098999023, |
| "epoch": 0.52, |
| "grad_norm": 0.002726923208683729, |
| "kl": 0.0006432235240936279, |
| "learning_rate": 1.2300579475997657e-07, |
| "loss": 0.0081, |
| "reward": 0.00850020069628954, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10255820862948895, |
| "rewards/cosine_scaled_reward": -0.19340587593615055, |
| "rewards/format_reward": 0.4375000037252903, |
| "step": 455 |
| }, |
| { |
| "advantage_max": 0.17035988252609968, |
| "advantage_mean": 5.044663679842909e-10, |
| "advantage_min": -0.18033531680703163, |
| "advantage_std": 0.1579930440057069, |
| "completion_length": 2987.7292289733887, |
| "epoch": 0.5211428571428571, |
| "grad_norm": 0.0037276751827448606, |
| "kl": 0.0005083084106445312, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": 0.0115, |
| "reward": 0.06464625336229801, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.157993049826473, |
| "rewards/cosine_scaled_reward": -0.017057785764336586, |
| "rewards/format_reward": 0.4166666753590107, |
| "step": 456 |
| }, |
| { |
| "advantage_max": 0.1135766888037324, |
| "advantage_mean": -2.6387474427735924e-09, |
| "advantage_min": -0.08013106137514114, |
| "advantage_std": 0.08444676687940955, |
| "completion_length": 3021.229217529297, |
| "epoch": 0.5222857142857142, |
| "grad_norm": 0.0019406946375966072, |
| "kl": 0.0007562637329101562, |
| "learning_rate": 1.2106419949317388e-07, |
| "loss": 0.0015, |
| "reward": 0.05151152703911066, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.08444677060469985, |
| "rewards/cosine_scaled_reward": -0.013899954035878181, |
| "rewards/format_reward": 0.3333333358168602, |
| "step": 457 |
| }, |
| { |
| "advantage_max": 0.18146846443414688, |
| "advantage_mean": -3.259629080543114e-09, |
| "advantage_min": -0.16966586094349623, |
| "advantage_std": 0.14272961462847888, |
| "completion_length": 1946.000015258789, |
| "epoch": 0.5234285714285715, |
| "grad_norm": 0.0026240902952849865, |
| "kl": 0.0003987550735473633, |
| "learning_rate": 1.2012473704494537e-07, |
| "loss": 0.0066, |
| "reward": 0.15159178618341684, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14272962091490626, |
| "rewards/cosine_scaled_reward": 0.11331283859908581, |
| "rewards/format_reward": 0.666666679084301, |
| "step": 458 |
| }, |
| { |
| "advantage_max": 0.10034113470464945, |
| "advantage_mean": -6.6744786780814636e-09, |
| "advantage_min": -0.12437078403308988, |
| "advantage_std": 0.09397356864064932, |
| "completion_length": 1233.9167022705078, |
| "epoch": 0.5245714285714286, |
| "grad_norm": 0.0009185225935652852, |
| "kl": 0.00022205710411071777, |
| "learning_rate": 1.1920622611056974e-07, |
| "loss": 0.0011, |
| "reward": 0.1982046803459525, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.0939735691063106, |
| "rewards/cosine_scaled_reward": 0.16758823953568935, |
| "rewards/format_reward": 0.8333333358168602, |
| "step": 459 |
| }, |
| { |
| "advantage_max": 0.2020870796404779, |
| "advantage_mean": -2.793967834868738e-09, |
| "advantage_min": -0.2301188837736845, |
| "advantage_std": 0.176668681204319, |
| "completion_length": 2900.1042098999023, |
| "epoch": 0.5257142857142857, |
| "grad_norm": 0.0030042824801057577, |
| "kl": 0.0005886554718017578, |
| "learning_rate": 1.1830871145697412e-07, |
| "loss": 0.0058, |
| "reward": 0.17208986543118954, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.17666867841035128, |
| "rewards/cosine_scaled_reward": 0.196685079485178, |
| "rewards/format_reward": 0.6250000093132257, |
| "step": 460 |
| }, |
| { |
| "advantage_max": 0.22937612980604172, |
| "advantage_mean": -1.396984056212247e-09, |
| "advantage_min": -0.20125835668295622, |
| "advantage_std": 0.1885735362302512, |
| "completion_length": 3077.375045776367, |
| "epoch": 0.5268571428571428, |
| "grad_norm": 0.003480825573205948, |
| "kl": 0.000635288655757904, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.0097, |
| "reward": 0.08316627237945795, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.18857353436760604, |
| "rewards/cosine_scaled_reward": 0.027362531051039696, |
| "rewards/format_reward": 0.4375000037252903, |
| "step": 461 |
| }, |
| { |
| "advantage_max": 0.09577207872644067, |
| "advantage_mean": 5.5879356419819e-09, |
| "advantage_min": -0.10161272855475545, |
| "advantage_std": 0.08275535795837641, |
| "completion_length": 2929.8541870117188, |
| "epoch": 0.528, |
| "grad_norm": 0.0019017203012481332, |
| "kl": 0.0005735903978347778, |
| "learning_rate": 1.1657684494105386e-07, |
| "loss": -0.001, |
| "reward": -0.005534024443477392, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.08275536308065057, |
| "rewards/cosine_scaled_reward": -0.1532220784574747, |
| "rewards/format_reward": 0.27083333395421505, |
| "step": 462 |
| }, |
| { |
| "advantage_max": 0.13797848299145699, |
| "advantage_mean": 6.984919517782906e-10, |
| "advantage_min": -0.14528980944305658, |
| "advantage_std": 0.11408946011215448, |
| "completion_length": 2641.625045776367, |
| "epoch": 0.5291428571428571, |
| "grad_norm": 0.0021766142453998327, |
| "kl": 0.0004150867462158203, |
| "learning_rate": 1.1574257748745986e-07, |
| "loss": 0.0123, |
| "reward": 0.11324432399123907, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11408946430310607, |
| "rewards/cosine_scaled_reward": 0.09595046006143093, |
| "rewards/format_reward": 0.4791666753590107, |
| "step": 463 |
| }, |
| { |
| "advantage_max": 0.06949485652148724, |
| "advantage_mean": -2.1730859889323995e-09, |
| "advantage_min": -0.0649899085983634, |
| "advantage_std": 0.05675833718851209, |
| "completion_length": 1714.729190826416, |
| "epoch": 0.5302857142857142, |
| "grad_norm": 0.0009724340634420514, |
| "kl": 0.00033186376094818115, |
| "learning_rate": 1.1492947512799328e-07, |
| "loss": 0.0017, |
| "reward": 0.1716830674558878, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.05675833881832659, |
| "rewards/cosine_scaled_reward": 0.15365481562912464, |
| "rewards/format_reward": 0.7083333358168602, |
| "step": 464 |
| }, |
| { |
| "advantage_max": 0.18007674161344767, |
| "advantage_mean": -4.501392578126762e-09, |
| "advantage_min": -0.18894312204793096, |
| "advantage_std": 0.1457268726080656, |
| "completion_length": 2496.875068664551, |
| "epoch": 0.5314285714285715, |
| "grad_norm": 0.0030991239473223686, |
| "kl": 0.000622868537902832, |
| "learning_rate": 1.1413757749211602e-07, |
| "loss": 0.006, |
| "reward": 0.16800264199264348, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1457268742378801, |
| "rewards/cosine_scaled_reward": 0.12976265419274569, |
| "rewards/format_reward": 0.7291666809469461, |
| "step": 465 |
| }, |
| { |
| "advantage_max": 0.1637826063670218, |
| "advantage_mean": -6.053597040311942e-09, |
| "advantage_min": -0.17003454267978668, |
| "advantage_std": 0.13364151399582624, |
| "completion_length": 3074.7916870117188, |
| "epoch": 0.5325714285714286, |
| "grad_norm": 0.003030646126717329, |
| "kl": 0.0006162524223327637, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": 0.0044, |
| "reward": 0.0841610130155459, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.13364151632413268, |
| "rewards/cosine_scaled_reward": 0.05105864675715566, |
| "rewards/format_reward": 0.3958333395421505, |
| "step": 466 |
| }, |
| { |
| "advantage_max": 0.1294914805330336, |
| "advantage_mean": -5.820766091346741e-10, |
| "advantage_min": -0.11973803536966443, |
| "advantage_std": 0.0973717300221324, |
| "completion_length": 2759.3333854675293, |
| "epoch": 0.5337142857142857, |
| "grad_norm": 0.0016337429406121373, |
| "kl": 0.0006120204925537109, |
| "learning_rate": 1.1261754973965422e-07, |
| "loss": 0.0054, |
| "reward": 0.021781093149911612, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.09737173025496304, |
| "rewards/cosine_scaled_reward": -0.1436814023181796, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 467 |
| }, |
| { |
| "advantage_max": 0.19513892801478505, |
| "advantage_mean": -1.6298145125159813e-09, |
| "advantage_min": -0.17503061518073082, |
| "advantage_std": 0.15135146118700504, |
| "completion_length": 2915.2708740234375, |
| "epoch": 0.5348571428571428, |
| "grad_norm": 0.0029594493098556995, |
| "kl": 0.0007366985082626343, |
| "learning_rate": 1.1188949370707787e-07, |
| "loss": 0.0098, |
| "reward": 0.039023627527058125, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15135146211832762, |
| "rewards/cosine_scaled_reward": -0.07234426774084568, |
| "rewards/format_reward": 0.3750000074505806, |
| "step": 468 |
| }, |
| { |
| "advantage_max": 0.1406808183528483, |
| "advantage_mean": 3.1820189122511167e-09, |
| "advantage_min": -0.13096178881824017, |
| "advantage_std": 0.10715825203806162, |
| "completion_length": 2790.729190826416, |
| "epoch": 0.536, |
| "grad_norm": 0.0020550782792270184, |
| "kl": 0.0006768703460693359, |
| "learning_rate": 1.1118279056249653e-07, |
| "loss": 0.0022, |
| "reward": 0.005044038873165846, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10715826088562608, |
| "rewards/cosine_scaled_reward": -0.1510836249217391, |
| "rewards/format_reward": 0.3333333358168602, |
| "step": 469 |
| }, |
| { |
| "advantage_max": 0.15261405799537897, |
| "advantage_mean": -1.144750705686648e-09, |
| "advantage_min": -0.1515314057469368, |
| "advantage_std": 0.12770982459187508, |
| "completion_length": 2569.3125381469727, |
| "epoch": 0.5371428571428571, |
| "grad_norm": 0.003103189170360565, |
| "kl": 0.0005183219909667969, |
| "learning_rate": 1.1049747474962444e-07, |
| "loss": 0.0035, |
| "reward": 0.04346779244951904, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12770982831716537, |
| "rewards/cosine_scaled_reward": -0.14374101161956787, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 470 |
| }, |
| { |
| "advantage_max": 0.09641966479830444, |
| "advantage_mean": -6.208817557307178e-10, |
| "advantage_min": -0.13236830849200487, |
| "advantage_std": 0.09493508515879512, |
| "completion_length": 2782.6458435058594, |
| "epoch": 0.5382857142857143, |
| "grad_norm": 0.0012270222650840878, |
| "kl": 0.0005309581756591797, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": 0.0019, |
| "reward": 0.004960605408996344, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09493508981540799, |
| "rewards/cosine_scaled_reward": -0.1521736173890531, |
| "rewards/format_reward": 0.3333333432674408, |
| "step": 471 |
| }, |
| { |
| "advantage_max": 0.17413780465722084, |
| "advantage_mean": -3.88050980237864e-10, |
| "advantage_min": -0.1484542451798916, |
| "advantage_std": 0.12915962655097246, |
| "completion_length": 2722.8750228881836, |
| "epoch": 0.5394285714285715, |
| "grad_norm": 0.0021392928902059793, |
| "kl": 0.0006825923919677734, |
| "learning_rate": 1.0919113768029517e-07, |
| "loss": 0.0033, |
| "reward": 0.07151374779641628, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.12915962236002088, |
| "rewards/cosine_scaled_reward": -0.02726024203002453, |
| "rewards/format_reward": 0.4791666679084301, |
| "step": 472 |
| }, |
| { |
| "advantage_max": 0.145589595194906, |
| "advantage_mean": 4.850638404829688e-09, |
| "advantage_min": -0.12249492853879929, |
| "advantage_std": 0.11026488617062569, |
| "completion_length": 2914.354217529297, |
| "epoch": 0.5405714285714286, |
| "grad_norm": 0.0016195240896195173, |
| "kl": 0.0005763769149780273, |
| "learning_rate": 1.0857018009286381e-07, |
| "loss": 0.0003, |
| "reward": 0.09501276165246964, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11026489362120628, |
| "rewards/cosine_scaled_reward": 0.08362941443920135, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 473 |
| }, |
| { |
| "advantage_max": 0.15750652737915516, |
| "advantage_mean": -5.743156061832622e-09, |
| "advantage_min": -0.09204499330371618, |
| "advantage_std": 0.09810823854058981, |
| "completion_length": 2134.6458625793457, |
| "epoch": 0.5417142857142857, |
| "grad_norm": 0.0021665513049811125, |
| "kl": 0.0005587935447692871, |
| "learning_rate": 1.0797073717209013e-07, |
| "loss": 0.0067, |
| "reward": 0.19979873031843454, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09810824086889625, |
| "rewards/cosine_scaled_reward": 0.29946115519851446, |
| "rewards/format_reward": 0.5833333358168602, |
| "step": 474 |
| }, |
| { |
| "advantage_max": 0.16019965521991253, |
| "advantage_mean": -9.002785017475645e-09, |
| "advantage_min": -0.24789944384247065, |
| "advantage_std": 0.1666221539489925, |
| "completion_length": 2112.8542098999023, |
| "epoch": 0.5428571428571428, |
| "grad_norm": 0.002450009109452367, |
| "kl": 0.0004245880991220474, |
| "learning_rate": 1.0739283813397639e-07, |
| "loss": 0.0018, |
| "reward": 0.2531615113839507, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1666221613995731, |
| "rewards/cosine_scaled_reward": 0.35053614526987076, |
| "rewards/format_reward": 0.791666679084301, |
| "step": 475 |
| }, |
| { |
| "advantage_max": 0.1794994603842497, |
| "advantage_mean": 2.638747262362351e-09, |
| "advantage_min": -0.2670667041093111, |
| "advantage_std": 0.17754664039239287, |
| "completion_length": 2330.375045776367, |
| "epoch": 0.544, |
| "grad_norm": 0.0032068255823105574, |
| "kl": 0.0006053447723388672, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": 0.0134, |
| "reward": 0.21058788988739252, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.1775466427206993, |
| "rewards/cosine_scaled_reward": 0.27757735550403595, |
| "rewards/format_reward": 0.6875000260770321, |
| "step": 476 |
| }, |
| { |
| "advantage_max": 0.1589113175868988, |
| "advantage_mean": -4.811833376194841e-09, |
| "advantage_min": -0.15475624846294522, |
| "advantage_std": 0.12695908243767917, |
| "completion_length": 1449.8542022705078, |
| "epoch": 0.5451428571428572, |
| "grad_norm": 0.001828887383453548, |
| "kl": 0.0002796947956085205, |
| "learning_rate": 1.063017833182728e-07, |
| "loss": 0.0029, |
| "reward": 0.24288752442225814, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.12695908942259848, |
| "rewards/cosine_scaled_reward": 0.26963918656110764, |
| "rewards/format_reward": 0.8958333395421505, |
| "step": 477 |
| }, |
| { |
| "advantage_max": 0.11892331298440695, |
| "advantage_mean": -8.381903254806033e-09, |
| "advantage_min": -0.15956566762179136, |
| "advantage_std": 0.1133618257008493, |
| "completion_length": 2689.5416946411133, |
| "epoch": 0.5462857142857143, |
| "grad_norm": 0.0018854563822969794, |
| "kl": 0.0004665255546569824, |
| "learning_rate": 1.0578868071715544e-07, |
| "loss": 0.0045, |
| "reward": 0.13063816633075476, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11336182476952672, |
| "rewards/cosine_scaled_reward": 0.11580769601278007, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 478 |
| }, |
| { |
| "advantage_max": 0.18029004149138927, |
| "advantage_mean": -4.03573130469681e-09, |
| "advantage_min": -0.17780630104243755, |
| "advantage_std": 0.13413295801728964, |
| "completion_length": 2802.062530517578, |
| "epoch": 0.5474285714285714, |
| "grad_norm": 0.0028555947355926037, |
| "kl": 0.0007611513137817383, |
| "learning_rate": 1.0529722834905125e-07, |
| "loss": 0.0021, |
| "reward": 0.06550013413652778, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1341329631395638, |
| "rewards/cosine_scaled_reward": -0.07874849392101169, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 479 |
| }, |
| { |
| "advantage_max": 0.22729169484227896, |
| "advantage_mean": -2.5999422337275035e-09, |
| "advantage_min": -0.15585079044103622, |
| "advantage_std": 0.1480179699137807, |
| "completion_length": 2250.6667404174805, |
| "epoch": 0.5485714285714286, |
| "grad_norm": 0.002713319845497608, |
| "kl": 0.0004476308822631836, |
| "learning_rate": 1.0482745016665526e-07, |
| "loss": 0.0066, |
| "reward": 0.08439068030565977, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.14801797457039356, |
| "rewards/cosine_scaled_reward": -0.05194460041821003, |
| "rewards/format_reward": 0.604166679084301, |
| "step": 480 |
| }, |
| { |
| "advantage_max": 0.12421236839145422, |
| "advantage_mean": -8.304293031002885e-09, |
| "advantage_min": -0.1307937242090702, |
| "advantage_std": 0.0952663142234087, |
| "completion_length": 2587.9375915527344, |
| "epoch": 0.5497142857142857, |
| "grad_norm": 0.0014279150636866689, |
| "kl": 0.0005082488059997559, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0048, |
| "reward": 0.07169135846197605, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.09526631888002157, |
| "rewards/cosine_scaled_reward": -0.12012962490553036, |
| "rewards/format_reward": 0.6666666697710752, |
| "step": 481 |
| }, |
| { |
| "advantage_max": 0.18846415961161256, |
| "advantage_mean": -7.99385220517923e-09, |
| "advantage_min": -0.1828137980774045, |
| "advantage_std": 0.15363801596686244, |
| "completion_length": 2704.9375534057617, |
| "epoch": 0.5508571428571428, |
| "grad_norm": 0.002800821093842387, |
| "kl": 0.0006156265735626221, |
| "learning_rate": 1.0395300688680625e-07, |
| "loss": 0.0005, |
| "reward": 0.1636654119938612, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.15363801969215274, |
| "rewards/cosine_scaled_reward": 0.21197660156758502, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 482 |
| }, |
| { |
| "advantage_max": 0.19416179601103067, |
| "advantage_mean": -1.5522043372850902e-10, |
| "advantage_min": -0.1611415701918304, |
| "advantage_std": 0.12938955938443542, |
| "completion_length": 2665.7083740234375, |
| "epoch": 0.552, |
| "grad_norm": 0.0027008799370378256, |
| "kl": 0.0005914568901062012, |
| "learning_rate": 1.0354838440848501e-07, |
| "loss": 0.0043, |
| "reward": 0.07180615421384573, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1293895640410483, |
| "rewards/cosine_scaled_reward": -0.11217466462403536, |
| "rewards/format_reward": 0.6458333414047956, |
| "step": 483 |
| }, |
| { |
| "advantage_max": 0.15204123593866825, |
| "advantage_mean": -2.2506964417190467e-09, |
| "advantage_min": -0.14813785336446017, |
| "advantage_std": 0.11534715490415692, |
| "completion_length": 2195.687515258789, |
| "epoch": 0.5531428571428572, |
| "grad_norm": 0.0017237699357792735, |
| "kl": 0.00040522217750549316, |
| "learning_rate": 1.0316552135205837e-07, |
| "loss": 0.0034, |
| "reward": 0.08024531602859497, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11534715909510851, |
| "rewards/cosine_scaled_reward": -0.03545547462999821, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 484 |
| }, |
| { |
| "advantage_max": 0.11956683732569218, |
| "advantage_mean": -1.164153232147136e-09, |
| "advantage_min": -0.08647039532661438, |
| "advantage_std": 0.07989799580536783, |
| "completion_length": 1736.5625457763672, |
| "epoch": 0.5542857142857143, |
| "grad_norm": 0.0011961512500420213, |
| "kl": 0.00036829710006713867, |
| "learning_rate": 1.0280443637773163e-07, |
| "loss": -0.0012, |
| "reward": 0.1085349339991808, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.07989799627102911, |
| "rewards/cosine_scaled_reward": -0.05393883492797613, |
| "rewards/format_reward": 0.75, |
| "step": 485 |
| }, |
| { |
| "advantage_max": 0.1754322131164372, |
| "advantage_mean": 3.4924587180573674e-10, |
| "advantage_min": -0.13765499275177717, |
| "advantage_std": 0.11859225039370358, |
| "completion_length": 1778.7083854675293, |
| "epoch": 0.5554285714285714, |
| "grad_norm": 0.0013995830668136477, |
| "kl": 0.00038488954305648804, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": -0.0012, |
| "reward": 0.05783613526728004, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.11859225668013096, |
| "rewards/cosine_scaled_reward": -0.14235917665064335, |
| "rewards/format_reward": 0.6250000111758709, |
| "step": 486 |
| }, |
| { |
| "advantage_max": 0.1776252081617713, |
| "advantage_mean": -6.053596859900701e-09, |
| "advantage_min": -0.25969033129513264, |
| "advantage_std": 0.16778704058378935, |
| "completion_length": 1765.500020980835, |
| "epoch": 0.5565714285714286, |
| "grad_norm": 0.0018098040018230677, |
| "kl": 0.00029283761978149414, |
| "learning_rate": 1.0214767000817596e-07, |
| "loss": -0.0038, |
| "reward": 0.2465712195262313, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1677870382554829, |
| "rewards/cosine_scaled_reward": 0.3729119673371315, |
| "rewards/format_reward": 0.7083333488553762, |
| "step": 487 |
| }, |
| { |
| "advantage_max": 0.1596626602113247, |
| "advantage_mean": -3.065603493279667e-09, |
| "advantage_min": -0.1261517507955432, |
| "advantage_std": 0.10886070877313614, |
| "completion_length": 2018.1458587646484, |
| "epoch": 0.5577142857142857, |
| "grad_norm": 0.001423255424015224, |
| "kl": 0.00038205087184906006, |
| "learning_rate": 1.0185202062281336e-07, |
| "loss": 0.0011, |
| "reward": 0.05636691814288497, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.10886071575805545, |
| "rewards/cosine_scaled_reward": -0.1461393255740404, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 488 |
| }, |
| { |
| "advantage_max": 0.13344533974304795, |
| "advantage_mean": 3.880509941156518e-10, |
| "advantage_min": -0.12539346516132355, |
| "advantage_std": 0.10917034232988954, |
| "completion_length": 3137.8333587646484, |
| "epoch": 0.5588571428571428, |
| "grad_norm": 0.0024442316498607397, |
| "kl": 0.0006935596466064453, |
| "learning_rate": 1.0157821333772304e-07, |
| "loss": 0.0037, |
| "reward": -0.013123379554599524, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.10917035210877657, |
| "rewards/cosine_scaled_reward": -0.1439531659707427, |
| "rewards/format_reward": 0.20833334140479565, |
| "step": 489 |
| }, |
| { |
| "advantage_max": 0.12650098372250795, |
| "advantage_mean": 9.216203494810671e-11, |
| "advantage_min": -0.14339723202283494, |
| "advantage_std": 0.11344200430903584, |
| "completion_length": 2136.083354949951, |
| "epoch": 0.56, |
| "grad_norm": 0.0020219513680785894, |
| "kl": 0.0005998890846967697, |
| "learning_rate": 1.013262614978859e-07, |
| "loss": 0.0018, |
| "reward": 0.10040172806475312, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11344200803432614, |
| "rewards/cosine_scaled_reward": -0.045523665845394135, |
| "rewards/format_reward": 0.6875000111758709, |
| "step": 490 |
| }, |
| { |
| "advantage_max": 0.1827850081026554, |
| "advantage_mean": -3.104408563547878e-09, |
| "advantage_min": -0.20528748910874128, |
| "advantage_std": 0.15947058238089085, |
| "completion_length": 2341.541717529297, |
| "epoch": 0.5611428571428572, |
| "grad_norm": 0.003260035300627351, |
| "kl": 0.0004696100950241089, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": 0.0126, |
| "reward": 0.15225723420735449, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15947059262543917, |
| "rewards/cosine_scaled_reward": 0.11595133878290653, |
| "rewards/format_reward": 0.666666679084301, |
| "step": 491 |
| }, |
| { |
| "advantage_max": 0.13405136327492073, |
| "advantage_mean": 1.5425030610444201e-09, |
| "advantage_min": -0.11482641356997192, |
| "advantage_std": 0.10828206856967881, |
| "completion_length": 2551.937511444092, |
| "epoch": 0.5622857142857143, |
| "grad_norm": 0.0028170021250844, |
| "kl": 0.00046622753143310547, |
| "learning_rate": 1.0088797220727779e-07, |
| "loss": 0.006, |
| "reward": 0.06306978134671226, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.10828206315636635, |
| "rewards/cosine_scaled_reward": -0.03327612020075321, |
| "rewards/format_reward": 0.4375000037252903, |
| "step": 492 |
| }, |
| { |
| "advantage_max": 0.1396464416757226, |
| "advantage_mean": -2.328306464294272e-09, |
| "advantage_min": -0.15597060602158308, |
| "advantage_std": 0.12428847094997764, |
| "completion_length": 1993.7708892822266, |
| "epoch": 0.5634285714285714, |
| "grad_norm": 0.0031442081090062857, |
| "kl": 0.00047659873962402344, |
| "learning_rate": 1.0070165611810855e-07, |
| "loss": 0.0204, |
| "reward": 0.13567497371695936, |
| "reward_advantage_correlation": 0.9999999999999998, |
| "reward_std": 0.12428847677074373, |
| "rewards/cosine_scaled_reward": 0.05777975544333458, |
| "rewards/format_reward": 0.6875000037252903, |
| "step": 493 |
| }, |
| { |
| "advantage_max": 0.18611273635178804, |
| "advantage_mean": -1.1175870923141318e-08, |
| "advantage_min": -0.20461444603279233, |
| "advantage_std": 0.16241960739716887, |
| "completion_length": 1439.104190826416, |
| "epoch": 0.5645714285714286, |
| "grad_norm": 0.002649143571034074, |
| "kl": 0.000398978590965271, |
| "learning_rate": 1.005372381963547e-07, |
| "loss": -0.0075, |
| "reward": 0.24482827726751566, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.16241961810737848, |
| "rewards/cosine_scaled_reward": 0.22105430043302476, |
| "rewards/format_reward": 1.0, |
| "step": 494 |
| }, |
| { |
| "advantage_max": 0.14939172100275755, |
| "advantage_mean": -5.8983763567832526e-09, |
| "advantage_min": -0.13820822536945343, |
| "advantage_std": 0.11426809709519148, |
| "completion_length": 2485.166717529297, |
| "epoch": 0.5657142857142857, |
| "grad_norm": 0.0017962036654353142, |
| "kl": 0.00054946169257164, |
| "learning_rate": 1.0039472645551372e-07, |
| "loss": 0.0039, |
| "reward": 0.14726564195007086, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11426810221746564, |
| "rewards/cosine_scaled_reward": 0.16136360727250576, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 495 |
| }, |
| { |
| "advantage_max": 0.15256773307919502, |
| "advantage_mean": 1.0089328539297782e-09, |
| "advantage_min": -0.13196661323308945, |
| "advantage_std": 0.11284308601170778, |
| "completion_length": 1811.2916793823242, |
| "epoch": 0.5668571428571428, |
| "grad_norm": 0.0022594898473471403, |
| "kl": 0.00036126933991909027, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": 0.0059, |
| "reward": 0.20605642755981535, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.11284308694303036, |
| "rewards/cosine_scaled_reward": 0.2769278697669506, |
| "rewards/format_reward": 0.6666666716337204, |
| "step": 496 |
| }, |
| { |
| "advantage_max": 0.17324247024953365, |
| "advantage_mean": -1.6298146443549655e-09, |
| "advantage_min": -0.2051475211046636, |
| "advantage_std": 0.16091588605195284, |
| "completion_length": 2181.6042137145996, |
| "epoch": 0.568, |
| "grad_norm": 0.0028121285140514374, |
| "kl": 0.00037629157304763794, |
| "learning_rate": 1.0017544823184055e-07, |
| "loss": 0.0112, |
| "reward": 0.18671931326389313, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.1609158907085657, |
| "rewards/cosine_scaled_reward": 0.22045234218239784, |
| "rewards/format_reward": 0.6666666679084301, |
| "step": 497 |
| }, |
| { |
| "advantage_max": 0.1668732976540923, |
| "advantage_mean": -1.3193736866923267e-09, |
| "advantage_min": -0.1562135349959135, |
| "advantage_std": 0.12399712949991226, |
| "completion_length": 2174.375015258789, |
| "epoch": 0.5691428571428572, |
| "grad_norm": 0.0032206247560679913, |
| "kl": 0.0004447326064109802, |
| "learning_rate": 1.0009869243631952e-07, |
| "loss": 0.0153, |
| "reward": 0.09758738335222006, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.1239971297327429, |
| "rewards/cosine_scaled_reward": -0.015365742146968842, |
| "rewards/format_reward": 0.6041666753590107, |
| "step": 498 |
| }, |
| { |
| "advantage_max": 0.18602585699409246, |
| "advantage_mean": -4.811833487217143e-09, |
| "advantage_min": -0.23508271854370832, |
| "advantage_std": 0.17544544488191605, |
| "completion_length": 2213.7292098999023, |
| "epoch": 0.5702857142857143, |
| "grad_norm": 0.0020961996633559465, |
| "kl": 0.0003085378557443619, |
| "learning_rate": 1.000438641958131e-07, |
| "loss": 0.007, |
| "reward": 0.1805350393988192, |
| "reward_advantage_correlation": 0.9999999999999999, |
| "reward_std": 0.17544544488191605, |
| "rewards/cosine_scaled_reward": 0.1705985008738935, |
| "rewards/format_reward": 0.7291666753590107, |
| "step": 499 |
| }, |
| { |
| "advantage_max": 0.17333817295730114, |
| "advantage_mean": -8.925174717344664e-09, |
| "advantage_min": -0.2120303800329566, |
| "advantage_std": 0.1599614191800356, |
| "completion_length": 2608.6458892822266, |
| "epoch": 0.5714285714285714, |
| "grad_norm": 0.003356323577463627, |
| "kl": 0.0006328821182250977, |
| "learning_rate": 1.0001096618257236e-07, |
| "loss": 0.0124, |
| "reward": 0.13607118383515626, |
| "reward_advantage_correlation": 1.0, |
| "reward_std": 0.15996142383664846, |
| "rewards/cosine_scaled_reward": 0.14388170279562473, |
| "rewards/format_reward": 0.5208333507180214, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "step": 500, |
| "total_flos": 0.0, |
| "train_loss": 0.004779140234080842, |
| "train_runtime": 145838.7741, |
| "train_samples_per_second": 0.165, |
| "train_steps_per_second": 0.003 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|