{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 1.7736882269382477, "advantage_mean": 4.967053990334591e-09, "advantage_min": -1.0247227177023888, "advantage_std": 0.9998322650790215, "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.19998183846473694, "kl": 0.0, "lambda_div_used": 0.6, "learning_rate": 2e-08, "loss": -0.0, "reward": 0.06657012924551964, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06657012924551964, "reward_after_std": 0.805392861366272, "reward_before_mean": 0.4897647276520729, "reward_before_std": 0.8290339298546314, "reward_change_max": 0.0005614385008811951, "reward_change_mean": -0.4231945872306824, "reward_change_min": -0.8292400389909744, "reward_change_std": 0.33647667057812214, "reward_std": 0.8053928762674332, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "advantage_max": 1.7142265439033508, "advantage_mean": 2.7318796780306798e-08, "advantage_min": -1.0121877193450928, "advantage_std": 0.9997509345412254, "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.18245820701122284, "kl": 0.0, "lambda_div_used": 0.6, "learning_rate": 4e-08, "loss": -0.0, "reward": -0.11615866981446743, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11615866981446743, "reward_after_std": 0.4655082933604717, "reward_before_mean": 0.27539755403995514, "reward_before_std": 0.42092561535537243, "reward_change_max": 0.0013062208890914917, "reward_change_mean": -0.39155622851103544, "reward_change_min": -0.6376443430781364, "reward_change_std": 0.26012564916163683, "reward_std": 0.46550831012427807, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "advantage_max": 1.7450169622898102, "advantage_mean": 1.3659398945264911e-08, "advantage_min": -0.9959479197859764, "advantage_std": 0.999689593911171, "completion_length": 3374.3125, "epoch": 0.0034285714285714284, "grad_norm": 0.16747689247131348, "kl": 4.373490810394287e-05, "lambda_div_used": 0.6, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.5193910151720047, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.5193910151720047, "reward_after_std": 0.48704322427511215, "reward_before_mean": -0.35822661221027374, "reward_before_std": 0.5342355072498322, "reward_change_max": 0.0013915002346038818, "reward_change_mean": -0.16116441413760185, "reward_change_min": -0.43814222142100334, "reward_change_std": 0.17781410180032253, "reward_std": 0.4870432298630476, "rewards/cosine_scaled_reward": -0.25202997773885727, "rewards/format_reward": 0.14583333767950535, "step": 3 }, { "advantage_max": 1.9348038583993912, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.7399652823805809, "advantage_std": 0.9998378455638885, "completion_length": 2311.27091217041, "epoch": 0.004571428571428572, "grad_norm": 0.2256404608488083, "kl": 3.388524055480957e-05, "lambda_div_used": 0.6, "learning_rate": 8e-08, "loss": 0.0, "reward": -0.05920893343864009, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05920893343864009, "reward_after_std": 0.8477489091455936, "reward_before_mean": 0.2724987119436264, "reward_before_std": 0.7898660115897655, "reward_change_max": 0.00016905367374420166, "reward_change_mean": -0.3317076303064823, "reward_change_min": -0.6719550713896751, "reward_change_std": 0.2557057347148657, "reward_std": 0.8477489277720451, "rewards/cosine_scaled_reward": -0.1658339835703373, "rewards/format_reward": 0.6041666734963655, "step": 4 }, { "advantage_max": 1.8208965361118317, "advantage_mean": -8.692344399818808e-09, "advantage_min": -0.8359091579914093, "advantage_std": 0.9997872859239578, "completion_length": 3321.4375610351562, "epoch": 0.005714285714285714, "grad_norm": 0.23304429650306702, "kl": 4.240870475769043e-05, "lambda_div_used": 0.6, "learning_rate": 1e-07, "loss": 0.0, "reward": -0.32904624473303556, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32904624473303556, "reward_after_std": 0.5964408218860626, "reward_before_mean": -0.0841533716302365, "reward_before_std": 0.6196324154734612, "reward_change_max": 0.001553364098072052, "reward_change_mean": -0.24489288963377476, "reward_change_min": -0.5599503479897976, "reward_change_std": 0.22373187262564898, "reward_std": 0.5964408367872238, "rewards/cosine_scaled_reward": -0.17749335523694754, "rewards/format_reward": 0.27083334140479565, "step": 5 }, { "advantage_max": 1.8378510475158691, "advantage_mean": 2.980232394200755e-08, "advantage_min": -0.7627209424972534, "advantage_std": 0.9998485594987869, "completion_length": 3113.937545776367, "epoch": 0.006857142857142857, "grad_norm": 0.21384815871715546, "kl": 4.3161213397979736e-05, "lambda_div_used": 0.6, "learning_rate": 1.2e-07, "loss": 0.0, "reward": -0.1830892115831375, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1830892115831375, "reward_after_std": 0.9384645372629166, "reward_before_mean": 0.061891574412584305, "reward_before_std": 0.9313972406089306, "reward_change_max": 0.0028055086731910706, "reward_change_mean": -0.24498078599572182, "reward_change_min": -0.5818257704377174, "reward_change_std": 0.22530303802341223, "reward_std": 0.9384645596146584, "rewards/cosine_scaled_reward": -0.10447087977081537, "rewards/format_reward": 0.2708333358168602, "step": 6 }, { "advantage_max": 1.848206102848053, "advantage_mean": 1.6142924885720333e-08, "advantage_min": -0.8735722899436951, "advantage_std": 0.9998387470841408, "completion_length": 2980.854217529297, "epoch": 0.008, "grad_norm": 0.1532345712184906, "kl": 2.2076070308685303e-05, "lambda_div_used": 0.6, "learning_rate": 1.4e-07, "loss": 0.0, "reward": -0.11802996881306171, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11802996881306171, "reward_after_std": 0.7807512283325195, "reward_before_mean": 0.2068807277828455, "reward_before_std": 0.7968633286654949, "reward_change_max": 5.9023499488830566e-05, "reward_change_mean": -0.3249106667935848, "reward_change_min": -0.6799479350447655, "reward_change_std": 0.2789893364533782, "reward_std": 0.7807512618601322, "rewards/cosine_scaled_reward": -0.13614298962056637, "rewards/format_reward": 0.4791666828095913, "step": 7 }, { "advantage_max": 1.768284872174263, "advantage_mean": 6.208817571184966e-09, "advantage_min": -1.073442094027996, "advantage_std": 0.9998049214482307, "completion_length": 2740.937530517578, "epoch": 0.009142857142857144, "grad_norm": 0.1699635237455368, "kl": 1.948140561580658e-05, "lambda_div_used": 0.6, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.18696115911006927, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.18696115911006927, "reward_after_std": 0.7428050264716148, "reward_before_mean": 0.6922222673892975, "reward_before_std": 0.7346657477319241, "reward_change_max": 0.0013648942112922668, "reward_change_mean": -0.5052611185237765, "reward_change_min": -0.8160489983856678, "reward_change_std": 0.34187921043485403, "reward_std": 0.7428050450980663, "rewards/cosine_scaled_reward": 0.096111124381423, "rewards/format_reward": 0.5000000111758709, "step": 8 }, { "advantage_max": 1.7886784225702286, "advantage_mean": 1.6142924996742636e-08, "advantage_min": -0.8845790177583694, "advantage_std": 0.9997691288590431, "completion_length": 3379.9583740234375, "epoch": 0.010285714285714285, "grad_norm": 0.17148180305957794, "kl": 4.947185516357422e-05, "lambda_div_used": 0.6, "learning_rate": 1.8e-07, "loss": 0.0, "reward": -0.3192979171872139, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.3192979171872139, "reward_after_std": 0.5886461585760117, "reward_before_mean": -0.06322952476330101, "reward_before_std": 0.6233885241672397, "reward_change_max": 0.0002186223864555359, "reward_change_mean": -0.2560683786869049, "reward_change_min": -0.5929627306759357, "reward_change_std": 0.2404519086703658, "reward_std": 0.5886461641639471, "rewards/cosine_scaled_reward": -0.14619810320436954, "rewards/format_reward": 0.2291666716337204, "step": 9 }, { "advantage_max": 1.7605973780155182, "advantage_mean": 2.2662183574162498e-08, "advantage_min": -1.0062192007899284, "advantage_std": 0.9998086839914322, "completion_length": 2689.1041679382324, "epoch": 0.011428571428571429, "grad_norm": 0.2057720124721527, "kl": 2.2359192371368408e-05, "lambda_div_used": 0.6, "learning_rate": 2e-07, "loss": 0.0, "reward": -0.14903598907403648, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.14903598907403648, "reward_after_std": 0.6891336962580681, "reward_before_mean": 0.17712077498435974, "reward_before_std": 0.7159077972173691, "reward_change_max": 0.0, "reward_change_mean": -0.32615675404667854, "reward_change_min": -0.6679622866213322, "reward_change_std": 0.27471973933279514, "reward_std": 0.689133707433939, "rewards/cosine_scaled_reward": -0.11977295717224479, "rewards/format_reward": 0.4166666716337204, "step": 10 }, { "advantage_max": 1.8891745954751968, "advantage_mean": 7.69893364616081e-08, "advantage_min": -0.9090164303779602, "advantage_std": 0.999754011631012, "completion_length": 3391.7916870117188, "epoch": 0.012571428571428572, "grad_norm": 0.17078208923339844, "kl": 2.7604401111602783e-05, "lambda_div_used": 0.6, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": -0.4651691932231188, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4651691932231188, "reward_after_std": 0.5533009208738804, "reward_before_mean": -0.29569249910855433, "reward_before_std": 0.5365621503442526, "reward_change_max": 0.0, "reward_change_mean": -0.1694766730070114, "reward_change_min": -0.3405461013317108, "reward_change_std": 0.1386844478547573, "reward_std": 0.5533009320497513, "rewards/cosine_scaled_reward": -0.2103462554514408, "rewards/format_reward": 0.12500000186264515, "step": 11 }, { "advantage_max": 1.815945327281952, "advantage_mean": 2.235174295650566e-08, "advantage_min": -0.7791703343391418, "advantage_std": 0.9998359605669975, "completion_length": 2334.854202270508, "epoch": 0.013714285714285714, "grad_norm": 0.24145646393299103, "kl": 3.232434391975403e-05, "lambda_div_used": 0.6, "learning_rate": 2.4e-07, "loss": 0.0, "reward": -0.039457873441278934, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.039457873441278934, "reward_after_std": 0.8034725449979305, "reward_before_mean": 0.32744147814810276, "reward_before_std": 0.8328801915049553, "reward_change_max": 0.0006123185157775879, "reward_change_mean": -0.36689933901652694, "reward_change_min": -0.8374556675553322, "reward_change_std": 0.3241370841860771, "reward_std": 0.8034725710749626, "rewards/cosine_scaled_reward": -0.19044593471335247, "rewards/format_reward": 0.7083333358168602, "step": 12 }, { "advantage_max": 1.7793785631656647, "advantage_mean": 2.1265199309783434e-08, "advantage_min": -1.123232178390026, "advantage_std": 0.9998079463839531, "completion_length": 2883.3959045410156, "epoch": 0.014857142857142857, "grad_norm": 0.30072519183158875, "kl": 2.9988586902618408e-05, "lambda_div_used": 0.6, "learning_rate": 2.6e-07, "loss": 0.0, "reward": 0.07007572869770229, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07007572869770229, "reward_after_std": 0.6831374354660511, "reward_before_mean": 0.5178708247840405, "reward_before_std": 0.6717508621513844, "reward_change_max": 0.00019785761833190918, "reward_change_mean": -0.44779507629573345, "reward_change_min": -0.7652952261269093, "reward_change_std": 0.3106790967285633, "reward_std": 0.6831374652683735, "rewards/cosine_scaled_reward": 0.019352062605321407, "rewards/format_reward": 0.47916667349636555, "step": 13 }, { "advantage_max": 1.8502720147371292, "advantage_mean": 4.2840839431512734e-08, "advantage_min": -0.8646775856614113, "advantage_std": 0.9998083263635635, "completion_length": 2823.9583892822266, "epoch": 0.016, "grad_norm": 0.2543714642524719, "kl": 2.5618821382522583e-05, "lambda_div_used": 0.6, "learning_rate": 2.8e-07, "loss": 0.0, "reward": -0.055498819798231125, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.055498819798231125, "reward_after_std": 0.8212512582540512, "reward_before_mean": 0.2946751266717911, "reward_before_std": 0.8422470949590206, "reward_change_max": 0.001776367425918579, "reward_change_mean": -0.3501739474013448, "reward_change_min": -0.8158514685928822, "reward_change_std": 0.3071507504209876, "reward_std": 0.8212512955069542, "rewards/cosine_scaled_reward": -0.04016243852674961, "rewards/format_reward": 0.37500000931322575, "step": 14 }, { "advantage_max": 1.868079349398613, "advantage_mean": 1.8316010486074674e-08, "advantage_min": -0.7876945361495018, "advantage_std": 0.999783918261528, "completion_length": 2822.9791717529297, "epoch": 0.017142857142857144, "grad_norm": 0.17979443073272705, "kl": 2.6114284992218018e-05, "lambda_div_used": 0.6, "learning_rate": 3e-07, "loss": 0.0, "reward": -0.01977388933300972, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.01977388933300972, "reward_after_std": 0.612056341022253, "reward_before_mean": 0.38737083226442337, "reward_before_std": 0.5165360234677792, "reward_change_max": 0.0003376305103302002, "reward_change_mean": -0.4071447290480137, "reward_change_min": -0.7115199901163578, "reward_change_std": 0.2606779634952545, "reward_std": 0.6120563521981239, "rewards/cosine_scaled_reward": 0.006185416132211685, "rewards/format_reward": 0.3750000037252903, "step": 15 }, { "advantage_max": 1.8392015546560287, "advantage_mean": -4.967053768289986e-09, "advantage_min": -0.8386228755116463, "advantage_std": 0.9997298642992973, "completion_length": 3462.1041870117188, "epoch": 0.018285714285714287, "grad_norm": 0.2112903892993927, "kl": 4.178285598754883e-05, "lambda_div_used": 0.6, "learning_rate": 3.2e-07, "loss": 0.0, "reward": -0.5464260056614876, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5464260056614876, "reward_after_std": 0.5875924732536077, "reward_before_mean": -0.43014006270095706, "reward_before_std": 0.5928587820380926, "reward_change_max": 0.0021633952856063843, "reward_change_mean": -0.11628596065565944, "reward_change_min": -0.28673963621258736, "reward_change_std": 0.11779441172257066, "reward_std": 0.5875924807041883, "rewards/cosine_scaled_reward": -0.2567367013543844, "rewards/format_reward": 0.0833333358168602, "step": 16 }, { "advantage_max": 1.7946833074092865, "advantage_mean": 7.761021159069514e-09, "advantage_min": -0.884327657520771, "advantage_std": 0.9998291656374931, "completion_length": 2296.708396911621, "epoch": 0.019428571428571427, "grad_norm": 0.28919804096221924, "kl": 3.8780272006988525e-05, "lambda_div_used": 0.6, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": 0.03589238924905658, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.03589238924905658, "reward_after_std": 0.7403586134314537, "reward_before_mean": 0.45668516401201487, "reward_before_std": 0.7449082098901272, "reward_change_max": 0.001710943877696991, "reward_change_mean": -0.4207927817478776, "reward_change_min": -0.8583872355520725, "reward_change_std": 0.32544669695198536, "reward_std": 0.7403586395084858, "rewards/cosine_scaled_reward": -0.0737407635897398, "rewards/format_reward": 0.6041666679084301, "step": 17 }, { "advantage_max": 1.8921531289815903, "advantage_mean": 2.048909714114089e-08, "advantage_min": -0.7638939619064331, "advantage_std": 0.9998648390173912, "completion_length": 3076.6250610351562, "epoch": 0.02057142857142857, "grad_norm": 0.15994614362716675, "kl": 2.356991171836853e-05, "lambda_div_used": 0.6, "learning_rate": 3.6e-07, "loss": 0.0, "reward": -0.0722283124923706, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.0722283124923706, "reward_after_std": 0.9180725961923599, "reward_before_mean": 0.24214321840554476, "reward_before_std": 0.9020998664200306, "reward_change_max": 0.0006986036896705627, "reward_change_mean": -0.3143715038895607, "reward_change_min": -0.6540146470069885, "reward_change_std": 0.26200826931744814, "reward_std": 0.9180726371705532, "rewards/cosine_scaled_reward": -0.08726172894239426, "rewards/format_reward": 0.41666667349636555, "step": 18 }, { "advantage_max": 1.7503979355096817, "advantage_mean": 1.4901161193847656e-08, "advantage_min": -0.8893851488828659, "advantage_std": 0.9998177886009216, "completion_length": 2895.6458740234375, "epoch": 0.021714285714285714, "grad_norm": 0.17977163195610046, "kl": 2.0965933799743652e-05, "lambda_div_used": 0.6, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.20064589567482471, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20064589567482471, "reward_after_std": 0.769149724394083, "reward_before_mean": 0.7063506981357932, "reward_before_std": 0.7390439650043845, "reward_change_max": 0.0026439353823661804, "reward_change_mean": -0.5057047791779041, "reward_change_min": -0.9240194708108902, "reward_change_std": 0.3898439407348633, "reward_std": 0.7691497392952442, "rewards/cosine_scaled_reward": 0.13442532037151977, "rewards/format_reward": 0.4375000037252903, "step": 19 }, { "advantage_max": 1.7905155718326569, "advantage_mean": 3.1044089521259366e-09, "advantage_min": -1.0227502509951591, "advantage_std": 0.9998354762792587, "completion_length": 2309.041717529297, "epoch": 0.022857142857142857, "grad_norm": 0.23434168100357056, "kl": 1.6529113054275513e-05, "lambda_div_used": 0.6, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.10582668473944068, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10582668473944068, "reward_after_std": 0.8228973932564259, "reward_before_mean": 0.5439623333513737, "reward_before_std": 0.8307750299572945, "reward_change_max": 0.0010927170515060425, "reward_change_mean": -0.43813565373420715, "reward_change_min": -0.8274941109120846, "reward_change_std": 0.34579705353826284, "reward_std": 0.8228974267840385, "rewards/cosine_scaled_reward": -0.050935512874275446, "rewards/format_reward": 0.6458333469927311, "step": 20 }, { "advantage_max": 1.8194524645805359, "advantage_mean": 5.525847429632691e-08, "advantage_min": -0.9087987467646599, "advantage_std": 0.9997514858841896, "completion_length": 2732.5833740234375, "epoch": 0.024, "grad_norm": 0.25368764996528625, "kl": 3.4362077713012695e-05, "lambda_div_used": 0.6, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": -0.10419294983148575, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.10419294983148575, "reward_after_std": 0.7973214015364647, "reward_before_mean": 0.2189039383083582, "reward_before_std": 0.7819975260645151, "reward_change_max": 0.0013872236013412476, "reward_change_mean": -0.3230968825519085, "reward_change_min": -0.6084227226674557, "reward_change_std": 0.25168567057698965, "reward_std": 0.7973214238882065, "rewards/cosine_scaled_reward": -0.08846470632124692, "rewards/format_reward": 0.39583334513008595, "step": 21 }, { "advantage_max": 1.8503218442201614, "advantage_mean": -1.9247333726823967e-08, "advantage_min": -0.8788245841860771, "advantage_std": 0.9998400658369064, "completion_length": 1828.4166870117188, "epoch": 0.025142857142857144, "grad_norm": 0.28431108593940735, "kl": 3.190571442246437e-05, "lambda_div_used": 0.6, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.19262014399282634, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19262014399282634, "reward_after_std": 0.735573273152113, "reward_before_mean": 0.6972342263907194, "reward_before_std": 0.671034948900342, "reward_change_max": 0.0, "reward_change_mean": -0.5046140514314175, "reward_change_min": -0.9077365770936012, "reward_change_std": 0.35123981535434723, "reward_std": 0.7355732768774033, "rewards/cosine_scaled_reward": -0.04721624404191971, "rewards/format_reward": 0.7916666753590107, "step": 22 }, { "advantage_max": 1.824779137969017, "advantage_mean": 1.2417634476236117e-08, "advantage_min": -0.8828606568276882, "advantage_std": 0.9998202919960022, "completion_length": 2416.9792098999023, "epoch": 0.026285714285714287, "grad_norm": 0.21872000396251678, "kl": 3.149360418319702e-05, "lambda_div_used": 0.6, "learning_rate": 4.6e-07, "loss": 0.0, "reward": -0.10515248589217663, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.10515248589217663, "reward_after_std": 0.7565237432718277, "reward_before_mean": 0.22834152821451426, "reward_before_std": 0.7646547462791204, "reward_change_max": 0.0011374279856681824, "reward_change_mean": -0.3334940178319812, "reward_change_min": -0.7022706530988216, "reward_change_std": 0.28038450982421637, "reward_std": 0.7565237805247307, "rewards/cosine_scaled_reward": -0.13582924474030733, "rewards/format_reward": 0.5000000074505806, "step": 23 }, { "advantage_max": 1.7469995766878128, "advantage_mean": 2.9491878938969762e-09, "advantage_min": -1.066501371562481, "advantage_std": 0.9998574405908585, "completion_length": 2877.500045776367, "epoch": 0.027428571428571427, "grad_norm": 0.24179069697856903, "kl": 3.087148070335388e-05, "lambda_div_used": 0.6, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.08253479516133666, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.08253479516133666, "reward_after_std": 0.9543181359767914, "reward_before_mean": 0.48664891347289085, "reward_before_std": 1.0185184814035892, "reward_change_max": 0.0005891844630241394, "reward_change_mean": -0.4041141299530864, "reward_change_min": -0.8701234757900238, "reward_change_std": 0.36574244499206543, "reward_std": 0.954318180680275, "rewards/cosine_scaled_reward": 0.0037411183584481478, "rewards/format_reward": 0.4791666828095913, "step": 24 }, { "advantage_max": 1.7304434180259705, "advantage_mean": 6.332993629509787e-08, "advantage_min": -1.050755836069584, "advantage_std": 0.9998001903295517, "completion_length": 2825.6458740234375, "epoch": 0.02857142857142857, "grad_norm": 0.2251826375722885, "kl": 3.547966480255127e-05, "lambda_div_used": 0.6, "learning_rate": 5e-07, "loss": 0.0, "reward": -0.13675972539931536, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.13675972539931536, "reward_after_std": 0.7334093153476715, "reward_before_mean": 0.18887527519837022, "reward_before_std": 0.7841606214642525, "reward_change_max": 0.0004205778241157532, "reward_change_mean": -0.3256350150331855, "reward_change_min": -0.6350610516965389, "reward_change_std": 0.2870927806943655, "reward_std": 0.7334093227982521, "rewards/cosine_scaled_reward": -0.08264568448066711, "rewards/format_reward": 0.3541666716337204, "step": 25 }, { "advantage_max": 1.8000003397464752, "advantage_mean": 5.5879355587151736e-08, "advantage_min": -0.9525649920105934, "advantage_std": 0.9997768253087997, "completion_length": 2798.562515258789, "epoch": 0.029714285714285714, "grad_norm": 0.16408225893974304, "kl": 3.130175173282623e-05, "lambda_div_used": 0.6, "learning_rate": 5.2e-07, "loss": 0.0, "reward": 0.019447185564786196, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.019447185564786196, "reward_after_std": 0.5030231699347496, "reward_before_mean": 0.4766043610870838, "reward_before_std": 0.41719701141119003, "reward_change_max": 0.00043053925037384033, "reward_change_mean": -0.45715717040002346, "reward_change_min": -0.6959591060876846, "reward_change_std": 0.2762450519949198, "reward_std": 0.5030231848359108, "rewards/cosine_scaled_reward": -0.03253114968538284, "rewards/format_reward": 0.5416666679084301, "step": 26 }, { "advantage_max": 1.779951274394989, "advantage_mean": -6.208815128694312e-10, "advantage_min": -0.8513502217829227, "advantage_std": 0.999822735786438, "completion_length": 3026.875045776367, "epoch": 0.030857142857142857, "grad_norm": 0.1849300116300583, "kl": 3.397837281227112e-05, "lambda_div_used": 0.6, "learning_rate": 5.4e-07, "loss": 0.0, "reward": -0.10913681925740093, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.10913681925740093, "reward_after_std": 0.7921761274337769, "reward_before_mean": 0.22155018523335457, "reward_before_std": 0.8362809754908085, "reward_change_max": 0.00031948089599609375, "reward_change_mean": -0.3306870339438319, "reward_change_min": -0.7002861388027668, "reward_change_std": 0.2832873035222292, "reward_std": 0.7921761721372604, "rewards/cosine_scaled_reward": -0.08714157156646252, "rewards/format_reward": 0.39583333767950535, "step": 27 }, { "advantage_max": 1.7814081907272339, "advantage_mean": 1.2417630257388623e-09, "advantage_min": -0.9797869324684143, "advantage_std": 0.9997996240854263, "completion_length": 2828.3542098999023, "epoch": 0.032, "grad_norm": 0.19132548570632935, "kl": 2.2899359464645386e-05, "lambda_div_used": 0.6, "learning_rate": 5.6e-07, "loss": 0.0, "reward": -0.09824594110250473, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09824594110250473, "reward_after_std": 0.680554173886776, "reward_before_mean": 0.2582579329609871, "reward_before_std": 0.6859657093882561, "reward_change_max": 0.0011489912867546082, "reward_change_mean": -0.3565038787201047, "reward_change_min": -0.7232412360608578, "reward_change_std": 0.2864949721843004, "reward_std": 0.6805541850626469, "rewards/cosine_scaled_reward": -0.0687877181917429, "rewards/format_reward": 0.39583334513008595, "step": 28 }, { "advantage_max": 1.8605255335569382, "advantage_mean": 7.202228036184977e-08, "advantage_min": -0.8301983177661896, "advantage_std": 0.9997770339250565, "completion_length": 3273.7083587646484, "epoch": 0.03314285714285714, "grad_norm": 0.18308158218860626, "kl": 1.9058585166931152e-05, "lambda_div_used": 0.6, "learning_rate": 5.8e-07, "loss": 0.0, "reward": -0.47089114785194397, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.47089114785194397, "reward_after_std": 0.6084134839475155, "reward_before_mean": -0.31675157556310296, "reward_before_std": 0.6089425943791866, "reward_change_max": 0.0014341697096824646, "reward_change_mean": -0.1541395653039217, "reward_change_min": -0.35405242815613747, "reward_change_std": 0.14612348517403007, "reward_std": 0.608413502573967, "rewards/cosine_scaled_reward": -0.26254245825111866, "rewards/format_reward": 0.20833333767950535, "step": 29 }, { "advantage_max": 1.8005549013614655, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.9672626964747906, "advantage_std": 0.9998631924390793, "completion_length": 2901.6458892822266, "epoch": 0.03428571428571429, "grad_norm": 0.16869448125362396, "kl": 2.5976449251174927e-05, "lambda_div_used": 0.6, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.08703623432666063, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.08703623432666063, "reward_after_std": 0.962010782212019, "reward_before_mean": 0.49391913414001465, "reward_before_std": 1.0108840502798557, "reward_change_max": 0.00033224374055862427, "reward_change_mean": -0.40688287653028965, "reward_change_min": -0.8312213607132435, "reward_change_std": 0.3514333504717797, "reward_std": 0.962010845541954, "rewards/cosine_scaled_reward": -0.013457119464874268, "rewards/format_reward": 0.5208333469927311, "step": 30 }, { "advantage_max": 1.791318416595459, "advantage_mean": 1.2417632477834672e-09, "advantage_min": -0.8292011320590973, "advantage_std": 0.9998136609792709, "completion_length": 2891.5833740234375, "epoch": 0.03542857142857143, "grad_norm": 0.20195633172988892, "kl": 3.875233232975006e-05, "lambda_div_used": 0.6, "learning_rate": 6.2e-07, "loss": 0.0, "reward": -0.1924915760755539, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1924915760755539, "reward_after_std": 0.7389788702130318, "reward_before_mean": 0.10049068834632635, "reward_before_std": 0.7730084210634232, "reward_change_max": 0.0006143301725387573, "reward_change_mean": -0.29298229329288006, "reward_change_min": -0.633475948125124, "reward_change_std": 0.26671546418219805, "reward_std": 0.7389788739383221, "rewards/cosine_scaled_reward": -0.1268379855901003, "rewards/format_reward": 0.3541666679084301, "step": 31 }, { "advantage_max": 1.7208815962076187, "advantage_mean": 2.4835269396561444e-08, "advantage_min": -1.1677290424704552, "advantage_std": 0.9997886344790459, "completion_length": 3198.979202270508, "epoch": 0.036571428571428574, "grad_norm": 0.18060676753520966, "kl": 2.734363079071045e-05, "lambda_div_used": 0.6, "learning_rate": 6.4e-07, "loss": 0.0, "reward": -0.02893495187163353, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.02893495187163353, "reward_after_std": 0.6704957820475101, "reward_before_mean": 0.37428003549575806, "reward_before_std": 0.7028882801532745, "reward_change_max": 0.0005309507250785828, "reward_change_mean": -0.40321500319987535, "reward_change_min": -0.6910530626773834, "reward_change_std": 0.29959324561059475, "reward_std": 0.6704958230257034, "rewards/cosine_scaled_reward": 0.020473352633416653, "rewards/format_reward": 0.33333334885537624, "step": 32 }, { "advantage_max": 1.7768725603818893, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.8517647087574005, "advantage_std": 0.9997695460915565, "completion_length": 3366.2708740234375, "epoch": 0.037714285714285714, "grad_norm": 0.1567767709493637, "kl": 3.699958324432373e-05, "lambda_div_used": 0.6, "learning_rate": 6.6e-07, "loss": 0.0, "reward": -0.2785487826913595, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2785487826913595, "reward_after_std": 0.658524302765727, "reward_before_mean": -0.01756126992404461, "reward_before_std": 0.6902536172419786, "reward_change_max": 0.001083478331565857, "reward_change_mean": -0.2609875090420246, "reward_change_min": -0.6648754775524139, "reward_change_std": 0.2532107653096318, "reward_std": 0.6585243381559849, "rewards/cosine_scaled_reward": -0.13378064148128033, "rewards/format_reward": 0.2500000037252903, "step": 33 }, { "advantage_max": 1.742632418870926, "advantage_mean": -3.725290742551124e-09, "advantage_min": -1.0178842395544052, "advantage_std": 0.9998548254370689, "completion_length": 2442.2708740234375, "epoch": 0.038857142857142854, "grad_norm": 0.26763319969177246, "kl": 3.138929605484009e-05, "lambda_div_used": 0.6, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 0.2606486789882183, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2606486789882183, "reward_after_std": 0.9495027400553226, "reward_before_mean": 0.7638577073812485, "reward_before_std": 0.9792901389300823, "reward_change_max": 0.0011056289076805115, "reward_change_mean": -0.503209053305909, "reward_change_min": -0.9927790202200413, "reward_change_std": 0.382378701120615, "reward_std": 0.9495027586817741, "rewards/cosine_scaled_reward": 0.09026219043880701, "rewards/format_reward": 0.5833333358168602, "step": 34 }, { "advantage_max": 1.7523992359638214, "advantage_mean": 1.3659398279131096e-08, "advantage_min": -1.0934018939733505, "advantage_std": 0.9998025223612785, "completion_length": 2960.7083740234375, "epoch": 0.04, "grad_norm": 0.2008289247751236, "kl": 3.193691372871399e-05, "lambda_div_used": 0.6, "learning_rate": 7e-07, "loss": 0.0, "reward": -0.1078006811439991, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1078006811439991, "reward_after_std": 0.7199197486042976, "reward_before_mean": 0.23942278884351254, "reward_before_std": 0.7542877979576588, "reward_change_max": 0.0, "reward_change_mean": -0.34722347371280193, "reward_change_min": -0.6703316308557987, "reward_change_std": 0.29183086566627026, "reward_std": 0.7199197895824909, "rewards/cosine_scaled_reward": -0.0782052765134722, "rewards/format_reward": 0.3958333395421505, "step": 35 }, { "advantage_max": 1.8357672542333603, "advantage_mean": 5.58793539218172e-09, "advantage_min": -0.892547219991684, "advantage_std": 0.9997570440173149, "completion_length": 3353.3958740234375, "epoch": 0.04114285714285714, "grad_norm": 0.1734616905450821, "kl": 4.410743713378906e-05, "lambda_div_used": 0.6, "learning_rate": 7.2e-07, "loss": 0.0, "reward": -0.41738917178008705, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.41738917178008705, "reward_after_std": 0.5450928043574095, "reward_before_mean": -0.21742304414510727, "reward_before_std": 0.5365421585738659, "reward_change_max": 0.0034711062908172607, "reward_change_mean": -0.1999661261215806, "reward_change_min": -0.4123280607163906, "reward_change_std": 0.16749785374850035, "reward_std": 0.5450928211212158, "rewards/cosine_scaled_reward": -0.20246152579784393, "rewards/format_reward": 0.1875000074505806, "step": 36 }, { "advantage_max": 1.7173901945352554, "advantage_mean": 2.0489096086429015e-08, "advantage_min": -1.0507243163883686, "advantage_std": 0.9997305124998093, "completion_length": 3307.6666870117188, "epoch": 0.04228571428571429, "grad_norm": 0.16524305939674377, "kl": 2.351030707359314e-05, "lambda_div_used": 0.6, "learning_rate": 7.4e-07, "loss": 0.0, "reward": -0.45867132768034935, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.45867132768034935, "reward_after_std": 0.3930114023387432, "reward_before_mean": -0.24169572815299034, "reward_before_std": 0.4065242074429989, "reward_change_max": 0.00047623366117477417, "reward_change_mean": -0.21697559859603643, "reward_change_min": -0.3991989456117153, "reward_change_std": 0.16736689116805792, "reward_std": 0.3930114172399044, "rewards/cosine_scaled_reward": -0.22501453291624784, "rewards/format_reward": 0.2083333358168602, "step": 37 }, { "advantage_max": 1.7465081810951233, "advantage_mean": 2.6077033643545633e-08, "advantage_min": -0.9350218996405602, "advantage_std": 0.9997123703360558, "completion_length": 3334.8333435058594, "epoch": 0.04342857142857143, "grad_norm": 0.19420531392097473, "kl": 3.1050294637680054e-05, "lambda_div_used": 0.6, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": -0.4392921030521393, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4392921030521393, "reward_after_std": 0.4309826772660017, "reward_before_mean": -0.22022299468517303, "reward_before_std": 0.4509555771946907, "reward_change_max": 0.001096479594707489, "reward_change_mean": -0.21906912024132907, "reward_change_min": -0.43532417714595795, "reward_change_std": 0.180083560757339, "reward_std": 0.43098269030451775, "rewards/cosine_scaled_reward": -0.16219483315944672, "rewards/format_reward": 0.1041666716337204, "step": 38 }, { "advantage_max": 1.809293732047081, "advantage_mean": 2.2351744011217534e-08, "advantage_min": -0.9606939628720284, "advantage_std": 0.9998093545436859, "completion_length": 2840.895866394043, "epoch": 0.044571428571428574, "grad_norm": 0.2235952764749527, "kl": 2.9481947422027588e-05, "lambda_div_used": 0.6, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": 0.018166373018175364, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.018166373018175364, "reward_after_std": 0.7264602929353714, "reward_before_mean": 0.42779126949608326, "reward_before_std": 0.6920085242018104, "reward_change_max": 0.002504482865333557, "reward_change_mean": -0.409624888561666, "reward_change_min": -0.7010375969111919, "reward_change_std": 0.29587725829333067, "reward_std": 0.726460300385952, "rewards/cosine_scaled_reward": -0.01527103316038847, "rewards/format_reward": 0.45833334513008595, "step": 39 }, { "advantage_max": 1.8042074739933014, "advantage_mean": -7.450580263856921e-09, "advantage_min": -1.0113056749105453, "advantage_std": 0.9997925981879234, "completion_length": 2649.5625228881836, "epoch": 0.045714285714285714, "grad_norm": 0.21060492098331451, "kl": 4.390254616737366e-05, "lambda_div_used": 0.6, "learning_rate": 8e-07, "loss": 0.0, "reward": -0.13332401355728507, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.13332401355728507, "reward_after_std": 0.5301584340631962, "reward_before_mean": 0.23096877778880298, "reward_before_std": 0.4900096170604229, "reward_change_max": 0.001264527440071106, "reward_change_mean": -0.3642928283661604, "reward_change_min": -0.6233471073210239, "reward_change_std": 0.25746062211692333, "reward_std": 0.5301584452390671, "rewards/cosine_scaled_reward": -0.12409894913434982, "rewards/format_reward": 0.4791666716337204, "step": 40 }, { "advantage_max": 1.8728558719158173, "advantage_mean": 3.104408707876871e-08, "advantage_min": -0.8173846006393433, "advantage_std": 0.9997896775603294, "completion_length": 3017.5416870117188, "epoch": 0.046857142857142854, "grad_norm": 0.16901332139968872, "kl": 3.6947429180145264e-05, "lambda_div_used": 0.6, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": -0.24943608665489592, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.24943608665489592, "reward_after_std": 0.6227637939155102, "reward_before_mean": 0.025789468549191952, "reward_before_std": 0.586168160662055, "reward_change_max": 0.0007337778806686401, "reward_change_mean": -0.2752255443483591, "reward_change_min": -0.5554584003984928, "reward_change_std": 0.19899009726941586, "reward_std": 0.6227638311684132, "rewards/cosine_scaled_reward": -0.1850219412590377, "rewards/format_reward": 0.3958333358168602, "step": 41 }, { "advantage_max": 1.8519198596477509, "advantage_mean": -6.208818792430293e-09, "advantage_min": -0.8044792786240578, "advantage_std": 0.9996784925460815, "completion_length": 2845.4583435058594, "epoch": 0.048, "grad_norm": 0.3022547662258148, "kl": 4.138052463531494e-05, "lambda_div_used": 0.6, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": -0.5167930386960506, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5167930386960506, "reward_after_std": 0.36160215362906456, "reward_before_mean": -0.33381566777825356, "reward_before_std": 0.3440868891775608, "reward_change_max": 0.0013512372970581055, "reward_change_mean": -0.18297737976536155, "reward_change_min": -0.3526080325245857, "reward_change_std": 0.14170236326754093, "reward_std": 0.36160216107964516, "rewards/cosine_scaled_reward": -0.3335745017975569, "rewards/format_reward": 0.3333333358168602, "step": 42 }, { "advantage_max": 1.7674841284751892, "advantage_mean": 4.221995697495373e-08, "advantage_min": -0.9594441875815392, "advantage_std": 0.9998220577836037, "completion_length": 2943.750045776367, "epoch": 0.04914285714285714, "grad_norm": 0.19866588711738586, "kl": 4.484504461288452e-05, "lambda_div_used": 0.6, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": -0.17242285422980785, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.17242285422980785, "reward_after_std": 0.7026234902441502, "reward_before_mean": 0.13483278080821037, "reward_before_std": 0.7139367610216141, "reward_change_max": 0.0017592236399650574, "reward_change_mean": -0.3072556145489216, "reward_change_min": -0.6003884114325047, "reward_change_std": 0.24329772219061852, "reward_std": 0.7026234939694405, "rewards/cosine_scaled_reward": -0.07841694308444858, "rewards/format_reward": 0.2916666679084301, "step": 43 }, { "advantage_max": 1.7240224331617355, "advantage_mean": 3.7873785108111235e-08, "advantage_min": -1.0484469085931778, "advantage_std": 0.9998211860656738, "completion_length": 2802.4583740234375, "epoch": 0.05028571428571429, "grad_norm": 0.236885666847229, "kl": 0.00011239200830459595, "lambda_div_used": 0.6, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": 0.020315666333772242, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.020315666333772242, "reward_after_std": 0.7511612996459007, "reward_before_mean": 0.43610192835330963, "reward_before_std": 0.8198062926530838, "reward_change_max": 0.0011220425367355347, "reward_change_mean": -0.41578628378920257, "reward_change_min": -0.8347551226615906, "reward_change_std": 0.3477799710817635, "reward_std": 0.7511613145470619, "rewards/cosine_scaled_reward": -0.02153235487639904, "rewards/format_reward": 0.4791666828095913, "step": 44 }, { "advantage_max": 1.8154965788125992, "advantage_mean": 1.3348956495740083e-08, "advantage_min": -0.9399640262126923, "advantage_std": 0.9997976571321487, "completion_length": 3300.5000610351562, "epoch": 0.05142857142857143, "grad_norm": 0.14654569327831268, "kl": 4.844926297664642e-05, "lambda_div_used": 0.6, "learning_rate": 9e-07, "loss": 0.0, "reward": -0.027277782559394836, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.027277782559394836, "reward_after_std": 0.7254571802914143, "reward_before_mean": 0.3635289091616869, "reward_before_std": 0.7658616602420807, "reward_change_max": 0.0018416717648506165, "reward_change_mean": -0.39080664864741266, "reward_change_min": -0.7719079181551933, "reward_change_std": 0.31446282705292106, "reward_std": 0.7254571970552206, "rewards/cosine_scaled_reward": -0.005735563114285469, "rewards/format_reward": 0.37500000558793545, "step": 45 }, { "advantage_max": 1.7028233408927917, "advantage_mean": 7.512669064624333e-08, "advantage_min": -1.0628773123025894, "advantage_std": 0.9997040554881096, "completion_length": 3280.1458435058594, "epoch": 0.052571428571428575, "grad_norm": 0.1873319298028946, "kl": 8.474476635456085e-05, "lambda_div_used": 0.6, "learning_rate": 9.2e-07, "loss": 0.0, "reward": -0.529501348733902, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.529501348733902, "reward_after_std": 0.3474753201007843, "reward_before_mean": -0.3440132327377796, "reward_before_std": 0.36735135316848755, "reward_change_max": 0.0011699274182319641, "reward_change_mean": -0.18548810062929988, "reward_change_min": -0.37992362678050995, "reward_change_std": 0.15657844487577677, "reward_std": 0.3474753201007843, "rewards/cosine_scaled_reward": -0.24492328986525536, "rewards/format_reward": 0.14583333395421505, "step": 46 }, { "advantage_max": 1.727543666958809, "advantage_mean": -1.4901161637936866e-08, "advantage_min": -0.9817303344607353, "advantage_std": 0.9998201057314873, "completion_length": 2743.1042098999023, "epoch": 0.053714285714285714, "grad_norm": 0.20242835581302643, "kl": 4.671327769756317e-05, "lambda_div_used": 0.6, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.17383363377302885, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17383363377302885, "reward_after_std": 0.8287919256836176, "reward_before_mean": 0.6626354195177555, "reward_before_std": 0.8962942529469728, "reward_change_max": 0.0017872899770736694, "reward_change_mean": -0.48880180856212974, "reward_change_min": -0.9589711390435696, "reward_change_std": 0.40080668311566114, "reward_std": 0.8287919294089079, "rewards/cosine_scaled_reward": 0.060484373942017555, "rewards/format_reward": 0.5416666716337204, "step": 47 }, { "advantage_max": 1.8530944287776947, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -0.8744976595044136, "advantage_std": 0.9998347014188766, "completion_length": 2797.2291870117188, "epoch": 0.054857142857142854, "grad_norm": 0.220950186252594, "kl": 0.0001786835491657257, "lambda_div_used": 0.6, "learning_rate": 9.6e-07, "loss": 0.0, "reward": -0.050193486735224724, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.050193486735224724, "reward_after_std": 0.9647956900298595, "reward_before_mean": 0.25988880917429924, "reward_before_std": 0.9481890201568604, "reward_change_max": 0.0005988627672195435, "reward_change_mean": -0.3100823136046529, "reward_change_min": -0.6051475256681442, "reward_change_std": 0.23567491210997105, "reward_std": 0.9647957049310207, "rewards/cosine_scaled_reward": -0.05755559680983424, "rewards/format_reward": 0.3750000037252903, "step": 48 }, { "advantage_max": 1.8145934343338013, "advantage_mean": 2.9026220982331097e-08, "advantage_min": -0.9026520848274231, "advantage_std": 0.9998468682169914, "completion_length": 2319.1875381469727, "epoch": 0.056, "grad_norm": 0.20816369354724884, "kl": 0.00010545924305915833, "lambda_div_used": 0.6, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.14474806562066078, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14474806562066078, "reward_after_std": 0.8051043152809143, "reward_before_mean": 0.6113051008433104, "reward_before_std": 0.7825077921152115, "reward_change_max": 0.0, "reward_change_mean": -0.46655698027461767, "reward_change_min": -0.8942185193300247, "reward_change_std": 0.33377677015960217, "reward_std": 0.8051043301820755, "rewards/cosine_scaled_reward": -0.006847476586699486, "rewards/format_reward": 0.6250000093132257, "step": 49 }, { "advantage_max": 1.889391914010048, "advantage_mean": -1.676380800841315e-08, "advantage_min": -0.8229124620556831, "advantage_std": 0.9997017830610275, "completion_length": 3018.937511444092, "epoch": 0.05714285714285714, "grad_norm": 0.15827570855617523, "kl": 0.00010543933603912592, "lambda_div_used": 0.6, "learning_rate": 1e-06, "loss": 0.0, "reward": -0.03166187182068825, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03166187182068825, "reward_after_std": 0.5302102379500866, "reward_before_mean": 0.39034452475607395, "reward_before_std": 0.4515397949144244, "reward_change_max": 0.0, "reward_change_mean": -0.4220064301043749, "reward_change_min": -0.6827008239924908, "reward_change_std": 0.27064335346221924, "reward_std": 0.5302102454006672, "rewards/cosine_scaled_reward": 0.02850560611113906, "rewards/format_reward": 0.33333333395421505, "step": 50 }, { "advantage_max": 1.7789610773324966, "advantage_mean": 3.166496753692627e-08, "advantage_min": -0.9067145064473152, "advantage_std": 0.9997837692499161, "completion_length": 2335.083351135254, "epoch": 0.05828571428571429, "grad_norm": 0.24217447638511658, "kl": 0.00023105740547180176, "lambda_div_used": 0.6, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": -0.08883536350913346, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08883536350913346, "reward_after_std": 0.6687220390886068, "reward_before_mean": 0.2738766521215439, "reward_before_std": 0.6639172211289406, "reward_change_max": 0.0006502941250801086, "reward_change_mean": -0.36271203216165304, "reward_change_min": -0.7275662198662758, "reward_change_std": 0.2778544398024678, "reward_std": 0.6687220819294453, "rewards/cosine_scaled_reward": -0.09222834184765816, "rewards/format_reward": 0.4583333358168602, "step": 51 }, { "advantage_max": 1.73988875746727, "advantage_mean": 1.707424765462484e-08, "advantage_min": -1.0492620393633842, "advantage_std": 0.9997946247458458, "completion_length": 2937.270835876465, "epoch": 0.05942857142857143, "grad_norm": 0.19648152589797974, "kl": 0.00017854571342468262, "lambda_div_used": 0.6, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": 0.03505727555602789, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03505727555602789, "reward_after_std": 0.8051403667777777, "reward_before_mean": 0.4398728273808956, "reward_before_std": 0.8179160077124834, "reward_change_max": 0.0011621415615081787, "reward_change_mean": -0.4048155304044485, "reward_change_min": -0.7212616428732872, "reward_change_std": 0.30738101061433554, "reward_std": 0.8051404003053904, "rewards/cosine_scaled_reward": 0.011603066697716713, "rewards/format_reward": 0.41666667722165585, "step": 52 }, { "advantage_max": 1.6888093948364258, "advantage_mean": 8.692344288796505e-09, "advantage_min": -1.1568487137556076, "advantage_std": 0.9998311847448349, "completion_length": 2733.479263305664, "epoch": 0.060571428571428575, "grad_norm": 0.19831141829490662, "kl": 0.00014853477478027344, "lambda_div_used": 0.6, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": 0.11825260240584612, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11825260240584612, "reward_after_std": 0.7896803952753544, "reward_before_mean": 0.5833198018372059, "reward_before_std": 0.8498901575803757, "reward_change_max": 0.0, "reward_change_mean": -0.46506719663739204, "reward_change_min": -0.9060896001756191, "reward_change_std": 0.37568887136876583, "reward_std": 0.7896804176270962, "rewards/cosine_scaled_reward": -0.010423431638628244, "rewards/format_reward": 0.604166679084301, "step": 53 }, { "advantage_max": 1.8639821112155914, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -0.8045392781496048, "advantage_std": 0.9998516067862511, "completion_length": 2762.8959350585938, "epoch": 0.061714285714285715, "grad_norm": 0.16911473870277405, "kl": 8.419156074523926e-05, "lambda_div_used": 0.6, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.3884635865688324, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3884635865688324, "reward_after_std": 0.857769999653101, "reward_before_mean": 0.974736912176013, "reward_before_std": 0.769689017906785, "reward_change_max": 0.0013243556022644043, "reward_change_mean": -0.5862732660025358, "reward_change_min": -1.0673403665423393, "reward_change_std": 0.4096502447500825, "reward_std": 0.8577700182795525, "rewards/cosine_scaled_reward": 0.19570175930857658, "rewards/format_reward": 0.5833333414047956, "step": 54 }, { "advantage_max": 1.8454468548297882, "advantage_mean": 1.024454790443663e-08, "advantage_min": -0.8249358907341957, "advantage_std": 0.9998388141393661, "completion_length": 2930.6667251586914, "epoch": 0.06285714285714286, "grad_norm": 0.17054994404315948, "kl": 0.00012383796274662018, "lambda_div_used": 0.6, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.030168408062309027, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.030168408062309027, "reward_after_std": 0.8526755161583424, "reward_before_mean": 0.41612496972084045, "reward_before_std": 0.8229849487543106, "reward_change_max": 0.0006914958357810974, "reward_change_mean": -0.38595652766525745, "reward_change_min": -0.6950371004641056, "reward_change_std": 0.288301445543766, "reward_std": 0.8526755459606647, "rewards/cosine_scaled_reward": 0.020562471821904182, "rewards/format_reward": 0.3750000074505806, "step": 55 }, { "advantage_max": 1.8175309002399445, "advantage_mean": 3.725290476097598e-08, "advantage_min": -0.8730312213301659, "advantage_std": 0.9997734501957893, "completion_length": 2981.937545776367, "epoch": 0.064, "grad_norm": 0.16157382726669312, "kl": 6.651878356933594e-05, "lambda_div_used": 0.6, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": -0.13588544633239508, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13588544633239508, "reward_after_std": 0.5623712055385113, "reward_before_mean": 0.22082094359211624, "reward_before_std": 0.5218833908438683, "reward_change_max": 0.00018957257270812988, "reward_change_mean": -0.3567063990049064, "reward_change_min": -0.647631298750639, "reward_change_std": 0.25431928131729364, "reward_std": 0.5623712316155434, "rewards/cosine_scaled_reward": -0.09792286064475775, "rewards/format_reward": 0.4166666716337204, "step": 56 }, { "advantage_max": 1.9298505038022995, "advantage_mean": 7.947286273513043e-08, "advantage_min": -0.7080076560378075, "advantage_std": 0.9997479617595673, "completion_length": 3066.875030517578, "epoch": 0.06514285714285714, "grad_norm": 0.1452140212059021, "kl": 6.233155727386475e-05, "lambda_div_used": 0.6, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": -0.2912606264776514, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2912606264776514, "reward_after_std": 0.747662709094584, "reward_before_mean": -0.06645372789353132, "reward_before_std": 0.7131164316087961, "reward_change_max": 0.0018123239278793335, "reward_change_mean": -0.22480689152143896, "reward_change_min": -0.4902408979833126, "reward_change_std": 0.18422507378272712, "reward_std": 0.7476627435535192, "rewards/cosine_scaled_reward": -0.1998935411684215, "rewards/format_reward": 0.33333334140479565, "step": 57 }, { "advantage_max": 1.8031867444515228, "advantage_mean": 8.692344177774203e-09, "advantage_min": -1.000834882259369, "advantage_std": 0.9998651817440987, "completion_length": 2100.375030517578, "epoch": 0.06628571428571428, "grad_norm": 0.2042514830827713, "kl": 0.0006186068058013916, "lambda_div_used": 0.6, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": 0.3083833637647331, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3083833637647331, "reward_after_std": 0.9442720264196396, "reward_before_mean": 0.8355601169168949, "reward_before_std": 0.92253103479743, "reward_change_max": 0.0006568878889083862, "reward_change_mean": -0.5271767731755972, "reward_change_min": -0.9465693905949593, "reward_change_std": 0.38795287534594536, "reward_std": 0.9442720338702202, "rewards/cosine_scaled_reward": 0.053196728229522705, "rewards/format_reward": 0.7291666716337204, "step": 58 }, { "advantage_max": 1.7461483031511307, "advantage_mean": 3.0267983897047657e-08, "advantage_min": -1.000838428735733, "advantage_std": 0.9997114017605782, "completion_length": 2875.6666717529297, "epoch": 0.06742857142857143, "grad_norm": 0.16439005732536316, "kl": 3.9380043745040894e-05, "lambda_div_used": 0.6, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": -0.19200942106544971, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.19200942106544971, "reward_after_std": 0.550924139097333, "reward_before_mean": 0.14425736293196678, "reward_before_std": 0.5736105926334858, "reward_change_max": 0.0010673105716705322, "reward_change_mean": -0.33626677468419075, "reward_change_min": -0.6364939995110035, "reward_change_std": 0.2666931441053748, "reward_std": 0.5509241484105587, "rewards/cosine_scaled_reward": -0.08412131667137146, "rewards/format_reward": 0.3125, "step": 59 }, { "advantage_max": 1.8866003900766373, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.8034778423607349, "advantage_std": 0.9998001158237457, "completion_length": 3011.6666870117188, "epoch": 0.06857142857142857, "grad_norm": 0.1728099286556244, "kl": 8.880347013473511e-05, "lambda_div_used": 0.6, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": -0.2994096493348479, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2994096493348479, "reward_after_std": 0.6945554278790951, "reward_before_mean": -0.06937994388863444, "reward_before_std": 0.6422449797391891, "reward_change_max": 0.0005838647484779358, "reward_change_mean": -0.23002970311790705, "reward_change_min": -0.4234812743961811, "reward_change_std": 0.16400552168488503, "reward_std": 0.6945554576814175, "rewards/cosine_scaled_reward": -0.1909399749711156, "rewards/format_reward": 0.31250000186264515, "step": 60 }, { "advantage_max": 1.8871331065893173, "advantage_mean": 1.738468902168222e-08, "advantage_min": -0.7573140487074852, "advantage_std": 0.9997683763504028, "completion_length": 3093.854217529297, "epoch": 0.06971428571428571, "grad_norm": 0.15688744187355042, "kl": 0.00017762184143066406, "lambda_div_used": 0.6, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": -0.11309731751680374, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11309731751680374, "reward_after_std": 0.5219951793551445, "reward_before_mean": 0.26111954916268587, "reward_before_std": 0.4263201951980591, "reward_change_max": 0.00035150349140167236, "reward_change_mean": -0.3742168480530381, "reward_change_min": -0.591254610568285, "reward_change_std": 0.23080396559089422, "reward_std": 0.5219952017068863, "rewards/cosine_scaled_reward": -0.10902357054874301, "rewards/format_reward": 0.4791666753590107, "step": 61 }, { "advantage_max": 1.852844849228859, "advantage_mean": -1.459072043741294e-08, "advantage_min": -0.884965643286705, "advantage_std": 0.9998951032757759, "completion_length": 2501.291748046875, "epoch": 0.07085714285714285, "grad_norm": 0.21886730194091797, "kl": 0.0006071189418435097, "lambda_div_used": 0.6, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.2706692605279386, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2706692605279386, "reward_after_std": 1.1487024202942848, "reward_before_mean": 0.7268138364888728, "reward_before_std": 1.1312339007854462, "reward_change_max": 0.0002567693591117859, "reward_change_mean": -0.45614459551870823, "reward_change_min": -0.9497322998940945, "reward_change_std": 0.35773101449012756, "reward_std": 1.1487024575471878, "rewards/cosine_scaled_reward": 0.05090692872181535, "rewards/format_reward": 0.6250000055879354, "step": 62 }, { "advantage_max": 1.7459149807691574, "advantage_mean": -7.450581207546492e-09, "advantage_min": -0.9959862679243088, "advantage_std": 0.9998425021767616, "completion_length": 2272.5208892822266, "epoch": 0.072, "grad_norm": 0.2128225415945053, "kl": 0.0005265623331069946, "lambda_div_used": 0.6, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "reward": 0.3422083929181099, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3422083929181099, "reward_after_std": 0.9376309607177973, "reward_before_mean": 0.9017110355198383, "reward_before_std": 0.9917434807866812, "reward_change_max": 0.00043205171823501587, "reward_change_mean": -0.559502637013793, "reward_change_min": -1.0579553097486496, "reward_change_std": 0.43955489341169596, "reward_std": 0.9376309644430876, "rewards/cosine_scaled_reward": 0.07585549168288708, "rewards/format_reward": 0.7500000149011612, "step": 63 }, { "advantage_max": 1.7108050882816315, "advantage_mean": -3.104408607956799e-08, "advantage_min": -1.0726191624999046, "advantage_std": 0.999819926917553, "completion_length": 2815.2083892822266, "epoch": 0.07314285714285715, "grad_norm": 0.18735727667808533, "kl": 0.00014835596084594727, "lambda_div_used": 0.6, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.1047421507537365, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": 0.1047421507537365, "reward_after_std": 0.6933653056621552, "reward_before_mean": 0.5733803249895573, "reward_before_std": 0.703533660620451, "reward_change_max": 0.0035287141799926758, "reward_change_mean": -0.46863821521401405, "reward_change_min": -0.8130603022873402, "reward_change_std": 0.33382952213287354, "reward_std": 0.6933653354644775, "rewards/cosine_scaled_reward": 0.057523492723703384, "rewards/format_reward": 0.4583333469927311, "step": 64 }, { "advantage_max": 1.901139497756958, "advantage_mean": -1.4901161193847656e-08, "advantage_min": -0.7846154049038887, "advantage_std": 0.9998325034976006, "completion_length": 2725.645881652832, "epoch": 0.07428571428571429, "grad_norm": 0.20273591578006744, "kl": 0.00024253875017166138, "lambda_div_used": 0.6, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": -0.02569293975830078, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.02569293975830078, "reward_after_std": 0.7628384791314602, "reward_before_mean": 0.34538572560995817, "reward_before_std": 0.7073428109288216, "reward_change_max": 0.0, "reward_change_mean": -0.371078678406775, "reward_change_min": -0.7337826155126095, "reward_change_std": 0.275781849399209, "reward_std": 0.7628384865820408, "rewards/cosine_scaled_reward": -0.07730714417994022, "rewards/format_reward": 0.5000000055879354, "step": 65 }, { "advantage_max": 1.8033942729234695, "advantage_mean": 8.071463331038586e-09, "advantage_min": -0.8609968945384026, "advantage_std": 0.9998007789254189, "completion_length": 2064.6250076293945, "epoch": 0.07542857142857143, "grad_norm": 0.24309836328029633, "kl": 0.0003443807363510132, "lambda_div_used": 0.6, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": 0.10103908181190491, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10103908181190491, "reward_after_std": 0.6284786872565746, "reward_before_mean": 0.5767052434384823, "reward_before_std": 0.5687633650377393, "reward_change_max": 0.0008361414074897766, "reward_change_mean": -0.47566617419943213, "reward_change_min": -0.8255947642028332, "reward_change_std": 0.3140420475974679, "reward_std": 0.6284786984324455, "rewards/cosine_scaled_reward": 0.038352612406015396, "rewards/format_reward": 0.5, "step": 66 }, { "advantage_max": 1.7674315869808197, "advantage_mean": 1.614292477469803e-08, "advantage_min": -0.938122421503067, "advantage_std": 0.9997090101242065, "completion_length": 3382.8541870117188, "epoch": 0.07657142857142857, "grad_norm": 0.13286292552947998, "kl": 0.0003947615623474121, "lambda_div_used": 0.6, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": -0.4611010178923607, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4611010178923607, "reward_after_std": 0.46187240816652775, "reward_before_mean": -0.26146249906742014, "reward_before_std": 0.4861889239400625, "reward_change_max": 0.0009711086750030518, "reward_change_mean": -0.19963854388333857, "reward_change_min": -0.45800918713212013, "reward_change_std": 0.1788791799917817, "reward_std": 0.4618724286556244, "rewards/cosine_scaled_reward": -0.2348979152739048, "rewards/format_reward": 0.2083333358168602, "step": 67 }, { "advantage_max": 1.8469493240118027, "advantage_mean": 2.483527050678447e-09, "advantage_min": -0.8409870713949203, "advantage_std": 0.9998388886451721, "completion_length": 2142.208381652832, "epoch": 0.07771428571428571, "grad_norm": 0.2571074366569519, "kl": 0.0017017126083374023, "lambda_div_used": 0.6, "learning_rate": 9.964516155915151e-07, "loss": 0.0001, "reward": 0.0018315929919481277, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0018315929919481277, "reward_after_std": 0.8450445830821991, "reward_before_mean": 0.3761431612074375, "reward_before_std": 0.8380554802715778, "reward_change_max": 0.0017983242869377136, "reward_change_mean": -0.37431156635284424, "reward_change_min": -0.8194547779858112, "reward_change_std": 0.30843253154307604, "reward_std": 0.8450446091592312, "rewards/cosine_scaled_reward": -0.11401176685467362, "rewards/format_reward": 0.6041666716337204, "step": 68 }, { "advantage_max": 1.8372850269079208, "advantage_mean": 4.0667754774847964e-08, "advantage_min": -0.8896334916353226, "advantage_std": 0.9997574761509895, "completion_length": 2471.770881652832, "epoch": 0.07885714285714286, "grad_norm": 0.2565942108631134, "kl": 0.001112222671508789, "lambda_div_used": 0.6, "learning_rate": 9.960469931131936e-07, "loss": 0.0, "reward": -0.27644870735821314, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.27644870735821314, "reward_after_std": 0.5243825484067202, "reward_before_mean": 0.007322388701140881, "reward_before_std": 0.5041719619184732, "reward_change_max": 0.0011881962418556213, "reward_change_mean": -0.2837710939347744, "reward_change_min": -0.5716921575367451, "reward_change_std": 0.21774671506136656, "reward_std": 0.5243825595825911, "rewards/cosine_scaled_reward": -0.23592214786913246, "rewards/format_reward": 0.4791666679084301, "step": 69 }, { "advantage_max": 1.8310918658971786, "advantage_mean": 4.346172166602713e-08, "advantage_min": -0.7979481443762779, "advantage_std": 0.9997915923595428, "completion_length": 3018.1458587646484, "epoch": 0.08, "grad_norm": 0.16229884326457977, "kl": 0.0007503684610128403, "lambda_div_used": 0.6, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": -0.2695963028818369, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2695963028818369, "reward_after_std": 0.6270924657583237, "reward_before_mean": -0.003460092470049858, "reward_before_std": 0.6113432552665472, "reward_change_max": 0.0, "reward_change_mean": -0.2661362048238516, "reward_change_min": -0.5505039319396019, "reward_change_std": 0.22055453341454268, "reward_std": 0.6270924992859364, "rewards/cosine_scaled_reward": -0.15798005042597651, "rewards/format_reward": 0.31250000186264515, "step": 70 }, { "advantage_max": 1.70066799223423, "advantage_mean": 5.091230215192866e-08, "advantage_min": -0.9549814835190773, "advantage_std": 0.9997783005237579, "completion_length": 2823.1666946411133, "epoch": 0.08114285714285714, "grad_norm": 0.24718300998210907, "kl": 0.0011773109436035156, "lambda_div_used": 0.6, "learning_rate": 9.951725498333448e-07, "loss": 0.0, "reward": -0.15892398823052645, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15892398823052645, "reward_after_std": 0.6443887799978256, "reward_before_mean": 0.17340507730841637, "reward_before_std": 0.6603858508169651, "reward_change_max": 0.0036379098892211914, "reward_change_mean": -0.33232905343174934, "reward_change_min": -0.6937352381646633, "reward_change_std": 0.27722177281975746, "reward_std": 0.6443887986242771, "rewards/cosine_scaled_reward": -0.06954746786504984, "rewards/format_reward": 0.3125, "step": 71 }, { "advantage_max": 1.7955728471279144, "advantage_mean": 2.6930746410691597e-08, "advantage_min": -0.9999164938926697, "advantage_std": 0.9997884854674339, "completion_length": 2870.666717529297, "epoch": 0.08228571428571428, "grad_norm": 0.20824484527111053, "kl": 0.0013428330421447754, "lambda_div_used": 0.6, "learning_rate": 9.947027716509488e-07, "loss": 0.0001, "reward": -0.32331820391118526, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.32331820391118526, "reward_after_std": 0.5827282182872295, "reward_before_mean": -0.07646113401278853, "reward_before_std": 0.5897139385342598, "reward_change_max": 0.0016376525163650513, "reward_change_mean": -0.2468570563942194, "reward_change_min": -0.4943818226456642, "reward_change_std": 0.20523380488157272, "reward_std": 0.5827282220125198, "rewards/cosine_scaled_reward": -0.21531390957534313, "rewards/format_reward": 0.354166679084301, "step": 72 }, { "advantage_max": 1.773019254207611, "advantage_mean": 5.494803217986899e-08, "advantage_min": -0.9967968463897705, "advantage_std": 0.999751403927803, "completion_length": 3529.7916870117188, "epoch": 0.08342857142857144, "grad_norm": 0.14700362086296082, "kl": 0.00020443648099899292, "lambda_div_used": 0.6, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "reward": -0.40370890498161316, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.40370890498161316, "reward_after_std": 0.5996265262365341, "reward_before_mean": -0.20144187100231647, "reward_before_std": 0.6425512656569481, "reward_change_max": 0.0006123930215835571, "reward_change_mean": -0.2022670367732644, "reward_change_min": -0.49701910093426704, "reward_change_std": 0.2075851233676076, "reward_std": 0.5996265448629856, "rewards/cosine_scaled_reward": -0.15280426386743784, "rewards/format_reward": 0.1041666679084301, "step": 73 }, { "advantage_max": 1.9407240599393845, "advantage_mean": 1.3659398390153399e-08, "advantage_min": -0.7461787275969982, "advantage_std": 0.9998097345232964, "completion_length": 3243.229217529297, "epoch": 0.08457142857142858, "grad_norm": 0.17531763017177582, "kl": 0.0008713230490684509, "lambda_div_used": 0.6, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "reward": -0.09746089670807123, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09746089670807123, "reward_after_std": 0.8146995007991791, "reward_before_mean": 0.2181630413979292, "reward_before_std": 0.7365536075085402, "reward_change_max": 0.0005424246191978455, "reward_change_mean": -0.31562393717467785, "reward_change_min": -0.53596480935812, "reward_change_std": 0.21479328256100416, "reward_std": 0.8146995343267918, "rewards/cosine_scaled_reward": -0.015918486984446645, "rewards/format_reward": 0.2500000037252903, "step": 74 }, { "advantage_max": 1.8912398666143417, "advantage_mean": 9.93410831373609e-09, "advantage_min": -0.7490442171692848, "advantage_std": 0.9998228922486305, "completion_length": 2986.625045776367, "epoch": 0.08571428571428572, "grad_norm": 0.15843676030635834, "kl": 0.0008603427559137344, "lambda_div_used": 0.6, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "reward": 0.004519036039710045, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.004519036039710045, "reward_after_std": 0.7009322084486485, "reward_before_mean": 0.40569755621254444, "reward_before_std": 0.6009742096066475, "reward_change_max": 0.00026772916316986084, "reward_change_mean": -0.40117852203547955, "reward_change_min": -0.6780325099825859, "reward_change_std": 0.2633136510848999, "reward_std": 0.7009322196245193, "rewards/cosine_scaled_reward": 0.0153487678617239, "rewards/format_reward": 0.3750000037252903, "step": 75 }, { "advantage_max": 1.8082723319530487, "advantage_mean": 3.0423204899765466e-08, "advantage_min": -0.9330036044120789, "advantage_std": 0.9997623041272163, "completion_length": 3001.8541870117188, "epoch": 0.08685714285714285, "grad_norm": 0.20225301384925842, "kl": 0.00020218640565872192, "lambda_div_used": 0.6, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": -0.1935147661715746, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1935147661715746, "reward_after_std": 0.45791828259825706, "reward_before_mean": 0.1532026305794716, "reward_before_std": 0.4037418030202389, "reward_change_max": 0.0013406574726104736, "reward_change_mean": -0.34671736136078835, "reward_change_min": -0.5875682011246681, "reward_change_std": 0.2296199956908822, "reward_std": 0.45791828632354736, "rewards/cosine_scaled_reward": -0.1317320466041565, "rewards/format_reward": 0.41666667349636555, "step": 76 }, { "advantage_max": 1.8107152730226517, "advantage_mean": 3.849466734262563e-08, "advantage_min": -0.9610567763447762, "advantage_std": 0.9997978210449219, "completion_length": 3046.5416870117188, "epoch": 0.088, "grad_norm": 0.16308292746543884, "kl": 0.0002619922161102295, "lambda_div_used": 0.6, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": -0.2058363500982523, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2058363500982523, "reward_after_std": 0.5894700028002262, "reward_before_mean": 0.10926695168018341, "reward_before_std": 0.5918228011578321, "reward_change_max": 0.0014843419194221497, "reward_change_mean": -0.3151032840833068, "reward_change_min": -0.5777905434370041, "reward_change_std": 0.23432088736444712, "reward_std": 0.5894700065255165, "rewards/cosine_scaled_reward": -0.13286652602255344, "rewards/format_reward": 0.37500001676380634, "step": 77 }, { "advantage_max": 1.810091182589531, "advantage_mean": 3.04232042891428e-08, "advantage_min": -1.0055750012397766, "advantage_std": 0.9997722059488297, "completion_length": 3185.8750610351562, "epoch": 0.08914285714285715, "grad_norm": 0.17335101962089539, "kl": 0.00017173215746879578, "lambda_div_used": 0.6, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": -0.12524622678756714, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.12524622678756714, "reward_after_std": 0.6566737592220306, "reward_before_mean": 0.2210826389491558, "reward_before_std": 0.6558897197246552, "reward_change_max": 0.0002913177013397217, "reward_change_mean": -0.3463288554921746, "reward_change_min": -0.6442321985960007, "reward_change_std": 0.26683663809672, "reward_std": 0.6566737834364176, "rewards/cosine_scaled_reward": -0.06654200842604041, "rewards/format_reward": 0.3541666753590107, "step": 78 }, { "advantage_max": 1.8899707645177841, "advantage_mean": 1.0244548626081595e-08, "advantage_min": -0.825028270483017, "advantage_std": 0.9998388290405273, "completion_length": 2240.5625228881836, "epoch": 0.09028571428571429, "grad_norm": 0.24236978590488434, "kl": 0.0011141449213027954, "lambda_div_used": 0.6, "learning_rate": 9.908088623197048e-07, "loss": 0.0, "reward": 0.07283397391438484, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07283397391438484, "reward_after_std": 0.7937607131898403, "reward_before_mean": 0.4887027923250571, "reward_before_std": 0.7088694777339697, "reward_change_max": 0.00013114511966705322, "reward_change_mean": -0.4158688234165311, "reward_change_min": -0.7517045177519321, "reward_change_std": 0.2835696069523692, "reward_std": 0.7937607355415821, "rewards/cosine_scaled_reward": -0.05773193761706352, "rewards/format_reward": 0.6041666697710752, "step": 79 }, { "advantage_max": 1.8568548262119293, "advantage_mean": -1.614292521878724e-08, "advantage_min": -0.837759755551815, "advantage_std": 0.9997959807515144, "completion_length": 3219.875030517578, "epoch": 0.09142857142857143, "grad_norm": 0.1928330361843109, "kl": 0.0007643923163414001, "lambda_div_used": 0.6, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": -0.28942321287468076, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.28942321287468076, "reward_after_std": 0.7306128852069378, "reward_before_mean": -0.057574108242988586, "reward_before_std": 0.7140483744442463, "reward_change_max": 0.0010956153273582458, "reward_change_mean": -0.2318491260521114, "reward_change_min": -0.5317598357796669, "reward_change_std": 0.19940282963216305, "reward_std": 0.7306128889322281, "rewards/cosine_scaled_reward": -0.15378706716001034, "rewards/format_reward": 0.2500000037252903, "step": 80 }, { "advantage_max": 1.7959967106580734, "advantage_mean": -1.1796753740522803e-08, "advantage_min": -0.9497917145490646, "advantage_std": 0.9997836574912071, "completion_length": 3129.7083435058594, "epoch": 0.09257142857142857, "grad_norm": 0.27863961458206177, "kl": 0.002306640148162842, "lambda_div_used": 0.6, "learning_rate": 9.895025252503755e-07, "loss": 0.0001, "reward": -0.2474798383191228, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2474798383191228, "reward_after_std": 0.5966678149998188, "reward_before_mean": 0.03739765891805291, "reward_before_std": 0.5835473164916039, "reward_change_max": 0.001976579427719116, "reward_change_mean": -0.2848775088787079, "reward_change_min": -0.5357042364776134, "reward_change_std": 0.21301922667771578, "reward_std": 0.5966678410768509, "rewards/cosine_scaled_reward": -0.1479678377509117, "rewards/format_reward": 0.3333333358168602, "step": 81 }, { "advantage_max": 1.8437489867210388, "advantage_mean": 4.967054767490708e-09, "advantage_min": -0.8466271758079529, "advantage_std": 0.9998117461800575, "completion_length": 2898.0208587646484, "epoch": 0.09371428571428571, "grad_norm": 0.18406398594379425, "kl": 0.0025584548711776733, "lambda_div_used": 0.6, "learning_rate": 9.888172094375033e-07, "loss": 0.0001, "reward": -0.016393298865295947, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.016393298865295947, "reward_after_std": 0.7960129491984844, "reward_before_mean": 0.35885727778077126, "reward_before_std": 0.7870577052235603, "reward_change_max": 0.0004786178469657898, "reward_change_mean": -0.37525059189647436, "reward_change_min": -0.7115802094340324, "reward_change_std": 0.2880655792541802, "reward_std": 0.7960129864513874, "rewards/cosine_scaled_reward": 0.0023453044705092907, "rewards/format_reward": 0.35416666977107525, "step": 82 }, { "advantage_max": 1.7741630524396896, "advantage_mean": 1.9868215961338365e-08, "advantage_min": -1.0308396220207214, "advantage_std": 0.9997642710804939, "completion_length": 2784.2916870117188, "epoch": 0.09485714285714286, "grad_norm": 0.2609867751598358, "kl": 0.0011954903602600098, "lambda_div_used": 0.6, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "reward": -0.2998163793236017, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2998163793236017, "reward_after_std": 0.4641554616391659, "reward_before_mean": -0.013990622013807297, "reward_before_std": 0.4607525132596493, "reward_change_max": 0.0005547106266021729, "reward_change_mean": -0.2858257554471493, "reward_change_min": -0.5067390538752079, "reward_change_std": 0.2103829812258482, "reward_std": 0.4641554690897465, "rewards/cosine_scaled_reward": -0.1632453203201294, "rewards/format_reward": 0.31250000186264515, "step": 83 }, { "advantage_max": 1.7930245250463486, "advantage_mean": 3.104408841103634e-09, "advantage_min": -0.8693301230669022, "advantage_std": 0.9997679516673088, "completion_length": 3040.5000534057617, "epoch": 0.096, "grad_norm": 0.1649995744228363, "kl": 0.0007286667823791504, "lambda_div_used": 0.6, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": -0.07544983178377151, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07544983178377151, "reward_after_std": 0.7895570639520884, "reward_before_mean": 0.27497592754662037, "reward_before_std": 0.830137187615037, "reward_change_max": 0.0006811842322349548, "reward_change_mean": -0.35042573790997267, "reward_change_min": -0.7961104810237885, "reward_change_std": 0.3109002844430506, "reward_std": 0.7895570639520884, "rewards/cosine_scaled_reward": -0.060428719967603683, "rewards/format_reward": 0.39583334140479565, "step": 84 }, { "advantage_max": 1.888422742486, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.79921755194664, "advantage_std": 0.9998757466673851, "completion_length": 3134.6250610351562, "epoch": 0.09714285714285714, "grad_norm": 0.168262779712677, "kl": 0.00031495094299316406, "lambda_div_used": 0.6, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": -0.060326272854581475, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.060326272854581475, "reward_after_std": 1.0223791375756264, "reward_before_mean": 0.23499422008171678, "reward_before_std": 0.9955884180963039, "reward_change_max": 0.0014316290616989136, "reward_change_mean": -0.2953204959630966, "reward_change_min": -0.6588696278631687, "reward_change_std": 0.255947170779109, "reward_std": 1.0223791673779488, "rewards/cosine_scaled_reward": -0.0804195562377572, "rewards/format_reward": 0.39583333767950535, "step": 85 }, { "advantage_max": 1.8527948707342148, "advantage_mean": 1.9868215850316062e-08, "advantage_min": -0.7902801856398582, "advantage_std": 0.9998051598668098, "completion_length": 2867.5625, "epoch": 0.09828571428571428, "grad_norm": 0.17173554003238678, "kl": 0.0012168288230895996, "lambda_div_used": 0.6, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": -0.15069702547043562, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15069702547043562, "reward_after_std": 0.7066328749060631, "reward_before_mean": 0.1656637266278267, "reward_before_std": 0.6682115383446217, "reward_change_max": 0.0008284449577331543, "reward_change_mean": -0.31636073999106884, "reward_change_min": -0.6082081608474255, "reward_change_std": 0.24262152425944805, "reward_std": 0.706632886081934, "rewards/cosine_scaled_reward": -0.14633481635246426, "rewards/format_reward": 0.4583333358168602, "step": 86 }, { "advantage_max": 1.8305644243955612, "advantage_mean": -2.173085600354341e-09, "advantage_min": -0.8854763805866241, "advantage_std": 0.9998243600130081, "completion_length": 2582.4375534057617, "epoch": 0.09942857142857142, "grad_norm": 0.23738721013069153, "kl": 0.002800785005092621, "lambda_div_used": 0.6, "learning_rate": 9.850705248720068e-07, "loss": 0.0001, "reward": 0.013825018191710114, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.013825018191710114, "reward_after_std": 0.6578258685767651, "reward_before_mean": 0.4317212179303169, "reward_before_std": 0.6047324761748314, "reward_change_max": 0.0030826181173324585, "reward_change_mean": -0.41789623629301786, "reward_change_min": -0.7545126043260098, "reward_change_std": 0.2849441980943084, "reward_std": 0.6578258872032166, "rewards/cosine_scaled_reward": -0.05497271195054054, "rewards/format_reward": 0.541666679084301, "step": 87 }, { "advantage_max": 1.8509272336959839, "advantage_mean": -5.4637592616924024e-08, "advantage_min": -0.884626179933548, "advantage_std": 0.99986432492733, "completion_length": 2470.08341217041, "epoch": 0.10057142857142858, "grad_norm": 0.2240888774394989, "kl": 0.0013819336891174316, "lambda_div_used": 0.6, "learning_rate": 9.8425742251254e-07, "loss": 0.0001, "reward": 0.37665122747421265, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.37665122747421265, "reward_after_std": 0.867752481251955, "reward_before_mean": 0.9529164787381887, "reward_before_std": 0.7872258014976978, "reward_change_max": 0.0012845396995544434, "reward_change_mean": -0.5762653043493629, "reward_change_min": -0.9866040423512459, "reward_change_std": 0.3834084654226899, "reward_std": 0.8677525036036968, "rewards/cosine_scaled_reward": 0.1535415492253378, "rewards/format_reward": 0.6458333376795053, "step": 88 }, { "advantage_max": 1.7756440043449402, "advantage_mean": 3.880510621168121e-09, "advantage_min": -0.9730030819773674, "advantage_std": 0.9998276829719543, "completion_length": 3276.979217529297, "epoch": 0.10171428571428572, "grad_norm": 0.22678594291210175, "kl": 0.0013582706451416016, "lambda_div_used": 0.6, "learning_rate": 9.83423155058946e-07, "loss": 0.0001, "reward": -0.16686965757980943, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.16686965757980943, "reward_after_std": 0.7788497358560562, "reward_before_mean": 0.127953777089715, "reward_before_std": 0.8071900904178619, "reward_change_max": 0.001987569034099579, "reward_change_mean": -0.29482342256233096, "reward_change_min": -0.643274899572134, "reward_change_std": 0.2588015152141452, "reward_std": 0.7788497470319271, "rewards/cosine_scaled_reward": -0.08185644680634141, "rewards/format_reward": 0.2916666753590107, "step": 89 }, { "advantage_max": 1.9292222708463669, "advantage_mean": 2.6077032533322608e-08, "advantage_min": -0.7419086024165154, "advantage_std": 0.9997998923063278, "completion_length": 2387.083366394043, "epoch": 0.10285714285714286, "grad_norm": 0.37300607562065125, "kl": 0.0021688640117645264, "lambda_div_used": 0.6, "learning_rate": 9.825677631722435e-07, "loss": 0.0001, "reward": -0.13474958203732967, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13474958203732967, "reward_after_std": 0.7872820869088173, "reward_before_mean": 0.16399362310767174, "reward_before_std": 0.7284187152981758, "reward_change_max": 0.0013339966535568237, "reward_change_mean": -0.2987432088702917, "reward_change_min": -0.5778307020664215, "reward_change_std": 0.22088597994297743, "reward_std": 0.78728212043643, "rewards/cosine_scaled_reward": -0.178419857596964, "rewards/format_reward": 0.5208333395421505, "step": 90 }, { "advantage_max": 1.7450472861528397, "advantage_mean": 2.4059167547108018e-08, "advantage_min": -1.000647023320198, "advantage_std": 0.9997962266206741, "completion_length": 3159.437530517578, "epoch": 0.104, "grad_norm": 0.1630987524986267, "kl": 0.0008542463183403015, "lambda_div_used": 0.6, "learning_rate": 9.816912885430258e-07, "loss": 0.0, "reward": -0.18008676916360855, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.18008676916360855, "reward_after_std": 0.6187229715287685, "reward_before_mean": 0.14918549545109272, "reward_before_std": 0.6627273000776768, "reward_change_max": 0.0004763081669807434, "reward_change_mean": -0.3292722823098302, "reward_change_min": -0.6484544016420841, "reward_change_std": 0.2739113047719002, "reward_std": 0.6187229789793491, "rewards/cosine_scaled_reward": -0.11290724948048592, "rewards/format_reward": 0.37500001676380634, "step": 91 }, { "advantage_max": 1.8468924760818481, "advantage_mean": -3.104408841103634e-09, "advantage_min": -0.9322610720992088, "advantage_std": 0.9997585564851761, "completion_length": 2569.8333892822266, "epoch": 0.10514285714285715, "grad_norm": 0.25294139981269836, "kl": 0.0018187016248703003, "lambda_div_used": 0.6, "learning_rate": 9.807937738894303e-07, "loss": 0.0001, "reward": -0.22114436700940132, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.22114436700940132, "reward_after_std": 0.5883230771869421, "reward_before_mean": 0.08094780705869198, "reward_before_std": 0.558818282559514, "reward_change_max": 1.0579824447631836e-05, "reward_change_mean": -0.302092173602432, "reward_change_min": -0.6010795347392559, "reward_change_std": 0.22603166941553354, "reward_std": 0.5883230995386839, "rewards/cosine_scaled_reward": -0.1886927718296647, "rewards/format_reward": 0.45833334513008595, "step": 92 }, { "advantage_max": 1.7818111181259155, "advantage_mean": 3.787378544117814e-08, "advantage_min": -0.8869654014706612, "advantage_std": 0.999727226793766, "completion_length": 3536.9791870117188, "epoch": 0.10628571428571429, "grad_norm": 0.1902371197938919, "kl": 0.0012578368186950684, "lambda_div_used": 0.6, "learning_rate": 9.798752629550546e-07, "loss": 0.0001, "reward": -0.6223237328231335, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.6223237328231335, "reward_after_std": 0.3858645409345627, "reward_before_mean": -0.4991328977048397, "reward_before_std": 0.409189336001873, "reward_change_max": 0.003185272216796875, "reward_change_mean": -0.1231908411718905, "reward_change_min": -0.2996739409863949, "reward_change_std": 0.12465427769348025, "reward_std": 0.3858645521104336, "rewards/cosine_scaled_reward": -0.2703997865319252, "rewards/format_reward": 0.0416666679084301, "step": 93 }, { "advantage_max": 1.889383926987648, "advantage_mean": -1.4901161082825354e-08, "advantage_min": -0.879096981137991, "advantage_std": 0.999787226319313, "completion_length": 3138.395866394043, "epoch": 0.10742857142857143, "grad_norm": 0.22059182822704315, "kl": 0.001993924379348755, "lambda_div_used": 0.6, "learning_rate": 9.78935800506826e-07, "loss": 0.0001, "reward": -0.1963297836482525, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1963297836482525, "reward_after_std": 0.5589430667459965, "reward_before_mean": 0.12045633129309863, "reward_before_std": 0.4760522823780775, "reward_change_max": 0.0003600567579269409, "reward_change_mean": -0.3167861073743552, "reward_change_min": -0.4744899459183216, "reward_change_std": 0.20194733957760036, "reward_std": 0.5589430965483189, "rewards/cosine_scaled_reward": -0.06477185152471066, "rewards/format_reward": 0.2500000037252903, "step": 94 }, { "advantage_max": 1.7795635610818863, "advantage_mean": 5.4948036121160726e-08, "advantage_min": -0.9644187763333321, "advantage_std": 0.9996979087591171, "completion_length": 3409.187530517578, "epoch": 0.10857142857142857, "grad_norm": 0.19662852585315704, "kl": 0.00038304924964904785, "lambda_div_used": 0.6, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": -0.594087558798492, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.594087558798492, "reward_after_std": 0.36136773228645325, "reward_before_mean": -0.44723618403077126, "reward_before_std": 0.3845880515873432, "reward_change_max": 0.0, "reward_change_mean": -0.14685136824846268, "reward_change_min": -0.3474042974412441, "reward_change_std": 0.13843790628015995, "reward_std": 0.36136773973703384, "rewards/cosine_scaled_reward": -0.2757014315575361, "rewards/format_reward": 0.10416666977107525, "step": 95 }, { "advantage_max": 1.834953397512436, "advantage_mean": 9.623667196478891e-09, "advantage_min": -0.9561122506856918, "advantage_std": 0.999815046787262, "completion_length": 2898.5, "epoch": 0.10971428571428571, "grad_norm": 0.19801871478557587, "kl": 0.0019068419933319092, "lambda_div_used": 0.6, "learning_rate": 9.769942052400235e-07, "loss": 0.0001, "reward": -0.05706032179296017, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05706032179296017, "reward_after_std": 0.7747535556554794, "reward_before_mean": 0.2972187213599682, "reward_before_std": 0.755788192152977, "reward_change_max": 0.0006535649299621582, "reward_change_mean": -0.3542790412902832, "reward_change_min": -0.6666507758200169, "reward_change_std": 0.2610515356063843, "reward_std": 0.7747535966336727, "rewards/cosine_scaled_reward": -0.02847396954894066, "rewards/format_reward": 0.3541666753590107, "step": 96 }, { "advantage_max": 1.7371751815080643, "advantage_mean": -6.208815683805824e-10, "advantage_min": -1.0732538476586342, "advantage_std": 0.9997835084795952, "completion_length": 3111.3334045410156, "epoch": 0.11085714285714286, "grad_norm": 0.21298745274543762, "kl": 0.000819966197013855, "lambda_div_used": 0.6, "learning_rate": 9.759921670520634e-07, "loss": 0.0, "reward": -0.12749477475881577, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.12749477475881577, "reward_after_std": 0.6070926785469055, "reward_before_mean": 0.2340460466220975, "reward_before_std": 0.6491605900228024, "reward_change_max": 0.0008164122700691223, "reward_change_mean": -0.36154082510620356, "reward_change_min": -0.658284705132246, "reward_change_std": 0.2894742302596569, "reward_std": 0.6070926897227764, "rewards/cosine_scaled_reward": -0.04964364320039749, "rewards/format_reward": 0.33333334513008595, "step": 97 }, { "advantage_max": 1.8331523686647415, "advantage_mean": 6.33299377383878e-08, "advantage_min": -0.9393685981631279, "advantage_std": 0.9997893199324608, "completion_length": 2636.229232788086, "epoch": 0.112, "grad_norm": 0.19141905009746552, "kl": 0.0006305575370788574, "lambda_div_used": 0.6, "learning_rate": 9.749693666068663e-07, "loss": 0.0, "reward": -0.04186771437525749, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.04186771437525749, "reward_after_std": 0.6713929902762175, "reward_before_mean": 0.34054063144139946, "reward_before_std": 0.6036591455340385, "reward_change_max": 0.0006186738610267639, "reward_change_mean": -0.3824083199724555, "reward_change_min": -0.640322033315897, "reward_change_std": 0.25615993700921535, "reward_std": 0.6713930163532495, "rewards/cosine_scaled_reward": -0.09014636965002865, "rewards/format_reward": 0.5208333488553762, "step": 98 }, { "advantage_max": 1.785209521651268, "advantage_mean": 4.0357311326122414e-08, "advantage_min": -0.9890889748930931, "advantage_std": 0.9997483119368553, "completion_length": 2831.1041717529297, "epoch": 0.11314285714285714, "grad_norm": 0.18398259580135345, "kl": 0.0009243488311767578, "lambda_div_used": 0.6, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "reward": -0.15700703021138906, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15700703021138906, "reward_after_std": 0.5350757166743279, "reward_before_mean": 0.19414901733398438, "reward_before_std": 0.488203302025795, "reward_change_max": 0.0009380653500556946, "reward_change_mean": -0.3511560335755348, "reward_change_min": -0.6369931064546108, "reward_change_std": 0.2508567860350013, "reward_std": 0.535075731575489, "rewards/cosine_scaled_reward": -0.048758842051029205, "rewards/format_reward": 0.2916666679084301, "step": 99 }, { "advantage_max": 1.725820317864418, "advantage_mean": 3.104408063947517e-09, "advantage_min": -1.057399995625019, "advantage_std": 0.9998147934675217, "completion_length": 2560.8958740234375, "epoch": 0.11428571428571428, "grad_norm": 0.18937556445598602, "kl": 0.003207683563232422, "lambda_div_used": 0.6, "learning_rate": 9.728616793536587e-07, "loss": 0.0001, "reward": 0.13002754002809525, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.13002754002809525, "reward_after_std": 0.7728552669286728, "reward_before_mean": 0.6003823187202215, "reward_before_std": 0.7967247776687145, "reward_change_max": 0.0010514110326766968, "reward_change_mean": -0.47035478707402945, "reward_change_min": -0.9081873521208763, "reward_change_std": 0.3621816970407963, "reward_std": 0.7728552781045437, "rewards/cosine_scaled_reward": 0.0501911542378366, "rewards/format_reward": 0.5000000037252903, "step": 100 }, { "advantage_max": 1.7652879804372787, "advantage_mean": 4.346172111091562e-08, "advantage_min": -0.9487873390316963, "advantage_std": 0.9997652173042297, "completion_length": 3085.9375610351562, "epoch": 0.11542857142857142, "grad_norm": 0.179105743765831, "kl": 0.0010350942611694336, "lambda_div_used": 0.6, "learning_rate": 9.717768952713511e-07, "loss": 0.0, "reward": -0.2374136783182621, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2374136783182621, "reward_after_std": 0.5234212130308151, "reward_before_mean": 0.0739324577152729, "reward_before_std": 0.5113976262509823, "reward_change_max": 0.0017821341753005981, "reward_change_mean": -0.31134614115580916, "reward_change_min": -0.5999995283782482, "reward_change_std": 0.23880406375974417, "reward_std": 0.5234212279319763, "rewards/cosine_scaled_reward": -0.1088671050965786, "rewards/format_reward": 0.29166667349636555, "step": 101 }, { "advantage_max": 1.736981987953186, "advantage_mean": -6.208817349140361e-09, "advantage_min": -1.1442973241209984, "advantage_std": 0.9998510330915451, "completion_length": 2414.479263305664, "epoch": 0.11657142857142858, "grad_norm": 0.27132582664489746, "kl": 0.003232717514038086, "lambda_div_used": 0.6, "learning_rate": 9.706715543782064e-07, "loss": 0.0001, "reward": 0.18853361695073545, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18853361695073545, "reward_after_std": 0.879979819059372, "reward_before_mean": 0.6658473797142506, "reward_before_std": 0.907788585871458, "reward_change_max": 0.00014173239469528198, "reward_change_mean": -0.47731374204158783, "reward_change_min": -0.8779811263084412, "reward_change_std": 0.3685195595026016, "reward_std": 0.8799798376858234, "rewards/cosine_scaled_reward": -0.00040966711821965873, "rewards/format_reward": 0.6666666828095913, "step": 102 }, { "advantage_max": 1.8154405057430267, "advantage_mean": 2.669791498988161e-08, "advantage_min": -0.9068902283906937, "advantage_std": 0.9998216927051544, "completion_length": 2932.8333740234375, "epoch": 0.11771428571428572, "grad_norm": 0.25712406635284424, "kl": 0.0015873908996582031, "lambda_div_used": 0.6, "learning_rate": 9.695457105469804e-07, "loss": 0.0001, "reward": -0.15092550683766603, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.15092550683766603, "reward_after_std": 0.7477401718497276, "reward_before_mean": 0.1566409319639206, "reward_before_std": 0.7541992217302322, "reward_change_max": 0.0016285479068756104, "reward_change_mean": -0.3075664332136512, "reward_change_min": -0.6556785479187965, "reward_change_std": 0.2573690786957741, "reward_std": 0.7477401979267597, "rewards/cosine_scaled_reward": -0.09876288007944822, "rewards/format_reward": 0.3541666753590107, "step": 103 }, { "advantage_max": 1.8285162150859833, "advantage_mean": 3.321717334525687e-08, "advantage_min": -0.8637420684099197, "advantage_std": 0.9997657388448715, "completion_length": 2773.8541717529297, "epoch": 0.11885714285714286, "grad_norm": 0.658698320388794, "kl": 0.025522232055664062, "lambda_div_used": 0.6, "learning_rate": 9.683994186497132e-07, "loss": 0.001, "reward": -0.2826954470947385, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2826954470947385, "reward_after_std": 0.6313493195921183, "reward_before_mean": -0.023373490199446678, "reward_before_std": 0.6353482659906149, "reward_change_max": 0.0010068267583847046, "reward_change_mean": -0.2593219568952918, "reward_change_min": -0.5421112589538097, "reward_change_std": 0.21279537491500378, "reward_std": 0.6313493568450212, "rewards/cosine_scaled_reward": -0.1991867497563362, "rewards/format_reward": 0.37500000186264515, "step": 104 }, { "advantage_max": 1.783625602722168, "advantage_mean": 3.3527614018424856e-08, "advantage_min": -0.8666596114635468, "advantage_std": 0.9997932240366936, "completion_length": 2725.9375381469727, "epoch": 0.12, "grad_norm": 0.18521331250667572, "kl": 0.0012688636779785156, "lambda_div_used": 0.6, "learning_rate": 9.672327345550543e-07, "loss": 0.0001, "reward": -0.015197938308119774, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.015197938308119774, "reward_after_std": 0.8849128354340792, "reward_before_mean": 0.3449369762092829, "reward_before_std": 0.8991417735815048, "reward_change_max": 0.0005147978663444519, "reward_change_mean": -0.3601348986849189, "reward_change_min": -0.7783688493072987, "reward_change_std": 0.3188676042482257, "reward_std": 0.8849128670990467, "rewards/cosine_scaled_reward": -0.025448182597756386, "rewards/format_reward": 0.39583333395421505, "step": 105 }, { "advantage_max": 1.8432470560073853, "advantage_mean": -2.2972623803241277e-08, "advantage_min": -0.8303454779088497, "advantage_std": 0.9998474791646004, "completion_length": 2239.6459045410156, "epoch": 0.12114285714285715, "grad_norm": 0.22106949985027313, "kl": 0.0016461610794067383, "lambda_div_used": 0.6, "learning_rate": 9.66045715125541e-07, "loss": 0.0001, "reward": 0.34969139844179153, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34969139844179153, "reward_after_std": 0.8614957369863987, "reward_before_mean": 0.9187089614570141, "reward_before_std": 0.8012212971225381, "reward_change_max": 0.001362532377243042, "reward_change_mean": -0.5690175807103515, "reward_change_min": -1.0385963395237923, "reward_change_std": 0.417306674644351, "reward_std": 0.8614957556128502, "rewards/cosine_scaled_reward": 0.146854467689991, "rewards/format_reward": 0.6250000093132257, "step": 106 }, { "advantage_max": 1.7680507004261017, "advantage_mean": 2.301142987271021e-08, "advantage_min": -0.9694222062826157, "advantage_std": 0.99971604347229, "completion_length": 2895.312530517578, "epoch": 0.12228571428571429, "grad_norm": 0.2039802074432373, "kl": 0.001500844955444336, "lambda_div_used": 0.6, "learning_rate": 9.648384182148252e-07, "loss": 0.0001, "reward": -0.15957804769277573, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15957804769277573, "reward_after_std": 0.6128820450976491, "reward_before_mean": 0.18090857192873955, "reward_before_std": 0.6410299837589264, "reward_change_max": 0.0003634542226791382, "reward_change_mean": -0.3404866079799831, "reward_change_min": -0.6619577445089817, "reward_change_std": 0.2745391200296581, "reward_std": 0.6128820804879069, "rewards/cosine_scaled_reward": -0.14912904612720013, "rewards/format_reward": 0.479166679084301, "step": 107 }, { "advantage_max": 1.790600210428238, "advantage_mean": 5.960464632970286e-08, "advantage_min": -0.9298330098390579, "advantage_std": 0.9997816309332848, "completion_length": 2844.6459045410156, "epoch": 0.12342857142857143, "grad_norm": 0.21611450612545013, "kl": 0.0016851872205734253, "lambda_div_used": 0.6, "learning_rate": 9.636109026648554e-07, "loss": 0.0001, "reward": -0.1583491563796997, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1583491563796997, "reward_after_std": 0.6582544650882483, "reward_before_mean": 0.16968258982524276, "reward_before_std": 0.6924717854708433, "reward_change_max": 0.00043252110481262207, "reward_change_mean": -0.3280317420139909, "reward_change_min": -0.6706159263849258, "reward_change_std": 0.2807430122047663, "reward_std": 0.6582544837146997, "rewards/cosine_scaled_reward": -0.09224203368648887, "rewards/format_reward": 0.35416666977107525, "step": 108 }, { "advantage_max": 1.8136216551065445, "advantage_mean": 4.656613178388724e-09, "advantage_min": -0.9915246963500977, "advantage_std": 0.9997651129961014, "completion_length": 3080.9583435058594, "epoch": 0.12457142857142857, "grad_norm": 0.17112916707992554, "kl": 0.0004984140396118164, "lambda_div_used": 0.6, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": -0.17264318838715553, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.17264318838715553, "reward_after_std": 0.5608428791165352, "reward_before_mean": 0.16347171552479267, "reward_before_std": 0.5202469453215599, "reward_change_max": 0.0016485974192619324, "reward_change_mean": -0.33611492812633514, "reward_change_min": -0.5799317136406898, "reward_change_std": 0.22663516830652952, "reward_std": 0.5608428884297609, "rewards/cosine_scaled_reward": -0.09534747712314129, "rewards/format_reward": 0.35416667722165585, "step": 109 }, { "advantage_max": 1.8396689295768738, "advantage_mean": -5.587935614226325e-09, "advantage_min": -0.8279101625084877, "advantage_std": 0.9998233169317245, "completion_length": 2519.7916717529297, "epoch": 0.12571428571428572, "grad_norm": 0.2504572570323944, "kl": 0.0009617358446121216, "lambda_div_used": 0.6, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "reward": -0.07773779518902302, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.07773779518902302, "reward_after_std": 0.7135216481983662, "reward_before_mean": 0.27838256349787116, "reward_before_std": 0.696380527690053, "reward_change_max": 0.0, "reward_change_mean": -0.35612036008387804, "reward_change_min": -0.7108714915812016, "reward_change_std": 0.27885529957711697, "reward_std": 0.7135216817259789, "rewards/cosine_scaled_reward": -0.12122538778930902, "rewards/format_reward": 0.5208333376795053, "step": 110 }, { "advantage_max": 1.7565907686948776, "advantage_mean": -8.692344177774203e-09, "advantage_min": -0.9142092913389206, "advantage_std": 0.9998379349708557, "completion_length": 2963.8750610351562, "epoch": 0.12685714285714286, "grad_norm": 0.19190770387649536, "kl": 0.0017622709274291992, "lambda_div_used": 0.6, "learning_rate": 9.598076473627796e-07, "loss": 0.0001, "reward": -0.037100352346897125, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.037100352346897125, "reward_after_std": 0.8680760376155376, "reward_before_mean": 0.32125273160636425, "reward_before_std": 0.9434991367161274, "reward_change_max": 0.0022219568490982056, "reward_change_mean": -0.3583531128242612, "reward_change_min": -0.8165842406451702, "reward_change_std": 0.3486758843064308, "reward_std": 0.8680760562419891, "rewards/cosine_scaled_reward": -0.02687364211305976, "rewards/format_reward": 0.3750000074505806, "step": 111 }, { "advantage_max": 1.7827106267213821, "advantage_mean": 2.7318795670083773e-08, "advantage_min": -1.0682056844234467, "advantage_std": 0.9998344331979752, "completion_length": 3133.1459045410156, "epoch": 0.128, "grad_norm": 0.15433692932128906, "kl": 0.0007636398077011108, "lambda_div_used": 0.6, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "reward": -0.010375543497502804, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.010375543497502804, "reward_after_std": 0.8647609390318394, "reward_before_mean": 0.35546717746183276, "reward_before_std": 0.8730155974626541, "reward_change_max": 0.0010883510112762451, "reward_change_mean": -0.3658427279442549, "reward_change_min": -0.6690595299005508, "reward_change_std": 0.28574431873857975, "reward_std": 0.8647609725594521, "rewards/cosine_scaled_reward": -0.009766413364559412, "rewards/format_reward": 0.37500000931322575, "step": 112 }, { "advantage_max": 1.755420058965683, "advantage_mean": 1.0710210274211818e-08, "advantage_min": -0.9944314882159233, "advantage_std": 0.9998151361942291, "completion_length": 2773.0833740234375, "epoch": 0.12914285714285714, "grad_norm": 0.23051810264587402, "kl": 0.0017020702362060547, "lambda_div_used": 0.6, "learning_rate": 9.571721736097088e-07, "loss": 0.0001, "reward": 0.07544285524636507, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07544285524636507, "reward_after_std": 0.7961066942662001, "reward_before_mean": 0.5120955022866838, "reward_before_std": 0.8482234226539731, "reward_change_max": 0.0005150362849235535, "reward_change_mean": -0.4366526734083891, "reward_change_min": -0.8510825112462044, "reward_change_std": 0.35694314260035753, "reward_std": 0.796106742694974, "rewards/cosine_scaled_reward": 0.02688109502196312, "rewards/format_reward": 0.4583333469927311, "step": 113 }, { "advantage_max": 1.8586469739675522, "advantage_mean": 2.421438782818086e-08, "advantage_min": -0.8431475088000298, "advantage_std": 0.9998027980327606, "completion_length": 2531.2083587646484, "epoch": 0.13028571428571428, "grad_norm": 0.22355306148529053, "kl": 0.0019383430480957031, "lambda_div_used": 0.6, "learning_rate": 9.55824636882301e-07, "loss": 0.0001, "reward": -0.20113424118608236, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.20113424118608236, "reward_after_std": 0.6546772718429565, "reward_before_mean": 0.09727955237030983, "reward_before_std": 0.6197393368929625, "reward_change_max": 0.0007202774286270142, "reward_change_mean": -0.29841379821300507, "reward_change_min": -0.5926534906029701, "reward_change_std": 0.22649162262678146, "reward_std": 0.6546772830188274, "rewards/cosine_scaled_reward": -0.24302690103650093, "rewards/format_reward": 0.5833333414047956, "step": 114 }, { "advantage_max": 1.7997616976499557, "advantage_mean": 4.159907551759545e-08, "advantage_min": -1.0052898675203323, "advantage_std": 0.999777115881443, "completion_length": 2741.7500381469727, "epoch": 0.13142857142857142, "grad_norm": 0.22047275304794312, "kl": 0.0026127099990844727, "lambda_div_used": 0.6, "learning_rate": 9.54457320834625e-07, "loss": 0.0001, "reward": -0.11932978220283985, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11932978220283985, "reward_after_std": 0.6451930701732635, "reward_before_mean": 0.22971507906913757, "reward_before_std": 0.6189654469490051, "reward_change_max": 0.0001480579376220703, "reward_change_mean": -0.3490448575466871, "reward_change_min": -0.656039223074913, "reward_change_std": 0.2516454681754112, "reward_std": 0.6451930776238441, "rewards/cosine_scaled_reward": -0.07264245301485062, "rewards/format_reward": 0.375, "step": 115 }, { "advantage_max": 1.828330248594284, "advantage_mean": 1.179675357398935e-08, "advantage_min": -0.9229530841112137, "advantage_std": 0.999760314822197, "completion_length": 3289.166717529297, "epoch": 0.13257142857142856, "grad_norm": 0.16027842462062836, "kl": 0.0012557506561279297, "lambda_div_used": 0.6, "learning_rate": 9.530702921077358e-07, "loss": 0.0001, "reward": -0.35777913220226765, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.35777913220226765, "reward_after_std": 0.5930831618607044, "reward_before_mean": -0.1336807757616043, "reward_before_std": 0.5846659392118454, "reward_change_max": 0.0006017833948135376, "reward_change_mean": -0.22409836295992136, "reward_change_min": -0.5018119886517525, "reward_change_std": 0.18498014891520143, "reward_std": 0.5930831879377365, "rewards/cosine_scaled_reward": -0.1501737218350172, "rewards/format_reward": 0.16666667349636555, "step": 116 }, { "advantage_max": 1.7930647432804108, "advantage_mean": -2.4835270062695258e-08, "advantage_min": -0.9088431373238564, "advantage_std": 0.999772883951664, "completion_length": 3074.291702270508, "epoch": 0.1337142857142857, "grad_norm": 0.19902725517749786, "kl": 0.001920938491821289, "lambda_div_used": 0.6, "learning_rate": 9.516636183034564e-07, "loss": 0.0001, "reward": -0.41275100101483986, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.41275100101483986, "reward_after_std": 0.6596652008593082, "reward_before_mean": -0.2349538952112198, "reward_before_std": 0.6833183541893959, "reward_change_max": 0.004510059952735901, "reward_change_mean": -0.177797120064497, "reward_change_min": -0.4685539975762367, "reward_change_std": 0.19438769109547138, "reward_std": 0.6596652120351791, "rewards/cosine_scaled_reward": -0.25289361737668514, "rewards/format_reward": 0.27083334513008595, "step": 117 }, { "advantage_max": 1.7604702562093735, "advantage_mean": -1.3038516932795119e-08, "advantage_min": -0.8611695393919945, "advantage_std": 0.9998533576726913, "completion_length": 3151.0833740234375, "epoch": 0.13485714285714287, "grad_norm": 0.15056875348091125, "kl": 0.0013856887817382812, "lambda_div_used": 0.6, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.09464032435789704, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09464032435789704, "reward_after_std": 0.9480334632098675, "reward_before_mean": 0.5092401094734669, "reward_before_std": 1.007173541933298, "reward_change_max": 0.0013965964317321777, "reward_change_mean": -0.4145997939631343, "reward_change_min": -0.9419260174036026, "reward_change_std": 0.389494770206511, "reward_std": 0.9480334855616093, "rewards/cosine_scaled_reward": 0.05670338403433561, "rewards/format_reward": 0.39583333767950535, "step": 118 }, { "advantage_max": 1.849794253706932, "advantage_mean": 4.8428775767384025e-08, "advantage_min": -0.9064371511340141, "advantage_std": 0.9998284727334976, "completion_length": 2609.3333587646484, "epoch": 0.136, "grad_norm": 0.22057241201400757, "kl": 0.00370180606842041, "lambda_div_used": 0.6, "learning_rate": 9.487916106540465e-07, "loss": 0.0001, "reward": 0.03942138887941837, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03942138887941837, "reward_after_std": 0.7630004845559597, "reward_before_mean": 0.4473726401338354, "reward_before_std": 0.7058235108852386, "reward_change_max": 0.0013832449913024902, "reward_change_mean": -0.40795122273266315, "reward_change_min": -0.7008799575269222, "reward_change_std": 0.2818867303431034, "reward_std": 0.7630005031824112, "rewards/cosine_scaled_reward": -0.02631368301808834, "rewards/format_reward": 0.5000000093132257, "step": 119 }, { "advantage_max": 1.8697483986616135, "advantage_mean": 1.2107193081423162e-08, "advantage_min": -0.9596693366765976, "advantage_std": 0.999811477959156, "completion_length": 2261.687515258789, "epoch": 0.13714285714285715, "grad_norm": 0.2415115386247635, "kl": 0.0018281936645507812, "lambda_div_used": 0.6, "learning_rate": 9.473264167865171e-07, "loss": 0.0001, "reward": -0.050280665047466755, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.050280665047466755, "reward_after_std": 0.7319730669260025, "reward_before_mean": 0.31586154247634113, "reward_before_std": 0.709919760003686, "reward_change_max": 0.0, "reward_change_mean": -0.3661421984434128, "reward_change_min": -0.6760508343577385, "reward_change_std": 0.2769299987703562, "reward_std": 0.7319730892777443, "rewards/cosine_scaled_reward": -0.12331923271995038, "rewards/format_reward": 0.5625000111758709, "step": 120 }, { "advantage_max": 1.8391786068677902, "advantage_mean": -2.7939677238464355e-09, "advantage_min": -0.8033071458339691, "advantage_std": 0.9997827708721161, "completion_length": 1701.333366394043, "epoch": 0.1382857142857143, "grad_norm": 0.22365814447402954, "kl": 0.0038230419158935547, "lambda_div_used": 0.6, "learning_rate": 9.458418577899774e-07, "loss": 0.0002, "reward": 0.16879624186549336, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16879624186549336, "reward_after_std": 0.64923981949687, "reward_before_mean": 0.6796352444216609, "reward_before_std": 0.5928017403930426, "reward_change_max": 0.0005354881286621094, "reward_change_mean": -0.5108390115201473, "reward_change_min": -0.8865525871515274, "reward_change_std": 0.3496230151504278, "reward_std": 0.6492398418486118, "rewards/cosine_scaled_reward": -0.03518239036202431, "rewards/format_reward": 0.7500000074505806, "step": 121 }, { "advantage_max": 1.7351529896259308, "advantage_mean": 3.818422625312401e-08, "advantage_min": -0.9226409643888474, "advantage_std": 0.9997973814606667, "completion_length": 3060.3333740234375, "epoch": 0.13942857142857143, "grad_norm": 0.18619462847709656, "kl": 0.0013076066970825195, "lambda_div_used": 0.6, "learning_rate": 9.443380060197385e-07, "loss": 0.0001, "reward": -0.013334386050701141, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.013334386050701141, "reward_after_std": 0.8127669915556908, "reward_before_mean": 0.3716919384896755, "reward_before_std": 0.8905900437384844, "reward_change_max": 0.0006670430302619934, "reward_change_mean": -0.3850262966006994, "reward_change_min": -0.8647193722426891, "reward_change_std": 0.3571057040244341, "reward_std": 0.8127670250833035, "rewards/cosine_scaled_reward": -0.0016540400683879852, "rewards/format_reward": 0.3750000037252903, "step": 122 }, { "advantage_max": 1.8506823033094406, "advantage_mean": -7.450579819767711e-09, "advantage_min": -0.8011151403188705, "advantage_std": 0.9998099207878113, "completion_length": 2766.8334045410156, "epoch": 0.14057142857142857, "grad_norm": 0.17895764112472534, "kl": 0.0018435120582580566, "lambda_div_used": 0.6, "learning_rate": 9.428149347714143e-07, "loss": 0.0001, "reward": -0.023045064299367368, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.023045064299367368, "reward_after_std": 0.6246655620634556, "reward_before_mean": 0.3796454730909318, "reward_before_std": 0.5567172318696976, "reward_change_max": 0.0005916282534599304, "reward_change_mean": -0.40269055776298046, "reward_change_min": -0.757130891084671, "reward_change_std": 0.27373781334608793, "reward_std": 0.6246655881404877, "rewards/cosine_scaled_reward": -0.06017725728452206, "rewards/format_reward": 0.5000000037252903, "step": 123 }, { "advantage_max": 1.891770288348198, "advantage_mean": -2.0489097307674342e-08, "advantage_min": -0.8343314006924629, "advantage_std": 0.9998250231146812, "completion_length": 2253.8125534057617, "epoch": 0.1417142857142857, "grad_norm": 0.21318885684013367, "kl": 0.006275177001953125, "lambda_div_used": 0.6, "learning_rate": 9.412727182773486e-07, "loss": 0.0003, "reward": 0.07496419548988342, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07496419548988342, "reward_after_std": 0.7322327829897404, "reward_before_mean": 0.5071105966344476, "reward_before_std": 0.6644273046404123, "reward_change_max": 0.0, "reward_change_mean": -0.43214639462530613, "reward_change_min": -0.7557884342968464, "reward_change_std": 0.290377726778388, "reward_std": 0.7322328351438046, "rewards/cosine_scaled_reward": -0.017278052866458893, "rewards/format_reward": 0.5416666716337204, "step": 124 }, { "advantage_max": 1.7362398356199265, "advantage_mean": 3.3217173234234565e-08, "advantage_min": -1.0559132620692253, "advantage_std": 0.9997788593173027, "completion_length": 2715.3125076293945, "epoch": 0.14285714285714285, "grad_norm": 0.1941165179014206, "kl": 0.0016214847564697266, "lambda_div_used": 0.6, "learning_rate": 9.397114317029974e-07, "loss": 0.0001, "reward": 0.020387548953294754, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.020387548953294754, "reward_after_std": 0.6161116883158684, "reward_before_mean": 0.4553663581609726, "reward_before_std": 0.5809119660407305, "reward_change_max": 0.0006070658564567566, "reward_change_mean": -0.4349788036197424, "reward_change_min": -0.7705775275826454, "reward_change_std": 0.31106082256883383, "reward_std": 0.6161117069423199, "rewards/cosine_scaled_reward": 0.0401831679046154, "rewards/format_reward": 0.3750000037252903, "step": 125 }, { "advantage_max": 1.8594186007976532, "advantage_mean": 1.2417635364414537e-08, "advantage_min": -0.8952602446079254, "advantage_std": 0.9997757375240326, "completion_length": 2842.9583587646484, "epoch": 0.144, "grad_norm": 0.17380203306674957, "kl": 0.0012089014053344727, "lambda_div_used": 0.6, "learning_rate": 9.381311511432658e-07, "loss": 0.0, "reward": -0.0640541547909379, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0640541547909379, "reward_after_std": 0.6584181226789951, "reward_before_mean": 0.310584613122046, "reward_before_std": 0.6167250759899616, "reward_change_max": 4.930049180984497e-05, "reward_change_mean": -0.37463877256959677, "reward_change_min": -0.6681321784853935, "reward_change_std": 0.258643782697618, "reward_std": 0.6584181413054466, "rewards/cosine_scaled_reward": -0.06345769576728344, "rewards/format_reward": 0.4375000037252903, "step": 126 }, { "advantage_max": 1.7131058424711227, "advantage_mean": -1.8626452602532595e-09, "advantage_min": -1.0991980135440826, "advantage_std": 0.9997560158371925, "completion_length": 2911.8958892822266, "epoch": 0.14514285714285713, "grad_norm": 0.18738742172718048, "kl": 0.0014913082122802734, "lambda_div_used": 0.6, "learning_rate": 9.36531953618799e-07, "loss": 0.0001, "reward": -0.37144492409424856, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.37144492409424856, "reward_after_std": 0.4604080505669117, "reward_before_mean": -0.11855738663871307, "reward_before_std": 0.4947989024221897, "reward_change_max": 0.001102253794670105, "reward_change_mean": -0.2528875581920147, "reward_change_min": -0.5120432004332542, "reward_change_std": 0.21158719062805176, "reward_std": 0.4604080617427826, "rewards/cosine_scaled_reward": -0.25719536282122135, "rewards/format_reward": 0.39583334885537624, "step": 127 }, { "advantage_max": 1.7644283175468445, "advantage_mean": 3.725290298461914e-09, "advantage_min": -0.978776179254055, "advantage_std": 0.9997963681817055, "completion_length": 2819.916702270508, "epoch": 0.1462857142857143, "grad_norm": 0.17408989369869232, "kl": 0.0031111836433410645, "lambda_div_used": 0.6, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": 0.17869412526488304, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17869412526488304, "reward_after_std": 0.796859186142683, "reward_before_mean": 0.6695016473531723, "reward_before_std": 0.8020838983356953, "reward_change_max": 0.0008061006665229797, "reward_change_mean": -0.49080754444003105, "reward_change_min": -0.8655724823474884, "reward_change_std": 0.3641576422378421, "reward_std": 0.7968592196702957, "rewards/cosine_scaled_reward": 0.11600081622600555, "rewards/format_reward": 0.43750000558793545, "step": 128 }, { "advantage_max": 1.8531613051891327, "advantage_mean": 6.146729142342267e-08, "advantage_min": -0.9235868975520134, "advantage_std": 0.9997715502977371, "completion_length": 3387.125030517578, "epoch": 0.14742857142857144, "grad_norm": 0.14450214803218842, "kl": 0.0020759105682373047, "lambda_div_used": 0.6, "learning_rate": 9.332771203643714e-07, "loss": 0.0001, "reward": -0.349605955183506, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.349605955183506, "reward_after_std": 0.7235031314194202, "reward_before_mean": -0.1519492152146995, "reward_before_std": 0.7243870310485363, "reward_change_max": 0.00021888315677642822, "reward_change_mean": -0.1976567441597581, "reward_change_min": -0.4697972945868969, "reward_change_std": 0.1836132798343897, "reward_std": 0.7235031351447105, "rewards/cosine_scaled_reward": -0.159307939640712, "rewards/format_reward": 0.1666666679084301, "step": 129 }, { "advantage_max": 1.775284931063652, "advantage_mean": 5.3395829702207465e-08, "advantage_min": -0.9660038575530052, "advantage_std": 0.9997857511043549, "completion_length": 2631.7500076293945, "epoch": 0.14857142857142858, "grad_norm": 0.19728270173072815, "kl": 0.001338362693786621, "lambda_div_used": 0.6, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": -0.13717409409582615, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13717409409582615, "reward_after_std": 0.6975272782146931, "reward_before_mean": 0.1896309917792678, "reward_before_std": 0.6852379832416773, "reward_change_max": 0.000896163284778595, "reward_change_mean": -0.326805068179965, "reward_change_min": -0.6144087947905064, "reward_change_std": 0.2421941850334406, "reward_std": 0.69752730242908, "rewards/cosine_scaled_reward": -0.11351784318685532, "rewards/format_reward": 0.41666667349636555, "step": 130 }, { "advantage_max": 1.7923941612243652, "advantage_mean": -2.2972623803241277e-08, "advantage_min": -0.9002665691077709, "advantage_std": 0.9998530149459839, "completion_length": 2729.0417404174805, "epoch": 0.14971428571428572, "grad_norm": 0.21225287020206451, "kl": 0.0025815963745117188, "lambda_div_used": 0.6, "learning_rate": 9.299475664759068e-07, "loss": 0.0001, "reward": 0.23085041716694832, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23085041716694832, "reward_after_std": 0.8556590303778648, "reward_before_mean": 0.7364845387637615, "reward_before_std": 0.8376536886207759, "reward_change_max": 0.0, "reward_change_mean": -0.5056341355666518, "reward_change_min": -0.8724869564175606, "reward_change_std": 0.37287682946771383, "reward_std": 0.8556590564548969, "rewards/cosine_scaled_reward": 0.1286589317023754, "rewards/format_reward": 0.47916667349636555, "step": 131 }, { "advantage_max": 1.8120517283678055, "advantage_mean": 1.3038516932795119e-08, "advantage_min": -0.886481486260891, "advantage_std": 0.9997847452759743, "completion_length": 2587.0833587646484, "epoch": 0.15085714285714286, "grad_norm": 0.18867288529872894, "kl": 0.001291036605834961, "lambda_div_used": 0.6, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": -0.0929887518286705, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.0929887518286705, "reward_after_std": 0.6387858018279076, "reward_before_mean": 0.2717039901763201, "reward_before_std": 0.6333675868809223, "reward_change_max": 0.0013419762253761292, "reward_change_mean": -0.3646927550435066, "reward_change_min": -0.7294860184192657, "reward_change_std": 0.2695095627568662, "reward_std": 0.6387858055531979, "rewards/cosine_scaled_reward": -0.07248133979737759, "rewards/format_reward": 0.4166666679084301, "step": 132 }, { "advantage_max": 1.8003092557191849, "advantage_mean": 3.476937815438674e-08, "advantage_min": -0.9143991321325302, "advantage_std": 0.9997724890708923, "completion_length": 3191.979217529297, "epoch": 0.152, "grad_norm": 0.19934667646884918, "kl": 0.002385854721069336, "lambda_div_used": 0.6, "learning_rate": 9.265439410565328e-07, "loss": 0.0001, "reward": -0.41168433986604214, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.41168433986604214, "reward_after_std": 0.5410910733044147, "reward_before_mean": -0.2069263681769371, "reward_before_std": 0.535653218626976, "reward_change_max": 0.00023845583200454712, "reward_change_mean": -0.2047579661011696, "reward_change_min": -0.42901289463043213, "reward_change_std": 0.17053746804594994, "reward_std": 0.541091077029705, "rewards/cosine_scaled_reward": -0.2388798501342535, "rewards/format_reward": 0.27083333767950535, "step": 133 }, { "advantage_max": 1.8175580650568008, "advantage_mean": 5.4016709771786964e-08, "advantage_min": -0.931298740208149, "advantage_std": 0.9997761398553848, "completion_length": 2434.8958892822266, "epoch": 0.15314285714285714, "grad_norm": 0.23071566224098206, "kl": 0.0030927658081054688, "lambda_div_used": 0.6, "learning_rate": 9.248145583195447e-07, "loss": 0.0001, "reward": 0.06319417338818312, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06319417338818312, "reward_after_std": 0.620172493159771, "reward_before_mean": 0.5191458743065596, "reward_before_std": 0.5582793261855841, "reward_change_max": 0.0, "reward_change_mean": -0.4559516739100218, "reward_change_min": -0.7631092332303524, "reward_change_std": 0.2891054190695286, "reward_std": 0.6201725006103516, "rewards/cosine_scaled_reward": -0.021677076816558838, "rewards/format_reward": 0.5625000074505806, "step": 134 }, { "advantage_max": 1.8472566604614258, "advantage_mean": 3.10440866346795e-08, "advantage_min": -0.7691843509674072, "advantage_std": 0.9998567774891853, "completion_length": 1922.6458549499512, "epoch": 0.15428571428571428, "grad_norm": 0.2417742908000946, "kl": 0.0037393569946289062, "lambda_div_used": 0.6, "learning_rate": 9.230669076497687e-07, "loss": 0.0001, "reward": 0.40880877897143364, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.40880877897143364, "reward_after_std": 0.8995259329676628, "reward_before_mean": 0.997649796307087, "reward_before_std": 0.7795259989798069, "reward_change_max": 0.0006740763783454895, "reward_change_mean": -0.5888410690240562, "reward_change_min": -1.0147056505084038, "reward_change_std": 0.39061957970261574, "reward_std": 0.8995259515941143, "rewards/cosine_scaled_reward": 0.1654915688559413, "rewards/format_reward": 0.6666666716337204, "step": 135 }, { "advantage_max": 1.7726252377033234, "advantage_mean": 1.2728075149404106e-08, "advantage_min": -0.8942654505372047, "advantage_std": 0.999861553311348, "completion_length": 2828.6875610351562, "epoch": 0.15542857142857142, "grad_norm": 0.18230877816677094, "kl": 0.0026178359985351562, "lambda_div_used": 0.6, "learning_rate": 9.213010742252327e-07, "loss": 0.0001, "reward": 0.11790861561894417, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11790861561894417, "reward_after_std": 0.9995119795203209, "reward_before_mean": 0.5282619073987007, "reward_before_std": 1.0388812385499477, "reward_change_max": 0.00042983144521713257, "reward_change_mean": -0.4103532899171114, "reward_change_min": -0.8829576633870602, "reward_change_std": 0.35299498960375786, "reward_std": 0.9995119869709015, "rewards/cosine_scaled_reward": 0.04538094159215689, "rewards/format_reward": 0.4375000111758709, "step": 136 }, { "advantage_max": 1.7562323808670044, "advantage_mean": 1.1486312123665243e-08, "advantage_min": -0.9233672171831131, "advantage_std": 0.9998031109571457, "completion_length": 2931.354217529297, "epoch": 0.15657142857142858, "grad_norm": 0.20975585281848907, "kl": 0.002144336700439453, "lambda_div_used": 0.6, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, "reward": -0.2550590895116329, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2550590895116329, "reward_after_std": 0.7029409743845463, "reward_before_mean": 0.0069596245884895325, "reward_before_std": 0.7332433424890041, "reward_change_max": 0.0006171539425849915, "reward_change_mean": -0.2620187192223966, "reward_change_min": -0.6090663559734821, "reward_change_std": 0.23964617308229208, "reward_std": 0.7029409967362881, "rewards/cosine_scaled_reward": -0.15277018956840038, "rewards/format_reward": 0.31250001303851604, "step": 137 }, { "advantage_max": 1.860204964876175, "advantage_mean": -1.552203920951456e-09, "advantage_min": -0.8499182164669037, "advantage_std": 0.9998690113425255, "completion_length": 2530.3542098999023, "epoch": 0.15771428571428572, "grad_norm": 0.25806108117103577, "kl": 0.0020530223846435547, "lambda_div_used": 0.6, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": 0.12369046686217189, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12369046686217189, "reward_after_std": 0.9462379738688469, "reward_before_mean": 0.5434232788684312, "reward_before_std": 0.9309011660516262, "reward_change_max": 0.0009085908532142639, "reward_change_mean": -0.41973283141851425, "reward_change_min": -0.8432405553758144, "reward_change_std": 0.32539301738142967, "reward_std": 0.9462379962205887, "rewards/cosine_scaled_reward": -0.040788375306874514, "rewards/format_reward": 0.6250000111758709, "step": 138 }, { "advantage_max": 1.8141157180070877, "advantage_mean": 5.556891616298465e-08, "advantage_min": -0.9482329413294792, "advantage_std": 0.9998308569192886, "completion_length": 3186.166717529297, "epoch": 0.15885714285714286, "grad_norm": 0.1826111525297165, "kl": 0.00322723388671875, "lambda_div_used": 0.6, "learning_rate": 9.158953424711624e-07, "loss": 0.0001, "reward": -0.2945699542760849, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2945699542760849, "reward_after_std": 0.7745798341929913, "reward_before_mean": -0.07500334223732352, "reward_before_std": 0.7766986936330795, "reward_change_max": 0.0012882798910140991, "reward_change_mean": -0.21956659480929375, "reward_change_min": -0.441211748868227, "reward_change_std": 0.19897049106657505, "reward_std": 0.7745798826217651, "rewards/cosine_scaled_reward": -0.193751678802073, "rewards/format_reward": 0.3125000074505806, "step": 139 }, { "advantage_max": 1.8250874429941177, "advantage_mean": -2.6697914323747796e-08, "advantage_min": -0.9064978882670403, "advantage_std": 0.9998640343546867, "completion_length": 2811.4375610351562, "epoch": 0.16, "grad_norm": 0.30310213565826416, "kl": 0.005560874938964844, "lambda_div_used": 0.6, "learning_rate": 9.140576474687263e-07, "loss": 0.0002, "reward": 0.05557395005598664, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.05557395005598664, "reward_after_std": 0.861349169164896, "reward_before_mean": 0.45549297146499157, "reward_before_std": 0.827749565243721, "reward_change_max": 0.0005374252796173096, "reward_change_mean": -0.3999190628528595, "reward_change_min": -0.7423599883913994, "reward_change_std": 0.31112305261194706, "reward_std": 0.8613492026925087, "rewards/cosine_scaled_reward": 0.008996479329653084, "rewards/format_reward": 0.4375000149011612, "step": 140 }, { "advantage_max": 1.8470164686441422, "advantage_mean": 1.3659397946064189e-08, "advantage_min": -0.8646907731890678, "advantage_std": 0.9998833239078522, "completion_length": 2427.791717529297, "epoch": 0.16114285714285714, "grad_norm": 0.2331998348236084, "kl": 0.00403594970703125, "lambda_div_used": 0.6, "learning_rate": 9.122022088101613e-07, "loss": 0.0002, "reward": 0.1419997257180512, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1419997257180512, "reward_after_std": 1.0160457417368889, "reward_before_mean": 0.5559970289468765, "reward_before_std": 1.0078918114304543, "reward_change_max": 0.0005534589290618896, "reward_change_mean": -0.41399733535945415, "reward_change_min": -0.8622670099139214, "reward_change_std": 0.3442181684076786, "reward_std": 1.01604575663805, "rewards/cosine_scaled_reward": -0.013668144005350769, "rewards/format_reward": 0.5833333469927311, "step": 141 }, { "advantage_max": 1.7220842242240906, "advantage_mean": 2.3593506925934093e-08, "advantage_min": -1.0945493131875992, "advantage_std": 0.9998142942786217, "completion_length": 2786.6250762939453, "epoch": 0.16228571428571428, "grad_norm": 0.20720338821411133, "kl": 0.003205537796020508, "lambda_div_used": 0.6, "learning_rate": 9.103291169269299e-07, "loss": 0.0001, "reward": 0.00605709757655859, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.00605709757655859, "reward_after_std": 0.7766943480819464, "reward_before_mean": 0.41078952327370644, "reward_before_std": 0.8526837788522243, "reward_change_max": 5.05298376083374e-05, "reward_change_mean": -0.40473244059830904, "reward_change_min": -0.8016922771930695, "reward_change_std": 0.34947027266025543, "reward_std": 0.7766943946480751, "rewards/cosine_scaled_reward": -0.05502191558480263, "rewards/format_reward": 0.5208333469927311, "step": 142 }, { "advantage_max": 1.7795784920454025, "advantage_mean": -7.140139590688932e-09, "advantage_min": -0.9856926649808884, "advantage_std": 0.999704547226429, "completion_length": 2434.5416870117188, "epoch": 0.16342857142857142, "grad_norm": 0.33381637930870056, "kl": 0.004268646240234375, "lambda_div_used": 0.6, "learning_rate": 9.084384631108882e-07, "loss": 0.0002, "reward": -0.1253709946759045, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1253709946759045, "reward_after_std": 0.5376052083447576, "reward_before_mean": 0.24621488712728024, "reward_before_std": 0.5251807440072298, "reward_change_max": 0.0004133358597755432, "reward_change_mean": -0.3715858841314912, "reward_change_min": -0.6634500622749329, "reward_change_std": 0.2629025443457067, "reward_std": 0.537605220451951, "rewards/cosine_scaled_reward": -0.13730923272669315, "rewards/format_reward": 0.5208333432674408, "step": 143 }, { "advantage_max": 1.8112081289291382, "advantage_mean": 4.0357312713901194e-08, "advantage_min": -0.8791662603616714, "advantage_std": 0.9998252764344215, "completion_length": 2935.9167098999023, "epoch": 0.16457142857142856, "grad_norm": 0.2265225052833557, "kl": 0.0033435821533203125, "lambda_div_used": 0.6, "learning_rate": 9.065303395098358e-07, "loss": 0.0001, "reward": -0.005481253378093243, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.005481253378093243, "reward_after_std": 0.7782573513686657, "reward_before_mean": 0.3812122130766511, "reward_before_std": 0.767244616523385, "reward_change_max": 3.51443886756897e-05, "reward_change_mean": -0.38669345434755087, "reward_change_min": -0.7510642148554325, "reward_change_std": 0.30695721600204706, "reward_std": 0.7782573662698269, "rewards/cosine_scaled_reward": 0.003106111893430352, "rewards/format_reward": 0.3750000074505806, "step": 144 }, { "advantage_max": 1.8799023926258087, "advantage_mean": 4.967053790494447e-08, "advantage_min": -0.7879588454961777, "advantage_std": 0.9997758939862251, "completion_length": 2171.104179382324, "epoch": 0.1657142857142857, "grad_norm": 0.2688426971435547, "kl": 0.004181623458862305, "lambda_div_used": 0.6, "learning_rate": 9.046048391230247e-07, "loss": 0.0002, "reward": 0.1414489287417382, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1414489287417382, "reward_after_std": 0.6489234548062086, "reward_before_mean": 0.6365663278847933, "reward_before_std": 0.5827234704047441, "reward_change_max": 0.0009598508477210999, "reward_change_mean": -0.49511735793203115, "reward_change_min": -0.8301094174385071, "reward_change_std": 0.32179426960647106, "reward_std": 0.6489234566688538, "rewards/cosine_scaled_reward": 0.016199816949665546, "rewards/format_reward": 0.6041666716337204, "step": 145 }, { "advantage_max": 1.8417692929506302, "advantage_mean": 5.393909985329515e-09, "advantage_min": -0.9344904869794846, "advantage_std": 0.9997825846076012, "completion_length": 2455.4583587646484, "epoch": 0.16685714285714287, "grad_norm": 0.2010597586631775, "kl": 0.00258481502532959, "lambda_div_used": 0.6, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": 0.019809929188340902, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.019809929188340902, "reward_after_std": 0.6886911056935787, "reward_before_mean": 0.4365716055035591, "reward_before_std": 0.64860606379807, "reward_change_max": 0.0, "reward_change_mean": -0.4167616907507181, "reward_change_min": -0.7325540669262409, "reward_change_std": 0.2858335985802114, "reward_std": 0.6886911205947399, "rewards/cosine_scaled_reward": -0.08379753306508064, "rewards/format_reward": 0.6041666828095913, "step": 146 }, { "advantage_max": 1.7888115048408508, "advantage_mean": 4.035731260287889e-08, "advantage_min": -0.9190972521901131, "advantage_std": 0.9998320192098618, "completion_length": 2554.45841217041, "epoch": 0.168, "grad_norm": 0.38465583324432373, "kl": 0.004227638244628906, "lambda_div_used": 0.6, "learning_rate": 9.007020842191634e-07, "loss": 0.0002, "reward": 0.04151174915023148, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04151174915023148, "reward_after_std": 0.8868411891162395, "reward_before_mean": 0.4299083799123764, "reward_before_std": 0.9076585825532675, "reward_change_max": 2.2679567337036133e-05, "reward_change_mean": -0.3883966105058789, "reward_change_min": -0.809001874178648, "reward_change_std": 0.32688095327466726, "reward_std": 0.886841207742691, "rewards/cosine_scaled_reward": -0.024629172403365374, "rewards/format_reward": 0.47916667349636555, "step": 147 }, { "advantage_max": 1.8225539922714233, "advantage_mean": 1.1175871450497255e-08, "advantage_min": -0.8709155693650246, "advantage_std": 0.999858982861042, "completion_length": 1892.8125762939453, "epoch": 0.16914285714285715, "grad_norm": 0.23031729459762573, "kl": 0.0027790069580078125, "lambda_div_used": 0.6, "learning_rate": 8.987250199168808e-07, "loss": 0.0001, "reward": 0.13457794743590057, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.13457794743590057, "reward_after_std": 0.8694209642708302, "reward_before_mean": 0.5781211638823152, "reward_before_std": 0.8416643925011158, "reward_change_max": 0.0011159330606460571, "reward_change_mean": -0.44354322366416454, "reward_change_min": -0.8777766264975071, "reward_change_std": 0.33100760076195, "reward_std": 0.8694210052490234, "rewards/cosine_scaled_reward": -0.0755227617919445, "rewards/format_reward": 0.7291666716337204, "step": 148 }, { "advantage_max": 1.7685239017009735, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.9461576044559479, "advantage_std": 0.9998336955904961, "completion_length": 2667.6250610351562, "epoch": 0.1702857142857143, "grad_norm": 0.18191303312778473, "kl": 0.003093719482421875, "lambda_div_used": 0.6, "learning_rate": 8.967309592491052e-07, "loss": 0.0001, "reward": -0.005377430468797684, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.005377430468797684, "reward_after_std": 0.8236372619867325, "reward_before_mean": 0.37823933735489845, "reward_before_std": 0.8831204213202, "reward_change_max": 0.0030940771102905273, "reward_change_mean": -0.3836167808622122, "reward_change_min": -0.8212070725858212, "reward_change_std": 0.35686618462204933, "reward_std": 0.8236372880637646, "rewards/cosine_scaled_reward": -0.06088033691048622, "rewards/format_reward": 0.5000000037252903, "step": 149 }, { "advantage_max": 1.6964322626590729, "advantage_mean": 1.614292610696566e-08, "advantage_min": -1.0401954725384712, "advantage_std": 0.9998602271080017, "completion_length": 2585.541732788086, "epoch": 0.17142857142857143, "grad_norm": 0.20473583042621613, "kl": 0.004099845886230469, "lambda_div_used": 0.6, "learning_rate": 8.9471999940354e-07, "loss": 0.0002, "reward": 0.0429554358124733, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0429554358124733, "reward_after_std": 0.9450533464550972, "reward_before_mean": 0.43351098895072937, "reward_before_std": 1.051012322306633, "reward_change_max": 0.0030746981501579285, "reward_change_mean": -0.3905555624514818, "reward_change_min": -0.9020018689334393, "reward_change_std": 0.39117319881916046, "reward_std": 0.9450533874332905, "rewards/cosine_scaled_reward": -0.05407784227281809, "rewards/format_reward": 0.5416666753590107, "step": 150 }, { "advantage_max": 1.7657774835824966, "advantage_mean": -1.8626452158443385e-08, "advantage_min": -1.0298929959535599, "advantage_std": 0.9998343884944916, "completion_length": 2374.2083892822266, "epoch": 0.17257142857142857, "grad_norm": 0.21564297378063202, "kl": 0.004437446594238281, "lambda_div_used": 0.6, "learning_rate": 8.926922383915315e-07, "loss": 0.0002, "reward": 0.1870010308921337, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1870010308921337, "reward_after_std": 0.81415681168437, "reward_before_mean": 0.6816153451800346, "reward_before_std": 0.8424667343497276, "reward_change_max": 0.0006392896175384521, "reward_change_mean": -0.4946143254637718, "reward_change_min": -0.8876711316406727, "reward_change_std": 0.3735849279910326, "reward_std": 0.8141568228602409, "rewards/cosine_scaled_reward": 0.028307669796049595, "rewards/format_reward": 0.6250000149011612, "step": 151 }, { "advantage_max": 1.8536403626203537, "advantage_mean": 3.4769377377230626e-08, "advantage_min": -0.8554203286767006, "advantage_std": 0.9997300058603287, "completion_length": 2729.583354949951, "epoch": 0.1737142857142857, "grad_norm": 0.21470296382904053, "kl": 0.005055665969848633, "lambda_div_used": 0.6, "learning_rate": 8.906477750432903e-07, "loss": 0.0002, "reward": -0.314574146643281, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.314574146643281, "reward_after_std": 0.5909437406808138, "reward_before_mean": -0.06655025156214833, "reward_before_std": 0.5612615784630179, "reward_change_max": 0.0003990978002548218, "reward_change_mean": -0.24802388530224562, "reward_change_min": -0.44809194654226303, "reward_change_std": 0.17653987370431423, "reward_std": 0.5909437648952007, "rewards/cosine_scaled_reward": -0.21035847393795848, "rewards/format_reward": 0.35416667722165585, "step": 152 }, { "advantage_max": 1.8638170510530472, "advantage_mean": -1.4745941578908628e-08, "advantage_min": -0.8921016380190849, "advantage_std": 0.9998102560639381, "completion_length": 2721.5000381469727, "epoch": 0.17485714285714285, "grad_norm": 0.22840584814548492, "kl": 0.006916046142578125, "lambda_div_used": 0.6, "learning_rate": 8.88586709003076e-07, "loss": 0.0003, "reward": -0.3279695939272642, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.3279695939272642, "reward_after_std": 0.6396501064300537, "reward_before_mean": -0.09922650083899498, "reward_before_std": 0.6324530653655529, "reward_change_max": 0.0033247023820877075, "reward_change_mean": -0.22874312056228518, "reward_change_min": -0.46635738760232925, "reward_change_std": 0.1982931261882186, "reward_std": 0.6396501064300537, "rewards/cosine_scaled_reward": -0.23711324855685234, "rewards/format_reward": 0.3750000074505806, "step": 153 }, { "advantage_max": 1.7728360295295715, "advantage_mean": 4.594524849466097e-08, "advantage_min": -0.9388241320848465, "advantage_std": 0.9998360797762871, "completion_length": 3298.4584045410156, "epoch": 0.176, "grad_norm": 0.18289536237716675, "kl": 0.0028214454650878906, "lambda_div_used": 0.6, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": -0.10140408016741276, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.10140408016741276, "reward_after_std": 0.8765006735920906, "reward_before_mean": 0.21658623684197664, "reward_before_std": 0.9439742900431156, "reward_change_max": 0.001234501600265503, "reward_change_mean": -0.3179903104901314, "reward_change_min": -0.7761300541460514, "reward_change_std": 0.3139498056843877, "reward_std": 0.8765007182955742, "rewards/cosine_scaled_reward": -0.05837355088442564, "rewards/format_reward": 0.33333334140479565, "step": 154 }, { "advantage_max": 1.8248510509729385, "advantage_mean": 7.76102093702491e-09, "advantage_min": -0.8832328766584396, "advantage_std": 0.9998307749629021, "completion_length": 2646.291717529297, "epoch": 0.17714285714285713, "grad_norm": 0.21617059409618378, "kl": 0.0036039352416992188, "lambda_div_used": 0.6, "learning_rate": 8.844151714648274e-07, "loss": 0.0001, "reward": 0.1411822196096182, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1411822196096182, "reward_after_std": 0.7428156174719334, "reward_before_mean": 0.6130893367808312, "reward_before_std": 0.6941012628376484, "reward_change_max": 0.0009594261646270752, "reward_change_mean": -0.47190713416785, "reward_change_min": -0.8309516459703445, "reward_change_std": 0.322382427752018, "reward_std": 0.7428156286478043, "rewards/cosine_scaled_reward": 0.06696132896468043, "rewards/format_reward": 0.47916666977107525, "step": 155 }, { "advantage_max": 1.722646802663803, "advantage_mean": 7.710575078423432e-08, "advantage_min": -1.010897733271122, "advantage_std": 0.9998083412647247, "completion_length": 2569.708366394043, "epoch": 0.1782857142857143, "grad_norm": 0.16158613562583923, "kl": 0.0024747848510742188, "lambda_div_used": 0.6, "learning_rate": 8.823049032816478e-07, "loss": 0.0001, "reward": 0.026534391567111015, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.026534391567111015, "reward_after_std": 0.7706789597868919, "reward_before_mean": 0.44334378745406866, "reward_before_std": 0.8541289437562227, "reward_change_max": 0.0033616721630096436, "reward_change_mean": -0.4168093828484416, "reward_change_min": -0.8273204155266285, "reward_change_std": 0.3676574770361185, "reward_std": 0.7706789746880531, "rewards/cosine_scaled_reward": -0.017911457223817706, "rewards/format_reward": 0.4791666716337204, "step": 156 }, { "advantage_max": 1.8470103442668915, "advantage_mean": -1.1486312068154092e-08, "advantage_min": -0.8701950237154961, "advantage_std": 0.9997879564762115, "completion_length": 2702.687530517578, "epoch": 0.17942857142857144, "grad_norm": 0.2701130211353302, "kl": 0.004548072814941406, "lambda_div_used": 0.6, "learning_rate": 8.801784390262943e-07, "loss": 0.0002, "reward": -0.20367415808141232, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.20367415808141232, "reward_after_std": 0.6155711524188519, "reward_before_mean": 0.10635744594037533, "reward_before_std": 0.6107806749641895, "reward_change_max": 0.0011181086301803589, "reward_change_mean": -0.31003160774707794, "reward_change_min": -0.5878064446151257, "reward_change_std": 0.24444873072206974, "reward_std": 0.6155711822211742, "rewards/cosine_scaled_reward": -0.17598796007223427, "rewards/format_reward": 0.4583333469927311, "step": 157 }, { "advantage_max": 1.7345101982355118, "advantage_mean": 9.934107647602275e-09, "advantage_min": -1.0180958062410355, "advantage_std": 0.9998627975583076, "completion_length": 2785.5834350585938, "epoch": 0.18057142857142858, "grad_norm": 0.2903349697589874, "kl": 0.003916740417480469, "lambda_div_used": 0.6, "learning_rate": 8.780358823396352e-07, "loss": 0.0002, "reward": 0.1706813657656312, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1706813657656312, "reward_after_std": 0.8751989975571632, "reward_before_mean": 0.6363602974452078, "reward_before_std": 0.9112723432481289, "reward_change_max": 0.004944115877151489, "reward_change_mean": -0.4656789507716894, "reward_change_min": -0.87837153673172, "reward_change_std": 0.37687744572758675, "reward_std": 0.8751990273594856, "rewards/cosine_scaled_reward": 0.09943014103919268, "rewards/format_reward": 0.43750000558793545, "step": 158 }, { "advantage_max": 1.8704514354467392, "advantage_mean": 8.940696871739817e-08, "advantage_min": -0.7633762657642365, "advantage_std": 0.9997115284204483, "completion_length": 2753.2291717529297, "epoch": 0.18171428571428572, "grad_norm": 0.14626438915729523, "kl": 0.004486083984375, "lambda_div_used": 0.6, "learning_rate": 8.758773376468604e-07, "loss": 0.0002, "reward": -0.38985342904925346, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.38985342904925346, "reward_after_std": 0.40392925404012203, "reward_before_mean": -0.14509601891040802, "reward_before_std": 0.36197624169290066, "reward_change_max": 0.001153610646724701, "reward_change_mean": -0.24475740175694227, "reward_change_min": -0.45411108434200287, "reward_change_std": 0.16509783919900656, "reward_std": 0.4039292652159929, "rewards/cosine_scaled_reward": -0.2600480148103088, "rewards/format_reward": 0.375, "step": 159 }, { "advantage_max": 1.8221765458583832, "advantage_mean": 3.911554902202852e-08, "advantage_min": -0.9147254899144173, "advantage_std": 0.999821625649929, "completion_length": 2687.145896911621, "epoch": 0.18285714285714286, "grad_norm": 0.2758440375328064, "kl": 0.006741523742675781, "lambda_div_used": 0.6, "learning_rate": 8.737029101523929e-07, "loss": 0.0003, "reward": -0.13885139022022486, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13885139022022486, "reward_after_std": 0.7137699276208878, "reward_before_mean": 0.18189829215407372, "reward_before_std": 0.6966813541948795, "reward_change_max": 0.004106998443603516, "reward_change_mean": -0.3207496590912342, "reward_change_min": -0.6812379769980907, "reward_change_std": 0.27391286846250296, "reward_std": 0.713769931346178, "rewards/cosine_scaled_reward": -0.1173841985873878, "rewards/format_reward": 0.41666667349636555, "step": 160 }, { "advantage_max": 1.7642860263586044, "advantage_mean": 2.5456150853919723e-08, "advantage_min": -0.9674390330910683, "advantage_std": 0.9998326897621155, "completion_length": 2567.875030517578, "epoch": 0.184, "grad_norm": 0.2193051278591156, "kl": 0.006969451904296875, "lambda_div_used": 0.6, "learning_rate": 8.715127058347614e-07, "loss": 0.0003, "reward": 0.04702313430607319, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.04702313430607319, "reward_after_std": 0.7199640087783337, "reward_before_mean": 0.4766552746295929, "reward_before_std": 0.709110327064991, "reward_change_max": 0.0007743611931800842, "reward_change_mean": -0.42963212728500366, "reward_change_min": -0.8557399734854698, "reward_change_std": 0.3297593742609024, "reward_std": 0.7199640311300755, "rewards/cosine_scaled_reward": -0.022089052945375443, "rewards/format_reward": 0.5208333432674408, "step": 161 }, { "advantage_max": 1.6750542670488358, "advantage_mean": 3.787378499708893e-08, "advantage_min": -1.1022668778896332, "advantage_std": 0.9998056441545486, "completion_length": 3112.0208435058594, "epoch": 0.18514285714285714, "grad_norm": 0.18071846663951874, "kl": 0.00931549072265625, "lambda_div_used": 0.6, "learning_rate": 8.693068314414344e-07, "loss": 0.0004, "reward": -0.08226488158106804, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.08226488158106804, "reward_after_std": 0.717128112912178, "reward_before_mean": 0.283852843567729, "reward_before_std": 0.7811130806803703, "reward_change_max": 0.0, "reward_change_mean": -0.36611773632466793, "reward_change_min": -0.7776030413806438, "reward_change_std": 0.3201664984226227, "reward_std": 0.7171281166374683, "rewards/cosine_scaled_reward": -0.0455735728610307, "rewards/format_reward": 0.3750000037252903, "step": 162 }, { "advantage_max": 1.8762730807065964, "advantage_mean": -3.725290076417309e-09, "advantage_min": -0.8003713823854923, "advantage_std": 0.9998403191566467, "completion_length": 2399.6041870117188, "epoch": 0.18628571428571428, "grad_norm": 0.21399880945682526, "kl": 0.004780769348144531, "lambda_div_used": 0.6, "learning_rate": 8.670853944836176e-07, "loss": 0.0002, "reward": 0.26743081398308277, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26743081398308277, "reward_after_std": 0.7890615798532963, "reward_before_mean": 0.7968443520367146, "reward_before_std": 0.6803100085817277, "reward_change_max": 0.0009098872542381287, "reward_change_mean": -0.5294135119765997, "reward_change_min": -0.8948076628148556, "reward_change_std": 0.34071714151650667, "reward_std": 0.7890615947544575, "rewards/cosine_scaled_reward": 0.08592214062809944, "rewards/format_reward": 0.6250000055879354, "step": 163 }, { "advantage_max": 1.8100349009037018, "advantage_mean": 2.545615029880821e-08, "advantage_min": -0.9734407514333725, "advantage_std": 0.9997857436537743, "completion_length": 2345.1667251586914, "epoch": 0.18742857142857142, "grad_norm": 0.2537487745285034, "kl": 0.004994392395019531, "lambda_div_used": 0.6, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": 0.16789760813117027, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16789760813117027, "reward_after_std": 0.6567693762481213, "reward_before_mean": 0.676574507728219, "reward_before_std": 0.6022562142461538, "reward_change_max": 0.0014617964625358582, "reward_change_mean": -0.508676890283823, "reward_change_min": -0.8728748597204685, "reward_change_std": 0.3502998370677233, "reward_std": 0.6567693911492825, "rewards/cosine_scaled_reward": 0.03620391618460417, "rewards/format_reward": 0.6041666753590107, "step": 164 }, { "advantage_max": 1.8189441114664078, "advantage_mean": 1.6142925884921056e-08, "advantage_min": -1.040967583656311, "advantage_std": 0.9997537732124329, "completion_length": 2431.6250610351562, "epoch": 0.18857142857142858, "grad_norm": 0.24487285315990448, "kl": 0.005176544189453125, "lambda_div_used": 0.6, "learning_rate": 8.625962667065487e-07, "loss": 0.0002, "reward": -0.33520201966166496, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.33520201966166496, "reward_after_std": 0.4438677877187729, "reward_before_mean": -0.06665671244263649, "reward_before_std": 0.4154568985104561, "reward_change_max": 0.0014463141560554504, "reward_change_mean": -0.2685453128069639, "reward_change_min": -0.4497813992202282, "reward_change_std": 0.1854629199951887, "reward_std": 0.4438677951693535, "rewards/cosine_scaled_reward": -0.25207835622131824, "rewards/format_reward": 0.4375000111758709, "step": 165 }, { "advantage_max": 1.8155362904071808, "advantage_mean": 5.898376925772553e-09, "advantage_min": -0.8460855633020401, "advantage_std": 0.999803401529789, "completion_length": 2481.645881652832, "epoch": 0.18971428571428572, "grad_norm": 0.17104172706604004, "kl": 0.003920555114746094, "lambda_div_used": 0.6, "learning_rate": 8.603287946810513e-07, "loss": 0.0002, "reward": -0.06894381903111935, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06894381903111935, "reward_after_std": 0.7122244127094746, "reward_before_mean": 0.29707395657896996, "reward_before_std": 0.7052637934684753, "reward_change_max": 0.0003136247396469116, "reward_change_mean": -0.3660177676938474, "reward_change_min": -0.7036398574709892, "reward_change_std": 0.28611435275524855, "reward_std": 0.7122244350612164, "rewards/cosine_scaled_reward": -0.11187970079481602, "rewards/format_reward": 0.5208333432674408, "step": 166 }, { "advantage_max": 1.822963908314705, "advantage_mean": -1.3659398390153399e-08, "advantage_min": -0.856605276465416, "advantage_std": 0.9998670220375061, "completion_length": 2174.6667404174805, "epoch": 0.19085714285714286, "grad_norm": 0.1990378051996231, "kl": 0.0035414695739746094, "lambda_div_used": 0.6, "learning_rate": 8.580461976679099e-07, "loss": 0.0001, "reward": 0.13649043417535722, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.13649043417535722, "reward_after_std": 0.9280563369393349, "reward_before_mean": 0.5691012926399708, "reward_before_std": 0.9118999019265175, "reward_change_max": 0.00039912760257720947, "reward_change_mean": -0.432610847055912, "reward_change_min": -0.8885245770215988, "reward_change_std": 0.34525672532618046, "reward_std": 0.9280563518404961, "rewards/cosine_scaled_reward": -0.09044937463477254, "rewards/format_reward": 0.7500000111758709, "step": 167 }, { "advantage_max": 1.8217055052518845, "advantage_mean": 1.5832484961952886e-08, "advantage_min": -0.9166247323155403, "advantage_std": 0.9998238533735275, "completion_length": 2714.3959197998047, "epoch": 0.192, "grad_norm": 0.2229907512664795, "kl": 0.0042667388916015625, "lambda_div_used": 0.6, "learning_rate": 8.557485869176825e-07, "loss": 0.0002, "reward": -0.14375681872479618, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.14375681872479618, "reward_after_std": 0.731827724725008, "reward_before_mean": 0.17551885545253754, "reward_before_std": 0.7568799871951342, "reward_change_max": 0.0009675100445747375, "reward_change_mean": -0.31927567534148693, "reward_change_min": -0.7318533398211002, "reward_change_std": 0.28689664974808693, "reward_std": 0.7318277433514595, "rewards/cosine_scaled_reward": -0.15182391228154302, "rewards/format_reward": 0.47916668094694614, "step": 168 }, { "advantage_max": 1.8260702639818192, "advantage_mean": -3.228585088166369e-08, "advantage_min": -0.9273155555129051, "advantage_std": 0.9998803958296776, "completion_length": 1702.958381652832, "epoch": 0.19314285714285714, "grad_norm": 0.20557190477848053, "kl": 0.0043087005615234375, "lambda_div_used": 0.6, "learning_rate": 8.534360744126753e-07, "loss": 0.0002, "reward": 0.6049684919416904, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6049684919416904, "reward_after_std": 0.9428669586777687, "reward_before_mean": 1.2980639263987541, "reward_before_std": 0.870427917689085, "reward_change_max": 0.0, "reward_change_mean": -0.6930954158306122, "reward_change_min": -1.1992580592632294, "reward_change_std": 0.47029072418808937, "reward_std": 0.9428669735789299, "rewards/cosine_scaled_reward": 0.2219486115500331, "rewards/format_reward": 0.854166679084301, "step": 169 }, { "advantage_max": 1.8410781174898148, "advantage_mean": -2.0178655718572358e-08, "advantage_min": -0.8170875944197178, "advantage_std": 0.999827153980732, "completion_length": 2252.854217529297, "epoch": 0.19428571428571428, "grad_norm": 0.24229690432548523, "kl": 0.0041675567626953125, "lambda_div_used": 0.6, "learning_rate": 8.511087728614862e-07, "loss": 0.0002, "reward": 0.09339876628291677, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09339876628291677, "reward_after_std": 0.6783297061920166, "reward_before_mean": 0.5516661715810187, "reward_before_std": 0.592248173430562, "reward_change_max": 0.0025824084877967834, "reward_change_mean": -0.458267405629158, "reward_change_min": -0.825487308204174, "reward_change_std": 0.3272961787879467, "reward_std": 0.6783297136425972, "rewards/cosine_scaled_reward": 0.015416416805237532, "rewards/format_reward": 0.5208333414047956, "step": 170 }, { "advantage_max": 1.8135200291872025, "advantage_mean": 9.002785184009099e-09, "advantage_min": -0.9001466482877731, "advantage_std": 0.9998125210404396, "completion_length": 2336.0208740234375, "epoch": 0.19542857142857142, "grad_norm": 0.22073253989219666, "kl": 0.0040073394775390625, "lambda_div_used": 0.6, "learning_rate": 8.487667956935087e-07, "loss": 0.0002, "reward": -0.02373066544532776, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.02373066544532776, "reward_after_std": 0.743452426046133, "reward_before_mean": 0.3574244696646929, "reward_before_std": 0.735035166144371, "reward_change_max": 0.004195690155029297, "reward_change_mean": -0.38115513836964965, "reward_change_min": -0.7801596075296402, "reward_change_std": 0.2982376590371132, "reward_std": 0.7434524372220039, "rewards/cosine_scaled_reward": -0.07128777727484703, "rewards/format_reward": 0.5000000111758709, "step": 171 }, { "advantage_max": 1.835044652223587, "advantage_mean": -2.3593505649177615e-08, "advantage_min": -0.9643918573856354, "advantage_std": 0.999810203909874, "completion_length": 2526.7083435058594, "epoch": 0.19657142857142856, "grad_norm": 0.24751603603363037, "kl": 0.005962371826171875, "lambda_div_used": 0.6, "learning_rate": 8.464102570534061e-07, "loss": 0.0002, "reward": 0.2530949041247368, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2530949041247368, "reward_after_std": 0.7152044381946325, "reward_before_mean": 0.7942209746688604, "reward_before_std": 0.642445981502533, "reward_change_max": 0.0018026381731033325, "reward_change_mean": -0.5411260761320591, "reward_change_min": -0.8798088431358337, "reward_change_std": 0.3599870717152953, "reward_std": 0.7152044512331486, "rewards/cosine_scaled_reward": 0.15752714965492487, "rewards/format_reward": 0.47916667349636555, "step": 172 }, { "advantage_max": 1.8556355088949203, "advantage_mean": -2.220446049250313e-16, "advantage_min": -0.8953605890274048, "advantage_std": 0.9997614771127701, "completion_length": 1534.0625343322754, "epoch": 0.1977142857142857, "grad_norm": 0.27481091022491455, "kl": 0.004489898681640625, "lambda_div_used": 0.6, "learning_rate": 8.440392717955475e-07, "loss": 0.0002, "reward": 0.0394736472517252, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0394736472517252, "reward_after_std": 0.588263314217329, "reward_before_mean": 0.48301520850509405, "reward_before_std": 0.5054080621339381, "reward_change_max": 0.0009268000721931458, "reward_change_mean": -0.4435415752232075, "reward_change_min": -0.653259627521038, "reward_change_std": 0.2701799310743809, "reward_std": 0.5882633421570063, "rewards/cosine_scaled_reward": -0.12307572877034545, "rewards/format_reward": 0.7291666679084301, "step": 173 }, { "advantage_max": 1.9079472124576569, "advantage_mean": -3.4148496252939253e-09, "advantage_min": -0.8583328798413277, "advantage_std": 0.9998527467250824, "completion_length": 1573.0208892822266, "epoch": 0.19885714285714284, "grad_norm": 0.3282460868358612, "kl": 0.015633583068847656, "lambda_div_used": 0.6, "learning_rate": 8.416539554784089e-07, "loss": 0.0006, "reward": 0.3546122731640935, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3546122731640935, "reward_after_std": 0.7721526771783829, "reward_before_mean": 0.9311862445902079, "reward_before_std": 0.6311989836394787, "reward_change_max": 0.0, "reward_change_mean": -0.5765739753842354, "reward_change_min": -0.9307240545749664, "reward_change_std": 0.3313662502914667, "reward_std": 0.7721527107059956, "rewards/cosine_scaled_reward": 0.03850978892296553, "rewards/format_reward": 0.8541666716337204, "step": 174 }, { "advantage_max": 1.8203274607658386, "advantage_mean": -1.3038516932795119e-08, "advantage_min": -0.8509106487035751, "advantage_std": 0.999811090528965, "completion_length": 2614.9583587646484, "epoch": 0.2, "grad_norm": 0.1870286613702774, "kl": 0.0050258636474609375, "lambda_div_used": 0.6, "learning_rate": 8.392544243589427e-07, "loss": 0.0002, "reward": 0.13974158838391304, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13974158838391304, "reward_after_std": 0.6508477814495564, "reward_before_mean": 0.6339031849056482, "reward_before_std": 0.5851920321583748, "reward_change_max": 0.0010417401790618896, "reward_change_mean": -0.49416152387857437, "reward_change_min": -0.8415251597762108, "reward_change_std": 0.3433941416442394, "reward_std": 0.6508478038012981, "rewards/cosine_scaled_reward": 0.06695156544446945, "rewards/format_reward": 0.5, "step": 175 }, { "advantage_max": 1.8568035066127777, "advantage_mean": 6.829699361610153e-09, "advantage_min": -0.8530491329729557, "advantage_std": 0.999842494726181, "completion_length": 1939.8750534057617, "epoch": 0.20114285714285715, "grad_norm": 0.22778016328811646, "kl": 0.0038318634033203125, "lambda_div_used": 0.6, "learning_rate": 8.368407953869103e-07, "loss": 0.0002, "reward": 0.2435381426475942, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2435381426475942, "reward_after_std": 0.804627712816, "reward_before_mean": 0.7599546858109534, "reward_before_std": 0.7316133556887507, "reward_change_max": 0.0, "reward_change_mean": -0.5164165645837784, "reward_change_min": -0.9660816714167595, "reward_change_std": 0.3689718898385763, "reward_std": 0.8046277277171612, "rewards/cosine_scaled_reward": 0.015394015703350306, "rewards/format_reward": 0.7291666716337204, "step": 176 }, { "advantage_max": 1.8439493477344513, "advantage_mean": 7.799827006493842e-09, "advantage_min": -0.9431622736155987, "advantage_std": 0.9998452067375183, "completion_length": 2418.000068664551, "epoch": 0.2022857142857143, "grad_norm": 0.21349747478961945, "kl": 0.004181861877441406, "lambda_div_used": 0.6, "learning_rate": 8.344131861991828e-07, "loss": 0.0002, "reward": 0.23039765562862158, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23039765562862158, "reward_after_std": 0.7614034824073315, "reward_before_mean": 0.7456597234122455, "reward_before_std": 0.6677466258406639, "reward_change_max": 0.0008978694677352905, "reward_change_mean": -0.5152620803564787, "reward_change_min": -0.8554310090839863, "reward_change_std": 0.3227596618235111, "reward_std": 0.7614035047590733, "rewards/cosine_scaled_reward": 0.07074652705341578, "rewards/format_reward": 0.6041666697710752, "step": 177 }, { "advantage_max": 1.8696839362382889, "advantage_mean": 1.4280280069556284e-08, "advantage_min": -0.8557280600070953, "advantage_std": 0.9998672232031822, "completion_length": 1876.7500305175781, "epoch": 0.20342857142857143, "grad_norm": 0.21730564534664154, "kl": 0.00606536865234375, "lambda_div_used": 0.6, "learning_rate": 8.319717151140072e-07, "loss": 0.0002, "reward": 0.17197159988427302, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17197159988427302, "reward_after_std": 0.9298111759126186, "reward_before_mean": 0.6148823341354728, "reward_before_std": 0.8897008560597897, "reward_change_max": 0.0001780092716217041, "reward_change_mean": -0.4429107364267111, "reward_change_min": -0.8260507620871067, "reward_change_std": 0.3189305029809475, "reward_std": 0.9298111908137798, "rewards/cosine_scaled_reward": -0.07797550270333886, "rewards/format_reward": 0.7708333414047956, "step": 178 }, { "advantage_max": 1.7686524093151093, "advantage_mean": 1.241763691872677e-09, "advantage_min": -1.0116918981075287, "advantage_std": 0.999799333512783, "completion_length": 2257.770866394043, "epoch": 0.20457142857142857, "grad_norm": 0.23837533593177795, "kl": 0.00466156005859375, "lambda_div_used": 0.6, "learning_rate": 8.295165011252396e-07, "loss": 0.0002, "reward": -0.06246851943433285, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06246851943433285, "reward_after_std": 0.6824757643043995, "reward_before_mean": 0.3122526276856661, "reward_before_std": 0.6739065647125244, "reward_change_max": 0.000336281955242157, "reward_change_mean": -0.3747211927548051, "reward_change_min": -0.6586055941879749, "reward_change_std": 0.2646934259682894, "reward_std": 0.6824758015573025, "rewards/cosine_scaled_reward": -0.10429035313427448, "rewards/format_reward": 0.520833333954215, "step": 179 }, { "advantage_max": 1.8220677822828293, "advantage_mean": -2.980232316485143e-08, "advantage_min": -0.9002445340156555, "advantage_std": 0.9998351335525513, "completion_length": 1576.7083740234375, "epoch": 0.2057142857142857, "grad_norm": 0.259470134973526, "kl": 0.006572723388671875, "lambda_div_used": 0.6, "learning_rate": 8.270476638965461e-07, "loss": 0.0003, "reward": 0.44151231832802296, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.44151231832802296, "reward_after_std": 0.8216301538050175, "reward_before_mean": 1.0700802188366652, "reward_before_std": 0.7572079785168171, "reward_change_max": 0.0004591718316078186, "reward_change_mean": -0.628567922860384, "reward_change_min": -1.0779259391129017, "reward_change_std": 0.4193782526999712, "reward_std": 0.821630172431469, "rewards/cosine_scaled_reward": 0.12879010662436485, "rewards/format_reward": 0.8125000074505806, "step": 180 }, { "advantage_max": 1.8709893822669983, "advantage_mean": 3.197540909827268e-08, "advantage_min": -0.8805593922734261, "advantage_std": 0.999825045466423, "completion_length": 2620.750030517578, "epoch": 0.20685714285714285, "grad_norm": 0.19660595059394836, "kl": 0.00618743896484375, "lambda_div_used": 0.6, "learning_rate": 8.245653237555705e-07, "loss": 0.0002, "reward": 0.13136461284011602, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13136461284011602, "reward_after_std": 0.8224067687988281, "reward_before_mean": 0.5772053925320506, "reward_before_std": 0.7477911319583654, "reward_change_max": 0.0013022571802139282, "reward_change_mean": -0.44584078434854746, "reward_change_min": -0.7741091437637806, "reward_change_std": 0.3098747590556741, "reward_std": 0.8224067911505699, "rewards/cosine_scaled_reward": 0.038602693006396294, "rewards/format_reward": 0.5000000074505806, "step": 181 }, { "advantage_max": 1.8176920861005783, "advantage_mean": 2.204130183924846e-08, "advantage_min": -0.9655944332480431, "advantage_std": 0.9998245909810066, "completion_length": 1977.2291946411133, "epoch": 0.208, "grad_norm": 0.1666197031736374, "kl": 0.0028371810913085938, "lambda_div_used": 0.6, "learning_rate": 8.220696016880687e-07, "loss": 0.0001, "reward": 0.06960967741906643, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06960967741906643, "reward_after_std": 0.771950475871563, "reward_before_mean": 0.49913039430975914, "reward_before_std": 0.7607616037130356, "reward_change_max": 0.0009039789438247681, "reward_change_mean": -0.4295206815004349, "reward_change_min": -0.7781954184174538, "reward_change_std": 0.3130391491577029, "reward_std": 0.7719505093991756, "rewards/cosine_scaled_reward": -0.09418481879401952, "rewards/format_reward": 0.6875000074505806, "step": 182 }, { "advantage_max": 1.780486524105072, "advantage_mean": -4.967054045845742e-09, "advantage_min": -1.0759043172001839, "advantage_std": 0.9998541921377182, "completion_length": 1652.2709045410156, "epoch": 0.20914285714285713, "grad_norm": 0.24588677287101746, "kl": 0.006328582763671875, "lambda_div_used": 0.6, "learning_rate": 8.195606193320136e-07, "loss": 0.0003, "reward": 0.34637500741519034, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.34637500741519034, "reward_after_std": 0.828385803848505, "reward_before_mean": 0.9232957623898983, "reward_before_std": 0.8138195388019085, "reward_change_max": 0.0002490729093551636, "reward_change_mean": -0.5769207254052162, "reward_change_min": -0.9477719999849796, "reward_change_std": 0.38944604992866516, "reward_std": 0.8283858075737953, "rewards/cosine_scaled_reward": 0.04498119559139013, "rewards/format_reward": 0.8333333544433117, "step": 183 }, { "advantage_max": 1.8851233571767807, "advantage_mean": 1.8626449271863521e-09, "advantage_min": -0.9318333119153976, "advantage_std": 0.9998067617416382, "completion_length": 1994.8541831970215, "epoch": 0.2102857142857143, "grad_norm": 0.2858082056045532, "kl": 0.0061016082763671875, "lambda_div_used": 0.6, "learning_rate": 8.170384989716657e-07, "loss": 0.0002, "reward": -0.03480223537189886, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03480223537189886, "reward_after_std": 0.575218565762043, "reward_before_mean": 0.36644532158970833, "reward_before_std": 0.47599741257727146, "reward_change_max": 0.0002749115228652954, "reward_change_mean": -0.40124754048883915, "reward_change_min": -0.6298751458525658, "reward_change_std": 0.2488370854407549, "reward_std": 0.5752185806632042, "rewards/cosine_scaled_reward": -0.15011069364845753, "rewards/format_reward": 0.6666666753590107, "step": 184 }, { "advantage_max": 1.793496459722519, "advantage_mean": 8.692344288796505e-09, "advantage_min": -1.0042466521263123, "advantage_std": 0.9997899830341339, "completion_length": 1939.1250534057617, "epoch": 0.21142857142857144, "grad_norm": 0.2225847840309143, "kl": 0.0044708251953125, "lambda_div_used": 0.6, "learning_rate": 8.145033635316128e-07, "loss": 0.0002, "reward": -0.03830873221158981, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03830873221158981, "reward_after_std": 0.541305024176836, "reward_before_mean": 0.3769722692668438, "reward_before_std": 0.5038091838359833, "reward_change_max": 0.0, "reward_change_mean": -0.41528095677495, "reward_change_min": -0.6677366159856319, "reward_change_std": 0.26928949914872646, "reward_std": 0.5413050428032875, "rewards/cosine_scaled_reward": -0.14484722539782524, "rewards/format_reward": 0.666666679084301, "step": 185 }, { "advantage_max": 1.8013376891613007, "advantage_mean": 3.97364305904091e-08, "advantage_min": -0.9621232002973557, "advantage_std": 0.999778263270855, "completion_length": 2184.291702270508, "epoch": 0.21257142857142858, "grad_norm": 0.21466225385665894, "kl": 0.005809783935546875, "lambda_div_used": 0.6, "learning_rate": 8.119553365707802e-07, "loss": 0.0002, "reward": -0.1045905682258308, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1045905682258308, "reward_after_std": 0.6125866509974003, "reward_before_mean": 0.26320910919457674, "reward_before_std": 0.6129262074828148, "reward_change_max": 0.0007858425378799438, "reward_change_mean": -0.3677996820770204, "reward_change_min": -0.6774780340492725, "reward_change_std": 0.2679179043043405, "reward_std": 0.6125866509974003, "rewards/cosine_scaled_reward": -0.14964544959366322, "rewards/format_reward": 0.562500013038516, "step": 186 }, { "advantage_max": 1.853604942560196, "advantage_mean": 1.0554989715583218e-08, "advantage_min": -0.9163635894656181, "advantage_std": 0.9997911527752876, "completion_length": 1584.1458587646484, "epoch": 0.21371428571428572, "grad_norm": 0.30856335163116455, "kl": 0.007468223571777344, "lambda_div_used": 0.6, "learning_rate": 8.093945422764069e-07, "loss": 0.0003, "reward": 0.07727741380222142, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07727741380222142, "reward_after_std": 0.5403100177645683, "reward_before_mean": 0.5557239428162575, "reward_before_std": 0.4463806804269552, "reward_change_max": 0.0005810186266899109, "reward_change_mean": -0.478446539491415, "reward_change_min": -0.7569740489125252, "reward_change_std": 0.28474703803658485, "reward_std": 0.5403100252151489, "rewards/cosine_scaled_reward": -0.13880470301955938, "rewards/format_reward": 0.8333333358168602, "step": 187 }, { "advantage_max": 1.826738864183426, "advantage_mean": 1.071021038523412e-08, "advantage_min": -0.9134683609008789, "advantage_std": 0.9997814372181892, "completion_length": 2628.9583587646484, "epoch": 0.21485714285714286, "grad_norm": 0.2049739956855774, "kl": 0.0063419342041015625, "lambda_div_used": 0.6, "learning_rate": 8.068211054579943e-07, "loss": 0.0003, "reward": -0.13351251278072596, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13351251278072596, "reward_after_std": 0.6621253341436386, "reward_before_mean": 0.20025232713669538, "reward_before_std": 0.6175753083080053, "reward_change_max": 0.0006356909871101379, "reward_change_mean": -0.3337648417800665, "reward_change_min": -0.5791598781943321, "reward_change_std": 0.2304175542667508, "reward_std": 0.6621253415942192, "rewards/cosine_scaled_reward": -0.10820717085152864, "rewards/format_reward": 0.4166666679084301, "step": 188 }, { "advantage_max": 1.8772830069065094, "advantage_mean": 1.1175871450497255e-08, "advantage_min": -0.7826530821621418, "advantage_std": 0.9998155608773232, "completion_length": 1632.5208854675293, "epoch": 0.216, "grad_norm": 0.24881505966186523, "kl": 0.0060138702392578125, "lambda_div_used": 0.6, "learning_rate": 8.04235151541222e-07, "loss": 0.0002, "reward": 0.09520456660538912, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.09520456660538912, "reward_after_std": 0.7265063393861055, "reward_before_mean": 0.540639003738761, "reward_before_std": 0.6537737995386124, "reward_change_max": 0.0, "reward_change_mean": -0.44543439242988825, "reward_change_min": -0.7373391389846802, "reward_change_std": 0.2890320150181651, "reward_std": 0.7265063729137182, "rewards/cosine_scaled_reward": -0.10468053352087736, "rewards/format_reward": 0.7500000037252903, "step": 189 }, { "advantage_max": 1.9335829019546509, "advantage_mean": 3.7252901874396116e-09, "advantage_min": -0.7080438584089279, "advantage_std": 0.9998445361852646, "completion_length": 1214.8333740234375, "epoch": 0.21714285714285714, "grad_norm": 0.2329777032136917, "kl": 0.004730224609375, "lambda_div_used": 0.6, "learning_rate": 8.01636806561836e-07, "loss": 0.0002, "reward": 0.33617127127945423, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.33617127127945423, "reward_after_std": 0.7745129093527794, "reward_before_mean": 0.9009418655186892, "reward_before_std": 0.6023400258272886, "reward_change_max": 0.0, "reward_change_mean": -0.5647705905139446, "reward_change_min": -0.8931130617856979, "reward_change_std": 0.3234950266778469, "reward_std": 0.7745129317045212, "rewards/cosine_scaled_reward": -0.01827908866107464, "rewards/format_reward": 0.9375, "step": 190 }, { "advantage_max": 1.9248203337192535, "advantage_mean": 1.2417640249395845e-09, "advantage_min": -0.7405965775251389, "advantage_std": 0.9998944103717804, "completion_length": 1295.3541984558105, "epoch": 0.21828571428571428, "grad_norm": 0.2606213688850403, "kl": 0.006084442138671875, "lambda_div_used": 0.6, "learning_rate": 7.990261971595048e-07, "loss": 0.0002, "reward": 0.4639074660371989, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4639074660371989, "reward_after_std": 1.0598181448876858, "reward_before_mean": 1.0392481312155724, "reward_before_std": 0.9292659349739552, "reward_change_max": 0.0, "reward_change_mean": -0.5753406658768654, "reward_change_min": -1.0478120222687721, "reward_change_std": 0.3785528726875782, "reward_std": 1.0598181709647179, "rewards/cosine_scaled_reward": 0.0821240646764636, "rewards/format_reward": 0.8750000111758709, "step": 191 }, { "advantage_max": 1.851755753159523, "advantage_mean": 4.346172255420555e-09, "advantage_min": -0.9270682707428932, "advantage_std": 0.9998274967074394, "completion_length": 2081.604217529297, "epoch": 0.21942857142857142, "grad_norm": 0.205403134226799, "kl": 0.005340576171875, "lambda_div_used": 0.6, "learning_rate": 7.964034505716476e-07, "loss": 0.0002, "reward": 0.044922725297510624, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.044922725297510624, "reward_after_std": 0.7124022357165813, "reward_before_mean": 0.46672992315143347, "reward_before_std": 0.6790667753666639, "reward_change_max": 0.00206892192363739, "reward_change_mean": -0.4218071922659874, "reward_change_min": -0.7962087951600552, "reward_change_std": 0.29674767330288887, "reward_std": 0.7124022506177425, "rewards/cosine_scaled_reward": -0.11038506031036377, "rewards/format_reward": 0.6875000242143869, "step": 192 }, { "advantage_max": 1.8514807373285294, "advantage_mean": 1.986821618338297e-08, "advantage_min": -0.8810537457466125, "advantage_std": 0.9998353496193886, "completion_length": 2794.187545776367, "epoch": 0.22057142857142858, "grad_norm": 0.2158997654914856, "kl": 0.00675201416015625, "lambda_div_used": 0.6, "learning_rate": 7.93768694627233e-07, "loss": 0.0003, "reward": -0.12474042293615639, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.12474042293615639, "reward_after_std": 0.8645276762545109, "reward_before_mean": 0.1677975022175815, "reward_before_std": 0.8510575741529465, "reward_change_max": 0.0004886612296104431, "reward_change_mean": -0.2925379406660795, "reward_change_min": -0.6505083031952381, "reward_change_std": 0.25192677415907383, "reward_std": 0.8645276948809624, "rewards/cosine_scaled_reward": -0.13485124241560698, "rewards/format_reward": 0.43750000186264515, "step": 193 }, { "advantage_max": 1.8288027048110962, "advantage_mean": -2.359350592673337e-08, "advantage_min": -0.8574704006314278, "advantage_std": 0.9998577386140823, "completion_length": 2348.520851135254, "epoch": 0.22171428571428572, "grad_norm": 0.19087940454483032, "kl": 0.005785942077636719, "lambda_div_used": 0.6, "learning_rate": 7.911220577405484e-07, "loss": 0.0002, "reward": 0.3952038553543389, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3952038553543389, "reward_after_std": 0.9202344082295895, "reward_before_mean": 0.9806380420923233, "reward_before_std": 0.8940357845276594, "reward_change_max": 0.0010172724723815918, "reward_change_mean": -0.5854342035017908, "reward_change_min": -1.0540791526436806, "reward_change_std": 0.43106903275474906, "reward_std": 0.9202344380319118, "rewards/cosine_scaled_reward": 0.13615235313773155, "rewards/format_reward": 0.7083333395421505, "step": 194 }, { "advantage_max": 1.8929940164089203, "advantage_mean": -8.381903393583912e-09, "advantage_min": -0.8231491893529892, "advantage_std": 0.999856561422348, "completion_length": 1668.1042175292969, "epoch": 0.22285714285714286, "grad_norm": 0.21075581014156342, "kl": 0.006229400634765625, "lambda_div_used": 0.6, "learning_rate": 7.884636689049422e-07, "loss": 0.0002, "reward": 0.21669870428740978, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21669870428740978, "reward_after_std": 0.8580862581729889, "reward_before_mean": 0.7015649350360036, "reward_before_std": 0.7897386755794287, "reward_change_max": 0.0, "reward_change_mean": -0.4848662167787552, "reward_change_min": -0.9058547914028168, "reward_change_std": 0.32796779833734035, "reward_std": 0.8580862581729889, "rewards/cosine_scaled_reward": -0.05546754505485296, "rewards/format_reward": 0.8125000055879354, "step": 195 }, { "advantage_max": 1.8233862221240997, "advantage_mean": 1.83160115962977e-08, "advantage_min": -1.0622686967253685, "advantage_std": 0.9998345524072647, "completion_length": 2790.8334350585938, "epoch": 0.224, "grad_norm": 0.22471381723880768, "kl": 0.0061893463134765625, "lambda_div_used": 0.6, "learning_rate": 7.857936576865356e-07, "loss": 0.0002, "reward": -0.0495893070474267, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.0495893070474267, "reward_after_std": 0.7146314792335033, "reward_before_mean": 0.323355401866138, "reward_before_std": 0.7049362137913704, "reward_change_max": 0.002869725227355957, "reward_change_mean": -0.37294470332562923, "reward_change_min": -0.6940531842410564, "reward_change_std": 0.28026892617344856, "reward_std": 0.7146315313875675, "rewards/cosine_scaled_reward": -0.10915563208982348, "rewards/format_reward": 0.5416666772216558, "step": 196 }, { "advantage_max": 1.844907984137535, "advantage_mean": -2.6697914323747796e-08, "advantage_min": -0.9053036347031593, "advantage_std": 0.9998667389154434, "completion_length": 1176.5000305175781, "epoch": 0.22514285714285714, "grad_norm": 0.2676119804382324, "kl": 0.006092071533203125, "lambda_div_used": 0.6, "learning_rate": 7.831121542179086e-07, "loss": 0.0002, "reward": 0.4123532408848405, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4123532408848405, "reward_after_std": 0.920974288135767, "reward_before_mean": 0.9987042904831469, "reward_before_std": 0.8744922038167715, "reward_change_max": 0.0, "reward_change_mean": -0.5863510705530643, "reward_change_min": -1.008712936192751, "reward_change_std": 0.3892252929508686, "reward_std": 0.9209743067622185, "rewards/cosine_scaled_reward": 0.06185212981654331, "rewards/format_reward": 0.8750000055879354, "step": 197 }, { "advantage_max": 1.8616387248039246, "advantage_mean": 5.27749466350258e-09, "advantage_min": -0.8514187633991241, "advantage_std": 0.9998800158500671, "completion_length": 1831.145896911621, "epoch": 0.22628571428571428, "grad_norm": 0.266026109457016, "kl": 0.0087127685546875, "lambda_div_used": 0.6, "learning_rate": 7.804192891917571e-07, "loss": 0.0003, "reward": 0.21275895088911057, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21275895088911057, "reward_after_std": 0.9901725947856903, "reward_before_mean": 0.6730580711737275, "reward_before_std": 0.9783835522830486, "reward_change_max": 0.0005161240696907043, "reward_change_mean": -0.46029909048229456, "reward_change_min": -0.8294646516442299, "reward_change_std": 0.3421561336144805, "reward_std": 0.9901726320385933, "rewards/cosine_scaled_reward": -0.007220972329378128, "rewards/format_reward": 0.6875000111758709, "step": 198 }, { "advantage_max": 1.869808241724968, "advantage_mean": -6.829699250587851e-09, "advantage_min": -0.9215875342488289, "advantage_std": 0.9998573064804077, "completion_length": 1522.3333892822266, "epoch": 0.22742857142857142, "grad_norm": 0.24699227511882782, "kl": 0.006649017333984375, "lambda_div_used": 0.6, "learning_rate": 7.777151938545235e-07, "loss": 0.0003, "reward": 0.2865274213254452, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2865274213254452, "reward_after_std": 0.829144075512886, "reward_before_mean": 0.8162419125437737, "reward_before_std": 0.7331322841346264, "reward_change_max": 0.0, "reward_change_mean": -0.5297144390642643, "reward_change_min": -0.86338210105896, "reward_change_std": 0.3267682734876871, "reward_std": 0.829144112765789, "rewards/cosine_scaled_reward": -0.06062907166779041, "rewards/format_reward": 0.9375000149011612, "step": 199 }, { "advantage_max": 1.8471790850162506, "advantage_mean": 8.692344288796505e-09, "advantage_min": -0.9032345339655876, "advantage_std": 0.99985521286726, "completion_length": 1256.3958740234375, "epoch": 0.22857142857142856, "grad_norm": 0.23767714202404022, "kl": 0.006656646728515625, "lambda_div_used": 0.6, "learning_rate": 7.75e-07, "loss": 0.0003, "reward": 0.37511107651516795, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.37511107651516795, "reward_after_std": 0.8456577584147453, "reward_before_mean": 0.95660699903965, "reward_before_std": 0.7694406695663929, "reward_change_max": 0.0009586066007614136, "reward_change_mean": -0.5814958810806274, "reward_change_min": -0.9831447154283524, "reward_change_std": 0.37729886546730995, "reward_std": 0.8456577733159065, "rewards/cosine_scaled_reward": 0.019970136578194797, "rewards/format_reward": 0.916666679084301, "step": 200 }, { "advantage_max": 1.8327146768569946, "advantage_mean": -1.738468857759301e-08, "advantage_min": -0.9730896577239037, "advantage_std": 0.9998484998941422, "completion_length": 1927.604232788086, "epoch": 0.2297142857142857, "grad_norm": 0.25266122817993164, "kl": 0.0050640106201171875, "lambda_div_used": 0.6, "learning_rate": 7.72273839962904e-07, "loss": 0.0002, "reward": 0.6098271571099758, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6098271571099758, "reward_after_std": 0.86701575294137, "reward_before_mean": 1.3185914978384972, "reward_before_std": 0.7631920166313648, "reward_change_max": 0.0, "reward_change_mean": -0.7087643798440695, "reward_change_min": -1.1464748308062553, "reward_change_std": 0.45284009352326393, "reward_std": 0.8670158125460148, "rewards/cosine_scaled_reward": 0.2842957489192486, "rewards/format_reward": 0.7500000149011612, "step": 201 }, { "advantage_max": 1.9493790417909622, "advantage_mean": 2.483526384544632e-09, "advantage_min": -0.7426571026444435, "advantage_std": 0.9998154044151306, "completion_length": 1572.9792175292969, "epoch": 0.23085714285714284, "grad_norm": 0.2120855450630188, "kl": 0.0058650970458984375, "lambda_div_used": 0.6, "learning_rate": 7.695368466124296e-07, "loss": 0.0002, "reward": 0.481074046343565, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.481074046343565, "reward_after_std": 0.6121453009545803, "reward_before_mean": 1.1622148640453815, "reward_before_std": 0.3702765265479684, "reward_change_max": 0.0005568414926528931, "reward_change_mean": -0.6811407832428813, "reward_change_min": -0.9480189867317677, "reward_change_std": 0.3608372900635004, "reward_std": 0.6121453046798706, "rewards/cosine_scaled_reward": 0.19569074362516403, "rewards/format_reward": 0.7708333395421505, "step": 202 }, { "advantage_max": 1.860578492283821, "advantage_mean": -3.1044056214568627e-10, "advantage_min": -0.9007564336061478, "advantage_std": 0.9998569637537003, "completion_length": 1608.2708892822266, "epoch": 0.232, "grad_norm": 0.2628607451915741, "kl": 0.008453369140625, "lambda_div_used": 0.6, "learning_rate": 7.667891533457718e-07, "loss": 0.0003, "reward": 0.25236531626433134, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.25236531626433134, "reward_after_std": 0.8142339959740639, "reward_before_mean": 0.7688097134232521, "reward_before_std": 0.7407113090157509, "reward_change_max": 0.0, "reward_change_mean": -0.516444344073534, "reward_change_min": -0.9059380665421486, "reward_change_std": 0.3518046382814646, "reward_std": 0.814234022051096, "rewards/cosine_scaled_reward": -0.021845156326889992, "rewards/format_reward": 0.8125000149011612, "step": 203 }, { "advantage_max": 1.856035828590393, "advantage_mean": -1.0554989660072067e-08, "advantage_min": -0.8498663008213043, "advantage_std": 0.9998530447483063, "completion_length": 1364.708381652832, "epoch": 0.23314285714285715, "grad_norm": 0.3182263970375061, "kl": 0.00948333740234375, "lambda_div_used": 0.6, "learning_rate": 7.640308940816239e-07, "loss": 0.0004, "reward": 0.45779778249561787, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.45779778249561787, "reward_after_std": 0.8125110119581223, "reward_before_mean": 1.0898098573088646, "reward_before_std": 0.7080512810498476, "reward_change_max": 0.0, "reward_change_mean": -0.632012028247118, "reward_change_min": -1.0780591741204262, "reward_change_std": 0.4018330071121454, "reward_std": 0.8125110454857349, "rewards/cosine_scaled_reward": 0.07615489140152931, "rewards/format_reward": 0.9375000074505806, "step": 204 }, { "advantage_max": 1.831876128911972, "advantage_mean": -1.241764135961887e-09, "advantage_min": -0.8645824491977692, "advantage_std": 0.9998550340533257, "completion_length": 1722.0000381469727, "epoch": 0.2342857142857143, "grad_norm": 0.2667372226715088, "kl": 0.0058498382568359375, "lambda_div_used": 0.6, "learning_rate": 7.612622032536507e-07, "loss": 0.0002, "reward": 0.5514694144949317, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5514694144949317, "reward_after_std": 0.8580240793526173, "reward_before_mean": 1.2323165144771338, "reward_before_std": 0.7656144704669714, "reward_change_max": 0.0, "reward_change_mean": -0.6808471139520407, "reward_change_min": -1.1130521781742573, "reward_change_std": 0.45496470108628273, "reward_std": 0.8580241277813911, "rewards/cosine_scaled_reward": 0.22032492235302925, "rewards/format_reward": 0.7916666753590107, "step": 205 }, { "advantage_max": 1.7926378697156906, "advantage_mean": 2.1730859334212482e-09, "advantage_min": -0.8720656037330627, "advantage_std": 0.9998156204819679, "completion_length": 2273.604217529297, "epoch": 0.23542857142857143, "grad_norm": 0.2223757952451706, "kl": 0.0063800811767578125, "lambda_div_used": 0.6, "learning_rate": 7.584832158039378e-07, "loss": 0.0003, "reward": -0.14681187830865383, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.14681187830865383, "reward_after_std": 0.6314323507249355, "reward_before_mean": 0.18663795664906502, "reward_before_std": 0.5911546461284161, "reward_change_max": 0.0013406351208686829, "reward_change_mean": -0.33344983868300915, "reward_change_min": -0.6240692362189293, "reward_change_std": 0.24192245677113533, "reward_std": 0.6314323656260967, "rewards/cosine_scaled_reward": -0.19834769575390965, "rewards/format_reward": 0.5833333358168602, "step": 206 }, { "advantage_max": 1.8526040315628052, "advantage_mean": -1.676380612103401e-08, "advantage_min": -0.8606942147016525, "advantage_std": 0.9998470023274422, "completion_length": 1725.4167022705078, "epoch": 0.23657142857142857, "grad_norm": 0.32430991530418396, "kl": 0.00893402099609375, "lambda_div_used": 0.6, "learning_rate": 7.556940671764124e-07, "loss": 0.0004, "reward": 0.19386290735565126, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19386290735565126, "reward_after_std": 0.7678088247776031, "reward_before_mean": 0.6845569107681513, "reward_before_std": 0.7046758942306042, "reward_change_max": 0.000944972038269043, "reward_change_mean": -0.4906939994543791, "reward_change_min": -0.8508261777460575, "reward_change_std": 0.31916314363479614, "reward_std": 0.767808835953474, "rewards/cosine_scaled_reward": -0.05355490278452635, "rewards/format_reward": 0.791666679084301, "step": 207 }, { "advantage_max": 1.839926853775978, "advantage_mean": -4.03573130469681e-09, "advantage_min": -0.9513271749019623, "advantage_std": 0.9998404234647751, "completion_length": 1415.0625228881836, "epoch": 0.2377142857142857, "grad_norm": 0.19833095371723175, "kl": 0.006793975830078125, "lambda_div_used": 0.6, "learning_rate": 7.528948933102438e-07, "loss": 0.0003, "reward": 0.408255933172768, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.408255933172768, "reward_after_std": 0.7480129301548004, "reward_before_mean": 1.027973834425211, "reward_before_std": 0.6748812645673752, "reward_change_max": 0.0, "reward_change_mean": -0.6197178699076176, "reward_change_min": -0.9957869723439217, "reward_change_std": 0.3788326345384121, "reward_std": 0.7480129413306713, "rewards/cosine_scaled_reward": 0.04523690603673458, "rewards/format_reward": 0.9375000074505806, "step": 208 }, { "advantage_max": 1.8184748589992523, "advantage_mean": 1.2107194080623884e-08, "advantage_min": -1.0376386195421219, "advantage_std": 0.9998557269573212, "completion_length": 1503.3958702087402, "epoch": 0.23885714285714285, "grad_norm": 0.32442209124565125, "kl": 0.00881195068359375, "lambda_div_used": 0.6, "learning_rate": 7.500858306332172e-07, "loss": 0.0004, "reward": 0.2773610055446625, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2773610055446625, "reward_after_std": 0.744460966438055, "reward_before_mean": 0.8234799510391895, "reward_before_std": 0.6716284602880478, "reward_change_max": 0.003235235810279846, "reward_change_mean": -0.546118900179863, "reward_change_min": -0.8975829593837261, "reward_change_std": 0.3473086543381214, "reward_std": 0.7444609776139259, "rewards/cosine_scaled_reward": 0.01590662496164441, "rewards/format_reward": 0.7916666753590107, "step": 209 }, { "advantage_max": 1.7693659961223602, "advantage_mean": -3.1044084525255755e-09, "advantage_min": -1.111761063337326, "advantage_std": 0.9998294115066528, "completion_length": 1683.0000305175781, "epoch": 0.24, "grad_norm": 0.1759955734014511, "kl": 0.006259918212890625, "lambda_div_used": 0.6, "learning_rate": 7.472670160550848e-07, "loss": 0.0003, "reward": 0.21515829270356335, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21515829270356335, "reward_after_std": 0.7110634557902813, "reward_before_mean": 0.7379409018903971, "reward_before_std": 0.675159614533186, "reward_change_max": 0.0, "reward_change_mean": -0.5227825716137886, "reward_change_min": -0.8667676635086536, "reward_change_std": 0.33463000506162643, "reward_std": 0.7110634669661522, "rewards/cosine_scaled_reward": -0.03727956488728523, "rewards/format_reward": 0.8125000149011612, "step": 210 }, { "advantage_max": 1.945738285779953, "advantage_mean": 2.793967829317623e-08, "advantage_min": -0.7639892995357513, "advantage_std": 0.999823622405529, "completion_length": 1397.020866394043, "epoch": 0.24114285714285713, "grad_norm": 0.2246849238872528, "kl": 0.007781982421875, "lambda_div_used": 0.6, "learning_rate": 7.444385869608921e-07, "loss": 0.0003, "reward": 0.29541015811264515, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.29541015811264515, "reward_after_std": 0.6922118216753006, "reward_before_mean": 0.8543685399927199, "reward_before_std": 0.5085874684154987, "reward_change_max": 0.0004105120897293091, "reward_change_mean": -0.5589583879336715, "reward_change_min": -0.812763299793005, "reward_change_std": 0.3149664308875799, "reward_std": 0.6922118328511715, "rewards/cosine_scaled_reward": 0.031350934877991676, "rewards/format_reward": 0.7916666716337204, "step": 211 }, { "advantage_max": 1.8842217326164246, "advantage_mean": -1.7384689132704523e-08, "advantage_min": -0.9026188924908638, "advantage_std": 0.999860368669033, "completion_length": 1130.6667022705078, "epoch": 0.2422857142857143, "grad_norm": 0.2639113962650299, "kl": 0.007846832275390625, "lambda_div_used": 0.6, "learning_rate": 7.416006812042827e-07, "loss": 0.0003, "reward": 0.506486542057246, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.506486542057246, "reward_after_std": 0.8082983270287514, "reward_before_mean": 1.1632536239922047, "reward_before_std": 0.6659324653446674, "reward_change_max": 0.0, "reward_change_mean": -0.656767075881362, "reward_change_min": -0.9595914296805859, "reward_change_std": 0.38042050041258335, "reward_std": 0.8082983493804932, "rewards/cosine_scaled_reward": 0.1337101130047813, "rewards/format_reward": 0.8958333395421505, "step": 212 }, { "advantage_max": 1.9224452078342438, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.7717320919036865, "advantage_std": 0.9998671188950539, "completion_length": 1562.6042022705078, "epoch": 0.24342857142857144, "grad_norm": 0.3185400366783142, "kl": 0.010986328125, "lambda_div_used": 0.6, "learning_rate": 7.387534371007797e-07, "loss": 0.0004, "reward": 0.3678184101881925, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3678184101881925, "reward_after_std": 0.9026115350425243, "reward_before_mean": 0.9258643062785268, "reward_before_std": 0.7626326903700829, "reward_change_max": 0.0, "reward_change_mean": -0.5580459162592888, "reward_change_min": -0.9071638658642769, "reward_change_std": 0.3532382473349571, "reward_std": 0.9026115611195564, "rewards/cosine_scaled_reward": 0.046265478784334846, "rewards/format_reward": 0.8333333395421505, "step": 213 }, { "advantage_max": 1.8441757261753082, "advantage_mean": -1.2883295541499251e-08, "advantage_min": -0.9523207470774651, "advantage_std": 0.9998089149594307, "completion_length": 1889.5625228881836, "epoch": 0.24457142857142858, "grad_norm": 0.3396517336368561, "kl": 0.008754730224609375, "lambda_div_used": 0.6, "learning_rate": 7.358969934210438e-07, "loss": 0.0004, "reward": 0.17065197927877307, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17065197927877307, "reward_after_std": 0.6326207704842091, "reward_before_mean": 0.6843567192554474, "reward_before_std": 0.5557381771504879, "reward_change_max": 5.299597978591919e-05, "reward_change_mean": -0.5137047506868839, "reward_change_min": -0.8281576447188854, "reward_change_std": 0.3360777208581567, "reward_std": 0.63262078166008, "rewards/cosine_scaled_reward": -0.04323832131922245, "rewards/format_reward": 0.7708333432674408, "step": 214 }, { "advantage_max": 1.8047952502965927, "advantage_mean": -6.984919156960423e-09, "advantage_min": -0.949731320142746, "advantage_std": 0.9998000115156174, "completion_length": 1343.3750305175781, "epoch": 0.24571428571428572, "grad_norm": 0.28229522705078125, "kl": 0.006084442138671875, "lambda_div_used": 0.6, "learning_rate": 7.330314893841101e-07, "loss": 0.0002, "reward": 0.11037498325458728, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11037498325458728, "reward_after_std": 0.5453722551465034, "reward_before_mean": 0.6039579696953297, "reward_before_std": 0.4900472089648247, "reward_change_max": 0.0009583085775375366, "reward_change_mean": -0.49358299002051353, "reward_change_min": -0.7654079273343086, "reward_change_std": 0.29697378166019917, "reward_std": 0.5453722700476646, "rewards/cosine_scaled_reward": -0.12510435469448566, "rewards/format_reward": 0.8541666716337204, "step": 215 }, { "advantage_max": 1.8079268336296082, "advantage_mean": -4.3461718668424965e-09, "advantage_min": -0.9888656884431839, "advantage_std": 0.9998346269130707, "completion_length": 1141.7291831970215, "epoch": 0.24685714285714286, "grad_norm": 0.2945360243320465, "kl": 0.008228302001953125, "lambda_div_used": 0.6, "learning_rate": 7.301570646506027e-07, "loss": 0.0003, "reward": 0.5667337765917182, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5667337765917182, "reward_after_std": 0.7148325145244598, "reward_before_mean": 1.2823062911629677, "reward_before_std": 0.599778238683939, "reward_change_max": 0.0007646828889846802, "reward_change_mean": -0.7155724987387657, "reward_change_min": -1.0847780294716358, "reward_change_std": 0.4218271281570196, "reward_std": 0.7148325331509113, "rewards/cosine_scaled_reward": 0.2036531288176775, "rewards/format_reward": 0.875, "step": 216 }, { "advantage_max": 1.9577355980873108, "advantage_mean": -3.476937759927523e-08, "advantage_min": -0.6729211881756783, "advantage_std": 0.9998743459582329, "completion_length": 1361.7500381469727, "epoch": 0.248, "grad_norm": 0.23953408002853394, "kl": 0.0068817138671875, "lambda_div_used": 0.6, "learning_rate": 7.27273859315928e-07, "loss": 0.0003, "reward": 0.3585895048454404, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3585895048454404, "reward_after_std": 0.9671962819993496, "reward_before_mean": 0.894859915599227, "reward_before_std": 0.817153150215745, "reward_change_max": 0.0, "reward_change_mean": -0.5362704452127218, "reward_change_min": -0.9340154454112053, "reward_change_std": 0.35429083555936813, "reward_std": 0.9671962969005108, "rewards/cosine_scaled_reward": 0.020346628269180655, "rewards/format_reward": 0.8541666679084301, "step": 217 }, { "advantage_max": 1.8746764808893204, "advantage_mean": 6.208815683805824e-10, "advantage_min": -0.8116071596741676, "advantage_std": 0.9998414590954781, "completion_length": 1506.9792251586914, "epoch": 0.24914285714285714, "grad_norm": 0.2781512141227722, "kl": 0.006923675537109375, "lambda_div_used": 0.6, "learning_rate": 7.243820139034464e-07, "loss": 0.0003, "reward": 0.06333183636888862, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06333183636888862, "reward_after_std": 0.6848684512078762, "reward_before_mean": 0.49841352133080363, "reward_before_std": 0.6076563782989979, "reward_change_max": 0.00116710364818573, "reward_change_mean": -0.4350816644728184, "reward_change_min": -0.7371813729405403, "reward_change_std": 0.27808909490704536, "reward_std": 0.6848684698343277, "rewards/cosine_scaled_reward": -0.17787658236920834, "rewards/format_reward": 0.854166679084301, "step": 218 }, { "advantage_max": 1.8024124205112457, "advantage_mean": -1.6763806787167823e-08, "advantage_min": -0.9114049077033997, "advantage_std": 0.9998286366462708, "completion_length": 1521.5833587646484, "epoch": 0.2502857142857143, "grad_norm": 0.29169994592666626, "kl": 0.008913993835449219, "lambda_div_used": 0.6, "learning_rate": 7.214816693576234e-07, "loss": 0.0004, "reward": 0.35595322732115164, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35595322732115164, "reward_after_std": 0.7003892213106155, "reward_before_mean": 0.959366999566555, "reward_before_std": 0.6458369642496109, "reward_change_max": 0.0, "reward_change_mean": -0.6034137718379498, "reward_change_min": -0.9950582347810268, "reward_change_std": 0.3782676439732313, "reward_std": 0.7003892250359058, "rewards/cosine_scaled_reward": 0.05260014161467552, "rewards/format_reward": 0.8541666697710752, "step": 219 }, { "advantage_max": 1.8146659433841705, "advantage_mean": 1.614292521878724e-08, "advantage_min": -1.0002572685480118, "advantage_std": 0.999789372086525, "completion_length": 1465.7708587646484, "epoch": 0.25142857142857145, "grad_norm": 0.3001108765602112, "kl": 0.007877349853515625, "lambda_div_used": 0.6, "learning_rate": 7.185729670371604e-07, "loss": 0.0003, "reward": -0.0560973163228482, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0560973163228482, "reward_after_std": 0.5878814682364464, "reward_before_mean": 0.33406518027186394, "reward_before_std": 0.5440696775913239, "reward_change_max": 0.0018651336431503296, "reward_change_mean": -0.3901625070720911, "reward_change_min": -0.6622885875403881, "reward_change_std": 0.25839220359921455, "reward_std": 0.5878814794123173, "rewards/cosine_scaled_reward": -0.24963408894836903, "rewards/format_reward": 0.8333333488553762, "step": 220 }, { "advantage_max": 1.9332973212003708, "advantage_mean": -4.2840840319691154e-08, "advantage_min": -0.6719999387860298, "advantage_std": 0.9998391345143318, "completion_length": 1181.5208740234375, "epoch": 0.25257142857142856, "grad_norm": 0.23093506693840027, "kl": 0.005725860595703125, "lambda_div_used": 0.6, "learning_rate": 7.156560487081051e-07, "loss": 0.0002, "reward": 0.5290301127824932, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5290301127824932, "reward_after_std": 0.7129516340792179, "reward_before_mean": 1.2169622369110584, "reward_before_std": 0.4925988847389817, "reward_change_max": 0.0, "reward_change_mean": -0.6879321187734604, "reward_change_min": -1.0727623589336872, "reward_change_std": 0.38971428014338017, "reward_std": 0.7129516452550888, "rewards/cosine_scaled_reward": 0.16056441084947437, "rewards/format_reward": 0.895833333954215, "step": 221 }, { "advantage_max": 1.8785310834646225, "advantage_mean": -1.179675268581093e-08, "advantage_min": -0.8870665952563286, "advantage_std": 0.9998476952314377, "completion_length": 1451.041690826416, "epoch": 0.2537142857142857, "grad_norm": 0.27896323800086975, "kl": 0.008668899536132812, "lambda_div_used": 0.6, "learning_rate": 7.127310565369415e-07, "loss": 0.0003, "reward": 0.2508357478072867, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2508357478072867, "reward_after_std": 0.7334035821259022, "reward_before_mean": 0.7815669402480125, "reward_before_std": 0.6281927190721035, "reward_change_max": 0.0011235624551773071, "reward_change_mean": -0.530731188133359, "reward_change_min": -0.8242725133895874, "reward_change_std": 0.3266385905444622, "reward_std": 0.7334035895764828, "rewards/cosine_scaled_reward": 0.005366799421608448, "rewards/format_reward": 0.7708333376795053, "step": 222 }, { "advantage_max": 1.8369051963090897, "advantage_mean": 2.6697914656814703e-08, "advantage_min": -0.8896066732704639, "advantage_std": 0.9998190328478813, "completion_length": 1759.4375305175781, "epoch": 0.25485714285714284, "grad_norm": 0.20690949261188507, "kl": 0.007503509521484375, "lambda_div_used": 0.6, "learning_rate": 7.097981330836616e-07, "loss": 0.0003, "reward": 0.3009045707876794, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3009045707876794, "reward_after_std": 0.6722562201321125, "reward_before_mean": 0.8796945922076702, "reward_before_std": 0.5771395843476057, "reward_change_max": 0.0011511072516441345, "reward_change_mean": -0.5787900015711784, "reward_change_min": -0.9382209926843643, "reward_change_std": 0.36548776365816593, "reward_std": 0.6722562275826931, "rewards/cosine_scaled_reward": 0.09609728120267391, "rewards/format_reward": 0.687500013038516, "step": 223 }, { "advantage_max": 1.8380918353796005, "advantage_mean": -1.117587122845265e-08, "advantage_min": -0.8600411862134933, "advantage_std": 0.9998573735356331, "completion_length": 1852.3958740234375, "epoch": 0.256, "grad_norm": 0.19519250094890594, "kl": 0.0054874420166015625, "lambda_div_used": 0.6, "learning_rate": 7.068574212948169e-07, "loss": 0.0002, "reward": 0.2570896605029702, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2570896605029702, "reward_after_std": 0.8001469634473324, "reward_before_mean": 0.7829917408525944, "reward_before_std": 0.7353864461183548, "reward_change_max": 0.0004997849464416504, "reward_change_mean": -0.5259020812809467, "reward_change_min": -0.9049793109297752, "reward_change_std": 0.344460004940629, "reward_std": 0.8001469932496548, "rewards/cosine_scaled_reward": -0.03558747028000653, "rewards/format_reward": 0.8541666697710752, "step": 224 }, { "advantage_max": 1.8619542717933655, "advantage_mean": -2.4835264955669345e-09, "advantage_min": -0.92620949447155, "advantage_std": 0.9998472630977631, "completion_length": 1862.6875495910645, "epoch": 0.2571428571428571, "grad_norm": 0.2828025817871094, "kl": 0.01148223876953125, "lambda_div_used": 0.6, "learning_rate": 7.039090644965509e-07, "loss": 0.0005, "reward": 0.1418031924404204, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1418031924404204, "reward_after_std": 0.8276291191577911, "reward_before_mean": 0.5936808623373508, "reward_before_std": 0.7773008048534393, "reward_change_max": 0.00016012787818908691, "reward_change_mean": -0.45187763683497906, "reward_change_min": -0.8189064487814903, "reward_change_std": 0.30999560933560133, "reward_std": 0.8276291415095329, "rewards/cosine_scaled_reward": -0.057326255831867456, "rewards/format_reward": 0.7083333507180214, "step": 225 }, { "advantage_max": 1.9105704128742218, "advantage_mean": 3.104408341503273e-09, "advantage_min": -0.7978261336684227, "advantage_std": 0.9998422935605049, "completion_length": 1464.1667022705078, "epoch": 0.2582857142857143, "grad_norm": 0.21695449948310852, "kl": 0.0057373046875, "lambda_div_used": 0.6, "learning_rate": 7.009532063876148e-07, "loss": 0.0002, "reward": 0.4355794661678374, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4355794661678374, "reward_after_std": 0.7580258473753929, "reward_before_mean": 1.0637911558151245, "reward_before_std": 0.6122870594263077, "reward_change_max": 0.0, "reward_change_mean": -0.6282116770744324, "reward_change_min": -0.9939616173505783, "reward_change_std": 0.36170411482453346, "reward_std": 0.7580258622765541, "rewards/cosine_scaled_reward": 0.05272890208289027, "rewards/format_reward": 0.9583333432674408, "step": 226 }, { "advantage_max": 1.880305826663971, "advantage_mean": -9.313226134732844e-09, "advantage_min": -0.8974742889404297, "advantage_std": 0.9998089522123337, "completion_length": 1090.1666946411133, "epoch": 0.25942857142857145, "grad_norm": 0.29648357629776, "kl": 0.007843017578125, "lambda_div_used": 0.6, "learning_rate": 6.979899910323624e-07, "loss": 0.0003, "reward": 0.1774475951679051, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1774475951679051, "reward_after_std": 0.6307259686291218, "reward_before_mean": 0.6877003312110901, "reward_before_std": 0.5237771086394787, "reward_change_max": 0.0, "reward_change_mean": -0.5102527514100075, "reward_change_min": -0.827692836523056, "reward_change_std": 0.295247383415699, "reward_std": 0.6307259723544121, "rewards/cosine_scaled_reward": -0.1561498325318098, "rewards/format_reward": 1.0, "step": 227 }, { "advantage_max": 1.8451535552740097, "advantage_mean": -9.934108202713787e-09, "advantage_min": -0.9320072717964649, "advantage_std": 0.9998236298561096, "completion_length": 1345.3750228881836, "epoch": 0.26057142857142856, "grad_norm": 0.2331770807504654, "kl": 0.00678253173828125, "lambda_div_used": 0.6, "learning_rate": 6.950195628537299e-07, "loss": 0.0003, "reward": 0.5653695005457848, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5653695005457848, "reward_after_std": 0.6357578784227371, "reward_before_mean": 1.294594332575798, "reward_before_std": 0.46521945111453533, "reward_change_max": 0.0007578954100608826, "reward_change_mean": -0.729224868118763, "reward_change_min": -1.0465231351554394, "reward_change_std": 0.40724730119109154, "reward_std": 0.6357578970491886, "rewards/cosine_scaled_reward": 0.23063051141798496, "rewards/format_reward": 0.8333333358168602, "step": 228 }, { "advantage_max": 1.8811430782079697, "advantage_mean": -5.432715166620028e-09, "advantage_min": -0.8746685832738876, "advantage_std": 0.9998182356357574, "completion_length": 1425.8542022705078, "epoch": 0.26171428571428573, "grad_norm": 0.2102670818567276, "kl": 0.00843048095703125, "lambda_div_used": 0.6, "learning_rate": 6.920420666261961e-07, "loss": 0.0003, "reward": 0.21790653312928043, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21790653312928043, "reward_after_std": 0.6236037760972977, "reward_before_mean": 0.7517698602750897, "reward_before_std": 0.49720613472163677, "reward_change_max": 0.0, "reward_change_mean": -0.5338633358478546, "reward_change_min": -0.8096239566802979, "reward_change_std": 0.30554310977458954, "reward_std": 0.6236037984490395, "rewards/cosine_scaled_reward": -0.061615096405148506, "rewards/format_reward": 0.8750000074505806, "step": 229 }, { "advantage_max": 1.8803343623876572, "advantage_mean": 5.898376453927767e-09, "advantage_min": -0.9339370280504227, "advantage_std": 0.9997898861765862, "completion_length": 1745.7916870117188, "epoch": 0.26285714285714284, "grad_norm": 0.20653089880943298, "kl": 0.00656890869140625, "lambda_div_used": 0.6, "learning_rate": 6.890576474687263e-07, "loss": 0.0003, "reward": 0.045102519914507866, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.045102519914507866, "reward_after_std": 0.5806776024401188, "reward_before_mean": 0.49274973943829536, "reward_before_std": 0.4847143702208996, "reward_change_max": 0.0, "reward_change_mean": -0.44764724373817444, "reward_change_min": -0.7104385755956173, "reward_change_std": 0.26539971493184566, "reward_std": 0.5806776247918606, "rewards/cosine_scaled_reward": -0.14945847261697054, "rewards/format_reward": 0.791666679084301, "step": 230 }, { "advantage_max": 1.925698772072792, "advantage_mean": -3.1044085080367267e-09, "advantage_min": -0.7799288704991341, "advantage_std": 0.9998774752020836, "completion_length": 1532.3125381469727, "epoch": 0.264, "grad_norm": 0.22522780299186707, "kl": 0.008289337158203125, "lambda_div_used": 0.6, "learning_rate": 6.860664508377001e-07, "loss": 0.0003, "reward": 0.49276258889585733, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.49276258889585733, "reward_after_std": 0.9082431048154831, "reward_before_mean": 1.1163499159738421, "reward_before_std": 0.7488042302429676, "reward_change_max": 0.0, "reward_change_mean": -0.6235873140394688, "reward_change_min": -0.9923263862729073, "reward_change_std": 0.361046951264143, "reward_std": 0.9082431308925152, "rewards/cosine_scaled_reward": 0.09984159865416586, "rewards/format_reward": 0.9166666716337204, "step": 231 }, { "advantage_max": 1.7980208545923233, "advantage_mean": 1.6763806787167823e-08, "advantage_min": -1.0510611981153488, "advantage_std": 0.9998587220907211, "completion_length": 2133.2917251586914, "epoch": 0.2651428571428571, "grad_norm": 0.259899377822876, "kl": 0.00949859619140625, "lambda_div_used": 0.6, "learning_rate": 6.83068622519821e-07, "loss": 0.0004, "reward": -0.0644418615847826, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.0644418615847826, "reward_after_std": 0.7792413085699081, "reward_before_mean": 0.28795089083723724, "reward_before_std": 0.7958154529333115, "reward_change_max": 0.0005818530917167664, "reward_change_mean": -0.35239275731146336, "reward_change_min": -0.7125130780041218, "reward_change_std": 0.28704385086894035, "reward_std": 0.7792413383722305, "rewards/cosine_scaled_reward": -0.1997745493426919, "rewards/format_reward": 0.6875000242143869, "step": 232 }, { "advantage_max": 1.9218673408031464, "advantage_mean": -6.51925802230835e-09, "advantage_min": -0.7960270717740059, "advantage_std": 0.9998332411050797, "completion_length": 1141.5208892822266, "epoch": 0.2662857142857143, "grad_norm": 0.22768016159534454, "kl": 0.005718231201171875, "lambda_div_used": 0.6, "learning_rate": 6.800643086250121e-07, "loss": 0.0002, "reward": 0.2717605981742963, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2717605981742963, "reward_after_std": 0.7569721080362797, "reward_before_mean": 0.8070176914334297, "reward_before_std": 0.6270077042281628, "reward_change_max": 0.0, "reward_change_mean": -0.53525710105896, "reward_change_min": -0.835021048784256, "reward_change_std": 0.3094108831137419, "reward_std": 0.7569721303880215, "rewards/cosine_scaled_reward": -0.06524115987122059, "rewards/format_reward": 0.9375000074505806, "step": 233 }, { "advantage_max": 1.9480681866407394, "advantage_mean": 3.104409063148239e-09, "advantage_min": -0.6695451363921165, "advantage_std": 0.9997960180044174, "completion_length": 1621.6250267028809, "epoch": 0.2674285714285714, "grad_norm": 0.25094732642173767, "kl": 0.008075714111328125, "lambda_div_used": 0.6, "learning_rate": 6.770536555792944e-07, "loss": 0.0003, "reward": 0.15022319613490254, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15022319613490254, "reward_after_std": 0.743482006713748, "reward_before_mean": 0.6215195916593075, "reward_before_std": 0.6112698167562485, "reward_change_max": 0.0, "reward_change_mean": -0.47129638865590096, "reward_change_min": -0.8236280344426632, "reward_change_std": 0.3019194579683244, "reward_std": 0.7434820216149092, "rewards/cosine_scaled_reward": -0.08507354371249676, "rewards/format_reward": 0.7916666716337204, "step": 234 }, { "advantage_max": 1.9268249720335007, "advantage_mean": -2.452482850134885e-08, "advantage_min": -0.7404474690556526, "advantage_std": 0.9998696148395538, "completion_length": 1109.9167098999023, "epoch": 0.26857142857142857, "grad_norm": 0.24515864253044128, "kl": 0.006595611572265625, "lambda_div_used": 0.6, "learning_rate": 6.740368101176495e-07, "loss": 0.0003, "reward": 0.5859628189355135, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5859628189355135, "reward_after_std": 0.8730616346001625, "reward_before_mean": 1.2709032725542784, "reward_before_std": 0.6743786497972906, "reward_change_max": 0.0007496699690818787, "reward_change_mean": -0.684940479695797, "reward_change_min": -1.027261570096016, "reward_change_std": 0.39483822137117386, "reward_std": 0.8730616644024849, "rewards/cosine_scaled_reward": 0.1667016496649012, "rewards/format_reward": 0.9375000074505806, "step": 235 }, { "advantage_max": 1.8696034252643585, "advantage_mean": 1.8626452047421083e-09, "advantage_min": -0.7846398204565048, "advantage_std": 0.9998518154025078, "completion_length": 1792.4584045410156, "epoch": 0.26971428571428574, "grad_norm": 0.21424059569835663, "kl": 0.006832122802734375, "lambda_div_used": 0.6, "learning_rate": 6.710139192768694e-07, "loss": 0.0003, "reward": 0.21139677381142974, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21139677381142974, "reward_after_std": 0.9058232493698597, "reward_before_mean": 0.685417864471674, "reward_before_std": 0.8372405208647251, "reward_change_max": 0.0, "reward_change_mean": -0.4740210734307766, "reward_change_min": -0.8449614234268665, "reward_change_std": 0.3198982533067465, "reward_std": 0.9058232642710209, "rewards/cosine_scaled_reward": -0.04270775453187525, "rewards/format_reward": 0.7708333358168602, "step": 236 }, { "advantage_max": 1.885415256023407, "advantage_mean": -4.190951696791956e-09, "advantage_min": -0.8234371915459633, "advantage_std": 0.9998277798295021, "completion_length": 1472.395866394043, "epoch": 0.27085714285714285, "grad_norm": 0.24223242700099945, "kl": 0.00577545166015625, "lambda_div_used": 0.6, "learning_rate": 6.679851303883891e-07, "loss": 0.0002, "reward": 0.3880492812022567, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3880492812022567, "reward_after_std": 0.7263778820633888, "reward_before_mean": 0.9952268004417419, "reward_before_std": 0.57838967256248, "reward_change_max": 0.0, "reward_change_mean": -0.607177471742034, "reward_change_min": -0.9197544753551483, "reward_change_std": 0.3471406549215317, "reward_std": 0.7263779193162918, "rewards/cosine_scaled_reward": 0.0601133843883872, "rewards/format_reward": 0.875, "step": 237 }, { "advantage_max": 1.9389768987894058, "advantage_mean": 2.1730859889323995e-09, "advantage_min": -0.7344888895750046, "advantage_std": 0.9998518079519272, "completion_length": 1094.6667175292969, "epoch": 0.272, "grad_norm": 0.24234844744205475, "kl": 0.00812530517578125, "lambda_div_used": 0.6, "learning_rate": 6.649505910711058e-07, "loss": 0.0003, "reward": 0.4462637463584542, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4462637463584542, "reward_after_std": 0.868531309068203, "reward_before_mean": 1.0553820058703423, "reward_before_std": 0.7022292437031865, "reward_change_max": 0.0, "reward_change_mean": -0.6091182753443718, "reward_change_min": -0.9754799082875252, "reward_change_std": 0.3624349180608988, "reward_std": 0.8685313127934933, "rewards/cosine_scaled_reward": 0.03810766385868192, "rewards/format_reward": 0.9791666716337204, "step": 238 }, { "advantage_max": 1.8280943185091019, "advantage_mean": 1.3348957383918503e-08, "advantage_min": -0.9786246418952942, "advantage_std": 0.9998012036085129, "completion_length": 1448.6041870117188, "epoch": 0.27314285714285713, "grad_norm": 0.22774183750152588, "kl": 0.005977630615234375, "lambda_div_used": 0.6, "learning_rate": 6.619104492241847e-07, "loss": 0.0002, "reward": 0.5649529304355383, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5649529304355383, "reward_after_std": 0.6479036398231983, "reward_before_mean": 1.2952700648456812, "reward_before_std": 0.51944032125175, "reward_change_max": 0.0005435720086097717, "reward_change_mean": -0.7303171027451754, "reward_change_min": -1.0367931462824345, "reward_change_std": 0.41321991570293903, "reward_std": 0.6479036472737789, "rewards/cosine_scaled_reward": 0.262218339368701, "rewards/format_reward": 0.770833333954215, "step": 239 }, { "advantage_max": 1.7769131064414978, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.9458809569478035, "advantage_std": 0.999803252518177, "completion_length": 1617.9583740234375, "epoch": 0.2742857142857143, "grad_norm": 0.2748890519142151, "kl": 0.009830474853515625, "lambda_div_used": 0.6, "learning_rate": 6.588648530198504e-07, "loss": 0.0004, "reward": -0.13307038694620132, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13307038694620132, "reward_after_std": 0.5256015434861183, "reward_before_mean": 0.23460055608302355, "reward_before_std": 0.5078816749155521, "reward_change_max": 0.0, "reward_change_mean": -0.367670938372612, "reward_change_min": -0.6695005521178246, "reward_change_std": 0.2524768877774477, "reward_std": 0.5256015658378601, "rewards/cosine_scaled_reward": -0.27853306010365486, "rewards/format_reward": 0.7916666772216558, "step": 240 }, { "advantage_max": 1.88160939514637, "advantage_mean": -1.1102230246251565e-16, "advantage_min": -0.8645576983690262, "advantage_std": 0.9998082891106606, "completion_length": 1639.1667404174805, "epoch": 0.2754285714285714, "grad_norm": 0.26753294467926025, "kl": 0.008953094482421875, "lambda_div_used": 0.6, "learning_rate": 6.558139508961654e-07, "loss": 0.0004, "reward": -0.031316899927333, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.031316899927333, "reward_after_std": 0.5845673941075802, "reward_before_mean": 0.37243654020130634, "reward_before_std": 0.4885371960699558, "reward_change_max": 0.0, "reward_change_mean": -0.40375345945358276, "reward_change_min": -0.650720402598381, "reward_change_std": 0.24138164147734642, "reward_std": 0.5845674015581608, "rewards/cosine_scaled_reward": -0.25128174014389515, "rewards/format_reward": 0.8750000111758709, "step": 241 }, { "advantage_max": 1.879174917936325, "advantage_mean": -2.4835267176115394e-09, "advantage_min": -0.8415133915841579, "advantage_std": 0.9997778609395027, "completion_length": 1141.4583549499512, "epoch": 0.2765714285714286, "grad_norm": 0.32908982038497925, "kl": 0.011310577392578125, "lambda_div_used": 0.6, "learning_rate": 6.527578915497951e-07, "loss": 0.0005, "reward": 0.08597287524025887, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08597287524025887, "reward_after_std": 0.5845495834946632, "reward_before_mean": 0.554448792245239, "reward_before_std": 0.4750943690305576, "reward_change_max": 0.0019485801458358765, "reward_change_mean": -0.46847590431571007, "reward_change_min": -0.7834129631519318, "reward_change_std": 0.2873268108814955, "reward_std": 0.5845495834946632, "rewards/cosine_scaled_reward": -0.17069227993488312, "rewards/format_reward": 0.8958333507180214, "step": 242 }, { "advantage_max": 1.9060489684343338, "advantage_mean": -2.9491878938969762e-09, "advantage_min": -0.8143318668007851, "advantage_std": 0.9998864382505417, "completion_length": 1576.0208587646484, "epoch": 0.2777142857142857, "grad_norm": 0.21973414719104767, "kl": 0.0056400299072265625, "lambda_div_used": 0.6, "learning_rate": 6.496968239287603e-07, "loss": 0.0002, "reward": 0.404103375505656, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.404103375505656, "reward_after_std": 0.9556259214878082, "reward_before_mean": 0.9712791014462709, "reward_before_std": 0.8410424143075943, "reward_change_max": 0.00015924125909805298, "reward_change_mean": -0.5671757366508245, "reward_change_min": -0.9959732592105865, "reward_change_std": 0.3573100995272398, "reward_std": 0.9556259475648403, "rewards/cosine_scaled_reward": 0.05855620512738824, "rewards/format_reward": 0.8541666772216558, "step": 243 }, { "advantage_max": 1.839727371931076, "advantage_mean": -1.7229468518564772e-08, "advantage_min": -0.9795639514923096, "advantage_std": 0.9998864531517029, "completion_length": 1683.9375305175781, "epoch": 0.27885714285714286, "grad_norm": 0.2994227111339569, "kl": 0.007843017578125, "lambda_div_used": 0.6, "learning_rate": 6.466308972251785e-07, "loss": 0.0003, "reward": 0.41605983674526215, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41605983674526215, "reward_after_std": 0.9633042551577091, "reward_before_mean": 0.9971684440970421, "reward_before_std": 0.9178141728043556, "reward_change_max": 0.0008343681693077087, "reward_change_mean": -0.5811086297035217, "reward_change_min": -0.9604966826736927, "reward_change_std": 0.3883028831332922, "reward_std": 0.9633043184876442, "rewards/cosine_scaled_reward": 0.09233421226963401, "rewards/format_reward": 0.8125000055879354, "step": 244 }, { "advantage_max": 1.8073162287473679, "advantage_mean": -6.208817238118058e-09, "advantage_min": -0.9385262057185173, "advantage_std": 0.9998811557888985, "completion_length": 1858.3125610351562, "epoch": 0.28, "grad_norm": 0.23238350450992584, "kl": 0.007617950439453125, "lambda_div_used": 0.6, "learning_rate": 6.435602608679916e-07, "loss": 0.0003, "reward": 0.29909435706213117, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.29909435706213117, "reward_after_std": 0.9968580417335033, "reward_before_mean": 0.8094470165669918, "reward_before_std": 0.9918197207152843, "reward_change_max": 0.0008577778935432434, "reward_change_mean": -0.5103526413440704, "reward_change_min": -1.0967907197773457, "reward_change_std": 0.4003028068691492, "reward_std": 0.996858075261116, "rewards/cosine_scaled_reward": 0.019306830130517483, "rewards/format_reward": 0.7708333469927311, "step": 245 }, { "advantage_max": 1.8786978721618652, "advantage_mean": -1.3348957161873898e-08, "advantage_min": -0.8943793699145317, "advantage_std": 0.9998432323336601, "completion_length": 1381.7500457763672, "epoch": 0.28114285714285714, "grad_norm": 0.23658344149589539, "kl": 0.008426666259765625, "lambda_div_used": 0.6, "learning_rate": 6.404850645156841e-07, "loss": 0.0003, "reward": 0.27080630185082555, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.27080630185082555, "reward_after_std": 0.7295896746218204, "reward_before_mean": 0.814372431486845, "reward_before_std": 0.6347568519413471, "reward_change_max": 0.0006069615483283997, "reward_change_mean": -0.5435661226511002, "reward_change_min": -0.8393444195389748, "reward_change_std": 0.32673146948218346, "reward_std": 0.7295896857976913, "rewards/cosine_scaled_reward": -0.040730470209382474, "rewards/format_reward": 0.895833333954215, "step": 246 }, { "advantage_max": 1.8464445322752, "advantage_mean": 1.8626452047421083e-09, "advantage_min": -0.9952505975961685, "advantage_std": 0.999820850789547, "completion_length": 1874.3750534057617, "epoch": 0.2822857142857143, "grad_norm": 0.2620001435279846, "kl": 0.0073604583740234375, "lambda_div_used": 0.6, "learning_rate": 6.374054580489873e-07, "loss": 0.0003, "reward": 0.04168115835636854, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04168115835636854, "reward_after_std": 0.6140828654170036, "reward_before_mean": 0.48145512863993645, "reward_before_std": 0.5548105500638485, "reward_change_max": 0.0, "reward_change_mean": -0.4397739823907614, "reward_change_min": -0.7346926927566528, "reward_change_std": 0.2879637535661459, "reward_std": 0.6140829026699066, "rewards/cosine_scaled_reward": -0.15510577894747257, "rewards/format_reward": 0.7916666828095913, "step": 247 }, { "advantage_max": 1.928673580288887, "advantage_mean": -3.663202191583892e-08, "advantage_min": -0.7065148465335369, "advantage_std": 0.9998361095786095, "completion_length": 1277.9375267028809, "epoch": 0.2834285714285714, "grad_norm": 0.27247554063796997, "kl": 0.0064296722412109375, "lambda_div_used": 0.6, "learning_rate": 6.343215915635761e-07, "loss": 0.0003, "reward": 0.6549166552722454, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6549166552722454, "reward_after_std": 0.7043044194579124, "reward_before_mean": 1.4125083833932877, "reward_before_std": 0.4449807833880186, "reward_change_max": 0.0, "reward_change_mean": -0.7575917467474937, "reward_change_min": -1.0937542356550694, "reward_change_std": 0.4141359133645892, "reward_std": 0.7043044343590736, "rewards/cosine_scaled_reward": 0.27917084423825145, "rewards/format_reward": 0.8541666716337204, "step": 248 }, { "advantage_max": 1.8969251960515976, "advantage_mean": -2.5456151742098143e-08, "advantage_min": -0.7646898031234741, "advantage_std": 0.9998411610722542, "completion_length": 1365.4792022705078, "epoch": 0.2845714285714286, "grad_norm": 0.22245629131793976, "kl": 0.00811767578125, "lambda_div_used": 0.6, "learning_rate": 6.31233615362752e-07, "loss": 0.0003, "reward": 0.5752901515224949, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5752901515224949, "reward_after_std": 0.7512060441076756, "reward_before_mean": 1.2835528128780425, "reward_before_std": 0.5859524626284838, "reward_change_max": 0.0, "reward_change_mean": -0.7082626335322857, "reward_change_min": -1.0432622693479061, "reward_change_std": 0.403097290545702, "reward_std": 0.7512060552835464, "rewards/cosine_scaled_reward": 0.2146930517628789, "rewards/format_reward": 0.8541666697710752, "step": 249 }, { "advantage_max": 1.878608837723732, "advantage_mean": -4.967053768289986e-09, "advantage_min": -0.8505470529198647, "advantage_std": 0.9998346194624901, "completion_length": 1293.0625534057617, "epoch": 0.2857142857142857, "grad_norm": 0.4508991539478302, "kl": 0.009288787841796875, "lambda_div_used": 0.6, "learning_rate": 6.281416799501187e-07, "loss": 0.0004, "reward": 0.303747734404169, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.303747734404169, "reward_after_std": 0.7406436167657375, "reward_before_mean": 0.863355714827776, "reward_before_std": 0.628277663141489, "reward_change_max": 0.0006925761699676514, "reward_change_mean": -0.5596079789102077, "reward_change_min": -0.8469617292284966, "reward_change_std": 0.3360453639179468, "reward_std": 0.7406436540186405, "rewards/cosine_scaled_reward": -0.047488822834566236, "rewards/format_reward": 0.9583333432674408, "step": 250 }, { "advantage_max": 1.872517004609108, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.9513845965266228, "advantage_std": 0.9998447820544243, "completion_length": 1014.9792098999023, "epoch": 0.28685714285714287, "grad_norm": 0.3035866916179657, "kl": 0.011281967163085938, "lambda_div_used": 0.6, "learning_rate": 6.25045936022246e-07, "loss": 0.0005, "reward": 0.3110234094783664, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3110234094783664, "reward_after_std": 0.7761142998933792, "reward_before_mean": 0.8676313664764166, "reward_before_std": 0.6817162893712521, "reward_change_max": 0.0, "reward_change_mean": -0.5566079169511795, "reward_change_min": -0.8789765313267708, "reward_change_std": 0.3465210497379303, "reward_std": 0.7761143408715725, "rewards/cosine_scaled_reward": -0.024517669342458248, "rewards/format_reward": 0.916666679084301, "step": 251 }, { "advantage_max": 1.8302258551120758, "advantage_mean": 1.4280280291600889e-08, "advantage_min": -0.8962804041802883, "advantage_std": 0.9998032450675964, "completion_length": 1662.7291984558105, "epoch": 0.288, "grad_norm": 0.2449835240840912, "kl": 0.009859085083007812, "lambda_div_used": 0.6, "learning_rate": 6.219465344613258e-07, "loss": 0.0004, "reward": 0.0912809963338077, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0912809963338077, "reward_after_std": 0.5700275152921677, "reward_before_mean": 0.5716589528601617, "reward_before_std": 0.47574319737032056, "reward_change_max": 0.0004999637603759766, "reward_change_mean": -0.4803779609501362, "reward_change_min": -0.7776023708283901, "reward_change_std": 0.29937842302024364, "reward_std": 0.5700275264680386, "rewards/cosine_scaled_reward": -0.11000386101659387, "rewards/format_reward": 0.7916666716337204, "step": 252 }, { "advantage_max": 1.9588626027107239, "advantage_mean": 2.980232283178452e-08, "advantage_min": -0.6859035342931747, "advantage_std": 0.9997961297631264, "completion_length": 1476.979206085205, "epoch": 0.28914285714285715, "grad_norm": 0.2885993421077728, "kl": 0.00954437255859375, "lambda_div_used": 0.6, "learning_rate": 6.188436263278172e-07, "loss": 0.0004, "reward": 0.023032560478895903, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.023032560478895903, "reward_after_std": 0.6161117516458035, "reward_before_mean": 0.44603691063821316, "reward_before_std": 0.4674696382135153, "reward_change_max": 0.0, "reward_change_mean": -0.423004312440753, "reward_change_min": -0.6526531614363194, "reward_change_std": 0.236457671970129, "reward_std": 0.6161117758601904, "rewards/cosine_scaled_reward": -0.18323156610131264, "rewards/format_reward": 0.8125000074505806, "step": 253 }, { "advantage_max": 1.9421354234218597, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.7857317849993706, "advantage_std": 0.9998878464102745, "completion_length": 1576.2083740234375, "epoch": 0.29028571428571426, "grad_norm": 0.28780651092529297, "kl": 0.00881195068359375, "lambda_div_used": 0.6, "learning_rate": 6.157373628530852e-07, "loss": 0.0004, "reward": 0.32410276448354125, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32410276448354125, "reward_after_std": 1.066658116877079, "reward_before_mean": 0.8170572388917208, "reward_before_std": 0.9517768397927284, "reward_change_max": 0.00031269341707229614, "reward_change_mean": -0.4929545000195503, "reward_change_min": -0.8555713668465614, "reward_change_std": 0.3223307225853205, "reward_std": 1.0666581466794014, "rewards/cosine_scaled_reward": -0.00813805649522692, "rewards/format_reward": 0.8333333432674408, "step": 254 }, { "advantage_max": 1.8271007537841797, "advantage_mean": 1.7229468074475562e-08, "advantage_min": -0.8999714367091656, "advantage_std": 0.9998132511973381, "completion_length": 1964.8542022705078, "epoch": 0.2914285714285714, "grad_norm": 0.2708861231803894, "kl": 0.012363433837890625, "lambda_div_used": 0.6, "learning_rate": 6.126278954320294e-07, "loss": 0.0005, "reward": -0.03212953475303948, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03212953475303948, "reward_after_std": 0.7982987090945244, "reward_before_mean": 0.33184120431542397, "reward_before_std": 0.7744866535067558, "reward_change_max": 0.0005225986242294312, "reward_change_mean": -0.3639707425609231, "reward_change_min": -0.6983498111367226, "reward_change_std": 0.28055456932634115, "reward_std": 0.7982987351715565, "rewards/cosine_scaled_reward": -0.19866274809464812, "rewards/format_reward": 0.729166679084301, "step": 255 }, { "advantage_max": 1.8787826746702194, "advantage_mean": -6.208817904251873e-10, "advantage_min": -0.8882212191820145, "advantage_std": 0.999819703400135, "completion_length": 1394.7500381469727, "epoch": 0.2925714285714286, "grad_norm": 0.2527642548084259, "kl": 0.007617950439453125, "lambda_div_used": 0.6, "learning_rate": 6.095153756157051e-07, "loss": 0.0003, "reward": 0.3247837144881487, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3247837144881487, "reward_after_std": 0.675804752856493, "reward_before_mean": 0.9066541530191898, "reward_before_std": 0.5500271543860435, "reward_change_max": 0.0, "reward_change_mean": -0.5818704478442669, "reward_change_min": -0.9174501821398735, "reward_change_std": 0.33812401071190834, "reward_std": 0.675804790109396, "rewards/cosine_scaled_reward": -0.005006253952160478, "rewards/format_reward": 0.916666679084301, "step": 256 }, { "advantage_max": 1.8178119510412216, "advantage_mean": -3.0423205787943886e-08, "advantage_min": -0.8814026415348053, "advantage_std": 0.9998690858483315, "completion_length": 1931.4583892822266, "epoch": 0.2937142857142857, "grad_norm": 0.21259228885173798, "kl": 0.008331298828125, "lambda_div_used": 0.6, "learning_rate": 6.06399955103937e-07, "loss": 0.0003, "reward": 0.4000783711671829, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4000783711671829, "reward_after_std": 0.8711743615567684, "reward_before_mean": 0.9961195886135101, "reward_before_std": 0.8301573395729065, "reward_change_max": 0.0, "reward_change_mean": -0.5960412230342627, "reward_change_min": -1.0773316584527493, "reward_change_std": 0.4147396646440029, "reward_std": 0.8711744025349617, "rewards/cosine_scaled_reward": 0.11264310358092189, "rewards/format_reward": 0.7708333488553762, "step": 257 }, { "advantage_max": 1.865677148103714, "advantage_mean": -9.313225912688239e-09, "advantage_min": -0.8630052953958511, "advantage_std": 0.9998342022299767, "completion_length": 1912.4167098999023, "epoch": 0.2948571428571429, "grad_norm": 0.23722055554389954, "kl": 0.00826263427734375, "lambda_div_used": 0.6, "learning_rate": 6.032817857379256e-07, "loss": 0.0003, "reward": 0.11609090398997068, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.11609090398997068, "reward_after_std": 0.7722814045846462, "reward_before_mean": 0.5685428031720221, "reward_before_std": 0.7202793788164854, "reward_change_max": 0.001006096601486206, "reward_change_mean": -0.45245190896093845, "reward_change_min": -0.8036795221269131, "reward_change_std": 0.31898451782763004, "reward_std": 0.7722814530134201, "rewards/cosine_scaled_reward": -0.10114526422694325, "rewards/format_reward": 0.7708333414047956, "step": 258 }, { "advantage_max": 1.9082180708646774, "advantage_mean": -2.4214387606136256e-08, "advantage_min": -0.8204468339681625, "advantage_std": 0.9998528063297272, "completion_length": 1272.5833625793457, "epoch": 0.296, "grad_norm": 0.28688231110572815, "kl": 0.009775161743164062, "lambda_div_used": 0.6, "learning_rate": 6.001610194928464e-07, "loss": 0.0004, "reward": 0.4570096703246236, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4570096703246236, "reward_after_std": 0.7713707610964775, "reward_before_mean": 1.09649408608675, "reward_before_std": 0.6217096205800772, "reward_change_max": 0.0, "reward_change_mean": -0.6394844427704811, "reward_change_min": -0.9701583310961723, "reward_change_std": 0.374478030949831, "reward_std": 0.771370779722929, "rewards/cosine_scaled_reward": 0.07949701510369778, "rewards/format_reward": 0.9375000074505806, "step": 259 }, { "advantage_max": 1.828286275267601, "advantage_mean": -1.9247333282734758e-08, "advantage_min": -0.9948167651891708, "advantage_std": 0.9998681172728539, "completion_length": 1070.5000267028809, "epoch": 0.29714285714285715, "grad_norm": 0.3003004193305969, "kl": 0.007595062255859375, "lambda_div_used": 0.6, "learning_rate": 5.97037808470444e-07, "loss": 0.0003, "reward": 0.4869587696157396, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4869587696157396, "reward_after_std": 0.8302631750702858, "reward_before_mean": 1.1328422725200653, "reward_before_std": 0.7522343434393406, "reward_change_max": 0.0, "reward_change_mean": -0.6458835043013096, "reward_change_min": -1.0198750123381615, "reward_change_std": 0.39399000257253647, "reward_std": 0.8302631825208664, "rewards/cosine_scaled_reward": 0.09767113672569394, "rewards/format_reward": 0.9375, "step": 260 }, { "advantage_max": 1.8792397528886795, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.8463708981871605, "advantage_std": 0.9997694864869118, "completion_length": 2166.270851135254, "epoch": 0.29828571428571427, "grad_norm": 0.20295003056526184, "kl": 0.007968902587890625, "lambda_div_used": 0.6, "learning_rate": 5.939123048916173e-07, "loss": 0.0003, "reward": 0.02313760167453438, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.02313760167453438, "reward_after_std": 0.5842309631407261, "reward_before_mean": 0.46202369034290314, "reward_before_std": 0.5016245627775788, "reward_change_max": 0.0, "reward_change_mean": -0.43888608552515507, "reward_change_min": -0.7771935053169727, "reward_change_std": 0.27092802058905363, "reward_std": 0.5842309780418873, "rewards/cosine_scaled_reward": -0.09190484462305903, "rewards/format_reward": 0.645833333954215, "step": 261 }, { "advantage_max": 1.828007161617279, "advantage_mean": -1.955777406692505e-08, "advantage_min": -0.8678448721766472, "advantage_std": 0.9998091235756874, "completion_length": 1643.0625305175781, "epoch": 0.29942857142857143, "grad_norm": 0.28205162286758423, "kl": 0.0093994140625, "lambda_div_used": 0.6, "learning_rate": 5.907846610890011e-07, "loss": 0.0004, "reward": -0.08022703300230205, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08022703300230205, "reward_after_std": 0.5777127407491207, "reward_before_mean": 0.2998348996043205, "reward_before_std": 0.5390517674386501, "reward_change_max": 0.0, "reward_change_mean": -0.3800619672983885, "reward_change_min": -0.6844152100384235, "reward_change_std": 0.2551191672682762, "reward_std": 0.577712744474411, "rewards/cosine_scaled_reward": -0.2250825520604849, "rewards/format_reward": 0.7500000093132257, "step": 262 }, { "advantage_max": 1.9083099365234375, "advantage_mean": -1.986821573929376e-08, "advantage_min": -0.8035422787070274, "advantage_std": 0.9997920989990234, "completion_length": 1199.625015258789, "epoch": 0.30057142857142854, "grad_norm": 0.24454443156719208, "kl": 0.006107330322265625, "lambda_div_used": 0.6, "learning_rate": 5.87655029499542e-07, "loss": 0.0002, "reward": 0.10955283138900995, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10955283138900995, "reward_after_std": 0.5718753635883331, "reward_before_mean": 0.5939783789217472, "reward_before_std": 0.45454465225338936, "reward_change_max": 0.0, "reward_change_mean": -0.4844255559146404, "reward_change_min": -0.7876281961798668, "reward_change_std": 0.28957670740783215, "reward_std": 0.5718753710389137, "rewards/cosine_scaled_reward": -0.17176082776859403, "rewards/format_reward": 0.9375, "step": 263 }, { "advantage_max": 1.8229975551366806, "advantage_mean": -8.84756468089165e-09, "advantage_min": -0.9802969917654991, "advantage_std": 0.999835766851902, "completion_length": 1350.2916946411133, "epoch": 0.3017142857142857, "grad_norm": 0.23903900384902954, "kl": 0.006744384765625, "lambda_div_used": 0.6, "learning_rate": 5.845235626570683e-07, "loss": 0.0003, "reward": 0.22060711891390383, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22060711891390383, "reward_after_std": 0.6753030680119991, "reward_before_mean": 0.7518087485805154, "reward_before_std": 0.6103526018559933, "reward_change_max": 0.0, "reward_change_mean": -0.5312016159296036, "reward_change_min": -0.9094065576791763, "reward_change_std": 0.33360027708113194, "reward_std": 0.6753031127154827, "rewards/cosine_scaled_reward": -0.07201230898499489, "rewards/format_reward": 0.8958333395421505, "step": 264 }, { "advantage_max": 1.8557786792516708, "advantage_mean": -1.7384688466570708e-08, "advantage_min": -0.8617202043533325, "advantage_std": 0.9998458698391914, "completion_length": 1349.3750305175781, "epoch": 0.3028571428571429, "grad_norm": 0.259480744600296, "kl": 0.00733184814453125, "lambda_div_used": 0.6, "learning_rate": 5.813904131848564e-07, "loss": 0.0003, "reward": 0.5593267162330449, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5593267162330449, "reward_after_std": 0.7222743555903435, "reward_before_mean": 1.2658403292298317, "reward_before_std": 0.5619801264256239, "reward_change_max": 0.0, "reward_change_mean": -0.7065136171877384, "reward_change_min": -1.0326689183712006, "reward_change_std": 0.401592755690217, "reward_std": 0.7222743816673756, "rewards/cosine_scaled_reward": 0.1537534836679697, "rewards/format_reward": 0.9583333358168602, "step": 265 }, { "advantage_max": 1.775598168373108, "advantage_mean": -1.1796753240922442e-08, "advantage_min": -1.0651524811983109, "advantage_std": 0.9998323395848274, "completion_length": 1781.8333740234375, "epoch": 0.304, "grad_norm": 0.2434740513563156, "kl": 0.00922393798828125, "lambda_div_used": 0.6, "learning_rate": 5.78255733788191e-07, "loss": 0.0004, "reward": 0.1264530853368342, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1264530853368342, "reward_after_std": 0.7009001858532429, "reward_before_mean": 0.6023802892304957, "reward_before_std": 0.6759253405034542, "reward_change_max": 9.702146053314209e-05, "reward_change_mean": -0.4759272299706936, "reward_change_min": -0.7876431345939636, "reward_change_std": 0.3144048321992159, "reward_std": 0.7009001895785332, "rewards/cosine_scaled_reward": -0.10505986865609884, "rewards/format_reward": 0.812500013038516, "step": 266 }, { "advantage_max": 1.9578577429056168, "advantage_mean": 2.3593505260599557e-08, "advantage_min": -0.7362656965851784, "advantage_std": 0.99981639534235, "completion_length": 1887.8541946411133, "epoch": 0.30514285714285716, "grad_norm": 0.2461211085319519, "kl": 0.010284423828125, "lambda_div_used": 0.6, "learning_rate": 5.751196772469237e-07, "loss": 0.0004, "reward": -0.1160497977398336, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1160497977398336, "reward_after_std": 0.6473545543849468, "reward_before_mean": 0.2215914549306035, "reward_before_std": 0.5311122592538595, "reward_change_max": 0.0, "reward_change_mean": -0.33764123916625977, "reward_change_min": -0.5379331484436989, "reward_change_std": 0.19541364442557096, "reward_std": 0.6473545767366886, "rewards/cosine_scaled_reward": -0.23295428697019815, "rewards/format_reward": 0.6875000055879354, "step": 267 }, { "advantage_max": 1.9218407273292542, "advantage_mean": -2.793967879277659e-08, "advantage_min": -0.8416651785373688, "advantage_std": 0.9998409524559975, "completion_length": 1430.9791870117188, "epoch": 0.3062857142857143, "grad_norm": 0.2900254428386688, "kl": 0.01381683349609375, "lambda_div_used": 0.6, "learning_rate": 5.71982396408026e-07, "loss": 0.0006, "reward": 0.20641274470835924, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20641274470835924, "reward_after_std": 0.7826429642736912, "reward_before_mean": 0.6979242414236069, "reward_before_std": 0.6802052427083254, "reward_change_max": 0.0005292147397994995, "reward_change_mean": -0.4915114976465702, "reward_change_min": -0.8061510771512985, "reward_change_std": 0.30862017907202244, "reward_std": 0.7826430015265942, "rewards/cosine_scaled_reward": -0.06770455720834434, "rewards/format_reward": 0.8333333395421505, "step": 268 }, { "advantage_max": 1.86625137925148, "advantage_mean": -6.829699250587851e-09, "advantage_min": -0.8370335847139359, "advantage_std": 0.9998442083597183, "completion_length": 1545.520866394043, "epoch": 0.30742857142857144, "grad_norm": 0.22027085721492767, "kl": 0.008152008056640625, "lambda_div_used": 0.6, "learning_rate": 5.688440441781398e-07, "loss": 0.0003, "reward": 0.21930112247355282, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21930112247355282, "reward_after_std": 0.7824936024844646, "reward_before_mean": 0.7232769569382071, "reward_before_std": 0.67463708226569, "reward_change_max": 0.00043030083179473877, "reward_change_mean": -0.5039758523926139, "reward_change_min": -0.8153284899890423, "reward_change_std": 0.31488157622516155, "reward_std": 0.7824936211109161, "rewards/cosine_scaled_reward": -0.044611528515815735, "rewards/format_reward": 0.8125000074505806, "step": 269 }, { "advantage_max": 1.9493401795625687, "advantage_mean": -1.862645193639878e-08, "advantage_min": -0.6977572962641716, "advantage_std": 0.9998801946640015, "completion_length": 1448.583381652832, "epoch": 0.30857142857142855, "grad_norm": 0.20074597001075745, "kl": 0.0062274932861328125, "lambda_div_used": 0.6, "learning_rate": 5.657047735161255e-07, "loss": 0.0002, "reward": 0.5448911990970373, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5448911990970373, "reward_after_std": 0.9793307185173035, "reward_before_mean": 1.1831915229558945, "reward_before_std": 0.7954367697238922, "reward_change_max": 0.0, "reward_change_mean": -0.6383003033697605, "reward_change_min": -1.0107336267828941, "reward_change_std": 0.3763220179826021, "reward_std": 0.9793307483196259, "rewards/cosine_scaled_reward": 0.11242906516417861, "rewards/format_reward": 0.9583333358168602, "step": 270 }, { "advantage_max": 1.9000986367464066, "advantage_mean": -1.1098261587516589e-08, "advantage_min": -0.8887056112289429, "advantage_std": 0.9998585283756256, "completion_length": 1232.270851135254, "epoch": 0.3097142857142857, "grad_norm": 0.2368963658809662, "kl": 0.008569717407226562, "lambda_div_used": 0.6, "learning_rate": 5.625647374256061e-07, "loss": 0.0003, "reward": 0.6372459325939417, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6372459325939417, "reward_after_std": 0.8309118635952473, "reward_before_mean": 1.3628186136484146, "reward_before_std": 0.6561471298336983, "reward_change_max": 0.0003469809889793396, "reward_change_mean": -0.725572694092989, "reward_change_min": -1.076353020966053, "reward_change_std": 0.4148294348269701, "reward_std": 0.8309118933975697, "rewards/cosine_scaled_reward": 0.23349263006821275, "rewards/format_reward": 0.8958333395421505, "step": 271 }, { "advantage_max": 1.8306615948677063, "advantage_mean": 1.3659398168108794e-08, "advantage_min": -0.8924771621823311, "advantage_std": 0.9998399615287781, "completion_length": 1688.9583740234375, "epoch": 0.31085714285714283, "grad_norm": 0.22661247849464417, "kl": 0.00933837890625, "lambda_div_used": 0.6, "learning_rate": 5.594240889475106e-07, "loss": 0.0004, "reward": 0.17267357744276524, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17267357744276524, "reward_after_std": 0.7733958214521408, "reward_before_mean": 0.6572124548256397, "reward_before_std": 0.7261578030884266, "reward_change_max": 0.00031144171953201294, "reward_change_mean": -0.48453884944319725, "reward_change_min": -0.8327630683779716, "reward_change_std": 0.3227216098457575, "reward_std": 0.7733958400785923, "rewards/cosine_scaled_reward": -0.08806046470999718, "rewards/format_reward": 0.8333333358168602, "step": 272 }, { "advantage_max": 1.8778915852308273, "advantage_mean": -1.614292477469803e-08, "advantage_min": -0.9554785639047623, "advantage_std": 0.9998368248343468, "completion_length": 1353.0000305175781, "epoch": 0.312, "grad_norm": 0.24040770530700684, "kl": 0.008619308471679688, "lambda_div_used": 0.6, "learning_rate": 5.562829811526154e-07, "loss": 0.0003, "reward": 0.39927546092076227, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.39927546092076227, "reward_after_std": 0.736445277929306, "reward_before_mean": 1.0131700951606035, "reward_before_std": 0.6133726499974728, "reward_change_max": 0.0, "reward_change_mean": -0.613894646987319, "reward_change_min": -0.9457522034645081, "reward_change_std": 0.35532393865287304, "reward_std": 0.7364452891051769, "rewards/cosine_scaled_reward": 0.07950170524418354, "rewards/format_reward": 0.8541666772216558, "step": 273 }, { "advantage_max": 1.882187381386757, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.848646879196167, "advantage_std": 0.9998696744441986, "completion_length": 888.4791793823242, "epoch": 0.31314285714285717, "grad_norm": 0.26737749576568604, "kl": 0.007991790771484375, "lambda_div_used": 0.6, "learning_rate": 5.531415671340826e-07, "loss": 0.0003, "reward": 0.5258622104302049, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5258622104302049, "reward_after_std": 0.9269070662558079, "reward_before_mean": 1.1696142181754112, "reward_before_std": 0.8161358386278152, "reward_change_max": 0.0, "reward_change_mean": -0.643751971423626, "reward_change_min": -1.0380645543336868, "reward_change_std": 0.38927505910396576, "reward_std": 0.926907118409872, "rewards/cosine_scaled_reward": 0.09522374533116817, "rewards/format_reward": 0.9791666716337204, "step": 274 }, { "advantage_max": 1.877179205417633, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.870720274746418, "advantage_std": 0.9998558908700943, "completion_length": 1558.8958740234375, "epoch": 0.3142857142857143, "grad_norm": 0.21030379831790924, "kl": 0.00858306884765625, "lambda_div_used": 0.6, "learning_rate": 5.5e-07, "loss": 0.0003, "reward": 0.5669933171011508, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5669933171011508, "reward_after_std": 0.8992337211966515, "reward_before_mean": 1.2377357706427574, "reward_before_std": 0.7513070069253445, "reward_change_max": 0.0002499818801879883, "reward_change_mean": -0.6707424111664295, "reward_change_min": -1.0658031031489372, "reward_change_std": 0.40347418934106827, "reward_std": 0.8992337286472321, "rewards/cosine_scaled_reward": 0.23345119040459394, "rewards/format_reward": 0.7708333395421505, "step": 275 }, { "advantage_max": 1.909448117017746, "advantage_mean": -2.1109978876054925e-08, "advantage_min": -0.7440906390547752, "advantage_std": 0.9998734965920448, "completion_length": 1220.9583740234375, "epoch": 0.31542857142857145, "grad_norm": 0.2865343689918518, "kl": 0.0120086669921875, "lambda_div_used": 0.6, "learning_rate": 5.468584328659172e-07, "loss": 0.0005, "reward": 0.4875176604837179, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4875176604837179, "reward_after_std": 0.8809028156101704, "reward_before_mean": 1.1188461929559708, "reward_before_std": 0.7294301693327725, "reward_change_max": 0.0, "reward_change_mean": -0.6313285455107689, "reward_change_min": -1.0723997987806797, "reward_change_std": 0.39141695387661457, "reward_std": 0.8809028305113316, "rewards/cosine_scaled_reward": 0.11150642792927101, "rewards/format_reward": 0.8958333395421505, "step": 276 }, { "advantage_max": 1.8797271996736526, "advantage_mean": 1.5832484323574647e-08, "advantage_min": -0.8954315483570099, "advantage_std": 0.999833457171917, "completion_length": 1340.333351135254, "epoch": 0.31657142857142856, "grad_norm": 0.2681465148925781, "kl": 0.009929656982421875, "lambda_div_used": 0.6, "learning_rate": 5.437170188473847e-07, "loss": 0.0004, "reward": 0.42410989105701447, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.42410989105701447, "reward_after_std": 0.7413080781698227, "reward_before_mean": 1.0489613488316536, "reward_before_std": 0.6017416436225176, "reward_change_max": 0.0, "reward_change_mean": -0.6248514233157039, "reward_change_min": -0.9577978886663914, "reward_change_std": 0.3677874878048897, "reward_std": 0.741308081895113, "rewards/cosine_scaled_reward": 0.09739732113666832, "rewards/format_reward": 0.8541666716337204, "step": 277 }, { "advantage_max": 1.9071197807788849, "advantage_mean": 1.7384688799637615e-08, "advantage_min": -0.8232261501252651, "advantage_std": 0.9998288676142693, "completion_length": 1538.6666870117188, "epoch": 0.3177142857142857, "grad_norm": 0.2890927791595459, "kl": 0.007373809814453125, "lambda_div_used": 0.6, "learning_rate": 5.405759110524894e-07, "loss": 0.0003, "reward": 0.3901700456626713, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3901700456626713, "reward_after_std": 0.6501722633838654, "reward_before_mean": 1.0097860351670533, "reward_before_std": 0.47574078012257814, "reward_change_max": 0.0, "reward_change_mean": -0.6196159720420837, "reward_change_min": -0.9255659654736519, "reward_change_std": 0.35610699094831944, "reward_std": 0.6501722633838654, "rewards/cosine_scaled_reward": 0.07780967373400927, "rewards/format_reward": 0.8541666679084301, "step": 278 }, { "advantage_max": 1.943043828010559, "advantage_mean": -9.934107647602275e-09, "advantage_min": -0.7475237399339676, "advantage_std": 0.9998511970043182, "completion_length": 1437.1666870117188, "epoch": 0.31885714285714284, "grad_norm": 0.21998536586761475, "kl": 0.009319305419921875, "lambda_div_used": 0.6, "learning_rate": 5.37435262574394e-07, "loss": 0.0004, "reward": 0.2873132990207523, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2873132990207523, "reward_after_std": 0.803663358092308, "reward_before_mean": 0.8204641807824373, "reward_before_std": 0.6527150310575962, "reward_change_max": 0.0, "reward_change_mean": -0.533150888979435, "reward_change_min": -0.8645448237657547, "reward_change_std": 0.3154606595635414, "reward_std": 0.8036633767187595, "rewards/cosine_scaled_reward": -0.037684588925912976, "rewards/format_reward": 0.8958333432674408, "step": 279 }, { "advantage_max": 1.8220892250537872, "advantage_mean": -3.042320539936583e-08, "advantage_min": -0.9527215585112572, "advantage_std": 0.999868743121624, "completion_length": 1730.9792175292969, "epoch": 0.32, "grad_norm": 0.38605308532714844, "kl": 0.01398468017578125, "lambda_div_used": 0.6, "learning_rate": 5.342952264838747e-07, "loss": 0.0006, "reward": 0.5508183864876628, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5508183864876628, "reward_after_std": 0.9038041196763515, "reward_before_mean": 1.2189355352893472, "reward_before_std": 0.8520298786461353, "reward_change_max": 0.0009787678718566895, "reward_change_mean": -0.6681171432137489, "reward_change_min": -1.149309366941452, "reward_change_std": 0.4476726073771715, "reward_std": 0.903804138302803, "rewards/cosine_scaled_reward": 0.1928010657429695, "rewards/format_reward": 0.8333333395421505, "step": 280 }, { "advantage_max": 1.9217159450054169, "advantage_mean": 1.9247333948868572e-08, "advantage_min": -0.775476261973381, "advantage_std": 0.9998063147068024, "completion_length": 2434.8541870117188, "epoch": 0.3211428571428571, "grad_norm": 0.2845155894756317, "kl": 0.012725830078125, "lambda_div_used": 0.6, "learning_rate": 5.311559558218603e-07, "loss": 0.0005, "reward": -0.05798890208825469, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05798890208825469, "reward_after_std": 0.7850078567862511, "reward_before_mean": 0.28525743540376425, "reward_before_std": 0.6987994946539402, "reward_change_max": 0.0, "reward_change_mean": -0.34324632212519646, "reward_change_min": -0.635148648172617, "reward_change_std": 0.23209808766841888, "reward_std": 0.7850079126656055, "rewards/cosine_scaled_reward": -0.149037959985435, "rewards/format_reward": 0.5833333358168602, "step": 281 }, { "advantage_max": 1.8430883884429932, "advantage_mean": 8.07146260939362e-09, "advantage_min": -0.9225859716534615, "advantage_std": 0.9998578503727913, "completion_length": 1281.3333702087402, "epoch": 0.3222857142857143, "grad_norm": 0.26052939891815186, "kl": 0.009288787841796875, "lambda_div_used": 0.6, "learning_rate": 5.28017603591974e-07, "loss": 0.0004, "reward": 0.29589394642971456, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.29589394642971456, "reward_after_std": 0.7819314710795879, "reward_before_mean": 0.8467987608164549, "reward_before_std": 0.6957082394510508, "reward_change_max": 0.0, "reward_change_mean": -0.5509047862142324, "reward_change_min": -0.938206635415554, "reward_change_std": 0.3565885201096535, "reward_std": 0.7819315008819103, "rewards/cosine_scaled_reward": -0.014100641012191772, "rewards/format_reward": 0.8750000149011612, "step": 282 }, { "advantage_max": 1.8591952323913574, "advantage_mean": -1.2417634032146907e-08, "advantage_min": -0.8575943261384964, "advantage_std": 0.9998741522431374, "completion_length": 2172.0625228881836, "epoch": 0.32342857142857145, "grad_norm": 0.18784987926483154, "kl": 0.0093994140625, "lambda_div_used": 0.6, "learning_rate": 5.248803227530763e-07, "loss": 0.0004, "reward": 0.4316023401916027, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4316023401916027, "reward_after_std": 0.9680920131504536, "reward_before_mean": 1.013820081949234, "reward_before_std": 0.8663015514612198, "reward_change_max": 0.00028090178966522217, "reward_change_mean": -0.5822177156805992, "reward_change_min": -1.0923729129135609, "reward_change_std": 0.39628793857991695, "reward_std": 0.9680920168757439, "rewards/cosine_scaled_reward": 0.1319100260734558, "rewards/format_reward": 0.7500000037252903, "step": 283 }, { "advantage_max": 1.9308781772851944, "advantage_mean": 1.862645426786713e-09, "advantage_min": -0.7617431655526161, "advantage_std": 0.9998421370983124, "completion_length": 1170.3542022705078, "epoch": 0.32457142857142857, "grad_norm": 0.26644590497016907, "kl": 0.009359359741210938, "lambda_div_used": 0.6, "learning_rate": 5.21744266211809e-07, "loss": 0.0004, "reward": 0.2046554802218452, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2046554802218452, "reward_after_std": 0.8015350475907326, "reward_before_mean": 0.688809160143137, "reward_before_std": 0.6861891858279705, "reward_change_max": 0.0, "reward_change_mean": -0.4841536581516266, "reward_change_min": -0.8655692115426064, "reward_change_std": 0.30566604621708393, "reward_std": 0.8015350624918938, "rewards/cosine_scaled_reward": -0.12434543017297983, "rewards/format_reward": 0.9375000074505806, "step": 284 }, { "advantage_max": 1.9677094668149948, "advantage_mean": -2.110997954218874e-08, "advantage_min": -0.7027824819087982, "advantage_std": 0.9998104050755501, "completion_length": 1104.5000381469727, "epoch": 0.32571428571428573, "grad_norm": 0.26687729358673096, "kl": 0.009538650512695312, "lambda_div_used": 0.6, "learning_rate": 5.186095868151436e-07, "loss": 0.0004, "reward": 0.2683999980799854, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2683999980799854, "reward_after_std": 0.6404694318771362, "reward_before_mean": 0.8243867550045252, "reward_before_std": 0.4349752515554428, "reward_change_max": 0.0, "reward_change_mean": -0.5559867545962334, "reward_change_min": -0.7854268550872803, "reward_change_std": 0.300765099003911, "reward_std": 0.6404694765806198, "rewards/cosine_scaled_reward": -0.056556637631729245, "rewards/format_reward": 0.9375000074505806, "step": 285 }, { "advantage_max": 1.8454220443964005, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -1.0381435677409172, "advantage_std": 0.99984922260046, "completion_length": 1362.6667175292969, "epoch": 0.32685714285714285, "grad_norm": 0.26756566762924194, "kl": 0.01204681396484375, "lambda_div_used": 0.6, "learning_rate": 5.154764373429315e-07, "loss": 0.0005, "reward": 0.24126607986545423, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.24126607986545423, "reward_after_std": 0.777409877628088, "reward_before_mean": 0.7604061793535948, "reward_before_std": 0.7200019955635071, "reward_change_max": 0.00024990737438201904, "reward_change_mean": -0.5191400870680809, "reward_change_min": -0.8817098364233971, "reward_change_std": 0.3420996591448784, "reward_std": 0.7774099223315716, "rewards/cosine_scaled_reward": -0.06771359359845519, "rewards/format_reward": 0.8958333507180214, "step": 286 }, { "advantage_max": 1.8230135142803192, "advantage_mean": -4.4703486135055925e-08, "advantage_min": -0.9058626368641853, "advantage_std": 0.9998276680707932, "completion_length": 1362.9583549499512, "epoch": 0.328, "grad_norm": 0.30255770683288574, "kl": 0.01079559326171875, "lambda_div_used": 0.6, "learning_rate": 5.123449705004581e-07, "loss": 0.0004, "reward": 0.28222215245477855, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.28222215245477855, "reward_after_std": 0.6900255009531975, "reward_before_mean": 0.8456722013652325, "reward_before_std": 0.6139935115352273, "reward_change_max": 0.00031256675720214844, "reward_change_mean": -0.5634500700980425, "reward_change_min": -0.9192318581044674, "reward_change_std": 0.36021300964057446, "reward_std": 0.6900255233049393, "rewards/cosine_scaled_reward": 0.027002752758562565, "rewards/format_reward": 0.7916666734963655, "step": 287 }, { "advantage_max": 1.8479500263929367, "advantage_mean": 1.490116185998147e-08, "advantage_min": -0.8085650354623795, "advantage_std": 0.9998614192008972, "completion_length": 1352.2500381469727, "epoch": 0.3291428571428571, "grad_norm": 0.21222001314163208, "kl": 0.009647369384765625, "lambda_div_used": 0.6, "learning_rate": 5.09215338910999e-07, "loss": 0.0004, "reward": 0.38232562225311995, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.38232562225311995, "reward_after_std": 0.8447245806455612, "reward_before_mean": 0.9674149639904499, "reward_before_std": 0.7541538216173649, "reward_change_max": 0.0004512891173362732, "reward_change_mean": -0.5850893221795559, "reward_change_min": -1.0319984778761864, "reward_change_std": 0.38284142315387726, "reward_std": 0.8447245843708515, "rewards/cosine_scaled_reward": 0.014957469655200839, "rewards/format_reward": 0.9375000074505806, "step": 288 }, { "advantage_max": 1.9135861545801163, "advantage_mean": 1.5522040874849097e-09, "advantage_min": -0.7802659943699837, "advantage_std": 0.9997969791293144, "completion_length": 1395.208339691162, "epoch": 0.3302857142857143, "grad_norm": 0.3205528259277344, "kl": 0.0114288330078125, "lambda_div_used": 0.6, "learning_rate": 5.060876951083828e-07, "loss": 0.0005, "reward": 0.2357093554455787, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.2357093554455787, "reward_after_std": 0.609809685498476, "reward_before_mean": 0.7810433376580477, "reward_before_std": 0.4604929033666849, "reward_change_max": 0.0005588680505752563, "reward_change_mean": -0.5453339908272028, "reward_change_min": -0.8528439849615097, "reward_change_std": 0.3111587315797806, "reward_std": 0.6098097078502178, "rewards/cosine_scaled_reward": -0.026145002455450594, "rewards/format_reward": 0.8333333358168602, "step": 289 }, { "advantage_max": 1.811624899506569, "advantage_mean": -7.45058070794613e-09, "advantage_min": -0.9065205976366997, "advantage_std": 0.9998824968934059, "completion_length": 1016.0625457763672, "epoch": 0.3314285714285714, "grad_norm": 0.31949731707572937, "kl": 0.008113861083984375, "lambda_div_used": 0.6, "learning_rate": 5.02962191529556e-07, "loss": 0.0003, "reward": 0.5378194686491042, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5378194686491042, "reward_after_std": 0.9473526403307915, "reward_before_mean": 1.1925920248031616, "reward_before_std": 0.8950722739100456, "reward_change_max": 0.0, "reward_change_mean": -0.65477254986763, "reward_change_min": -1.1149347499012947, "reward_change_std": 0.4268352948129177, "reward_std": 0.9473526403307915, "rewards/cosine_scaled_reward": 0.13796266820281744, "rewards/format_reward": 0.9166666716337204, "step": 290 }, { "advantage_max": 1.8208087533712387, "advantage_mean": -6.208817682207268e-09, "advantage_min": -0.9952463805675507, "advantage_std": 0.9998692199587822, "completion_length": 1115.0833587646484, "epoch": 0.3325714285714286, "grad_norm": 0.20173849165439606, "kl": 0.0072803497314453125, "lambda_div_used": 0.6, "learning_rate": 4.998389805071536e-07, "loss": 0.0003, "reward": 0.4247822118923068, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4247822118923068, "reward_after_std": 0.9100354351103306, "reward_before_mean": 1.0233391895890236, "reward_before_std": 0.8653226383030415, "reward_change_max": 0.0, "reward_change_mean": -0.5985569916665554, "reward_change_min": -1.0455361381173134, "reward_change_std": 0.4032821226865053, "reward_std": 0.9100354537367821, "rewards/cosine_scaled_reward": 0.04291958408430219, "rewards/format_reward": 0.9375000149011612, "step": 291 }, { "advantage_max": 1.9008750915527344, "advantage_mean": 6.208817127095756e-09, "advantage_min": -0.855563573539257, "advantage_std": 0.9998486787080765, "completion_length": 1590.5000305175781, "epoch": 0.33371428571428574, "grad_norm": 0.2894393503665924, "kl": 0.0106658935546875, "lambda_div_used": 0.6, "learning_rate": 4.967182142620745e-07, "loss": 0.0004, "reward": 0.16041000466793776, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16041000466793776, "reward_after_std": 0.8050662241876125, "reward_before_mean": 0.6222329139709473, "reward_before_std": 0.7005427153781056, "reward_change_max": 0.0, "reward_change_mean": -0.46182290092110634, "reward_change_min": -0.7366148829460144, "reward_change_std": 0.2943364791572094, "reward_std": 0.8050662279129028, "rewards/cosine_scaled_reward": -0.10555021092295647, "rewards/format_reward": 0.8333333395421505, "step": 292 }, { "advantage_max": 1.8485536128282547, "advantage_mean": -4.190951696791956e-09, "advantage_min": -0.9521011859178543, "advantage_std": 0.9998137950897217, "completion_length": 979.8541946411133, "epoch": 0.33485714285714285, "grad_norm": 0.241227388381958, "kl": 0.008237838745117188, "lambda_div_used": 0.6, "learning_rate": 4.93600044896063e-07, "loss": 0.0003, "reward": 0.3330893259262666, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3330893259262666, "reward_after_std": 0.5936389490962029, "reward_before_mean": 0.9396286457777023, "reward_before_std": 0.48794906213879585, "reward_change_max": 0.0, "reward_change_mean": -0.6065393388271332, "reward_change_min": -0.9153870195150375, "reward_change_std": 0.3452066704630852, "reward_std": 0.5936389714479446, "rewards/cosine_scaled_reward": -0.019769012928009033, "rewards/format_reward": 0.9791666716337204, "step": 293 }, { "advantage_max": 1.8205612301826477, "advantage_mean": 9.313225635132483e-09, "advantage_min": -0.9388800859451294, "advantage_std": 0.999842956662178, "completion_length": 1574.6250228881836, "epoch": 0.336, "grad_norm": 0.2210991382598877, "kl": 0.010227203369140625, "lambda_div_used": 0.6, "learning_rate": 4.904846243842949e-07, "loss": 0.0004, "reward": 0.2603410785086453, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2603410785086453, "reward_after_std": 0.780572846531868, "reward_before_mean": 0.7907075211405754, "reward_before_std": 0.7266894429922104, "reward_change_max": 0.00038830190896987915, "reward_change_mean": -0.5303664095699787, "reward_change_min": -0.902099747210741, "reward_change_std": 0.34440297074615955, "reward_std": 0.7805728502571583, "rewards/cosine_scaled_reward": -0.031729597598314285, "rewards/format_reward": 0.8541666772216558, "step": 294 }, { "advantage_max": 1.910035789012909, "advantage_mean": 2.8250119410433427e-08, "advantage_min": -0.7845353484153748, "advantage_std": 0.9998215660452843, "completion_length": 1460.7500305175781, "epoch": 0.33714285714285713, "grad_norm": 0.22684507071971893, "kl": 0.0109710693359375, "lambda_div_used": 0.6, "learning_rate": 4.873721045679706e-07, "loss": 0.0004, "reward": 0.46327299624681473, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.46327299624681473, "reward_after_std": 0.6574340984225273, "reward_before_mean": 1.1236709244549274, "reward_before_std": 0.45060044154524803, "reward_change_max": 0.0, "reward_change_mean": -0.660397931933403, "reward_change_min": -0.9777256101369858, "reward_change_std": 0.3532063625752926, "reward_std": 0.6574341058731079, "rewards/cosine_scaled_reward": 0.11391878468566574, "rewards/format_reward": 0.8958333358168602, "step": 295 }, { "advantage_max": 1.8386378586292267, "advantage_mean": -4.9670543234014986e-09, "advantage_min": -0.8036707676947117, "advantage_std": 0.9998711794614792, "completion_length": 1556.6667098999023, "epoch": 0.3382857142857143, "grad_norm": 0.287063330411911, "kl": 0.012338638305664062, "lambda_div_used": 0.6, "learning_rate": 4.842626371469149e-07, "loss": 0.0005, "reward": 0.3221674086526036, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3221674086526036, "reward_after_std": 0.9312387257814407, "reward_before_mean": 0.8596511520445347, "reward_before_std": 0.8932880535721779, "reward_change_max": 0.0, "reward_change_mean": -0.5374837517738342, "reward_change_min": -0.9401872828602791, "reward_change_std": 0.36972188390791416, "reward_std": 0.9312387406826019, "rewards/cosine_scaled_reward": -0.018091095611453056, "rewards/format_reward": 0.8958333432674408, "step": 296 }, { "advantage_max": 1.873251423239708, "advantage_mean": 2.0489097085629737e-08, "advantage_min": -0.877208910882473, "advantage_std": 0.9998420774936676, "completion_length": 2009.8333740234375, "epoch": 0.3394285714285714, "grad_norm": 0.22784045338630676, "kl": 0.0127410888671875, "lambda_div_used": 0.6, "learning_rate": 4.811563736721829e-07, "loss": 0.0005, "reward": 0.1964537873864174, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1964537873864174, "reward_after_std": 0.7367817126214504, "reward_before_mean": 0.6939435098320246, "reward_before_std": 0.6230933777987957, "reward_change_max": 0.0023637935519218445, "reward_change_mean": -0.497489670291543, "reward_change_min": -0.8366812095046043, "reward_change_std": 0.3124427292495966, "reward_std": 0.7367817237973213, "rewards/cosine_scaled_reward": -0.03844492509961128, "rewards/format_reward": 0.770833345130086, "step": 297 }, { "advantage_max": 1.9013422727584839, "advantage_mean": -1.6453366002977532e-08, "advantage_min": -0.8251728340983391, "advantage_std": 0.9998217076063156, "completion_length": 1473.8125381469727, "epoch": 0.3405714285714286, "grad_norm": 0.2259664386510849, "kl": 0.009435653686523438, "lambda_div_used": 0.6, "learning_rate": 4.780534655386743e-07, "loss": 0.0004, "reward": 0.17620949761476368, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17620949761476368, "reward_after_std": 0.6479051597416401, "reward_before_mean": 0.6807891931384802, "reward_before_std": 0.5183268450200558, "reward_change_max": 0.0, "reward_change_mean": -0.5045796744525433, "reward_change_min": -0.8199915066361427, "reward_change_std": 0.3016477432101965, "reward_std": 0.647905170917511, "rewards/cosine_scaled_reward": -0.08668875135481358, "rewards/format_reward": 0.8541666828095913, "step": 298 }, { "advantage_max": 1.8082835525274277, "advantage_mean": -6.829699250587851e-09, "advantage_min": -0.9486165568232536, "advantage_std": 0.9998417943716049, "completion_length": 1438.270851135254, "epoch": 0.3417142857142857, "grad_norm": 0.2638239562511444, "kl": 0.01007843017578125, "lambda_div_used": 0.6, "learning_rate": 4.749540639777539e-07, "loss": 0.0004, "reward": 0.09841901052277535, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09841901052277535, "reward_after_std": 0.7489665485918522, "reward_before_mean": 0.5462133912369609, "reward_before_std": 0.7167009748518467, "reward_change_max": 0.00038520991802215576, "reward_change_mean": -0.4477943815290928, "reward_change_min": -0.828592486679554, "reward_change_std": 0.3159326184540987, "reward_std": 0.748966570943594, "rewards/cosine_scaled_reward": -0.17480998300015926, "rewards/format_reward": 0.8958333432674408, "step": 299 }, { "advantage_max": 1.8891912400722504, "advantage_mean": 8.071462831438225e-09, "advantage_min": -0.8375274538993835, "advantage_std": 0.999823771417141, "completion_length": 1474.1875305175781, "epoch": 0.34285714285714286, "grad_norm": 0.34135720133781433, "kl": 0.01102447509765625, "lambda_div_used": 0.6, "learning_rate": 4.7185832004988133e-07, "loss": 0.0004, "reward": 0.18832044645387214, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18832044645387214, "reward_after_std": 0.682534109801054, "reward_before_mean": 0.6911065131425858, "reward_before_std": 0.574971929192543, "reward_change_max": 0.0005830526351928711, "reward_change_mean": -0.5027860454283655, "reward_change_min": -0.8069445490837097, "reward_change_std": 0.29158598743379116, "reward_std": 0.6825341135263443, "rewards/cosine_scaled_reward": -0.07111341133713722, "rewards/format_reward": 0.8333333432674408, "step": 300 }, { "advantage_max": 1.8619256168603897, "advantage_mean": -1.6763806787167823e-08, "advantage_min": -0.9469154067337513, "advantage_std": 0.9998360425233841, "completion_length": 1291.520896911621, "epoch": 0.344, "grad_norm": 0.30536800622940063, "kl": 0.012241363525390625, "lambda_div_used": 0.6, "learning_rate": 4.68766384637248e-07, "loss": 0.0005, "reward": 0.3281488213688135, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3281488213688135, "reward_after_std": 0.7333485223352909, "reward_before_mean": 0.9018230102956295, "reward_before_std": 0.6293862089514732, "reward_change_max": 0.0, "reward_change_mean": -0.5736742094159126, "reward_change_min": -0.9062788337469101, "reward_change_std": 0.33993304893374443, "reward_std": 0.7333485260605812, "rewards/cosine_scaled_reward": -0.03867182228714228, "rewards/format_reward": 0.9791666716337204, "step": 301 }, { "advantage_max": 1.8721920400857925, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.7614920884370804, "advantage_std": 0.9998584762215614, "completion_length": 1661.166748046875, "epoch": 0.34514285714285714, "grad_norm": 0.33685818314552307, "kl": 0.01244354248046875, "lambda_div_used": 0.6, "learning_rate": 4.656784084364238e-07, "loss": 0.0005, "reward": 0.38028667867183685, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.38028667867183685, "reward_after_std": 0.7807257771492004, "reward_before_mean": 0.9740477418527007, "reward_before_std": 0.6580136283300817, "reward_change_max": 0.0, "reward_change_mean": -0.5937610603868961, "reward_change_min": -1.0487833842635155, "reward_change_std": 0.3755612950772047, "reward_std": 0.7807258144021034, "rewards/cosine_scaled_reward": 0.11202386766672134, "rewards/format_reward": 0.7500000018626451, "step": 302 }, { "advantage_max": 1.8254009038209915, "advantage_mean": -5.587935614226325e-09, "advantage_min": -0.9064772799611092, "advantage_std": 0.9998302906751633, "completion_length": 1085.5208587646484, "epoch": 0.3462857142857143, "grad_norm": 0.3099322021007538, "kl": 0.00725555419921875, "lambda_div_used": 0.6, "learning_rate": 4.6259454195101267e-07, "loss": 0.0003, "reward": 0.4087141342461109, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4087141342461109, "reward_after_std": 0.7723647579550743, "reward_before_mean": 1.0217244923114777, "reward_before_std": 0.7018113266676664, "reward_change_max": 0.0, "reward_change_mean": -0.6130103468894958, "reward_change_min": -1.0503919497132301, "reward_change_std": 0.38757357373833656, "reward_std": 0.7723647765815258, "rewards/cosine_scaled_reward": 0.052528894040733576, "rewards/format_reward": 0.916666679084301, "step": 303 }, { "advantage_max": 1.8945975750684738, "advantage_mean": 4.1599077515996896e-08, "advantage_min": -0.7990177571773529, "advantage_std": 0.9998310655355453, "completion_length": 1432.6875305175781, "epoch": 0.3474285714285714, "grad_norm": 0.2642192244529724, "kl": 0.01309967041015625, "lambda_div_used": 0.6, "learning_rate": 4.59514935484316e-07, "loss": 0.0005, "reward": 0.27664350939448923, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.27664350939448923, "reward_after_std": 0.7569139413535595, "reward_before_mean": 0.8116565495729446, "reward_before_std": 0.6271609403192997, "reward_change_max": 0.0004661977291107178, "reward_change_mean": -0.5350130377337337, "reward_change_min": -0.850684642791748, "reward_change_std": 0.3104001171886921, "reward_std": 0.7569139413535595, "rewards/cosine_scaled_reward": -0.021255063824355602, "rewards/format_reward": 0.8541666716337204, "step": 304 }, { "advantage_max": 1.7953919023275375, "advantage_mean": -1.4280279736489376e-08, "advantage_min": -0.9705780595541, "advantage_std": 0.9998304322361946, "completion_length": 1441.645881652832, "epoch": 0.3485714285714286, "grad_norm": 0.35386335849761963, "kl": 0.01190948486328125, "lambda_div_used": 0.6, "learning_rate": 4.5643973913200837e-07, "loss": 0.0005, "reward": 0.06580792646855116, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06580792646855116, "reward_after_std": 0.6579489149153233, "reward_before_mean": 0.5137032084167004, "reward_before_std": 0.6019007451832294, "reward_change_max": 0.0008752346038818359, "reward_change_mean": -0.4478952884674072, "reward_change_min": -0.7521562948822975, "reward_change_std": 0.28863073140382767, "reward_std": 0.6579489223659039, "rewards/cosine_scaled_reward": -0.17023173440247774, "rewards/format_reward": 0.854166679084301, "step": 305 }, { "advantage_max": 1.8763258010149002, "advantage_mean": -3.2285850215529877e-08, "advantage_min": -0.7838627435266972, "advantage_std": 0.9998114481568336, "completion_length": 1086.4375381469727, "epoch": 0.3497142857142857, "grad_norm": 0.24181218445301056, "kl": 0.00975799560546875, "lambda_div_used": 0.6, "learning_rate": 4.5336910277482155e-07, "loss": 0.0004, "reward": 0.48075782135128975, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.48075782135128975, "reward_after_std": 0.6983195133507252, "reward_before_mean": 1.1519020181149244, "reward_before_std": 0.5561750112101436, "reward_change_max": 0.0, "reward_change_mean": -0.6711442433297634, "reward_change_min": -1.0415485128760338, "reward_change_std": 0.40287046879529953, "reward_std": 0.6983195282518864, "rewards/cosine_scaled_reward": 0.09678435418754816, "rewards/format_reward": 0.9583333358168602, "step": 306 }, { "advantage_max": 1.8967533856630325, "advantage_mean": -2.980232305382913e-08, "advantage_min": -0.8466514945030212, "advantage_std": 0.9998496323823929, "completion_length": 1320.62504196167, "epoch": 0.35085714285714287, "grad_norm": 0.29407626390457153, "kl": 0.01064300537109375, "lambda_div_used": 0.6, "learning_rate": 4.503031760712397e-07, "loss": 0.0004, "reward": 0.28989268373697996, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.28989268373697996, "reward_after_std": 0.8201549686491489, "reward_before_mean": 0.8231144957244396, "reward_before_std": 0.6994161698967218, "reward_change_max": 0.0011776536703109741, "reward_change_mean": -0.5332218129187822, "reward_change_min": -0.8389204330742359, "reward_change_std": 0.31923081912100315, "reward_std": 0.8201549835503101, "rewards/cosine_scaled_reward": -0.025942761451005936, "rewards/format_reward": 0.8750000055879354, "step": 307 }, { "advantage_max": 1.8265314847230911, "advantage_mean": 2.4214386717957836e-08, "advantage_min": -1.0207050442695618, "advantage_std": 0.9998078942298889, "completion_length": 2310.291763305664, "epoch": 0.352, "grad_norm": 0.18933595716953278, "kl": 0.01651763916015625, "lambda_div_used": 0.6, "learning_rate": 4.4724210845020494e-07, "loss": 0.0007, "reward": 0.1366605656221509, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1366605656221509, "reward_after_std": 0.7060784362256527, "reward_before_mean": 0.6142788184806705, "reward_before_std": 0.6519606187939644, "reward_change_max": 0.00028352439403533936, "reward_change_mean": -0.4776182733476162, "reward_change_min": -0.7881277278065681, "reward_change_std": 0.29920427687466145, "reward_std": 0.7060784623026848, "rewards/cosine_scaled_reward": -0.03661058656871319, "rewards/format_reward": 0.6875000055879354, "step": 308 }, { "advantage_max": 1.7872188687324524, "advantage_mean": 2.4835271617007493e-09, "advantage_min": -0.9430239722132683, "advantage_std": 0.9998209476470947, "completion_length": 1762.8333740234375, "epoch": 0.35314285714285715, "grad_norm": 0.22261229157447815, "kl": 0.01409912109375, "lambda_div_used": 0.6, "learning_rate": 4.441860491038345e-07, "loss": 0.0006, "reward": 0.12087270012125373, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12087270012125373, "reward_after_std": 0.7508501298725605, "reward_before_mean": 0.5837531480938196, "reward_before_std": 0.7245769258588552, "reward_change_max": 0.0, "reward_change_mean": -0.4628804475069046, "reward_change_min": -0.8756393194198608, "reward_change_std": 0.3242361284792423, "reward_std": 0.7508501410484314, "rewards/cosine_scaled_reward": -0.10395676456391811, "rewards/format_reward": 0.7916666716337204, "step": 309 }, { "advantage_max": 1.8359250724315643, "advantage_mean": -6.208814573582799e-10, "advantage_min": -0.9323371052742004, "advantage_std": 0.9998283386230469, "completion_length": 1442.5625534057617, "epoch": 0.35428571428571426, "grad_norm": 0.3380570411682129, "kl": 0.01674652099609375, "lambda_div_used": 0.6, "learning_rate": 4.4113514698014953e-07, "loss": 0.0007, "reward": 0.17298853071406484, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17298853071406484, "reward_after_std": 0.649883933365345, "reward_before_mean": 0.6802481282502413, "reward_before_std": 0.5928199477493763, "reward_change_max": 0.0, "reward_change_mean": -0.5072596073150635, "reward_change_min": -0.874159500002861, "reward_change_std": 0.3214066829532385, "reward_std": 0.6498839743435383, "rewards/cosine_scaled_reward": -0.10779260657727718, "rewards/format_reward": 0.8958333358168602, "step": 310 }, { "advantage_max": 1.922322541475296, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.7284784689545631, "advantage_std": 0.999852791428566, "completion_length": 1135.8750228881836, "epoch": 0.3554285714285714, "grad_norm": 0.22316350042819977, "kl": 0.008321762084960938, "lambda_div_used": 0.6, "learning_rate": 4.3808955077581546e-07, "loss": 0.0003, "reward": 0.466993102512788, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.466993102512788, "reward_after_std": 0.8413489013910294, "reward_before_mean": 1.0938301347196102, "reward_before_std": 0.6916535142809153, "reward_change_max": 0.0, "reward_change_mean": -0.6268370077013969, "reward_change_min": -1.0190090090036392, "reward_change_std": 0.3714268747717142, "reward_std": 0.8413489013910294, "rewards/cosine_scaled_reward": 0.04691504535730928, "rewards/format_reward": 1.0, "step": 311 }, { "advantage_max": 1.9405202120542526, "advantage_mean": -4.315127988263612e-08, "advantage_min": -0.7634863182902336, "advantage_std": 0.9998544678092003, "completion_length": 1331.7708587646484, "epoch": 0.3565714285714286, "grad_norm": 0.287652850151062, "kl": 0.011852264404296875, "lambda_div_used": 0.6, "learning_rate": 4.350494089288943e-07, "loss": 0.0005, "reward": 0.6322616841644049, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6322616841644049, "reward_after_std": 0.7453844994306564, "reward_before_mean": 1.3673997856676579, "reward_before_std": 0.5059651434421539, "reward_change_max": 0.0, "reward_change_mean": -0.7351380791515112, "reward_change_min": -1.0781624987721443, "reward_change_std": 0.4042064417153597, "reward_std": 0.7453845143318176, "rewards/cosine_scaled_reward": 0.24619986613106448, "rewards/format_reward": 0.8750000111758709, "step": 312 }, { "advantage_max": 1.8338307291269302, "advantage_mean": -8.692344399818808e-09, "advantage_min": -0.9592939876019955, "advantage_std": 0.9998330473899841, "completion_length": 1714.4791984558105, "epoch": 0.3577142857142857, "grad_norm": 0.24802806973457336, "kl": 0.01470947265625, "lambda_div_used": 0.6, "learning_rate": 4.3201486961161093e-07, "loss": 0.0006, "reward": 0.3089195266366005, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.3089195266366005, "reward_after_std": 0.8284649103879929, "reward_before_mean": 0.8583853915333748, "reward_before_std": 0.7549590589478612, "reward_change_max": 0.0007516443729400635, "reward_change_mean": -0.5494658825919032, "reward_change_min": -0.9307254888117313, "reward_change_std": 0.3660744549706578, "reward_std": 0.8284649439156055, "rewards/cosine_scaled_reward": 0.0750260278582573, "rewards/format_reward": 0.7083333488553762, "step": 313 }, { "advantage_max": 1.8229438215494156, "advantage_mean": -8.692344732885715e-09, "advantage_min": -0.9871519505977631, "advantage_std": 0.9998666346073151, "completion_length": 1355.0208778381348, "epoch": 0.3588571428571429, "grad_norm": 0.2961985766887665, "kl": 0.01201629638671875, "lambda_div_used": 0.6, "learning_rate": 4.2898608072313045e-07, "loss": 0.0005, "reward": 0.42520161904394627, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.42520161904394627, "reward_after_std": 0.840074960142374, "reward_before_mean": 1.0372523088008165, "reward_before_std": 0.7808595895767212, "reward_change_max": 0.0003578066825866699, "reward_change_mean": -0.6120506376028061, "reward_change_min": -1.002462938427925, "reward_change_std": 0.39382885955274105, "reward_std": 0.8400749824941158, "rewards/cosine_scaled_reward": 0.13320946041494608, "rewards/format_reward": 0.7708333395421505, "step": 314 }, { "advantage_max": 1.8934376388788223, "advantage_mean": -1.1020650614312899e-08, "advantage_min": -0.9089367166161537, "advantage_std": 0.999834805727005, "completion_length": 1825.6458587646484, "epoch": 0.36, "grad_norm": 0.3252292573451996, "kl": 0.024791717529296875, "lambda_div_used": 0.6, "learning_rate": 4.2596318988235037e-07, "loss": 0.001, "reward": 0.2225517202168703, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2225517202168703, "reward_after_std": 0.6751000694930553, "reward_before_mean": 0.748973336070776, "reward_before_std": 0.5620210394263268, "reward_change_max": 0.0004638954997062683, "reward_change_mean": -0.5264216009527445, "reward_change_min": -0.8017611876130104, "reward_change_std": 0.3129722382873297, "reward_std": 0.6751000992953777, "rewards/cosine_scaled_reward": -0.0005133431404829025, "rewards/format_reward": 0.750000013038516, "step": 315 }, { "advantage_max": 1.8182682543992996, "advantage_mean": 4.3461718668424965e-09, "advantage_min": -0.927369087934494, "advantage_std": 0.9998475015163422, "completion_length": 1909.4583892822266, "epoch": 0.36114285714285715, "grad_norm": 0.3349359333515167, "kl": 0.019321441650390625, "lambda_div_used": 0.6, "learning_rate": 4.2294634442070553e-07, "loss": 0.0008, "reward": 0.019429476466029882, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.019429476466029882, "reward_after_std": 0.7821727208793163, "reward_before_mean": 0.4113868260756135, "reward_before_std": 0.7491345182061195, "reward_change_max": 0.0010381415486335754, "reward_change_mean": -0.39195735938847065, "reward_change_min": -0.6826085112988949, "reward_change_std": 0.27011805586516857, "reward_std": 0.7821727506816387, "rewards/cosine_scaled_reward": -0.1588899241760373, "rewards/format_reward": 0.7291666828095913, "step": 316 }, { "advantage_max": 1.8804281800985336, "advantage_mean": -5.587935725248627e-09, "advantage_min": -0.7781447246670723, "advantage_std": 0.9998077526688576, "completion_length": 1622.0417175292969, "epoch": 0.36228571428571427, "grad_norm": 0.3521542549133301, "kl": 0.017303466796875, "lambda_div_used": 0.6, "learning_rate": 4.1993569137498776e-07, "loss": 0.0007, "reward": 0.11890967702493072, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11890967702493072, "reward_after_std": 0.6751161739230156, "reward_before_mean": 0.587021853774786, "reward_before_std": 0.5646471055224538, "reward_change_max": 0.0004993155598640442, "reward_change_mean": -0.46811217814683914, "reward_change_min": -0.7818466536700726, "reward_change_std": 0.29836034402251244, "reward_std": 0.6751161776483059, "rewards/cosine_scaled_reward": -0.0710724163800478, "rewards/format_reward": 0.7291666697710752, "step": 317 }, { "advantage_max": 1.8058977276086807, "advantage_mean": -1.9247333504779363e-08, "advantage_min": -0.9391337558627129, "advantage_std": 0.9998605996370316, "completion_length": 1382.2500267028809, "epoch": 0.36342857142857143, "grad_norm": 0.5408955812454224, "kl": 0.020711898803710938, "lambda_div_used": 0.6, "learning_rate": 4.1693137748017915e-07, "loss": 0.0008, "reward": 0.19501495765871368, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19501495765871368, "reward_after_std": 0.805657759308815, "reward_before_mean": 0.6848395057022572, "reward_before_std": 0.7711063474416733, "reward_change_max": 0.0032318681478500366, "reward_change_mean": -0.489824540913105, "reward_change_min": -0.9173102602362633, "reward_change_std": 0.3421527110040188, "reward_std": 0.8056577667593956, "rewards/cosine_scaled_reward": -0.09508026950061321, "rewards/format_reward": 0.8750000111758709, "step": 318 }, { "advantage_max": 1.875873938202858, "advantage_mean": 4.656612984099695e-09, "advantage_min": -0.8540654145181179, "advantage_std": 0.9998373687267303, "completion_length": 1421.8125610351562, "epoch": 0.36457142857142855, "grad_norm": 0.3166425824165344, "kl": 0.011302947998046875, "lambda_div_used": 0.6, "learning_rate": 4.1393354916230005e-07, "loss": 0.0005, "reward": 0.0982060037786141, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0982060037786141, "reward_after_std": 0.711435116827488, "reward_before_mean": 0.548051243647933, "reward_before_std": 0.6334625743329525, "reward_change_max": 1.6868114471435547e-05, "reward_change_mean": -0.44984522834420204, "reward_change_min": -0.7615330517292023, "reward_change_std": 0.2841661609709263, "reward_std": 0.7114351354539394, "rewards/cosine_scaled_reward": -0.17389106666087173, "rewards/format_reward": 0.8958333432674408, "step": 319 }, { "advantage_max": 1.844655841588974, "advantage_mean": 6.208817460162663e-09, "advantage_min": -0.8964797705411911, "advantage_std": 0.99985171854496, "completion_length": 1050.1666946411133, "epoch": 0.3657142857142857, "grad_norm": 0.32885971665382385, "kl": 0.016178131103515625, "lambda_div_used": 0.6, "learning_rate": 4.1094235253127374e-07, "loss": 0.0006, "reward": 0.34835223481059074, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34835223481059074, "reward_after_std": 0.8526491969823837, "reward_before_mean": 0.905394246801734, "reward_before_std": 0.7732513155788183, "reward_change_max": 0.0, "reward_change_mean": -0.5570420175790787, "reward_change_min": -0.9411030262708664, "reward_change_std": 0.35189635306596756, "reward_std": 0.8526492044329643, "rewards/cosine_scaled_reward": -0.01605289150029421, "rewards/format_reward": 0.9375000074505806, "step": 320 }, { "advantage_max": 1.9735696017742157, "advantage_mean": -3.725290431688677e-08, "advantage_min": -0.6550575718283653, "advantage_std": 0.9998459443449974, "completion_length": 912.5625152587891, "epoch": 0.3668571428571429, "grad_norm": 0.252029687166214, "kl": 0.0068225860595703125, "lambda_div_used": 0.6, "learning_rate": 4.079579333738039e-07, "loss": 0.0003, "reward": 0.574149573687464, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.574149573687464, "reward_after_std": 0.7687293998897076, "reward_before_mean": 1.2689920328557491, "reward_before_std": 0.5234791114926338, "reward_change_max": 0.0, "reward_change_mean": -0.6948424577713013, "reward_change_min": -1.0286077186465263, "reward_change_std": 0.37573184818029404, "reward_std": 0.7687294036149979, "rewards/cosine_scaled_reward": 0.13449599593877792, "rewards/format_reward": 1.0, "step": 321 }, { "advantage_max": 1.8821515142917633, "advantage_mean": 6.829699028543246e-09, "advantage_min": -0.8529209233820438, "advantage_std": 0.9998260661959648, "completion_length": 1869.2083854675293, "epoch": 0.368, "grad_norm": 0.5277259349822998, "kl": 0.0365142822265625, "lambda_div_used": 0.6, "learning_rate": 4.0498043714627006e-07, "loss": 0.0015, "reward": -0.018657252425327897, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.018657252425327897, "reward_after_std": 0.7735125869512558, "reward_before_mean": 0.3417943734675646, "reward_before_std": 0.7077783048152924, "reward_change_max": 0.001621730625629425, "reward_change_mean": -0.36045162566006184, "reward_change_min": -0.6681258156895638, "reward_change_std": 0.24554293975234032, "reward_std": 0.773512601852417, "rewards/cosine_scaled_reward": -0.17285283096134663, "rewards/format_reward": 0.6875000055879354, "step": 322 }, { "advantage_max": 1.9139571785926819, "advantage_mean": 3.3306690738754696e-16, "advantage_min": -0.8632525950670242, "advantage_std": 0.9998295903205872, "completion_length": 1469.3958892822266, "epoch": 0.36914285714285716, "grad_norm": 0.43059584498405457, "kl": 0.01897430419921875, "lambda_div_used": 0.6, "learning_rate": 4.020100089676376e-07, "loss": 0.0008, "reward": 0.16783785168081522, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16783785168081522, "reward_after_std": 0.6461106389760971, "reward_before_mean": 0.665590648073703, "reward_before_std": 0.5015575010329485, "reward_change_max": 0.0, "reward_change_mean": -0.49775280244648457, "reward_change_min": -0.7560123316943645, "reward_change_std": 0.2950308583676815, "reward_std": 0.6461106389760971, "rewards/cosine_scaled_reward": -0.06303800735622644, "rewards/format_reward": 0.7916666716337204, "step": 323 }, { "advantage_max": 1.8660519123077393, "advantage_mean": -2.048909675256283e-08, "advantage_min": -0.8690266758203506, "advantage_std": 0.9998775646090508, "completion_length": 1304.645866394043, "epoch": 0.3702857142857143, "grad_norm": 0.4782713055610657, "kl": 0.018848419189453125, "lambda_div_used": 0.6, "learning_rate": 3.9904679361238526e-07, "loss": 0.0008, "reward": 0.309486435726285, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.309486435726285, "reward_after_std": 0.909374438226223, "reward_before_mean": 0.8317293685395271, "reward_before_std": 0.8388629630208015, "reward_change_max": 0.0, "reward_change_mean": -0.5222429521381855, "reward_change_min": -0.950531929731369, "reward_change_std": 0.3524559233337641, "reward_std": 0.9093744680285454, "rewards/cosine_scaled_reward": 0.009614669252187014, "rewards/format_reward": 0.8125000037252903, "step": 324 }, { "advantage_max": 1.8715059906244278, "advantage_mean": -1.6763806787167823e-08, "advantage_min": -0.8543071523308754, "advantage_std": 0.9998787268996239, "completion_length": 1794.1666946411133, "epoch": 0.37142857142857144, "grad_norm": 0.29662418365478516, "kl": 0.016979217529296875, "lambda_div_used": 0.6, "learning_rate": 3.9609093550344907e-07, "loss": 0.0007, "reward": 0.2978088464587927, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2978088464587927, "reward_after_std": 0.8947106897830963, "reward_before_mean": 0.8223866783082485, "reward_before_std": 0.7958251684904099, "reward_change_max": 0.0008146986365318298, "reward_change_mean": -0.5245778262615204, "reward_change_min": -0.9071747809648514, "reward_change_std": 0.3426001761108637, "reward_std": 0.8947107121348381, "rewards/cosine_scaled_reward": 0.046609988203272223, "rewards/format_reward": 0.729166679084301, "step": 325 }, { "advantage_max": 1.8180214762687683, "advantage_mean": -2.6697914268236644e-08, "advantage_min": -0.9289712607860565, "advantage_std": 0.9998547285795212, "completion_length": 1444.2292098999023, "epoch": 0.37257142857142855, "grad_norm": 0.3878525495529175, "kl": 0.02288818359375, "lambda_div_used": 0.6, "learning_rate": 3.931425787051832e-07, "loss": 0.0009, "reward": 0.26968370797112584, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26968370797112584, "reward_after_std": 0.8076260313391685, "reward_before_mean": 0.80234469845891, "reward_before_std": 0.7833601534366608, "reward_change_max": 0.0002892613410949707, "reward_change_mean": -0.5326609779149294, "reward_change_min": -0.9844692200422287, "reward_change_std": 0.37313529662787914, "reward_std": 0.8076260536909103, "rewards/cosine_scaled_reward": 0.005338998977094889, "rewards/format_reward": 0.7916666716337204, "step": 326 }, { "advantage_max": 1.9538254141807556, "advantage_mean": -3.476937671109681e-08, "advantage_min": -0.742390725761652, "advantage_std": 0.9998945817351341, "completion_length": 1554.2500381469727, "epoch": 0.3737142857142857, "grad_norm": 0.3174353539943695, "kl": 0.012559890747070312, "lambda_div_used": 0.6, "learning_rate": 3.902018669163384e-07, "loss": 0.0005, "reward": 0.49121918249875307, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.49121918249875307, "reward_after_std": 1.0039337873458862, "reward_before_mean": 1.0879922366584651, "reward_before_std": 0.8203750047832727, "reward_change_max": 0.0, "reward_change_mean": -0.596773061901331, "reward_change_min": -0.9268834516406059, "reward_change_std": 0.33764065988361835, "reward_std": 1.003933809697628, "rewards/cosine_scaled_reward": 0.12732943054288626, "rewards/format_reward": 0.833333333954215, "step": 327 }, { "advantage_max": 1.8848908692598343, "advantage_mean": 3.166496809203778e-08, "advantage_min": -0.8913846984505653, "advantage_std": 0.9998078942298889, "completion_length": 1600.4792098999023, "epoch": 0.37485714285714283, "grad_norm": 0.3274012506008148, "kl": 0.017192840576171875, "lambda_div_used": 0.6, "learning_rate": 3.872689434630585e-07, "loss": 0.0007, "reward": 0.1585409319959581, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1585409319959581, "reward_after_std": 0.7077501621097326, "reward_before_mean": 0.6421646438539028, "reward_before_std": 0.6207192353904247, "reward_change_max": 0.00017159432172775269, "reward_change_mean": -0.48362372256815434, "reward_change_min": -0.842924177646637, "reward_change_std": 0.3040748070925474, "reward_std": 0.7077501658350229, "rewards/cosine_scaled_reward": -0.06433434877544641, "rewards/format_reward": 0.770833333954215, "step": 328 }, { "advantage_max": 1.8619430512189865, "advantage_mean": -9.468446360294536e-09, "advantage_min": -0.9495992064476013, "advantage_std": 0.9998591840267181, "completion_length": 929.3542060852051, "epoch": 0.376, "grad_norm": 0.30674153566360474, "kl": 0.0108184814453125, "lambda_div_used": 0.6, "learning_rate": 3.843439512918949e-07, "loss": 0.0004, "reward": 0.4454690790735185, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4454690790735185, "reward_after_std": 0.8043604120612144, "reward_before_mean": 1.0710258733015507, "reward_before_std": 0.7005043588578701, "reward_change_max": 0.0, "reward_change_mean": -0.625556755810976, "reward_change_min": -0.9712866432964802, "reward_change_std": 0.37744545564055443, "reward_std": 0.8043604269623756, "rewards/cosine_scaled_reward": 0.0667628962546587, "rewards/format_reward": 0.9375000074505806, "step": 329 }, { "advantage_max": 1.9051975160837173, "advantage_mean": -7.062529561174813e-09, "advantage_min": -0.834363654255867, "advantage_std": 0.9998826459050179, "completion_length": 1213.083366394043, "epoch": 0.37714285714285717, "grad_norm": 0.4489211440086365, "kl": 0.019287109375, "lambda_div_used": 0.6, "learning_rate": 3.8142703296283953e-07, "loss": 0.0008, "reward": 0.23261515237390995, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23261515237390995, "reward_after_std": 1.0043475143611431, "reward_before_mean": 0.6893374051433057, "reward_before_std": 0.9294507168233395, "reward_change_max": 0.0012516528367996216, "reward_change_mean": -0.45672223158180714, "reward_change_min": -0.8383929058909416, "reward_change_std": 0.31927052699029446, "reward_std": 1.0043475553393364, "rewards/cosine_scaled_reward": -0.0719979761634022, "rewards/format_reward": 0.8333333432674408, "step": 330 }, { "advantage_max": 1.80654077231884, "advantage_mean": -3.042320462220971e-08, "advantage_min": -0.9978099688887596, "advantage_std": 0.9997223243117332, "completion_length": 1831.7708625793457, "epoch": 0.3782857142857143, "grad_norm": 0.3333419859409332, "kl": 0.01973724365234375, "lambda_div_used": 0.6, "learning_rate": 3.785183306423767e-07, "loss": 0.0008, "reward": -0.053731471532955766, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.053731471532955766, "reward_after_std": 0.4776611737906933, "reward_before_mean": 0.3651720993220806, "reward_before_std": 0.4157779663801193, "reward_change_max": 0.00035496801137924194, "reward_change_mean": -0.4189036013558507, "reward_change_min": -0.6600331999361515, "reward_change_std": 0.2541554179042578, "reward_std": 0.4776612054556608, "rewards/cosine_scaled_reward": -0.1507472936064005, "rewards/format_reward": 0.6666666679084301, "step": 331 }, { "advantage_max": 1.8885580450296402, "advantage_mean": -2.8560560139112567e-08, "advantage_min": -0.7655205801129341, "advantage_std": 0.9998506605625153, "completion_length": 1258.0416870117188, "epoch": 0.37942857142857145, "grad_norm": 0.6487593650817871, "kl": 0.017627716064453125, "lambda_div_used": 0.6, "learning_rate": 3.7561798609655373e-07, "loss": 0.0007, "reward": 0.21511683403514326, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21511683403514326, "reward_after_std": 0.8157045319676399, "reward_before_mean": 0.7045507121365517, "reward_before_std": 0.7418462634086609, "reward_change_max": 0.00038327276706695557, "reward_change_mean": -0.48943387530744076, "reward_change_min": -0.854445330798626, "reward_change_std": 0.3147754594683647, "reward_std": 0.8157045654952526, "rewards/cosine_scaled_reward": -0.08522465638816357, "rewards/format_reward": 0.8750000055879354, "step": 332 }, { "advantage_max": 1.885246142745018, "advantage_mean": -3.0423204400165105e-08, "advantage_min": -0.8369051367044449, "advantage_std": 0.9998035356402397, "completion_length": 1188.6458587646484, "epoch": 0.38057142857142856, "grad_norm": 0.3059176504611969, "kl": 0.012359619140625, "lambda_div_used": 0.6, "learning_rate": 3.72726140684072e-07, "loss": 0.0005, "reward": 0.38543284498155117, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.38543284498155117, "reward_after_std": 0.6527124047279358, "reward_before_mean": 1.0101193934679031, "reward_before_std": 0.5245114741846919, "reward_change_max": 0.0, "reward_change_mean": -0.6246866025030613, "reward_change_min": -0.9763289764523506, "reward_change_std": 0.3626061100512743, "reward_std": 0.6527124308049679, "rewards/cosine_scaled_reward": 0.025893021374940872, "rewards/format_reward": 0.9583333432674408, "step": 333 }, { "advantage_max": 1.857540786266327, "advantage_mean": 8.071462720415923e-09, "advantage_min": -0.799895916134119, "advantage_std": 0.9998114556074142, "completion_length": 2156.4167251586914, "epoch": 0.38171428571428573, "grad_norm": 0.33751410245895386, "kl": 0.02600860595703125, "lambda_div_used": 0.6, "learning_rate": 3.6984293534939737e-07, "loss": 0.001, "reward": -0.1896799481473863, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1896799481473863, "reward_after_std": 0.6050053462386131, "reward_before_mean": 0.12263576500117779, "reward_before_std": 0.5591881051659584, "reward_change_max": 0.0009383410215377808, "reward_change_mean": -0.31231571082025766, "reward_change_min": -0.5923841707408428, "reward_change_std": 0.21185853891074657, "reward_std": 0.6050053536891937, "rewards/cosine_scaled_reward": -0.23034879751503468, "rewards/format_reward": 0.5833333395421505, "step": 334 }, { "advantage_max": 1.8346337229013443, "advantage_mean": -9.934107647602275e-09, "advantage_min": -0.9422204568982124, "advantage_std": 0.999869279563427, "completion_length": 1255.1250534057617, "epoch": 0.38285714285714284, "grad_norm": 0.20903566479682922, "kl": 0.008930206298828125, "lambda_div_used": 0.6, "learning_rate": 3.6696851061588994e-07, "loss": 0.0004, "reward": 0.4035352533683181, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4035352533683181, "reward_after_std": 0.8650985062122345, "reward_before_mean": 0.9930033460259438, "reward_before_std": 0.7928112968802452, "reward_change_max": 0.00031920522451400757, "reward_change_mean": -0.5894680954515934, "reward_change_min": -0.9780095741152763, "reward_change_std": 0.37678316980600357, "reward_std": 0.8650985211133957, "rewards/cosine_scaled_reward": 0.03816831856966019, "rewards/format_reward": 0.9166666716337204, "step": 335 }, { "advantage_max": 1.6843600422143936, "advantage_mean": -2.0489098251363913e-08, "advantage_min": -1.1067031100392342, "advantage_std": 0.9998784810304642, "completion_length": 1705.6042098999023, "epoch": 0.384, "grad_norm": 0.3693605065345764, "kl": 0.018991470336914062, "lambda_div_used": 0.6, "learning_rate": 3.641030065789562e-07, "loss": 0.0008, "reward": 0.3400895514059812, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3400895514059812, "reward_after_std": 0.96578698605299, "reward_before_mean": 0.8908323775976896, "reward_before_std": 1.0428907200694084, "reward_change_max": 0.0005368068814277649, "reward_change_mean": -0.5507428087294102, "reward_change_min": -1.1123467236757278, "reward_change_std": 0.4507921673357487, "reward_std": 0.9657870009541512, "rewards/cosine_scaled_reward": 0.1016661785542965, "rewards/format_reward": 0.6875000149011612, "step": 336 }, { "advantage_max": 1.8957796841859818, "advantage_mean": -3.725290520506519e-09, "advantage_min": -0.7205033674836159, "advantage_std": 0.9998714700341225, "completion_length": 1377.7083740234375, "epoch": 0.3851428571428571, "grad_norm": 0.25850510597229004, "kl": 0.011775970458984375, "lambda_div_used": 0.6, "learning_rate": 3.612465628992203e-07, "loss": 0.0005, "reward": 0.3554620500653982, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3554620500653982, "reward_after_std": 0.9742900244891644, "reward_before_mean": 0.8959630839526653, "reward_before_std": 0.8873303392902017, "reward_change_max": 0.0, "reward_change_mean": -0.540501032024622, "reward_change_min": -1.0388685837388039, "reward_change_std": 0.36955892480909824, "reward_std": 0.9742900505661964, "rewards/cosine_scaled_reward": -0.03118512872606516, "rewards/format_reward": 0.9583333432674408, "step": 337 }, { "advantage_max": 1.8836781829595566, "advantage_mean": 1.1175870007207322e-08, "advantage_min": -0.8648780807852745, "advantage_std": 0.9998218789696693, "completion_length": 1293.7917098999023, "epoch": 0.3862857142857143, "grad_norm": 0.2512975037097931, "kl": 0.009267807006835938, "lambda_div_used": 0.6, "learning_rate": 3.5839931879571725e-07, "loss": 0.0004, "reward": 0.318469176068902, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.318469176068902, "reward_after_std": 0.7098839841783047, "reward_before_mean": 0.8908936120569706, "reward_before_std": 0.5762915294617414, "reward_change_max": 0.0, "reward_change_mean": -0.5724244341254234, "reward_change_min": -0.8914236910641193, "reward_change_std": 0.32981424406170845, "reward_std": 0.709883987903595, "rewards/cosine_scaled_reward": 0.007946796715259552, "rewards/format_reward": 0.875, "step": 338 }, { "advantage_max": 1.8676584959030151, "advantage_mean": 1.4280279680978225e-08, "advantage_min": -0.8956063166260719, "advantage_std": 0.9998053833842278, "completion_length": 1835.2083435058594, "epoch": 0.38742857142857146, "grad_norm": 0.3322133719921112, "kl": 0.0276947021484375, "lambda_div_used": 0.6, "learning_rate": 3.555614130391079e-07, "loss": 0.0011, "reward": -0.054544417187571526, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.054544417187571526, "reward_after_std": 0.6542263887822628, "reward_before_mean": 0.32189842336811125, "reward_before_std": 0.6224677935242653, "reward_change_max": 0.0010414794087409973, "reward_change_mean": -0.3764428086578846, "reward_change_min": -0.6626890636980534, "reward_change_std": 0.26315341144800186, "reward_std": 0.654226390644908, "rewards/cosine_scaled_reward": -0.16196747706271708, "rewards/format_reward": 0.6458333488553762, "step": 339 }, { "advantage_max": 1.9051592350006104, "advantage_mean": -3.166496753692627e-08, "advantage_min": -0.8037348762154579, "advantage_std": 0.9998365417122841, "completion_length": 1288.6042098999023, "epoch": 0.38857142857142857, "grad_norm": 0.25094640254974365, "kl": 0.010942459106445312, "lambda_div_used": 0.6, "learning_rate": 3.5273298394491515e-07, "loss": 0.0004, "reward": 0.28444598644273356, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.28444598644273356, "reward_after_std": 0.6651497408747673, "reward_before_mean": 0.8474188968539238, "reward_before_std": 0.528843104839325, "reward_change_max": 0.0, "reward_change_mean": -0.5629729311913252, "reward_change_min": -0.9151150286197662, "reward_change_std": 0.3289155066013336, "reward_std": 0.6651497483253479, "rewards/cosine_scaled_reward": -0.013790564611554146, "rewards/format_reward": 0.8750000055879354, "step": 340 }, { "advantage_max": 1.8734830617904663, "advantage_mean": -4.34617203337595e-09, "advantage_min": -0.8691454865038395, "advantage_std": 0.9998512864112854, "completion_length": 1238.9166946411133, "epoch": 0.38971428571428574, "grad_norm": 0.31012919545173645, "kl": 0.014537811279296875, "lambda_div_used": 0.6, "learning_rate": 3.4991416936678276e-07, "loss": 0.0006, "reward": 0.577342574018985, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.577342574018985, "reward_after_std": 0.8173014968633652, "reward_before_mean": 1.2706860266625881, "reward_before_std": 0.6470533646643162, "reward_change_max": 0.0009817034006118774, "reward_change_mean": -0.6933434382081032, "reward_change_min": -1.1341840326786041, "reward_change_std": 0.4181734323501587, "reward_std": 0.8173014968633652, "rewards/cosine_scaled_reward": 0.19784300029277802, "rewards/format_reward": 0.8750000074505806, "step": 341 }, { "advantage_max": 1.8878445029258728, "advantage_mean": -1.800557042352935e-08, "advantage_min": -0.7828846871852875, "advantage_std": 0.9998863115906715, "completion_length": 1433.562515258789, "epoch": 0.39085714285714285, "grad_norm": 0.4130045771598816, "kl": 0.020595550537109375, "lambda_div_used": 0.6, "learning_rate": 3.471051066897562e-07, "loss": 0.0008, "reward": 0.36572215892374516, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.36572215892374516, "reward_after_std": 1.022568255662918, "reward_before_mean": 0.8937026299536228, "reward_before_std": 0.9488763399422169, "reward_change_max": 0.0, "reward_change_mean": -0.5279804691672325, "reward_change_min": -1.0131430327892303, "reward_change_std": 0.36056538484990597, "reward_std": 1.0225682780146599, "rewards/cosine_scaled_reward": -0.0010653771460056305, "rewards/format_reward": 0.8958333432674408, "step": 342 }, { "advantage_max": 1.852153331041336, "advantage_mean": 6.208817904251873e-10, "advantage_min": -0.8911474421620369, "advantage_std": 0.9999059438705444, "completion_length": 1568.7917098999023, "epoch": 0.392, "grad_norm": 0.3222767114639282, "kl": 0.019664764404296875, "lambda_div_used": 0.6, "learning_rate": 3.4430593282358777e-07, "loss": 0.0008, "reward": 0.5172849660739303, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5172849660739303, "reward_after_std": 1.1370365917682648, "reward_before_mean": 1.1116691306233406, "reward_before_std": 1.0664770379662514, "reward_change_max": 0.0, "reward_change_mean": -0.5943841449916363, "reward_change_min": -1.137386292219162, "reward_change_std": 0.41275884211063385, "reward_std": 1.1370366215705872, "rewards/cosine_scaled_reward": 0.09750120915123262, "rewards/format_reward": 0.9166666716337204, "step": 343 }, { "advantage_max": 1.8314592689275742, "advantage_mean": -8.22668272393301e-09, "advantage_min": -1.0410834476351738, "advantage_std": 0.9998433589935303, "completion_length": 1337.0417022705078, "epoch": 0.3931428571428571, "grad_norm": 0.33922290802001953, "kl": 0.013172149658203125, "lambda_div_used": 0.6, "learning_rate": 3.4151678419606233e-07, "loss": 0.0005, "reward": 0.5709733965341002, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5709733965341002, "reward_after_std": 0.7575945109128952, "reward_before_mean": 1.2816130220890045, "reward_before_std": 0.6747047891840339, "reward_change_max": 0.0, "reward_change_mean": -0.7106395661830902, "reward_change_min": -1.081616371870041, "reward_change_std": 0.425860196352005, "reward_std": 0.7575945146381855, "rewards/cosine_scaled_reward": 0.203306476585567, "rewards/format_reward": 0.8750000111758709, "step": 344 }, { "advantage_max": 1.7418056577444077, "advantage_mean": -1.800557097864086e-08, "advantage_min": -0.9639134332537651, "advantage_std": 0.9998338147997856, "completion_length": 1639.1041946411133, "epoch": 0.3942857142857143, "grad_norm": 0.4215705692768097, "kl": 0.027713775634765625, "lambda_div_used": 0.6, "learning_rate": 3.387377967463493e-07, "loss": 0.0011, "reward": 0.3227203474380076, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3227203474380076, "reward_after_std": 0.686117697507143, "reward_before_mean": 0.9131767749786377, "reward_before_std": 0.6375276725739241, "reward_change_max": 0.00036709755659103394, "reward_change_mean": -0.5904564298689365, "reward_change_min": -0.9719649441540241, "reward_change_std": 0.3853672593832016, "reward_std": 0.6861177161335945, "rewards/cosine_scaled_reward": 0.05033837631344795, "rewards/format_reward": 0.8125000111758709, "step": 345 }, { "advantage_max": 1.8243045955896378, "advantage_mean": -2.8560559584001055e-08, "advantage_min": -1.0744804069399834, "advantage_std": 0.9998216480016708, "completion_length": 1337.2083587646484, "epoch": 0.3954285714285714, "grad_norm": 0.289594829082489, "kl": 0.013706207275390625, "lambda_div_used": 0.6, "learning_rate": 3.359691059183761e-07, "loss": 0.0005, "reward": 0.27365553192794323, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.27365553192794323, "reward_after_std": 0.6229892708361149, "reward_before_mean": 0.8438735082745552, "reward_before_std": 0.560555960983038, "reward_change_max": 0.0, "reward_change_mean": -0.570217976346612, "reward_change_min": -0.8716960623860359, "reward_change_std": 0.3414479810744524, "reward_std": 0.6229892894625664, "rewards/cosine_scaled_reward": -0.015563266351819038, "rewards/format_reward": 0.8750000111758709, "step": 346 }, { "advantage_max": 1.8952476382255554, "advantage_mean": -9.934107647602275e-09, "advantage_min": -0.8498244062066078, "advantage_std": 0.9998175203800201, "completion_length": 1423.2291946411133, "epoch": 0.3965714285714286, "grad_norm": 0.2076168954372406, "kl": 0.013187408447265625, "lambda_div_used": 0.6, "learning_rate": 3.3321084665422803e-07, "loss": 0.0005, "reward": 0.11660511128138751, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11660511128138751, "reward_after_std": 0.6356964744627476, "reward_before_mean": 0.5917580462992191, "reward_before_std": 0.5249120108783245, "reward_change_max": 0.0, "reward_change_mean": -0.475152924656868, "reward_change_min": -0.7603438273072243, "reward_change_std": 0.27699695713818073, "reward_std": 0.6356964930891991, "rewards/cosine_scaled_reward": -0.19370432803407311, "rewards/format_reward": 0.9791666716337204, "step": 347 }, { "advantage_max": 1.8410307168960571, "advantage_mean": -2.0178656634506353e-08, "advantage_min": -0.9013228639960289, "advantage_std": 0.9998410418629646, "completion_length": 1313.1458740234375, "epoch": 0.3977142857142857, "grad_norm": 0.3774625360965729, "kl": 0.018695831298828125, "lambda_div_used": 0.6, "learning_rate": 3.3046315338757026e-07, "loss": 0.0007, "reward": 0.379140455275774, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.379140455275774, "reward_after_std": 0.7235062941908836, "reward_before_mean": 0.9868384040892124, "reward_before_std": 0.628569420427084, "reward_change_max": 0.0, "reward_change_mean": -0.6076979450881481, "reward_change_min": -1.0189422145485878, "reward_change_std": 0.375955443829298, "reward_std": 0.7235063090920448, "rewards/cosine_scaled_reward": 0.045502522960305214, "rewards/format_reward": 0.8958333432674408, "step": 348 }, { "advantage_max": 1.8501978665590286, "advantage_mean": -9.002785184009099e-09, "advantage_min": -0.9660811722278595, "advantage_std": 0.9998482018709183, "completion_length": 1379.395866394043, "epoch": 0.39885714285714285, "grad_norm": 0.35212260484695435, "kl": 0.018650054931640625, "lambda_div_used": 0.6, "learning_rate": 3.2772616003709616e-07, "loss": 0.0007, "reward": 0.33767664176411927, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.33767664176411927, "reward_after_std": 0.8225687071681023, "reward_before_mean": 0.9022767737042159, "reward_before_std": 0.7706983331590891, "reward_change_max": 0.0, "reward_change_mean": -0.5646001249551773, "reward_change_min": -0.9951711148023605, "reward_change_std": 0.3776344805955887, "reward_std": 0.822568740695715, "rewards/cosine_scaled_reward": -0.017611628398299217, "rewards/format_reward": 0.9375000074505806, "step": 349 }, { "advantage_max": 1.929465413093567, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.7032685950398445, "advantage_std": 0.9998827949166298, "completion_length": 911.0000305175781, "epoch": 0.4, "grad_norm": 0.377990186214447, "kl": 0.007293701171875, "lambda_div_used": 0.6, "learning_rate": 3.250000000000001e-07, "loss": 0.0003, "reward": 0.399724748916924, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.399724748916924, "reward_after_std": 1.016660876572132, "reward_before_mean": 0.9504643231630325, "reward_before_std": 0.8899853974580765, "reward_change_max": 0.0, "reward_change_mean": -0.5507395602762699, "reward_change_min": -0.9786839932203293, "reward_change_std": 0.34146598540246487, "reward_std": 1.0166608802974224, "rewards/cosine_scaled_reward": -0.0247678579762578, "rewards/format_reward": 1.0, "step": 350 }, { "advantage_max": 1.9297864735126495, "advantage_mean": -1.4901161526914564e-08, "advantage_min": -0.7317093014717102, "advantage_std": 0.9998589679598808, "completion_length": 1339.6667175292969, "epoch": 0.40114285714285713, "grad_norm": 0.30346372723579407, "kl": 0.01879119873046875, "lambda_div_used": 0.6, "learning_rate": 3.222848061454764e-07, "loss": 0.0008, "reward": 0.3753087054938078, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3753087054938078, "reward_after_std": 0.8502420969307423, "reward_before_mean": 0.945469731464982, "reward_before_std": 0.710259310901165, "reward_change_max": 0.0, "reward_change_mean": -0.5701610464602709, "reward_change_min": -0.9608272425830364, "reward_change_std": 0.3396002743393183, "reward_std": 0.8502420969307423, "rewards/cosine_scaled_reward": 0.014401533640921116, "rewards/format_reward": 0.9166666679084301, "step": 351 }, { "advantage_max": 1.8952407985925674, "advantage_mean": -5.277494968813912e-09, "advantage_min": -0.7425141632556915, "advantage_std": 0.9998616054654121, "completion_length": 1668.0208587646484, "epoch": 0.4022857142857143, "grad_norm": 0.36306384205818176, "kl": 0.027172088623046875, "lambda_div_used": 0.6, "learning_rate": 3.195807108082429e-07, "loss": 0.0011, "reward": 0.17020612582564354, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17020612582564354, "reward_after_std": 0.9018196687102318, "reward_before_mean": 0.6187307387590408, "reward_before_std": 0.8488708660006523, "reward_change_max": 0.0, "reward_change_mean": -0.44852462224662304, "reward_change_min": -0.8682103119790554, "reward_change_std": 0.32232335302978754, "reward_std": 0.9018196910619736, "rewards/cosine_scaled_reward": -0.06563465157523751, "rewards/format_reward": 0.7500000018626451, "step": 352 }, { "advantage_max": 1.8312696814537048, "advantage_mean": -6.829699583654758e-09, "advantage_min": -0.9065942466259003, "advantage_std": 0.999829463660717, "completion_length": 1157.708366394043, "epoch": 0.4034285714285714, "grad_norm": 0.3327624797821045, "kl": 0.010297775268554688, "lambda_div_used": 0.6, "learning_rate": 3.168878457820915e-07, "loss": 0.0004, "reward": 0.33716506394557655, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.33716506394557655, "reward_after_std": 0.7595427818596363, "reward_before_mean": 0.9169227406382561, "reward_before_std": 0.7094110660254955, "reward_change_max": 0.0, "reward_change_mean": -0.5797576904296875, "reward_change_min": -1.044652320444584, "reward_change_std": 0.3778637405484915, "reward_std": 0.7595428004860878, "rewards/cosine_scaled_reward": -0.031121966429054737, "rewards/format_reward": 0.9791666716337204, "step": 353 }, { "advantage_max": 1.874330386519432, "advantage_mean": -1.1796752963366686e-08, "advantage_min": -0.8423102647066116, "advantage_std": 0.9998380094766617, "completion_length": 1096.2292098999023, "epoch": 0.4045714285714286, "grad_norm": 0.23596711456775665, "kl": 0.01212310791015625, "lambda_div_used": 0.6, "learning_rate": 3.142063423134644e-07, "loss": 0.0005, "reward": 0.4981326200067997, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4981326200067997, "reward_after_std": 0.7721955999732018, "reward_before_mean": 1.1596158780157566, "reward_before_std": 0.6328756362199783, "reward_change_max": 0.0, "reward_change_mean": -0.6614832170307636, "reward_change_min": -1.0063703507184982, "reward_change_std": 0.38712820969522, "reward_std": 0.772195640951395, "rewards/cosine_scaled_reward": 0.0902245668694377, "rewards/format_reward": 0.9791666716337204, "step": 354 }, { "advantage_max": 1.8732577413320541, "advantage_mean": -1.4901161637936866e-08, "advantage_min": -0.8534680753946304, "advantage_std": 0.9998545199632645, "completion_length": 1047.4166946411133, "epoch": 0.4057142857142857, "grad_norm": 0.29967862367630005, "kl": 0.013187408447265625, "lambda_div_used": 0.6, "learning_rate": 3.115363310950578e-07, "loss": 0.0005, "reward": 0.5733014561701566, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5733014561701566, "reward_after_std": 0.8431755602359772, "reward_before_mean": 1.2605398669838905, "reward_before_std": 0.706612853333354, "reward_change_max": 0.0, "reward_change_mean": -0.6872383803129196, "reward_change_min": -1.055651679635048, "reward_change_std": 0.41943654976785183, "reward_std": 0.8431755751371384, "rewards/cosine_scaled_reward": 0.15110323758563027, "rewards/format_reward": 0.9583333432674408, "step": 355 }, { "advantage_max": 1.8735899478197098, "advantage_mean": -6.014791997799307e-09, "advantage_min": -0.8832316547632217, "advantage_std": 0.9998319298028946, "completion_length": 1665.6667175292969, "epoch": 0.40685714285714286, "grad_norm": 0.24277737736701965, "kl": 0.02996063232421875, "lambda_div_used": 0.6, "learning_rate": 3.0887794225945143e-07, "loss": 0.0012, "reward": 0.21907207665208261, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21907207665208261, "reward_after_std": 0.7408511564135551, "reward_before_mean": 0.7323421612381935, "reward_before_std": 0.6679595708847046, "reward_change_max": 0.002227187156677246, "reward_change_mean": -0.5132700940594077, "reward_change_min": -0.8826829344034195, "reward_change_std": 0.3406843263655901, "reward_std": 0.7408512085676193, "rewards/cosine_scaled_reward": -0.050495600793510675, "rewards/format_reward": 0.8333333414047956, "step": 356 }, { "advantage_max": 1.8176670521497726, "advantage_mean": -6.208817904251873e-10, "advantage_min": -0.9782910048961639, "advantage_std": 0.9998316466808319, "completion_length": 1771.520881652832, "epoch": 0.408, "grad_norm": 0.3803209960460663, "kl": 0.026142120361328125, "lambda_div_used": 0.6, "learning_rate": 3.062313053727671e-07, "loss": 0.001, "reward": 0.07317158195655793, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.07317158195655793, "reward_after_std": 0.6787878163158894, "reward_before_mean": 0.5202360115945339, "reward_before_std": 0.6421620734035969, "reward_change_max": 0.0007302388548851013, "reward_change_mean": -0.44706446304917336, "reward_change_min": -0.7746949233114719, "reward_change_std": 0.30314655415713787, "reward_std": 0.6787878349423409, "rewards/cosine_scaled_reward": -0.17738200351595879, "rewards/format_reward": 0.8750000149011612, "step": 357 }, { "advantage_max": 1.9141297489404678, "advantage_mean": -1.8005570368018198e-08, "advantage_min": -0.758477583527565, "advantage_std": 0.9998798295855522, "completion_length": 1370.3333587646484, "epoch": 0.40914285714285714, "grad_norm": 0.23564986884593964, "kl": 0.0139007568359375, "lambda_div_used": 0.6, "learning_rate": 3.0359654942835247e-07, "loss": 0.0006, "reward": 0.49675935972481966, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.49675935972481966, "reward_after_std": 0.9583783783018589, "reward_before_mean": 1.1124700075015426, "reward_before_std": 0.8288324661552906, "reward_change_max": 0.0, "reward_change_mean": -0.6157106459140778, "reward_change_min": -1.02113651111722, "reward_change_std": 0.37534380331635475, "reward_std": 0.9583783894777298, "rewards/cosine_scaled_reward": 0.09790166141465306, "rewards/format_reward": 0.9166666679084301, "step": 358 }, { "advantage_max": 1.8960884362459183, "advantage_mean": -1.4590720853746575e-08, "advantage_min": -0.8398799225687981, "advantage_std": 0.9998002797365189, "completion_length": 902.458366394043, "epoch": 0.4102857142857143, "grad_norm": 0.3054303526878357, "kl": 0.00920867919921875, "lambda_div_used": 0.6, "learning_rate": 3.0097380284049523e-07, "loss": 0.0004, "reward": 0.3241674543824047, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3241674543824047, "reward_after_std": 0.6032501570880413, "reward_before_mean": 0.9223264642059803, "reward_before_std": 0.4702158570289612, "reward_change_max": 0.0, "reward_change_mean": -0.598159022629261, "reward_change_min": -0.9134200140833855, "reward_change_std": 0.33777002803981304, "reward_std": 0.6032501794397831, "rewards/cosine_scaled_reward": -0.028420104179531336, "rewards/format_reward": 0.9791666716337204, "step": 359 }, { "advantage_max": 1.8649418652057648, "advantage_mean": -2.250696218286663e-08, "advantage_min": -0.8667888045310974, "advantage_std": 0.9998376667499542, "completion_length": 1136.8333435058594, "epoch": 0.4114285714285714, "grad_norm": 0.27571192383766174, "kl": 0.016468048095703125, "lambda_div_used": 0.6, "learning_rate": 2.9836319343816397e-07, "loss": 0.0007, "reward": 0.5144574351143092, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5144574351143092, "reward_after_std": 0.7366812415421009, "reward_before_mean": 1.1926300078630447, "reward_before_std": 0.5972456242889166, "reward_change_max": 0.0, "reward_change_mean": -0.6781725883483887, "reward_change_min": -1.0510307103395462, "reward_change_std": 0.39387187361717224, "reward_std": 0.7366812489926815, "rewards/cosine_scaled_reward": 0.0963149992749095, "rewards/format_reward": 1.0, "step": 360 }, { "advantage_max": 1.8617792576551437, "advantage_mean": -2.1109979042588378e-08, "advantage_min": -0.8186349608004093, "advantage_std": 0.9998375251889229, "completion_length": 1194.4791793823242, "epoch": 0.4125714285714286, "grad_norm": 0.3388521075248718, "kl": 0.01670074462890625, "lambda_div_used": 0.6, "learning_rate": 2.9576484845877793e-07, "loss": 0.0007, "reward": 0.19046252546831965, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19046252546831965, "reward_after_std": 0.756943553686142, "reward_before_mean": 0.6817769166082144, "reward_before_std": 0.6761543806642294, "reward_change_max": 0.0, "reward_change_mean": -0.4913143888115883, "reward_change_min": -0.8694675639271736, "reward_change_std": 0.31585903838276863, "reward_std": 0.7569435648620129, "rewards/cosine_scaled_reward": -0.12786155845969915, "rewards/format_reward": 0.9375000074505806, "step": 361 }, { "advantage_max": 1.874962106347084, "advantage_mean": -1.1874363270436561e-08, "advantage_min": -0.9260348118841648, "advantage_std": 0.9998194351792336, "completion_length": 956.8542060852051, "epoch": 0.4137142857142857, "grad_norm": 0.3070959448814392, "kl": 0.012874603271484375, "lambda_div_used": 0.6, "learning_rate": 2.931788945420058e-07, "loss": 0.0005, "reward": 0.501685687340796, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.501685687340796, "reward_after_std": 0.6610399819910526, "reward_before_mean": 1.1869359854608774, "reward_before_std": 0.49332827515900135, "reward_change_max": 0.0, "reward_change_mean": -0.6852503418922424, "reward_change_min": -0.9486516639590263, "reward_change_std": 0.3753964975476265, "reward_std": 0.6610400229692459, "rewards/cosine_scaled_reward": 0.10388467647135258, "rewards/format_reward": 0.9791666716337204, "step": 362 }, { "advantage_max": 1.888390600681305, "advantage_mean": -4.3461723664428575e-09, "advantage_min": -0.825883325189352, "advantage_std": 0.9998545944690704, "completion_length": 1159.9166946411133, "epoch": 0.41485714285714287, "grad_norm": 0.3400725722312927, "kl": 0.0212554931640625, "lambda_div_used": 0.6, "learning_rate": 2.9060545772359305e-07, "loss": 0.0008, "reward": 0.6914736162871122, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6914736162871122, "reward_after_std": 0.7680247835814953, "reward_before_mean": 1.4520082499366254, "reward_before_std": 0.5585946920327842, "reward_change_max": 0.0, "reward_change_mean": -0.7605346217751503, "reward_change_min": -1.129803366959095, "reward_change_std": 0.43319543078541756, "reward_std": 0.7680247910320759, "rewards/cosine_scaled_reward": 0.27808745484799147, "rewards/format_reward": 0.8958333432674408, "step": 363 }, { "advantage_max": 1.8743097186088562, "advantage_mean": -1.3659397946064189e-08, "advantage_min": -0.964237779378891, "advantage_std": 0.999806247651577, "completion_length": 1216.3125305175781, "epoch": 0.416, "grad_norm": 0.30530038475990295, "kl": 0.01313018798828125, "lambda_div_used": 0.6, "learning_rate": 2.8804466342921987e-07, "loss": 0.0005, "reward": 0.10216320748440921, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10216320748440921, "reward_after_std": 0.5930333361029625, "reward_before_mean": 0.5751078221946955, "reward_before_std": 0.4904471728950739, "reward_change_max": 0.0, "reward_change_mean": -0.47294463589787483, "reward_change_min": -0.7347556799650192, "reward_change_std": 0.27974462509155273, "reward_std": 0.5930333621799946, "rewards/cosine_scaled_reward": -0.19161276146769524, "rewards/format_reward": 0.9583333432674408, "step": 364 }, { "advantage_max": 1.765339434146881, "advantage_mean": -3.3306690738754696e-16, "advantage_min": -1.010600470006466, "advantage_std": 0.9998663514852524, "completion_length": 2147.541748046875, "epoch": 0.41714285714285715, "grad_norm": 0.8073046803474426, "kl": 0.047985076904296875, "lambda_div_used": 0.6, "learning_rate": 2.854966364683872e-07, "loss": 0.0019, "reward": 0.03060930408537388, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03060930408537388, "reward_after_std": 0.854483887553215, "reward_before_mean": 0.42119893501512706, "reward_before_std": 0.90941421687603, "reward_change_max": 0.0013112276792526245, "reward_change_mean": -0.39058958552777767, "reward_change_min": -0.8304200060665607, "reward_change_std": 0.3401966169476509, "reward_std": 0.8544838950037956, "rewards/cosine_scaled_reward": -0.07065055519342422, "rewards/format_reward": 0.5625000111758709, "step": 365 }, { "advantage_max": 1.8821345120668411, "advantage_mean": -2.3671115845225188e-08, "advantage_min": -0.8438076861202717, "advantage_std": 0.9998635575175285, "completion_length": 1135.8541870117188, "epoch": 0.41828571428571426, "grad_norm": 0.27431872487068176, "kl": 0.01039886474609375, "lambda_div_used": 0.6, "learning_rate": 2.829615010283344e-07, "loss": 0.0004, "reward": 0.5668425522744656, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5668425522744656, "reward_after_std": 0.858330711722374, "reward_before_mean": 1.2461014911532402, "reward_before_std": 0.7163833295926452, "reward_change_max": 0.0, "reward_change_mean": -0.6792589277029037, "reward_change_min": -1.054423488676548, "reward_change_std": 0.3998459428548813, "reward_std": 0.858330748975277, "rewards/cosine_scaled_reward": 0.1334673846140504, "rewards/format_reward": 0.9791666716337204, "step": 366 }, { "advantage_max": 1.827427864074707, "advantage_mean": -2.2351743123039114e-08, "advantage_min": -0.9146878495812416, "advantage_std": 0.9998784735798836, "completion_length": 1681.7084197998047, "epoch": 0.41942857142857143, "grad_norm": 0.31206369400024414, "kl": 0.02565765380859375, "lambda_div_used": 0.6, "learning_rate": 2.8043938066798645e-07, "loss": 0.001, "reward": 0.35777226043865085, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.35777226043865085, "reward_after_std": 0.9370049685239792, "reward_before_mean": 0.9072601944208145, "reward_before_std": 0.892977561801672, "reward_change_max": 0.0, "reward_change_mean": -0.5494879521429539, "reward_change_min": -0.9770361036062241, "reward_change_std": 0.38045726902782917, "reward_std": 0.9370049983263016, "rewards/cosine_scaled_reward": 0.04738008719868958, "rewards/format_reward": 0.8125000186264515, "step": 367 }, { "advantage_max": 1.8542225509881973, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.9214349538087845, "advantage_std": 0.9998540431261063, "completion_length": 2178.3334045410156, "epoch": 0.4205714285714286, "grad_norm": 0.5161811113357544, "kl": 0.04541015625, "lambda_div_used": 0.6, "learning_rate": 2.7793039831193133e-07, "loss": 0.0018, "reward": 0.2744460329413414, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2744460329413414, "reward_after_std": 0.8940634317696095, "reward_before_mean": 0.7882697209715843, "reward_before_std": 0.8432548437267542, "reward_change_max": 0.0, "reward_change_mean": -0.5138236656785011, "reward_change_min": -0.9380548447370529, "reward_change_std": 0.35526866372674704, "reward_std": 0.8940634839236736, "rewards/cosine_scaled_reward": 0.019134832313284278, "rewards/format_reward": 0.7500000111758709, "step": 368 }, { "advantage_max": 1.8268895745277405, "advantage_mean": -5.277494830036034e-09, "advantage_min": -0.9020405560731888, "advantage_std": 0.9998411163687706, "completion_length": 1538.3750305175781, "epoch": 0.4217142857142857, "grad_norm": 0.5276854038238525, "kl": 0.029750823974609375, "lambda_div_used": 0.6, "learning_rate": 2.7543467624442956e-07, "loss": 0.0012, "reward": 0.23423784598708153, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.23423784598708153, "reward_after_std": 0.740181528031826, "reward_before_mean": 0.7611953113228083, "reward_before_std": 0.7027287185192108, "reward_change_max": 0.0, "reward_change_mean": -0.5269574634730816, "reward_change_min": -0.9228874072432518, "reward_change_std": 0.35378825664520264, "reward_std": 0.740181565284729, "rewards/cosine_scaled_reward": -0.036069024819880724, "rewards/format_reward": 0.8333333507180214, "step": 369 }, { "advantage_max": 1.8950076550245285, "advantage_mean": 6.208812908248262e-10, "advantage_min": -0.7749098390340805, "advantage_std": 0.9998090341687202, "completion_length": 1539.0208892822266, "epoch": 0.4228571428571429, "grad_norm": 0.4318285584449768, "kl": 0.027801513671875, "lambda_div_used": 0.6, "learning_rate": 2.729523361034538e-07, "loss": 0.0011, "reward": 0.267172476509586, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.267172476509586, "reward_after_std": 0.5785079337656498, "reward_before_mean": 0.835631494410336, "reward_before_std": 0.42543186247348785, "reward_change_max": 0.0, "reward_change_mean": -0.5684590209275484, "reward_change_min": -0.8694842867553234, "reward_change_std": 0.32806423865258694, "reward_std": 0.5785079672932625, "rewards/cosine_scaled_reward": -0.0301009276881814, "rewards/format_reward": 0.8958333432674408, "step": 370 }, { "advantage_max": 1.8320594280958176, "advantage_mean": -3.616636157222075e-08, "advantage_min": -0.8768892697989941, "advantage_std": 0.9998387768864632, "completion_length": 883.2500343322754, "epoch": 0.424, "grad_norm": 0.3734484314918518, "kl": 0.02027130126953125, "lambda_div_used": 0.6, "learning_rate": 2.7048349887476037e-07, "loss": 0.0008, "reward": 0.432393487659283, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.432393487659283, "reward_after_std": 0.6572475507855415, "reward_before_mean": 1.0829788483679295, "reward_before_std": 0.5263035781681538, "reward_change_max": 0.0, "reward_change_mean": -0.6505853645503521, "reward_change_min": -0.9898973554372787, "reward_change_std": 0.3860883489251137, "reward_std": 0.6572475582361221, "rewards/cosine_scaled_reward": 0.07273940369486809, "rewards/format_reward": 0.9375000074505806, "step": 371 }, { "advantage_max": 1.8835410475730896, "advantage_mean": -7.761022241536963e-09, "advantage_min": -0.9432180970907211, "advantage_std": 0.9998204559087753, "completion_length": 1505.895851135254, "epoch": 0.42514285714285716, "grad_norm": 0.3226718008518219, "kl": 0.02196502685546875, "lambda_div_used": 0.6, "learning_rate": 2.6802828488599294e-07, "loss": 0.0009, "reward": 0.45580523181706667, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.45580523181706667, "reward_after_std": 0.629872802644968, "reward_before_mean": 1.1213063728064299, "reward_before_std": 0.4603481814265251, "reward_change_max": 0.0, "reward_change_mean": -0.6655011437833309, "reward_change_min": -0.9581310376524925, "reward_change_std": 0.37093730084598064, "reward_std": 0.6298728287220001, "rewards/cosine_scaled_reward": 0.09190318267792463, "rewards/format_reward": 0.9375000149011612, "step": 372 }, { "advantage_max": 1.844840943813324, "advantage_mean": 3.104408785592483e-09, "advantage_min": -0.8586374893784523, "advantage_std": 0.9997843205928802, "completion_length": 970.4791946411133, "epoch": 0.42628571428571427, "grad_norm": 0.3689638078212738, "kl": 0.0161895751953125, "lambda_div_used": 0.6, "learning_rate": 2.655868138008171e-07, "loss": 0.0006, "reward": 0.15545231167925522, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15545231167925522, "reward_after_std": 0.5446327701210976, "reward_before_mean": 0.6714023929089308, "reward_before_std": 0.4520675241947174, "reward_change_max": 0.0, "reward_change_mean": -0.5159501116722822, "reward_change_min": -0.7823366560041904, "reward_change_std": 0.30494451709091663, "reward_std": 0.5446327850222588, "rewards/cosine_scaled_reward": -0.143465468659997, "rewards/format_reward": 0.9583333358168602, "step": 373 }, { "advantage_max": 1.8741025775671005, "advantage_mean": -1.6142925329809543e-08, "advantage_min": -0.9768885672092438, "advantage_std": 0.9998114258050919, "completion_length": 1107.9375228881836, "epoch": 0.42742857142857144, "grad_norm": 0.42821618914604187, "kl": 0.01703643798828125, "lambda_div_used": 0.6, "learning_rate": 2.631592046130896e-07, "loss": 0.0007, "reward": 0.3539789589121938, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3539789589121938, "reward_after_std": 0.5764170140028, "reward_before_mean": 0.972320668399334, "reward_before_std": 0.41725558042526245, "reward_change_max": 0.0, "reward_change_mean": -0.6183416955173016, "reward_change_min": -0.8753276914358139, "reward_change_std": 0.3396179787814617, "reward_std": 0.5764170251786709, "rewards/cosine_scaled_reward": 0.0069936420768499374, "rewards/format_reward": 0.9583333432674408, "step": 374 }, { "advantage_max": 1.8370700776576996, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -0.9200460538268089, "advantage_std": 0.9998407140374184, "completion_length": 1740.5416793823242, "epoch": 0.42857142857142855, "grad_norm": 0.4497852027416229, "kl": 0.0552825927734375, "lambda_div_used": 0.6, "learning_rate": 2.6074557564105724e-07, "loss": 0.0022, "reward": 0.3787726857699454, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3787726857699454, "reward_after_std": 0.7555015608668327, "reward_before_mean": 0.978042071685195, "reward_before_std": 0.6583366394042969, "reward_change_max": 0.001976780593395233, "reward_change_mean": -0.5992693491280079, "reward_change_min": -0.9710818901658058, "reward_change_std": 0.3688979558646679, "reward_std": 0.7555015906691551, "rewards/cosine_scaled_reward": 0.11402101069688797, "rewards/format_reward": 0.7500000055879354, "step": 375 }, { "advantage_max": 1.866427093744278, "advantage_mean": 0.0, "advantage_min": -0.9301695749163628, "advantage_std": 0.9998011514544487, "completion_length": 1490.479232788086, "epoch": 0.4297142857142857, "grad_norm": 0.36968451738357544, "kl": 0.03911590576171875, "lambda_div_used": 0.6, "learning_rate": 2.583460445215911e-07, "loss": 0.0016, "reward": 0.2085689200903289, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2085689200903289, "reward_after_std": 0.6105855852365494, "reward_before_mean": 0.7465545944869518, "reward_before_std": 0.5230113845318556, "reward_change_max": 0.0, "reward_change_mean": -0.5379856768995523, "reward_change_min": -0.8638027310371399, "reward_change_std": 0.3324153460562229, "reward_std": 0.6105856001377106, "rewards/cosine_scaled_reward": -0.07463938370347023, "rewards/format_reward": 0.8958333395421505, "step": 376 }, { "advantage_max": 1.8543337881565094, "advantage_mean": 2.110997954218874e-08, "advantage_min": -0.9540010169148445, "advantage_std": 0.9997950345277786, "completion_length": 1885.8334045410156, "epoch": 0.4308571428571429, "grad_norm": 0.45212268829345703, "kl": 0.0592498779296875, "lambda_div_used": 0.6, "learning_rate": 2.5596072820445254e-07, "loss": 0.0024, "reward": -0.0285780755802989, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0285780755802989, "reward_after_std": 0.687009047716856, "reward_before_mean": 0.3567437110468745, "reward_before_std": 0.6372678130865097, "reward_change_max": 0.0009665042161941528, "reward_change_mean": -0.3853217884898186, "reward_change_min": -0.6375828981399536, "reward_change_std": 0.26585924066603184, "reward_std": 0.6870090700685978, "rewards/cosine_scaled_reward": -0.18621148075908422, "rewards/format_reward": 0.7291666828095913, "step": 377 }, { "advantage_max": 1.7750205844640732, "advantage_mean": -2.856055936195645e-08, "advantage_min": -1.0438815727829933, "advantage_std": 0.9998446851968765, "completion_length": 1369.6667003631592, "epoch": 0.432, "grad_norm": 0.44347718358039856, "kl": 0.026195526123046875, "lambda_div_used": 0.6, "learning_rate": 2.5358974294659373e-07, "loss": 0.001, "reward": 0.45271228021010756, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.45271228021010756, "reward_after_std": 0.7842323668301105, "reward_before_mean": 1.0912157818675041, "reward_before_std": 0.7611095923930407, "reward_change_max": 0.0, "reward_change_mean": -0.6385035067796707, "reward_change_min": -1.0665529370307922, "reward_change_std": 0.4174453355371952, "reward_std": 0.7842323780059814, "rewards/cosine_scaled_reward": 0.06644121464341879, "rewards/format_reward": 0.9583333432674408, "step": 378 }, { "advantage_max": 1.807452067732811, "advantage_mean": -1.1175871450497255e-08, "advantage_min": -0.9612009599804878, "advantage_std": 0.9998452290892601, "completion_length": 1867.4584197998047, "epoch": 0.43314285714285716, "grad_norm": 0.5786564946174622, "kl": 0.04692649841308594, "lambda_div_used": 0.6, "learning_rate": 2.512332043064913e-07, "loss": 0.0019, "reward": 0.22903996147215366, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22903996147215366, "reward_after_std": 0.7921751923859119, "reward_before_mean": 0.7403265759348869, "reward_before_std": 0.7686562575399876, "reward_change_max": 0.001318424940109253, "reward_change_mean": -0.5112866014242172, "reward_change_min": -0.8982795923948288, "reward_change_std": 0.36374893598258495, "reward_std": 0.7921752035617828, "rewards/cosine_scaled_reward": -0.036086732521653175, "rewards/format_reward": 0.8125000149011612, "step": 379 }, { "advantage_max": 1.8599117398262024, "advantage_mean": 4.3461718668424965e-09, "advantage_min": -0.9834437742829323, "advantage_std": 0.9998585060238838, "completion_length": 1566.6041946411133, "epoch": 0.4342857142857143, "grad_norm": 0.598343551158905, "kl": 0.038066864013671875, "lambda_div_used": 0.6, "learning_rate": 2.488912271385139e-07, "loss": 0.0015, "reward": 0.28783006872981787, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.28783006872981787, "reward_after_std": 0.8142180219292641, "reward_before_mean": 0.8213298991322517, "reward_before_std": 0.7334345877170563, "reward_change_max": 0.005948290228843689, "reward_change_mean": -0.533499825745821, "reward_change_min": -0.8682552352547646, "reward_change_std": 0.3408501222729683, "reward_std": 0.8142180517315865, "rewards/cosine_scaled_reward": -0.01641839649528265, "rewards/format_reward": 0.854166679084301, "step": 380 }, { "advantage_max": 1.9078808277845383, "advantage_mean": -1.4280279792000528e-08, "advantage_min": -0.8441812470555305, "advantage_std": 0.999834418296814, "completion_length": 1700.0000190734863, "epoch": 0.43542857142857144, "grad_norm": 0.6566035151481628, "kl": 0.05217742919921875, "lambda_div_used": 0.6, "learning_rate": 2.465639255873246e-07, "loss": 0.0021, "reward": 0.044323298148810863, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.044323298148810863, "reward_after_std": 0.7120475806295872, "reward_before_mean": 0.455623185262084, "reward_before_std": 0.6162074208259583, "reward_change_max": 0.0, "reward_change_mean": -0.41129989735782146, "reward_change_min": -0.6884924322366714, "reward_change_std": 0.25980154797434807, "reward_std": 0.7120476141571999, "rewards/cosine_scaled_reward": -0.15760508552193642, "rewards/format_reward": 0.7708333469927311, "step": 381 }, { "advantage_max": 1.91932113468647, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.8044084087014198, "advantage_std": 0.9998442009091377, "completion_length": 1039.5833778381348, "epoch": 0.43657142857142855, "grad_norm": 0.36317184567451477, "kl": 0.012350082397460938, "lambda_div_used": 0.6, "learning_rate": 2.4425141308231765e-07, "loss": 0.0005, "reward": 0.20284398877993226, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20284398877993226, "reward_after_std": 0.7599193342030048, "reward_before_mean": 0.6968522891402245, "reward_before_std": 0.6381727494299412, "reward_change_max": 0.0, "reward_change_mean": -0.49400831013917923, "reward_change_min": -0.8446781784296036, "reward_change_std": 0.29265458323061466, "reward_std": 0.7599193751811981, "rewards/cosine_scaled_reward": -0.13074052496813238, "rewards/format_reward": 0.9583333358168602, "step": 382 }, { "advantage_max": 1.8412632197141647, "advantage_mean": -4.035731332452386e-09, "advantage_min": -0.9618307873606682, "advantage_std": 0.9998642280697823, "completion_length": 1811.7709045410156, "epoch": 0.4377142857142857, "grad_norm": 0.5148884057998657, "kl": 0.07571983337402344, "lambda_div_used": 0.6, "learning_rate": 2.4195380233209006e-07, "loss": 0.003, "reward": 0.3637051163241267, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3637051163241267, "reward_after_std": 0.8799832984805107, "reward_before_mean": 0.9258610289543867, "reward_before_std": 0.8425909131765366, "reward_change_max": 0.0, "reward_change_mean": -0.5621559210121632, "reward_change_min": -0.9556906446814537, "reward_change_std": 0.38166039250791073, "reward_std": 0.8799833245575428, "rewards/cosine_scaled_reward": 0.05668050143867731, "rewards/format_reward": 0.8125000111758709, "step": 383 }, { "advantage_max": 1.89041306078434, "advantage_mean": -2.6077033199456423e-08, "advantage_min": -0.83429766446352, "advantage_std": 0.9998781979084015, "completion_length": 1192.3125381469727, "epoch": 0.43885714285714283, "grad_norm": 0.3472490608692169, "kl": 0.0273590087890625, "lambda_div_used": 0.6, "learning_rate": 2.3967120531894857e-07, "loss": 0.0011, "reward": 0.7900097626261413, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7900097626261413, "reward_after_std": 0.9402679018676281, "reward_before_mean": 1.5731139779090881, "reward_before_std": 0.7669761152938008, "reward_change_max": 0.00027222931385040283, "reward_change_mean": -0.7831042222678661, "reward_change_min": -1.2078073993325233, "reward_change_std": 0.46871116384863853, "reward_std": 0.9402679279446602, "rewards/cosine_scaled_reward": 0.307390327565372, "rewards/format_reward": 0.9583333358168602, "step": 384 }, { "advantage_max": 1.8527950644493103, "advantage_mean": -1.6142925107764938e-08, "advantage_min": -0.8780858963727951, "advantage_std": 0.9998686984181404, "completion_length": 1546.958381652832, "epoch": 0.44, "grad_norm": 0.42252200841903687, "kl": 0.033329010009765625, "lambda_div_used": 0.6, "learning_rate": 2.374037332934512e-07, "loss": 0.0013, "reward": 0.3363357661291957, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3363357661291957, "reward_after_std": 0.9530099332332611, "reward_before_mean": 0.8679481521248817, "reward_before_std": 0.8937208689749241, "reward_change_max": 0.0, "reward_change_mean": -0.5316124074161053, "reward_change_min": -0.9706587940454483, "reward_change_std": 0.37425510212779045, "reward_std": 0.9530099704861641, "rewards/cosine_scaled_reward": 0.006890743970870972, "rewards/format_reward": 0.854166679084301, "step": 385 }, { "advantage_max": 1.8089945912361145, "advantage_mean": -2.980232260973992e-08, "advantage_min": -0.9716885611414909, "advantage_std": 0.9998600035905838, "completion_length": 1505.354232788086, "epoch": 0.44114285714285717, "grad_norm": 0.46533486247062683, "kl": 0.04397773742675781, "lambda_div_used": 0.6, "learning_rate": 2.3515149676898552e-07, "loss": 0.0018, "reward": 0.563620753120631, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.563620753120631, "reward_after_std": 0.8097805194556713, "reward_before_mean": 1.2603773300070316, "reward_before_std": 0.7409437410533428, "reward_change_max": 0.0007016360759735107, "reward_change_mean": -0.6967565380036831, "reward_change_min": -1.082048561424017, "reward_change_std": 0.4333069808781147, "reward_std": 0.8097805455327034, "rewards/cosine_scaled_reward": 0.18227194994688034, "rewards/format_reward": 0.8958333395421505, "step": 386 }, { "advantage_max": 1.8622557073831558, "advantage_mean": -2.4835267176115394e-09, "advantage_min": -0.8337085545063019, "advantage_std": 0.9998394101858139, "completion_length": 1601.8125610351562, "epoch": 0.4422857142857143, "grad_norm": 0.5599687099456787, "kl": 0.0380706787109375, "lambda_div_used": 0.6, "learning_rate": 2.3291460551638237e-07, "loss": 0.0015, "reward": 0.24382829433307052, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.24382829433307052, "reward_after_std": 0.7119114100933075, "reward_before_mean": 0.7761804088950157, "reward_before_std": 0.6253592558205128, "reward_change_max": 0.0, "reward_change_mean": -0.5323521457612514, "reward_change_min": -0.9017984233796597, "reward_change_std": 0.34474920108914375, "reward_std": 0.7119114361703396, "rewards/cosine_scaled_reward": -0.028576454147696495, "rewards/format_reward": 0.8333333395421505, "step": 387 }, { "advantage_max": 1.8835696578025818, "advantage_mean": -4.470348546892211e-08, "advantage_min": -0.8850356787443161, "advantage_std": 0.9998294040560722, "completion_length": 1042.2291946411133, "epoch": 0.44342857142857145, "grad_norm": 0.33819589018821716, "kl": 0.0192108154296875, "lambda_div_used": 0.6, "learning_rate": 2.306931685585657e-07, "loss": 0.0008, "reward": 0.40724813751876354, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.40724813751876354, "reward_after_std": 0.7526779770851135, "reward_before_mean": 1.0167050827294588, "reward_before_std": 0.6314396969974041, "reward_change_max": 0.0, "reward_change_mean": -0.6094569526612759, "reward_change_min": -0.9567462503910065, "reward_change_std": 0.35661482252180576, "reward_std": 0.7526780031621456, "rewards/cosine_scaled_reward": 0.018769189715385437, "rewards/format_reward": 0.9791666716337204, "step": 388 }, { "advantage_max": 1.9048434495925903, "advantage_mean": -2.297262352568552e-08, "advantage_min": -0.835426326841116, "advantage_std": 0.9998196735978127, "completion_length": 1316.6250228881836, "epoch": 0.44457142857142856, "grad_norm": 0.38188523054122925, "kl": 0.02446746826171875, "lambda_div_used": 0.6, "learning_rate": 2.2848729416523859e-07, "loss": 0.001, "reward": 0.32476662658154964, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.32476662658154964, "reward_after_std": 0.6799643561244011, "reward_before_mean": 0.9025517366826534, "reward_before_std": 0.5456170169636607, "reward_change_max": 0.0, "reward_change_mean": -0.5777850933372974, "reward_change_min": -0.8595723137259483, "reward_change_std": 0.3257438950240612, "reward_std": 0.6799643859267235, "rewards/cosine_scaled_reward": -0.04872416495345533, "rewards/format_reward": 1.0, "step": 389 }, { "advantage_max": 1.8400477319955826, "advantage_mean": -6.829698917520943e-09, "advantage_min": -0.854744978249073, "advantage_std": 0.9998657703399658, "completion_length": 1577.2500457763672, "epoch": 0.44571428571428573, "grad_norm": 0.6859716176986694, "kl": 0.0335693359375, "lambda_div_used": 0.6, "learning_rate": 2.2629708984760706e-07, "loss": 0.0013, "reward": 0.1753460403997451, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1753460403997451, "reward_after_std": 0.871589045971632, "reward_before_mean": 0.6376499533653259, "reward_before_std": 0.8287604749202728, "reward_change_max": 0.0008363723754882812, "reward_change_mean": -0.46230391412973404, "reward_change_min": -0.935205452144146, "reward_change_std": 0.3333991579711437, "reward_std": 0.8715890794992447, "rewards/cosine_scaled_reward": -0.07700837170705199, "rewards/format_reward": 0.791666679084301, "step": 390 }, { "advantage_max": 1.912625327706337, "advantage_mean": -2.173086099954702e-09, "advantage_min": -0.7925033271312714, "advantage_std": 0.9998733997344971, "completion_length": 1489.00004196167, "epoch": 0.44685714285714284, "grad_norm": 0.44128429889678955, "kl": 0.06465911865234375, "lambda_div_used": 0.6, "learning_rate": 2.2412266235313973e-07, "loss": 0.0026, "reward": 0.348428251221776, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.348428251221776, "reward_after_std": 0.9207301996648312, "reward_before_mean": 0.8853506334125996, "reward_before_std": 0.7893150560557842, "reward_change_max": 0.00031003355979919434, "reward_change_mean": -0.536922387778759, "reward_change_min": -0.9096999578177929, "reward_change_std": 0.33434988744556904, "reward_std": 0.9207301996648312, "rewards/cosine_scaled_reward": 0.03642530972138047, "rewards/format_reward": 0.8125000111758709, "step": 391 }, { "advantage_max": 1.841011866927147, "advantage_mean": -4.967053990334591e-09, "advantage_min": -0.9479911550879478, "advantage_std": 0.9998376369476318, "completion_length": 1231.2500381469727, "epoch": 0.448, "grad_norm": 0.41627851128578186, "kl": 0.026950836181640625, "lambda_div_used": 0.6, "learning_rate": 2.2196411766036487e-07, "loss": 0.0011, "reward": 0.238603868172504, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.238603868172504, "reward_after_std": 0.7660604529082775, "reward_before_mean": 0.7568782866001129, "reward_before_std": 0.7045793607831001, "reward_change_max": 0.0005902126431465149, "reward_change_mean": -0.5182744301855564, "reward_change_min": -0.850563645362854, "reward_change_std": 0.33581987768411636, "reward_std": 0.7660604864358902, "rewards/cosine_scaled_reward": -0.07989420369267464, "rewards/format_reward": 0.916666679084301, "step": 392 }, { "advantage_max": 1.850405141711235, "advantage_mean": -4.967053990334591e-09, "advantage_min": -0.813638836145401, "advantage_std": 0.9998711571097374, "completion_length": 1491.2292175292969, "epoch": 0.4491428571428571, "grad_norm": 0.46879175305366516, "kl": 0.04475212097167969, "lambda_div_used": 0.6, "learning_rate": 2.1982156097370557e-07, "loss": 0.0018, "reward": 0.22369152214378119, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22369152214378119, "reward_after_std": 0.9152826145291328, "reward_before_mean": 0.6987034790217876, "reward_before_std": 0.8574752546846867, "reward_change_max": 0.0006786957383155823, "reward_change_mean": -0.47501196525990963, "reward_change_min": -0.8072455041110516, "reward_change_std": 0.3139333399012685, "reward_std": 0.9152826257050037, "rewards/cosine_scaled_reward": -0.06731493026018143, "rewards/format_reward": 0.833333333954215, "step": 393 }, { "advantage_max": 1.8343770503997803, "advantage_mean": 9.313225746154785e-10, "advantage_min": -1.0088716968894005, "advantage_std": 0.9998288080096245, "completion_length": 1652.5625534057617, "epoch": 0.4502857142857143, "grad_norm": 0.7100973725318909, "kl": 0.06395339965820312, "lambda_div_used": 0.6, "learning_rate": 2.1769509671835223e-07, "loss": 0.0026, "reward": -0.015374501468613744, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.015374501468613744, "reward_after_std": 0.6524021811783314, "reward_before_mean": 0.38132119935471565, "reward_before_std": 0.6267807520925999, "reward_change_max": 0.005344957113265991, "reward_change_mean": -0.39669569209218025, "reward_change_min": -0.7064265944063663, "reward_change_std": 0.28260448575019836, "reward_std": 0.6524022221565247, "rewards/cosine_scaled_reward": -0.18433942459523678, "rewards/format_reward": 0.7500000186264515, "step": 394 }, { "advantage_max": 1.9344320595264435, "advantage_mean": -1.1796753018877837e-08, "advantage_min": -0.7464034222066402, "advantage_std": 0.9998317658901215, "completion_length": 1422.31254196167, "epoch": 0.4514285714285714, "grad_norm": 0.5548021197319031, "kl": 0.04305839538574219, "lambda_div_used": 0.6, "learning_rate": 2.1558482853517253e-07, "loss": 0.0017, "reward": 0.12881008815020323, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12881008815020323, "reward_after_std": 0.7306486591696739, "reward_before_mean": 0.5857934057712555, "reward_before_std": 0.6257657716050744, "reward_change_max": 0.0, "reward_change_mean": -0.45698332041502, "reward_change_min": -0.7511992193758488, "reward_change_std": 0.30648272577673197, "reward_std": 0.7306486964225769, "rewards/cosine_scaled_reward": -0.07168664503842592, "rewards/format_reward": 0.7291666772216558, "step": 395 }, { "advantage_max": 1.8720027953386307, "advantage_mean": -1.5522043372850902e-08, "advantage_min": -0.897967129945755, "advantage_std": 0.9998262673616409, "completion_length": 1115.2500305175781, "epoch": 0.45257142857142857, "grad_norm": 0.4723523259162903, "kl": 0.023822784423828125, "lambda_div_used": 0.6, "learning_rate": 2.134908592756607e-07, "loss": 0.001, "reward": 0.3313878992339596, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3313878992339596, "reward_after_std": 0.6471819877624512, "reward_before_mean": 0.9213565066456795, "reward_before_std": 0.5130888111889362, "reward_change_max": 3.838539123535156e-05, "reward_change_mean": -0.5899685919284821, "reward_change_min": -0.8697409331798553, "reward_change_std": 0.3338143788278103, "reward_std": 0.6471820063889027, "rewards/cosine_scaled_reward": 0.03359490446746349, "rewards/format_reward": 0.8541666716337204, "step": 396 }, { "advantage_max": 1.883902370929718, "advantage_mean": -3.632158152022669e-08, "advantage_min": -0.9016060680150986, "advantage_std": 0.9997219517827034, "completion_length": 1352.0625228881836, "epoch": 0.45371428571428574, "grad_norm": 0.37755143642425537, "kl": 0.04022216796875, "lambda_div_used": 0.6, "learning_rate": 2.1141329099692406e-07, "loss": 0.0016, "reward": -0.04198963730596006, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.04198963730596006, "reward_after_std": 0.5525118494406343, "reward_before_mean": 0.3588534388691187, "reward_before_std": 0.45337067916989326, "reward_change_max": 0.0, "reward_change_mean": -0.40084310434758663, "reward_change_min": -0.6106414273381233, "reward_change_std": 0.22512250347062945, "reward_std": 0.5525118727236986, "rewards/cosine_scaled_reward": -0.19557328848168254, "rewards/format_reward": 0.7500000055879354, "step": 397 }, { "advantage_max": 1.7940724939107895, "advantage_mean": 1.8781671828893565e-08, "advantage_min": -0.9255733340978622, "advantage_std": 0.9998294785618782, "completion_length": 1550.7500457763672, "epoch": 0.45485714285714285, "grad_norm": 0.37187108397483826, "kl": 0.07955551147460938, "lambda_div_used": 0.6, "learning_rate": 2.0935222495670968e-07, "loss": 0.0032, "reward": 0.01694987085647881, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.01694987085647881, "reward_after_std": 0.7360811270773411, "reward_before_mean": 0.4198214989155531, "reward_before_std": 0.7361685782670975, "reward_change_max": 0.0, "reward_change_mean": -0.4028716376051307, "reward_change_min": -0.7846706658601761, "reward_change_std": 0.30554668232798576, "reward_std": 0.7360811606049538, "rewards/cosine_scaled_reward": -0.12342259637080133, "rewards/format_reward": 0.6666666734963655, "step": 398 }, { "advantage_max": 1.8436774611473083, "advantage_mean": -1.0244548376281415e-08, "advantage_min": -0.8891482055187225, "advantage_std": 0.9998530372977257, "completion_length": 1050.0000228881836, "epoch": 0.456, "grad_norm": 0.4016158878803253, "kl": 0.01065826416015625, "lambda_div_used": 0.6, "learning_rate": 2.0730776160846853e-07, "loss": 0.0004, "reward": 0.5273470878601074, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5273470878601074, "reward_after_std": 0.960618756711483, "reward_before_mean": 1.1682646535336971, "reward_before_std": 0.8969470211304724, "reward_change_max": 0.0, "reward_change_mean": -0.6409175284206867, "reward_change_min": -1.093371707946062, "reward_change_std": 0.41930639930069447, "reward_std": 0.9606187716126442, "rewards/cosine_scaled_reward": 0.08413228066638112, "rewards/format_reward": 1.0, "step": 399 }, { "advantage_max": 1.8732015937566757, "advantage_mean": -2.3593505593666464e-08, "advantage_min": -0.9000465795397758, "advantage_std": 0.9998682737350464, "completion_length": 1030.270881652832, "epoch": 0.45714285714285713, "grad_norm": 0.6417791843414307, "kl": 0.0258026123046875, "lambda_div_used": 0.6, "learning_rate": 2.0528000059645995e-07, "loss": 0.001, "reward": 0.6478320201858878, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6478320201858878, "reward_after_std": 0.9242336377501488, "reward_before_mean": 1.3621239997446537, "reward_before_std": 0.8079969696700573, "reward_change_max": 0.0, "reward_change_mean": -0.7142920345067978, "reward_change_min": -1.1032118797302246, "reward_change_std": 0.4413549527525902, "reward_std": 0.9242336414754391, "rewards/cosine_scaled_reward": 0.20189532358199358, "rewards/format_reward": 0.9583333358168602, "step": 400 }, { "advantage_max": 1.8797823637723923, "advantage_mean": -2.359350581571107e-08, "advantage_min": -0.8693061619997025, "advantage_std": 0.9998289421200752, "completion_length": 1648.9792022705078, "epoch": 0.4582857142857143, "grad_norm": 0.8710258603096008, "kl": 0.07901382446289062, "lambda_div_used": 0.6, "learning_rate": 2.032690407508949e-07, "loss": 0.0032, "reward": 0.18920016940683126, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18920016940683126, "reward_after_std": 0.6232438683509827, "reward_before_mean": 0.7045781388878822, "reward_before_std": 0.500193178653717, "reward_change_max": 0.001879081130027771, "reward_change_mean": -0.515377989038825, "reward_change_min": -0.8118748292326927, "reward_change_std": 0.30811250768601894, "reward_std": 0.6232438869774342, "rewards/cosine_scaled_reward": -0.043544284999370575, "rewards/format_reward": 0.7916666753590107, "step": 401 }, { "advantage_max": 1.8284994959831238, "advantage_mean": -4.4703484247676784e-08, "advantage_min": -0.9610132426023483, "advantage_std": 0.9998360648751259, "completion_length": 1549.2500534057617, "epoch": 0.4594285714285714, "grad_norm": 0.7397347092628479, "kl": 0.09239578247070312, "lambda_div_used": 0.6, "learning_rate": 2.0127498008311922e-07, "loss": 0.0037, "reward": 0.2838644115254283, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2838644115254283, "reward_after_std": 0.6427106708288193, "reward_before_mean": 0.8501286339014769, "reward_before_std": 0.561686310917139, "reward_change_max": 0.0, "reward_change_mean": -0.5662642568349838, "reward_change_min": -0.9427123293280602, "reward_change_std": 0.3427322842180729, "reward_std": 0.6427106931805611, "rewards/cosine_scaled_reward": 0.039647649973630905, "rewards/format_reward": 0.7708333544433117, "step": 402 }, { "advantage_max": 1.9294809103012085, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.7606799304485321, "advantage_std": 0.9998130202293396, "completion_length": 1098.4167022705078, "epoch": 0.4605714285714286, "grad_norm": 0.7467202544212341, "kl": 0.0480804443359375, "lambda_div_used": 0.6, "learning_rate": 1.9929791578083655e-07, "loss": 0.0019, "reward": 0.3443166771903634, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3443166771903634, "reward_after_std": 0.5857836827635765, "reward_before_mean": 0.9529151804745197, "reward_before_std": 0.40462041553109884, "reward_change_max": 0.0, "reward_change_mean": -0.6085985079407692, "reward_change_min": -0.8830667324364185, "reward_change_std": 0.3298831805586815, "reward_std": 0.5857837088406086, "rewards/cosine_scaled_reward": 0.05979091301560402, "rewards/format_reward": 0.8333333414047956, "step": 403 }, { "advantage_max": 1.8348060548305511, "advantage_mean": -2.0799537647775423e-08, "advantage_min": -0.9691257327795029, "advantage_std": 0.999817743897438, "completion_length": 1306.0416946411133, "epoch": 0.4617142857142857, "grad_norm": 0.43737542629241943, "kl": 0.037811279296875, "lambda_div_used": 0.6, "learning_rate": 1.9733794420337213e-07, "loss": 0.0015, "reward": 0.34468303504399955, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.34468303504399955, "reward_after_std": 0.5768958441913128, "reward_before_mean": 0.9605327546596527, "reward_before_std": 0.44061626866459846, "reward_change_max": 0.0020405128598213196, "reward_change_mean": -0.6158497631549835, "reward_change_min": -0.9091715142130852, "reward_change_std": 0.3500679489225149, "reward_std": 0.576895859092474, "rewards/cosine_scaled_reward": 0.0010997112840414047, "rewards/format_reward": 0.9583333432674408, "step": 404 }, { "advantage_max": 1.8543145060539246, "advantage_mean": -3.1664968203060084e-08, "advantage_min": -0.898287508636713, "advantage_std": 0.9998418316245079, "completion_length": 1262.7500343322754, "epoch": 0.46285714285714286, "grad_norm": 0.570946991443634, "kl": 0.04802703857421875, "lambda_div_used": 0.6, "learning_rate": 1.9539516087697517e-07, "loss": 0.0019, "reward": 0.6255162106826901, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6255162106826901, "reward_after_std": 0.7487674653530121, "reward_before_mean": 1.3644494786858559, "reward_before_std": 0.5858640410006046, "reward_change_max": 0.0, "reward_change_mean": -0.7389333806931973, "reward_change_min": -1.0935213565826416, "reward_change_std": 0.43595005199313164, "reward_std": 0.7487674877047539, "rewards/cosine_scaled_reward": 0.22389142867177725, "rewards/format_reward": 0.916666679084301, "step": 405 }, { "advantage_max": 1.8656399846076965, "advantage_mean": -9.934107980669182e-09, "advantage_min": -0.9580504596233368, "advantage_std": 0.9998698681592941, "completion_length": 1351.3542098999023, "epoch": 0.464, "grad_norm": 0.30655261874198914, "kl": 0.04695701599121094, "lambda_div_used": 0.6, "learning_rate": 1.934696604901642e-07, "loss": 0.0019, "reward": 0.5224170899018645, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5224170899018645, "reward_after_std": 0.9001653417944908, "reward_before_mean": 1.1674301028251648, "reward_before_std": 0.7968062944710255, "reward_change_max": 0.0, "reward_change_mean": -0.6450129598379135, "reward_change_min": -1.0377557501196861, "reward_change_std": 0.3952809479087591, "reward_std": 0.9001653641462326, "rewards/cosine_scaled_reward": 0.10454833297990263, "rewards/format_reward": 0.9583333358168602, "step": 406 }, { "advantage_max": 1.8426674157381058, "advantage_mean": 2.3283066585833012e-09, "advantage_min": -0.8966666460037231, "advantage_std": 0.9998308792710304, "completion_length": 1636.6250305175781, "epoch": 0.46514285714285714, "grad_norm": 0.524170994758606, "kl": 0.07071685791015625, "lambda_div_used": 0.6, "learning_rate": 1.915615368891117e-07, "loss": 0.0028, "reward": 0.2841284740716219, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2841284740716219, "reward_after_std": 0.7291165739297867, "reward_before_mean": 0.8379490524530411, "reward_before_std": 0.6585894171148539, "reward_change_max": 0.0015131905674934387, "reward_change_mean": -0.5538205932825804, "reward_change_min": -0.9359328970313072, "reward_change_std": 0.3618352375924587, "reward_std": 0.7291165962815285, "rewards/cosine_scaled_reward": -0.008108798414468765, "rewards/format_reward": 0.8541666753590107, "step": 407 }, { "advantage_max": 1.8449404537677765, "advantage_mean": -2.1109978542988017e-08, "advantage_min": -0.8859404623508453, "advantage_std": 0.9998802319169044, "completion_length": 1421.2083854675293, "epoch": 0.4662857142857143, "grad_norm": 0.5737088322639465, "kl": 0.049816131591796875, "lambda_div_used": 0.6, "learning_rate": 1.8967088307307e-07, "loss": 0.002, "reward": 0.5460685240104795, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5460685240104795, "reward_after_std": 0.9580986090004444, "reward_before_mean": 1.1958689044695348, "reward_before_std": 0.8734465874731541, "reward_change_max": 0.0, "reward_change_mean": -0.6498003713786602, "reward_change_min": -1.1185031943023205, "reward_change_std": 0.413064856082201, "reward_std": 0.9580986239016056, "rewards/cosine_scaled_reward": 0.1708511160686612, "rewards/format_reward": 0.8541666697710752, "step": 408 }, { "advantage_max": 1.8438959568738937, "advantage_mean": -7.450580818968433e-09, "advantage_min": -1.0017603039741516, "advantage_std": 0.9998411983251572, "completion_length": 1914.6875457763672, "epoch": 0.4674285714285714, "grad_norm": 0.48147112131118774, "kl": 0.08876800537109375, "lambda_div_used": 0.6, "learning_rate": 1.8779779118983867e-07, "loss": 0.0036, "reward": 0.13925567595288157, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13925567595288157, "reward_after_std": 0.7202729880809784, "reward_before_mean": 0.6109248884022236, "reward_before_std": 0.6696692258119583, "reward_change_max": 0.0009178891777992249, "reward_change_mean": -0.4716692119836807, "reward_change_min": -0.7645023390650749, "reward_change_std": 0.306436350569129, "reward_std": 0.720272995531559, "rewards/cosine_scaled_reward": -0.09037090092897415, "rewards/format_reward": 0.7916666828095913, "step": 409 }, { "advantage_max": 1.7967534363269806, "advantage_mean": -4.656615093523442e-10, "advantage_min": -0.9627714604139328, "advantage_std": 0.9998459294438362, "completion_length": 1528.6667137145996, "epoch": 0.4685714285714286, "grad_norm": 0.7331544756889343, "kl": 0.0760650634765625, "lambda_div_used": 0.6, "learning_rate": 1.8594235253127372e-07, "loss": 0.003, "reward": 0.42353655165061355, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.42353655165061355, "reward_after_std": 0.7990189306437969, "reward_before_mean": 1.042414367198944, "reward_before_std": 0.7414693534374237, "reward_change_max": 0.00046183913946151733, "reward_change_mean": -0.6188778132200241, "reward_change_min": -1.0362925231456757, "reward_change_std": 0.405984902754426, "reward_std": 0.7990189418196678, "rewards/cosine_scaled_reward": 0.05245716869831085, "rewards/format_reward": 0.9375000149011612, "step": 410 }, { "advantage_max": 1.8948615789413452, "advantage_mean": 1.2417634254191512e-08, "advantage_min": -0.7667038217186928, "advantage_std": 0.999861590564251, "completion_length": 1889.2708854675293, "epoch": 0.4697142857142857, "grad_norm": 0.6439375877380371, "kl": 0.09022903442382812, "lambda_div_used": 0.6, "learning_rate": 1.8410465752883758e-07, "loss": 0.0036, "reward": 0.07135952671524137, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07135952671524137, "reward_after_std": 0.8766655959188938, "reward_before_mean": 0.47151265665888786, "reward_before_std": 0.8228470720350742, "reward_change_max": 0.0, "reward_change_mean": -0.40015316288918257, "reward_change_min": -0.7908356711268425, "reward_change_std": 0.29624347295612097, "reward_std": 0.8766656406223774, "rewards/cosine_scaled_reward": -0.12882700935006142, "rewards/format_reward": 0.7291666679084301, "step": 411 }, { "advantage_max": 1.9090563207864761, "advantage_mean": -2.6697914767837005e-08, "advantage_min": -0.7822853326797485, "advantage_std": 0.9998724535107613, "completion_length": 1202.1875381469727, "epoch": 0.47085714285714286, "grad_norm": 0.670369565486908, "kl": 0.04871368408203125, "lambda_div_used": 0.6, "learning_rate": 1.822847957491922e-07, "loss": 0.0019, "reward": 0.22458704718155786, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.22458704718155786, "reward_after_std": 0.908806387335062, "reward_before_mean": 0.6955942697823048, "reward_before_std": 0.8164355047047138, "reward_change_max": 0.0008294880390167236, "reward_change_mean": -0.471007265150547, "reward_change_min": -0.9000264182686806, "reward_change_std": 0.31314732879400253, "reward_std": 0.9088063985109329, "rewards/cosine_scaled_reward": -0.11053620371967554, "rewards/format_reward": 0.916666679084301, "step": 412 }, { "advantage_max": 1.8098042160272598, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.9544949308037758, "advantage_std": 0.9998386353254318, "completion_length": 1417.1042137145996, "epoch": 0.472, "grad_norm": 0.3730092942714691, "kl": 0.0497283935546875, "lambda_div_used": 0.6, "learning_rate": 1.804828558898332e-07, "loss": 0.002, "reward": 0.2984967448282987, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2984967448282987, "reward_after_std": 0.7107267417013645, "reward_before_mean": 0.8657870907336473, "reward_before_std": 0.6707973293960094, "reward_change_max": 0.0, "reward_change_mean": -0.5672903321683407, "reward_change_min": -0.9352284893393517, "reward_change_std": 0.36285562813282013, "reward_std": 0.7107267566025257, "rewards/cosine_scaled_reward": -0.004606468603014946, "rewards/format_reward": 0.8750000037252903, "step": 413 }, { "advantage_max": 1.858881652355194, "advantage_mean": 5.587935225648266e-09, "advantage_min": -0.8481370061635971, "advantage_std": 0.9998633489012718, "completion_length": 1995.2500457763672, "epoch": 0.47314285714285714, "grad_norm": 0.3799186944961548, "kl": 0.0695343017578125, "lambda_div_used": 0.6, "learning_rate": 1.7869892577476722e-07, "loss": 0.0028, "reward": 0.12761934008449316, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.12761934008449316, "reward_after_std": 0.8530314080417156, "reward_before_mean": 0.5639428496360779, "reward_before_std": 0.7966863811016083, "reward_change_max": 0.0, "reward_change_mean": -0.43632352352142334, "reward_change_min": -0.7810782827436924, "reward_change_std": 0.29841686226427555, "reward_std": 0.853031437844038, "rewards/cosine_scaled_reward": -0.10344524041283876, "rewards/format_reward": 0.7708333414047956, "step": 414 }, { "advantage_max": 1.806399554014206, "advantage_mean": -1.6142924996742636e-08, "advantage_min": -1.0417326241731644, "advantage_std": 0.9998660162091255, "completion_length": 1612.1667022705078, "epoch": 0.4742857142857143, "grad_norm": 1.021669864654541, "kl": 0.0842742919921875, "lambda_div_used": 0.6, "learning_rate": 1.7693309235023127e-07, "loss": 0.0034, "reward": 0.1636495697312057, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1636495697312057, "reward_after_std": 0.8222077488899231, "reward_before_mean": 0.6326027903705835, "reward_before_std": 0.801915667951107, "reward_change_max": 0.0015484318137168884, "reward_change_mean": -0.4689532145857811, "reward_change_min": -0.8480212837457657, "reward_change_std": 0.3447183482348919, "reward_std": 0.8222077712416649, "rewards/cosine_scaled_reward": -0.0795319527387619, "rewards/format_reward": 0.7916666865348816, "step": 415 }, { "advantage_max": 1.946450263261795, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.6989743858575821, "advantage_std": 0.999872125685215, "completion_length": 1185.583381652832, "epoch": 0.4754285714285714, "grad_norm": 0.5290063619613647, "kl": 0.032833099365234375, "lambda_div_used": 0.6, "learning_rate": 1.7518544168045524e-07, "loss": 0.0013, "reward": 0.4655949706211686, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4655949706211686, "reward_after_std": 0.915255781263113, "reward_before_mean": 1.0712868720293045, "reward_before_std": 0.7395226247608662, "reward_change_max": 0.0007962062954902649, "reward_change_mean": -0.6056919172406197, "reward_change_min": -1.005474604666233, "reward_change_std": 0.36948048509657383, "reward_std": 0.9152557961642742, "rewards/cosine_scaled_reward": 0.0668934234417975, "rewards/format_reward": 0.9375000074505806, "step": 416 }, { "advantage_max": 1.818949431180954, "advantage_mean": -1.3659397946064189e-08, "advantage_min": -1.0677797496318817, "advantage_std": 0.9998498633503914, "completion_length": 1563.0000228881836, "epoch": 0.4765714285714286, "grad_norm": 0.9102173447608948, "kl": 0.0742950439453125, "lambda_div_used": 0.6, "learning_rate": 1.7345605894346726e-07, "loss": 0.003, "reward": 0.1907540822867304, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1907540822867304, "reward_after_std": 0.7381607219576836, "reward_before_mean": 0.6873529492877424, "reward_before_std": 0.675433199852705, "reward_change_max": 0.0027615725994110107, "reward_change_mean": -0.49659889191389084, "reward_change_min": -0.8365379236638546, "reward_change_std": 0.32951972633600235, "reward_std": 0.7381607331335545, "rewards/cosine_scaled_reward": -0.052156862802803516, "rewards/format_reward": 0.7916666828095913, "step": 417 }, { "advantage_max": 1.9242968559265137, "advantage_mean": -5.091230215192866e-08, "advantage_min": -0.7926112860441208, "advantage_std": 0.9998694732785225, "completion_length": 1250.062557220459, "epoch": 0.4777142857142857, "grad_norm": 0.781521201133728, "kl": 0.0989990234375, "lambda_div_used": 0.6, "learning_rate": 1.7174502842694212e-07, "loss": 0.004, "reward": 0.7606905307620764, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7606905307620764, "reward_after_std": 0.8790313825011253, "reward_before_mean": 1.541863800957799, "reward_before_std": 0.6521397422766313, "reward_change_max": 0.0, "reward_change_mean": -0.7811733037233353, "reward_change_min": -1.084649033844471, "reward_change_std": 0.4253282528370619, "reward_std": 0.8790314197540283, "rewards/cosine_scaled_reward": 0.30218187253922224, "rewards/format_reward": 0.9375000074505806, "step": 418 }, { "advantage_max": 1.8819816559553146, "advantage_mean": -3.725290076417309e-09, "advantage_min": -0.7932135835289955, "advantage_std": 0.9998789504170418, "completion_length": 1468.895881652832, "epoch": 0.47885714285714287, "grad_norm": 0.7229182720184326, "kl": 0.0759429931640625, "lambda_div_used": 0.6, "learning_rate": 1.7005243352409333e-07, "loss": 0.003, "reward": 0.35275646578520536, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.35275646578520536, "reward_after_std": 0.9550173804163933, "reward_before_mean": 0.8900185525417328, "reward_before_std": 0.8626913130283356, "reward_change_max": 0.0007177069783210754, "reward_change_mean": -0.5372620560228825, "reward_change_min": -0.9783233143389225, "reward_change_std": 0.3662260863929987, "reward_std": 0.9550173878669739, "rewards/cosine_scaled_reward": 0.0595925732050091, "rewards/format_reward": 0.7708333414047956, "step": 419 }, { "advantage_max": 1.8907837122678757, "advantage_mean": -1.5832484323574647e-08, "advantage_min": -0.8471547365188599, "advantage_std": 0.9998333230614662, "completion_length": 858.0625228881836, "epoch": 0.48, "grad_norm": 0.427143931388855, "kl": 0.016613006591796875, "lambda_div_used": 0.6, "learning_rate": 1.6837835672960831e-07, "loss": 0.0007, "reward": 0.3101103331428021, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3101103331428021, "reward_after_std": 0.6953030377626419, "reward_before_mean": 0.8771585412323475, "reward_before_std": 0.5664926655590534, "reward_change_max": 0.0, "reward_change_mean": -0.5670481845736504, "reward_change_min": -0.9017394334077835, "reward_change_std": 0.32143189013004303, "reward_std": 0.6953030750155449, "rewards/cosine_scaled_reward": -0.05100408475846052, "rewards/format_reward": 0.9791666716337204, "step": 420 }, { "advantage_max": 1.926459640264511, "advantage_mean": 2.220446049250313e-16, "advantage_min": -0.8092224150896072, "advantage_std": 0.9998452663421631, "completion_length": 1315.7292137145996, "epoch": 0.48114285714285715, "grad_norm": 0.4144383668899536, "kl": 0.064453125, "lambda_div_used": 0.6, "learning_rate": 1.6672287963562852e-07, "loss": 0.0026, "reward": 0.2203623978421092, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2203623978421092, "reward_after_std": 0.8387158066034317, "reward_before_mean": 0.7068217769265175, "reward_before_std": 0.7188771069049835, "reward_change_max": 0.0, "reward_change_mean": -0.4864593520760536, "reward_change_min": -0.8118696585297585, "reward_change_std": 0.30111537501215935, "reward_std": 0.8387158252298832, "rewards/cosine_scaled_reward": -0.10492247063666582, "rewards/format_reward": 0.9166666716337204, "step": 421 }, { "advantage_max": 1.8332590609788895, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -0.9279729872941971, "advantage_std": 0.999835379421711, "completion_length": 1939.7500610351562, "epoch": 0.48228571428571426, "grad_norm": 0.7433092594146729, "kl": 0.13888168334960938, "lambda_div_used": 0.6, "learning_rate": 1.6508608292777203e-07, "loss": 0.0056, "reward": 0.1761605131905526, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1761605131905526, "reward_after_std": 0.7036119475960732, "reward_before_mean": 0.6759131057187915, "reward_before_std": 0.640310924500227, "reward_change_max": 0.0007759109139442444, "reward_change_mean": -0.4997525941580534, "reward_change_min": -0.8166663274168968, "reward_change_std": 0.31987714022397995, "reward_std": 0.7036119699478149, "rewards/cosine_scaled_reward": -0.04746011132374406, "rewards/format_reward": 0.7708333488553762, "step": 422 }, { "advantage_max": 1.9029418379068375, "advantage_mean": 2.7939678182153926e-08, "advantage_min": -0.7431788854300976, "advantage_std": 0.9998583421111107, "completion_length": 1889.770866394043, "epoch": 0.48342857142857143, "grad_norm": 0.4884932339191437, "kl": 0.11093902587890625, "lambda_div_used": 0.6, "learning_rate": 1.6346804638120098e-07, "loss": 0.0044, "reward": 0.06853078509448096, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.06853078509448096, "reward_after_std": 0.8814452588558197, "reward_before_mean": 0.4631076133809984, "reward_before_std": 0.8424768298864365, "reward_change_max": 0.000535815954208374, "reward_change_mean": -0.39457682403735816, "reward_change_min": -0.7278733029961586, "reward_change_std": 0.280312453629449, "reward_std": 0.88144526258111, "rewards/cosine_scaled_reward": -0.13302953727543354, "rewards/format_reward": 0.7291666734963655, "step": 423 }, { "advantage_max": 1.8147494047880173, "advantage_mean": 0.0, "advantage_min": -0.9146339148283005, "advantage_std": 0.9998487681150436, "completion_length": 1910.6458740234375, "epoch": 0.4845714285714286, "grad_norm": 0.826490581035614, "kl": 0.11475372314453125, "lambda_div_used": 0.6, "learning_rate": 1.6186884885673413e-07, "loss": 0.0046, "reward": 0.09315519593656063, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09315519593656063, "reward_after_std": 0.744984857738018, "reward_before_mean": 0.5344462972134352, "reward_before_std": 0.7094641253352165, "reward_change_max": 0.0005524009466171265, "reward_change_mean": -0.44129109010100365, "reward_change_min": -0.8307461738586426, "reward_change_std": 0.3057961128652096, "reward_std": 0.744984857738018, "rewards/cosine_scaled_reward": -0.08694352209568024, "rewards/format_reward": 0.7083333432674408, "step": 424 }, { "advantage_max": 1.8650247901678085, "advantage_mean": -4.2840839542535036e-08, "advantage_min": -0.910727221518755, "advantage_std": 0.9998797848820686, "completion_length": 1219.7708702087402, "epoch": 0.4857142857142857, "grad_norm": 0.44416239857673645, "kl": 0.05071258544921875, "lambda_div_used": 0.6, "learning_rate": 1.6028856829700258e-07, "loss": 0.002, "reward": 0.8080948491115123, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.8080948491115123, "reward_after_std": 0.9254983216524124, "reward_before_mean": 1.611850606277585, "reward_before_std": 0.7530386643484235, "reward_change_max": 7.234513759613037e-06, "reward_change_mean": -0.8037557974457741, "reward_change_min": -1.2168358340859413, "reward_change_std": 0.4892400950193405, "reward_std": 0.925498329102993, "rewards/cosine_scaled_reward": 0.3475919794291258, "rewards/format_reward": 0.9166666716337204, "step": 425 }, { "advantage_max": 1.7878179401159286, "advantage_mean": 2.7318796169684134e-08, "advantage_min": -1.0395925492048264, "advantage_std": 0.9997829124331474, "completion_length": 1821.645866394043, "epoch": 0.4868571428571429, "grad_norm": 0.6422761678695679, "kl": 0.17049407958984375, "lambda_div_used": 0.6, "learning_rate": 1.5872728172265146e-07, "loss": 0.0068, "reward": 0.07293279469013214, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.07293279469013214, "reward_after_std": 0.5741061456501484, "reward_before_mean": 0.5401615208247676, "reward_before_std": 0.5329846888780594, "reward_change_max": 0.009062647819519043, "reward_change_mean": -0.46722870878875256, "reward_change_min": -0.7904520779848099, "reward_change_std": 0.30266605969518423, "reward_std": 0.5741061493754387, "rewards/cosine_scaled_reward": -0.07366926036775112, "rewards/format_reward": 0.6875000074505806, "step": 426 }, { "advantage_max": 1.8344089090824127, "advantage_mean": 1.5987704937714398e-08, "advantage_min": -0.8490063548088074, "advantage_std": 0.9998742416501045, "completion_length": 1730.166732788086, "epoch": 0.488, "grad_norm": 0.8043573498725891, "kl": 0.07287216186523438, "lambda_div_used": 0.6, "learning_rate": 1.5718506522858572e-07, "loss": 0.0029, "reward": 0.2885350910946727, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2885350910946727, "reward_after_std": 0.9019887298345566, "reward_before_mean": 0.8062578393146396, "reward_before_std": 0.8632199503481388, "reward_change_max": 0.0, "reward_change_mean": -0.5177227295935154, "reward_change_min": -0.9653673470020294, "reward_change_std": 0.3626882489770651, "reward_std": 0.9019887447357178, "rewards/cosine_scaled_reward": 0.017712251748889685, "rewards/format_reward": 0.7708333358168602, "step": 427 }, { "advantage_max": 1.918458417057991, "advantage_mean": 1.2262414250674425e-08, "advantage_min": -0.7912456393241882, "advantage_std": 0.9998344704508781, "completion_length": 1410.6042175292969, "epoch": 0.48914285714285716, "grad_norm": 0.46534183621406555, "kl": 0.03601837158203125, "lambda_div_used": 0.6, "learning_rate": 1.5566199398026147e-07, "loss": 0.0014, "reward": 0.03753029089421034, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.03753029089421034, "reward_after_std": 0.734732910990715, "reward_before_mean": 0.4424444120377302, "reward_before_std": 0.629564180970192, "reward_change_max": 0.0, "reward_change_mean": -0.40491411834955215, "reward_change_min": -0.6768765859305859, "reward_change_std": 0.24197638221085072, "reward_std": 0.7347329407930374, "rewards/cosine_scaled_reward": -0.21627781761344522, "rewards/format_reward": 0.8750000111758709, "step": 428 }, { "advantage_max": 1.879625290632248, "advantage_mean": -1.6142924996742636e-08, "advantage_min": -0.8501939885318279, "advantage_std": 0.9998295605182648, "completion_length": 955.3958740234375, "epoch": 0.49028571428571427, "grad_norm": 0.5915119647979736, "kl": 0.043731689453125, "lambda_div_used": 0.6, "learning_rate": 1.5415814221002265e-07, "loss": 0.0018, "reward": 0.22457869758363813, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.22457869758363813, "reward_after_std": 0.667672373354435, "reward_before_mean": 0.7487860321998596, "reward_before_std": 0.5584245286881924, "reward_change_max": 0.0, "reward_change_mean": -0.5242073722183704, "reward_change_min": -0.8736131340265274, "reward_change_std": 0.30754177272319794, "reward_std": 0.6676723919808865, "rewards/cosine_scaled_reward": -0.10477365460246801, "rewards/format_reward": 0.9583333358168602, "step": 429 }, { "advantage_max": 1.8657117784023285, "advantage_mean": -1.5832484767663857e-08, "advantage_min": -0.9031914100050926, "advantage_std": 0.9998542368412018, "completion_length": 1187.3958587646484, "epoch": 0.49142857142857144, "grad_norm": 0.532692551612854, "kl": 0.046665191650390625, "lambda_div_used": 0.6, "learning_rate": 1.5267358321348285e-07, "loss": 0.0019, "reward": 0.37671698722988367, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.37671698722988367, "reward_after_std": 0.7594957612454891, "reward_before_mean": 0.9692848455160856, "reward_before_std": 0.6301495917141438, "reward_change_max": 9.178370237350464e-05, "reward_change_mean": -0.5925678685307503, "reward_change_min": -0.9060627967119217, "reward_change_std": 0.3460345584899187, "reward_std": 0.7594957798719406, "rewards/cosine_scaled_reward": 0.07839240477187559, "rewards/format_reward": 0.8125000037252903, "step": 430 }, { "advantage_max": 1.8353245258331299, "advantage_mean": -9.934107592091124e-09, "advantage_min": -0.9341690689325333, "advantage_std": 0.9998679608106613, "completion_length": 1518.1875457763672, "epoch": 0.49257142857142855, "grad_norm": 1.6024953126907349, "kl": 0.146270751953125, "lambda_div_used": 0.6, "learning_rate": 1.5120838934595337e-07, "loss": 0.0058, "reward": 0.1624729260802269, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1624729260802269, "reward_after_std": 0.8999154008924961, "reward_before_mean": 0.6052434705197811, "reward_before_std": 0.8698120005428791, "reward_change_max": 0.0008520856499671936, "reward_change_mean": -0.4427705593407154, "reward_change_min": -0.7804036028683186, "reward_change_std": 0.3131858557462692, "reward_std": 0.8999154083430767, "rewards/cosine_scaled_reward": -0.08279493264853954, "rewards/format_reward": 0.7708333432674408, "step": 431 }, { "advantage_max": 1.9103060811758041, "advantage_mean": -6.208817238118058e-09, "advantage_min": -0.835049219429493, "advantage_std": 0.9998124614357948, "completion_length": 1616.4792022705078, "epoch": 0.4937142857142857, "grad_norm": 0.44959592819213867, "kl": 0.06036376953125, "lambda_div_used": 0.6, "learning_rate": 1.4976263201891613e-07, "loss": 0.0024, "reward": 0.005719345761463046, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.005719345761463046, "reward_after_std": 0.6313600055873394, "reward_before_mean": 0.4188762828707695, "reward_before_std": 0.5393286049365997, "reward_change_max": 0.0, "reward_change_mean": -0.41315691312775016, "reward_change_min": -0.6860866919159889, "reward_change_std": 0.2504292316734791, "reward_std": 0.6313600204885006, "rewards/cosine_scaled_reward": -0.19681187812238932, "rewards/format_reward": 0.8125000149011612, "step": 432 }, { "advantage_max": 1.892504170536995, "advantage_mean": 2.980232327587373e-08, "advantage_min": -0.8732018694281578, "advantage_std": 0.9998399764299393, "completion_length": 1564.5000457763672, "epoch": 0.4948571428571429, "grad_norm": 0.66018146276474, "kl": 0.07354354858398438, "lambda_div_used": 0.6, "learning_rate": 1.483363816965435e-07, "loss": 0.0029, "reward": 0.26046125683933496, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26046125683933496, "reward_after_std": 0.711750440299511, "reward_before_mean": 0.7982939779758453, "reward_before_std": 0.6088505052030087, "reward_change_max": 0.0005765184760093689, "reward_change_mean": -0.5378326866775751, "reward_change_min": -0.8633565232157707, "reward_change_std": 0.3263698350638151, "reward_std": 0.7117504626512527, "rewards/cosine_scaled_reward": 0.024146972224116325, "rewards/format_reward": 0.7500000111758709, "step": 433 }, { "advantage_max": 1.9090900868177414, "advantage_mean": 0.0, "advantage_min": -0.7441076934337616, "advantage_std": 0.9998107478022575, "completion_length": 1740.4167098999023, "epoch": 0.496, "grad_norm": 0.7447723150253296, "kl": 0.13619613647460938, "lambda_div_used": 0.6, "learning_rate": 1.469297078922642e-07, "loss": 0.0054, "reward": -0.00020221294835209846, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.00020221294835209846, "reward_after_std": 0.6341037414968014, "reward_before_mean": 0.40146185271441936, "reward_before_std": 0.5198134481906891, "reward_change_max": 0.0, "reward_change_mean": -0.4016640540212393, "reward_change_min": -0.6340990662574768, "reward_change_std": 0.2353166714310646, "reward_std": 0.6341037563979626, "rewards/cosine_scaled_reward": -0.20551909133791924, "rewards/format_reward": 0.8125000037252903, "step": 434 }, { "advantage_max": 1.9161070734262466, "advantage_mean": -1.707424759911369e-08, "advantage_min": -0.8011736907064915, "advantage_std": 0.9998152628540993, "completion_length": 1104.5833778381348, "epoch": 0.49714285714285716, "grad_norm": 0.6831105947494507, "kl": 0.09851837158203125, "lambda_div_used": 0.6, "learning_rate": 1.4554267916537495e-07, "loss": 0.0039, "reward": 0.1984256466384977, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.1984256466384977, "reward_after_std": 0.6855190061032772, "reward_before_mean": 0.701103962957859, "reward_before_std": 0.5424638222903013, "reward_change_max": 0.0, "reward_change_mean": -0.5026783309876919, "reward_change_min": -0.7911634966731071, "reward_change_std": 0.28394212760031223, "reward_std": 0.6855190135538578, "rewards/cosine_scaled_reward": -0.11819804133847356, "rewards/format_reward": 0.9375000149011612, "step": 435 }, { "advantage_max": 1.8080298602581024, "advantage_mean": -1.9247333504779363e-08, "advantage_min": -1.0045333839952946, "advantage_std": 0.9998283982276917, "completion_length": 1415.3958740234375, "epoch": 0.4982857142857143, "grad_norm": 0.5908387303352356, "kl": 0.10746002197265625, "lambda_div_used": 0.6, "learning_rate": 1.4417536311769885e-07, "loss": 0.0043, "reward": 0.40853736363351345, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.40853736363351345, "reward_after_std": 0.6946579478681087, "reward_before_mean": 1.03979467228055, "reward_before_std": 0.596217917278409, "reward_change_max": 0.0012014508247375488, "reward_change_mean": -0.6312573403120041, "reward_change_min": -0.9797411002218723, "reward_change_std": 0.388340774923563, "reward_std": 0.6946579590439796, "rewards/cosine_scaled_reward": 0.12406398542225361, "rewards/format_reward": 0.7916666716337204, "step": 436 }, { "advantage_max": 1.765413984656334, "advantage_mean": -1.4280280069556284e-08, "advantage_min": -1.140317179262638, "advantage_std": 0.9998202100396156, "completion_length": 1342.8125228881836, "epoch": 0.49942857142857144, "grad_norm": 0.8170318603515625, "kl": 0.040370941162109375, "lambda_div_used": 0.6, "learning_rate": 1.4282782639029128e-07, "loss": 0.0016, "reward": 0.2664998557884246, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2664998557884246, "reward_after_std": 0.6159840114414692, "reward_before_mean": 0.8322788365185261, "reward_before_std": 0.576800886541605, "reward_change_max": 0.0, "reward_change_mean": -0.5657790005207062, "reward_change_min": -0.9002040028572083, "reward_change_std": 0.34919700771570206, "reward_std": 0.6159840300679207, "rewards/cosine_scaled_reward": -0.06302724592387676, "rewards/format_reward": 0.9583333432674408, "step": 437 }, { "advantage_max": 1.8181038945913315, "advantage_mean": -8.692344288796505e-09, "advantage_min": -0.9430951476097107, "advantage_std": 0.9998539239168167, "completion_length": 1953.8750762939453, "epoch": 0.5005714285714286, "grad_norm": 1.0785415172576904, "kl": 0.2301177978515625, "lambda_div_used": 0.6, "learning_rate": 1.4150013466019114e-07, "loss": 0.0092, "reward": 0.12155490834265947, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12155490834265947, "reward_after_std": 0.8553712926805019, "reward_before_mean": 0.5592961621587165, "reward_before_std": 0.8434982839971781, "reward_change_max": 0.0007446259260177612, "reward_change_mean": -0.4377412758767605, "reward_change_min": -0.891018021851778, "reward_change_std": 0.33926537446677685, "reward_std": 0.8553713001310825, "rewards/cosine_scaled_reward": -0.09535192046314478, "rewards/format_reward": 0.7500000186264515, "step": 438 }, { "advantage_max": 1.7687881886959076, "advantage_mean": -6.208817904251873e-10, "advantage_min": -1.0331488847732544, "advantage_std": 0.9998689517378807, "completion_length": 1621.4375762939453, "epoch": 0.5017142857142857, "grad_norm": 1.3315192461013794, "kl": 0.14151763916015625, "lambda_div_used": 0.6, "learning_rate": 1.4019235263722034e-07, "loss": 0.0057, "reward": 0.04851225670427084, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04851225670427084, "reward_after_std": 0.8845059871673584, "reward_before_mean": 0.4421884883195162, "reward_before_std": 0.9212424159049988, "reward_change_max": 0.0003846213221549988, "reward_change_mean": -0.3936762325465679, "reward_change_min": -0.8846784085035324, "reward_change_std": 0.34067676588892937, "reward_std": 0.8845060132443905, "rewards/cosine_scaled_reward": -0.11223910190165043, "rewards/format_reward": 0.6666666828095913, "step": 439 }, { "advantage_max": 1.8706139773130417, "advantage_mean": 1.1796753074388988e-08, "advantage_min": -0.8981772735714912, "advantage_std": 0.9998108372092247, "completion_length": 1569.7292251586914, "epoch": 0.5028571428571429, "grad_norm": 0.855440080165863, "kl": 0.1434173583984375, "lambda_div_used": 0.6, "learning_rate": 1.3890454406082956e-07, "loss": 0.0057, "reward": -0.05658835871145129, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05658835871145129, "reward_after_std": 0.5816938430070877, "reward_before_mean": 0.33301460556685925, "reward_before_std": 0.5139974765479565, "reward_change_max": 0.0, "reward_change_mean": -0.3896029610186815, "reward_change_min": -0.681924182921648, "reward_change_std": 0.24834264814853668, "reward_std": 0.5816938765347004, "rewards/cosine_scaled_reward": -0.1980760432779789, "rewards/format_reward": 0.729166679084301, "step": 440 }, { "advantage_max": 1.8911541998386383, "advantage_mean": 1.4280279625467074e-08, "advantage_min": -0.8213992044329643, "advantage_std": 0.9998567774891853, "completion_length": 1801.3750267028809, "epoch": 0.504, "grad_norm": 0.8155668377876282, "kl": 0.165771484375, "lambda_div_used": 0.6, "learning_rate": 1.3763677169699217e-07, "loss": 0.0066, "reward": 0.07445507869124413, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07445507869124413, "reward_after_std": 0.7889985404908657, "reward_before_mean": 0.4961062502115965, "reward_before_std": 0.743747316300869, "reward_change_max": 0.0009514167904853821, "reward_change_mean": -0.42165115289390087, "reward_change_min": -0.721052560955286, "reward_change_std": 0.29154213331639767, "reward_std": 0.7889985628426075, "rewards/cosine_scaled_reward": -0.09569689631462097, "rewards/format_reward": 0.6875000093132257, "step": 441 }, { "advantage_max": 1.9226910918951035, "advantage_mean": -2.8560559028889543e-08, "advantage_min": -0.745913602411747, "advantage_std": 0.9998499751091003, "completion_length": 1091.2500228881836, "epoch": 0.5051428571428571, "grad_norm": 0.7159312963485718, "kl": 0.05721282958984375, "lambda_div_used": 0.6, "learning_rate": 1.3638909733514452e-07, "loss": 0.0023, "reward": 0.41800220077857375, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.41800220077857375, "reward_after_std": 0.7833468951284885, "reward_before_mean": 1.0278632938861847, "reward_before_std": 0.6174677358940244, "reward_change_max": 0.0, "reward_change_mean": -0.6098610982298851, "reward_change_min": -0.9643383026123047, "reward_change_std": 0.35795337706804276, "reward_std": 0.7833469174802303, "rewards/cosine_scaled_reward": 0.08684830274432898, "rewards/format_reward": 0.854166679084301, "step": 442 }, { "advantage_max": 1.8147583454847336, "advantage_mean": 6.208817349140361e-09, "advantage_min": -1.0209823548793793, "advantage_std": 0.9998364895582199, "completion_length": 1822.1459045410156, "epoch": 0.5062857142857143, "grad_norm": 0.8386198282241821, "kl": 0.11566162109375, "lambda_div_used": 0.6, "learning_rate": 1.351615817851748e-07, "loss": 0.0046, "reward": 0.030754336155951023, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.030754336155951023, "reward_after_std": 0.7065671682357788, "reward_before_mean": 0.4468862364301458, "reward_before_std": 0.6671516671776772, "reward_change_max": 0.0, "reward_change_mean": -0.4161319136619568, "reward_change_min": -0.7626941949129105, "reward_change_std": 0.2861163951456547, "reward_std": 0.7065671719610691, "rewards/cosine_scaled_reward": -0.13072354905307293, "rewards/format_reward": 0.7083333507180214, "step": 443 }, { "advantage_max": 1.9129046946763992, "advantage_mean": -9.313226190243995e-09, "advantage_min": -0.8238680362701416, "advantage_std": 0.9998748078942299, "completion_length": 1545.3750267028809, "epoch": 0.5074285714285715, "grad_norm": 1.1218396425247192, "kl": 0.19435882568359375, "lambda_div_used": 0.6, "learning_rate": 1.3395428487445914e-07, "loss": 0.0078, "reward": 0.03131588757969439, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03131588757969439, "reward_after_std": 0.9942633770406246, "reward_before_mean": 0.37214701221091673, "reward_before_std": 0.9374651424586773, "reward_change_max": 0.00407920777797699, "reward_change_mean": -0.3408311549574137, "reward_change_min": -0.6419351659715176, "reward_change_std": 0.25083165243268013, "reward_std": 0.9942633770406246, "rewards/cosine_scaled_reward": -0.1576764981728047, "rewards/format_reward": 0.6875000111758709, "step": 444 }, { "advantage_max": 1.858717828989029, "advantage_mean": -1.5522043095295146e-08, "advantage_min": -0.842341735959053, "advantage_std": 0.9998549446463585, "completion_length": 1924.5000762939453, "epoch": 0.5085714285714286, "grad_norm": 0.7524690628051758, "kl": 0.251220703125, "lambda_div_used": 0.6, "learning_rate": 1.3276726544494571e-07, "loss": 0.01, "reward": 0.0804061135277152, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0804061135277152, "reward_after_std": 0.7893403545022011, "reward_before_mean": 0.4991146745160222, "reward_before_std": 0.7196450419723988, "reward_change_max": 0.0, "reward_change_mean": -0.41870855912566185, "reward_change_min": -0.7721864506602287, "reward_change_std": 0.27942358888685703, "reward_std": 0.7893403694033623, "rewards/cosine_scaled_reward": -0.1462760092690587, "rewards/format_reward": 0.791666679084301, "step": 445 }, { "advantage_max": 1.9125205725431442, "advantage_mean": -1.521160272743849e-08, "advantage_min": -0.7717568129301071, "advantage_std": 0.9998368695378304, "completion_length": 1579.791748046875, "epoch": 0.5097142857142857, "grad_norm": 0.6899632215499878, "kl": 0.10594940185546875, "lambda_div_used": 0.6, "learning_rate": 1.316005813502869e-07, "loss": 0.0042, "reward": 0.3102727495133877, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3102727495133877, "reward_after_std": 0.8228458315134048, "reward_before_mean": 0.8454371113330126, "reward_before_std": 0.6917162649333477, "reward_change_max": 0.0012743175029754639, "reward_change_mean": -0.5351644065231085, "reward_change_min": -0.9024721682071686, "reward_change_std": 0.3305331449955702, "reward_std": 0.8228458426892757, "rewards/cosine_scaled_reward": 0.0685519129037857, "rewards/format_reward": 0.7083333414047956, "step": 446 }, { "advantage_max": 1.8417370170354843, "advantage_mean": -6.829698695476338e-09, "advantage_min": -0.935723327100277, "advantage_std": 0.9998374506831169, "completion_length": 1487.7917022705078, "epoch": 0.5108571428571429, "grad_norm": 0.8426020741462708, "kl": 0.1772003173828125, "lambda_div_used": 0.6, "learning_rate": 1.3045428945301953e-07, "loss": 0.0071, "reward": 0.23191683134064078, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23191683134064078, "reward_after_std": 0.7033054195344448, "reward_before_mean": 0.7608326952904463, "reward_before_std": 0.6380577906966209, "reward_change_max": 0.0, "reward_change_mean": -0.5289158523082733, "reward_change_min": -0.8764414638280869, "reward_change_std": 0.3322218470275402, "reward_std": 0.7033054418861866, "rewards/cosine_scaled_reward": -0.05708366571343504, "rewards/format_reward": 0.8750000149011612, "step": 447 }, { "advantage_max": 1.902267187833786, "advantage_mean": 6.208816516473092e-10, "advantage_min": -0.8541913852095604, "advantage_std": 0.9997800961136818, "completion_length": 1220.833366394043, "epoch": 0.512, "grad_norm": 0.9086900949478149, "kl": 0.12390899658203125, "lambda_div_used": 0.6, "learning_rate": 1.2932844562179352e-07, "loss": 0.0049, "reward": 0.2925412461627275, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2925412461627275, "reward_after_std": 0.5348220467567444, "reward_before_mean": 0.8818653598427773, "reward_before_std": 0.3927401080727577, "reward_change_max": 0.0, "reward_change_mean": -0.5893240831792355, "reward_change_min": -0.8397349342703819, "reward_change_std": 0.32815040461719036, "reward_std": 0.5348220616579056, "rewards/cosine_scaled_reward": -0.017400696873664856, "rewards/format_reward": 0.916666679084301, "step": 448 }, { "advantage_max": 1.905688613653183, "advantage_mean": -2.23517424569053e-08, "advantage_min": -0.841227937489748, "advantage_std": 0.9997895136475563, "completion_length": 1415.9167137145996, "epoch": 0.5131428571428571, "grad_norm": 0.8157268166542053, "kl": 0.20606422424316406, "lambda_div_used": 0.6, "learning_rate": 1.2822310472864885e-07, "loss": 0.0082, "reward": 0.01664125733077526, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.01664125733077526, "reward_after_std": 0.5663458779454231, "reward_before_mean": 0.4466824699193239, "reward_before_std": 0.44439556263387203, "reward_change_max": 0.000407978892326355, "reward_change_mean": -0.4300412107259035, "reward_change_min": -0.6489026509225368, "reward_change_std": 0.24280665069818497, "reward_std": 0.5663458835333586, "rewards/cosine_scaled_reward": -0.16207545064389706, "rewards/format_reward": 0.7708333432674408, "step": 449 }, { "advantage_max": 1.8794930279254913, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.8497578613460064, "advantage_std": 0.9998070001602173, "completion_length": 1445.083366394043, "epoch": 0.5142857142857142, "grad_norm": 0.7874720692634583, "kl": 0.15003204345703125, "lambda_div_used": 0.6, "learning_rate": 1.2713832064634125e-07, "loss": 0.006, "reward": 0.02464776113629341, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.02464776113629341, "reward_after_std": 0.5693525820970535, "reward_before_mean": 0.45805526059120893, "reward_before_std": 0.4447983056306839, "reward_change_max": 0.0023548901081085205, "reward_change_mean": -0.43340749107301235, "reward_change_min": -0.6489949226379395, "reward_change_std": 0.2607662323862314, "reward_std": 0.5693525932729244, "rewards/cosine_scaled_reward": -0.12513904832303524, "rewards/format_reward": 0.7083333395421505, "step": 450 }, { "advantage_max": 1.872328832745552, "advantage_mean": -1.6763806898190126e-08, "advantage_min": -0.8258812800049782, "advantage_std": 0.9998480677604675, "completion_length": 1233.708366394043, "epoch": 0.5154285714285715, "grad_norm": 0.7528899908065796, "kl": 0.09528350830078125, "lambda_div_used": 0.6, "learning_rate": 1.260741462457165e-07, "loss": 0.0038, "reward": 0.3256368708098307, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3256368708098307, "reward_after_std": 0.7790644764900208, "reward_before_mean": 0.8877322040498257, "reward_before_std": 0.6680051982402802, "reward_change_max": 0.0, "reward_change_mean": -0.562095332890749, "reward_change_min": -0.9134536460042, "reward_change_std": 0.35085158981382847, "reward_std": 0.7790645137429237, "rewards/cosine_scaled_reward": -0.00405057892203331, "rewards/format_reward": 0.8958333432674408, "step": 451 }, { "advantage_max": 1.9041462689638138, "advantage_mean": 1.0865430555284661e-08, "advantage_min": -0.9039032459259033, "advantage_std": 0.9998711198568344, "completion_length": 1906.7292098999023, "epoch": 0.5165714285714286, "grad_norm": 1.088655710220337, "kl": 0.24600601196289062, "lambda_div_used": 0.6, "learning_rate": 1.2503063339313356e-07, "loss": 0.0098, "reward": 0.29726181272417307, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.29726181272417307, "reward_after_std": 0.8609438054263592, "reward_before_mean": 0.8219169015064836, "reward_before_std": 0.7409865632653236, "reward_change_max": 0.0, "reward_change_mean": -0.5246550729498267, "reward_change_min": -0.8709829784929752, "reward_change_std": 0.33557169884443283, "reward_std": 0.8609438389539719, "rewards/cosine_scaled_reward": 0.07762509072199464, "rewards/format_reward": 0.6666666716337204, "step": 452 }, { "advantage_max": 1.86186121404171, "advantage_mean": 7.1401394796666295e-09, "advantage_min": -0.8760163262486458, "advantage_std": 0.9998539313673973, "completion_length": 1587.083396911621, "epoch": 0.5177142857142857, "grad_norm": 0.6533041596412659, "kl": 0.12093353271484375, "lambda_div_used": 0.6, "learning_rate": 1.2400783294793668e-07, "loss": 0.0048, "reward": 0.36189063219353557, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.36189063219353557, "reward_after_std": 0.7707811817526817, "reward_before_mean": 0.9481308050453663, "reward_before_std": 0.6685521900653839, "reward_change_max": 0.0, "reward_change_mean": -0.58624017983675, "reward_change_min": -0.9146820791065693, "reward_change_std": 0.3575539728626609, "reward_std": 0.7707811929285526, "rewards/cosine_scaled_reward": 0.036565386690199375, "rewards/format_reward": 0.8750000055879354, "step": 453 }, { "advantage_max": 1.799769252538681, "advantage_mean": 0.0, "advantage_min": -0.9815754368901253, "advantage_std": 0.9998062923550606, "completion_length": 1350.0416946411133, "epoch": 0.5188571428571429, "grad_norm": 0.8042285442352295, "kl": 0.10607337951660156, "lambda_div_used": 0.6, "learning_rate": 1.2300579475997657e-07, "loss": 0.0042, "reward": 0.11526673112530261, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11526673112530261, "reward_after_std": 0.6661871522665024, "reward_before_mean": 0.5921627879142761, "reward_before_std": 0.6296087387017906, "reward_change_max": 0.002816028892993927, "reward_change_mean": -0.4768960699439049, "reward_change_min": -0.8067386299371719, "reward_change_std": 0.33680446818470955, "reward_std": 0.6661871746182442, "rewards/cosine_scaled_reward": -0.11016861326061189, "rewards/format_reward": 0.8125000111758709, "step": 454 }, { "advantage_max": 1.9014418870210648, "advantage_mean": 7.450580929990736e-09, "advantage_min": -0.8852047994732857, "advantage_std": 0.9998085275292397, "completion_length": 1282.6250610351562, "epoch": 0.52, "grad_norm": 0.4326711893081665, "kl": 0.11649322509765625, "lambda_div_used": 0.6, "learning_rate": 1.220245676671809e-07, "loss": 0.0047, "reward": 0.08766961051151156, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08766961051151156, "reward_after_std": 0.5684734508395195, "reward_before_mean": 0.5595664214342833, "reward_before_std": 0.46307386830449104, "reward_change_max": 0.0, "reward_change_mean": -0.47189680114388466, "reward_change_min": -0.7259544879198074, "reward_change_std": 0.2745931725949049, "reward_std": 0.5684734769165516, "rewards/cosine_scaled_reward": -0.17855014093220234, "rewards/format_reward": 0.9166666679084301, "step": 455 }, { "advantage_max": 1.8154648691415787, "advantage_mean": 2.1730860444435507e-09, "advantage_min": -0.9932011067867279, "advantage_std": 0.9998342245817184, "completion_length": 1712.9584159851074, "epoch": 0.5211428571428571, "grad_norm": 1.1537747383117676, "kl": 0.2523918151855469, "lambda_div_used": 0.6, "learning_rate": 1.2106419949317388e-07, "loss": 0.0101, "reward": 0.06782927364110947, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06782927364110947, "reward_after_std": 0.7310365661978722, "reward_before_mean": 0.4991466961801052, "reward_before_std": 0.7056907415390015, "reward_change_max": 0.0028716325759887695, "reward_change_mean": -0.43131742626428604, "reward_change_min": -0.7868289947509766, "reward_change_std": 0.3076404817402363, "reward_std": 0.731036588549614, "rewards/cosine_scaled_reward": -0.10459333215840161, "rewards/format_reward": 0.7083333507180214, "step": 456 }, { "advantage_max": 1.8463391065597534, "advantage_mean": 2.980232327587373e-08, "advantage_min": -0.9132648408412933, "advantage_std": 0.9998122155666351, "completion_length": 1766.3750305175781, "epoch": 0.5222857142857142, "grad_norm": 0.8214160203933716, "kl": 0.2835235595703125, "lambda_div_used": 0.6, "learning_rate": 1.2012473704494537e-07, "loss": 0.0113, "reward": 0.053376014227978885, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.053376014227978885, "reward_after_std": 0.6058420166373253, "reward_before_mean": 0.49396978225558996, "reward_before_std": 0.5151708796620369, "reward_change_max": 0.0011105164885520935, "reward_change_mean": -0.4405937194824219, "reward_change_min": -0.7060803063213825, "reward_change_std": 0.268763717263937, "reward_std": 0.6058420278131962, "rewards/cosine_scaled_reward": -0.09676513634622097, "rewards/format_reward": 0.6875000149011612, "step": 457 }, { "advantage_max": 1.858269363641739, "advantage_mean": 4.190951752303107e-09, "advantage_min": -0.9696895703673363, "advantage_std": 0.9998143240809441, "completion_length": 1340.2292098999023, "epoch": 0.5234285714285715, "grad_norm": 0.8421010971069336, "kl": 0.12511825561523438, "lambda_div_used": 0.6, "learning_rate": 1.1920622611056974e-07, "loss": 0.005, "reward": 0.040771787986159325, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.040771787986159325, "reward_after_std": 0.6010152883827686, "reward_before_mean": 0.48337520100176334, "reward_before_std": 0.5329945832490921, "reward_change_max": 0.0, "reward_change_mean": -0.442603413015604, "reward_change_min": -0.7114094980061054, "reward_change_std": 0.2769723404198885, "reward_std": 0.6010153144598007, "rewards/cosine_scaled_reward": -0.14372907672077417, "rewards/format_reward": 0.770833345130086, "step": 458 }, { "advantage_max": 1.8294371962547302, "advantage_mean": 1.8626454822978644e-09, "advantage_min": -1.01992367208004, "advantage_std": 0.9998571202158928, "completion_length": 1208.854206085205, "epoch": 0.5245714285714286, "grad_norm": 0.8173410892486572, "kl": 0.0986328125, "lambda_div_used": 0.6, "learning_rate": 1.1830871145697412e-07, "loss": 0.0039, "reward": 0.2616848908364773, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2616848908364773, "reward_after_std": 0.8070877604186535, "reward_before_mean": 0.7860768726095557, "reward_before_std": 0.7491856962442398, "reward_change_max": 0.00043958425521850586, "reward_change_mean": -0.5243919845670462, "reward_change_min": -0.9147582352161407, "reward_change_std": 0.3446339722722769, "reward_std": 0.8070877604186535, "rewards/cosine_scaled_reward": -0.03404490277171135, "rewards/format_reward": 0.854166679084301, "step": 459 }, { "advantage_max": 1.7802351862192154, "advantage_mean": 1.117587172805301e-08, "advantage_min": -0.9591168016195297, "advantage_std": 0.9998372122645378, "completion_length": 1933.541748046875, "epoch": 0.5257142857142857, "grad_norm": 1.4948370456695557, "kl": 0.232696533203125, "lambda_div_used": 0.6, "learning_rate": 1.1743223682775649e-07, "loss": 0.0093, "reward": 0.17830261262133718, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17830261262133718, "reward_after_std": 0.766702301800251, "reward_before_mean": 0.669447798281908, "reward_before_std": 0.7442214302718639, "reward_change_max": 0.0005079954862594604, "reward_change_mean": -0.49114519357681274, "reward_change_min": -0.9560124427080154, "reward_change_std": 0.3472681976854801, "reward_std": 0.7667023204267025, "rewards/cosine_scaled_reward": -0.050692775286734104, "rewards/format_reward": 0.7708333488553762, "step": 460 }, { "advantage_max": 1.8203076124191284, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.9270096272230148, "advantage_std": 0.9998652040958405, "completion_length": 1526.2917098999023, "epoch": 0.5268571428571428, "grad_norm": 0.8616439700126648, "kl": 0.14084625244140625, "lambda_div_used": 0.6, "learning_rate": 1.1657684494105386e-07, "loss": 0.0056, "reward": 0.27595112178096315, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.27595112178096315, "reward_after_std": 0.9150453321635723, "reward_before_mean": 0.7884750002995133, "reward_before_std": 0.9043990522623062, "reward_change_max": 0.0, "reward_change_mean": -0.5125238858163357, "reward_change_min": -0.9507169425487518, "reward_change_std": 0.3724978230893612, "reward_std": 0.9150453470647335, "rewards/cosine_scaled_reward": 0.04007083596661687, "rewards/format_reward": 0.7083333432674408, "step": 461 }, { "advantage_max": 1.8993992805480957, "advantage_mean": 6.208817682207268e-09, "advantage_min": -0.8489513024687767, "advantage_std": 0.9998081251978874, "completion_length": 1436.9375228881836, "epoch": 0.528, "grad_norm": 1.4473426342010498, "kl": 0.20296478271484375, "lambda_div_used": 0.6, "learning_rate": 1.1574257748745986e-07, "loss": 0.0081, "reward": -0.10620388202369213, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.10620388202369213, "reward_after_std": 0.6253377795219421, "reward_before_mean": 0.24274032982066274, "reward_before_std": 0.5555364396423101, "reward_change_max": 8.26418399810791e-05, "reward_change_mean": -0.34894421696662903, "reward_change_min": -0.612749133259058, "reward_change_std": 0.24038436450064182, "reward_std": 0.625337790697813, "rewards/cosine_scaled_reward": -0.21196317533031106, "rewards/format_reward": 0.6666666716337204, "step": 462 }, { "advantage_max": 1.7989549040794373, "advantage_mean": -6.6744786364481e-09, "advantage_min": -0.9759308695793152, "advantage_std": 0.9998603165149689, "completion_length": 1856.2292022705078, "epoch": 0.5291428571428571, "grad_norm": 1.4682304859161377, "kl": 0.24060440063476562, "lambda_div_used": 0.6, "learning_rate": 1.1492947512799328e-07, "loss": 0.0096, "reward": 0.2849512416869402, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2849512416869402, "reward_after_std": 0.801155474036932, "reward_before_mean": 0.8258302640169859, "reward_before_std": 0.7735779173672199, "reward_change_max": 0.0022415444254875183, "reward_change_mean": -0.5408790297806263, "reward_change_min": -0.9627332501113415, "reward_change_std": 0.37879977002739906, "reward_std": 0.8011555224657059, "rewards/cosine_scaled_reward": 0.10041513899341226, "rewards/format_reward": 0.6250000093132257, "step": 463 }, { "advantage_max": 1.870115026831627, "advantage_mean": -3.756334376880943e-08, "advantage_min": -0.872229591012001, "advantage_std": 0.9998425468802452, "completion_length": 1322.395866394043, "epoch": 0.5302857142857142, "grad_norm": 0.9846634268760681, "kl": 0.19464111328125, "lambda_div_used": 0.6, "learning_rate": 1.1413757749211602e-07, "loss": 0.0078, "reward": 0.3826366728171706, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3826366728171706, "reward_after_std": 0.6611596159636974, "reward_before_mean": 1.0033353762701154, "reward_before_std": 0.5336503200232983, "reward_change_max": 0.0006307139992713928, "reward_change_mean": -0.6206987667828798, "reward_change_min": -0.9831653386354446, "reward_change_std": 0.36680967546999454, "reward_std": 0.6611596457660198, "rewards/cosine_scaled_reward": 0.1162510234862566, "rewards/format_reward": 0.7708333432674408, "step": 464 }, { "advantage_max": 1.9198713898658752, "advantage_mean": -3.104408563547878e-09, "advantage_min": -0.7656259946525097, "advantage_std": 0.9998870715498924, "completion_length": 1667.8542213439941, "epoch": 0.5314285714285715, "grad_norm": 1.1186479330062866, "kl": 0.3111457824707031, "lambda_div_used": 0.6, "learning_rate": 1.1336692317580158e-07, "loss": 0.0124, "reward": 0.4131432604044676, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4131432604044676, "reward_after_std": 1.006252758204937, "reward_before_mean": 0.9700895072892308, "reward_before_std": 0.8827327527105808, "reward_change_max": 0.0, "reward_change_mean": -0.5569462459534407, "reward_change_min": -0.9653652012348175, "reward_change_std": 0.3402498383074999, "reward_std": 1.0062527880072594, "rewards/cosine_scaled_reward": 0.037128068739548326, "rewards/format_reward": 0.8958333395421505, "step": 465 }, { "advantage_max": 1.8933946043252945, "advantage_mean": 9.93410786964688e-09, "advantage_min": -0.8079804182052612, "advantage_std": 0.9998544976115227, "completion_length": 1693.1875534057617, "epoch": 0.5325714285714286, "grad_norm": 1.2470462322235107, "kl": 0.2385406494140625, "lambda_div_used": 0.6, "learning_rate": 1.1261754973965422e-07, "loss": 0.0095, "reward": 0.16381147410720587, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16381147410720587, "reward_after_std": 0.8152791000902653, "reward_before_mean": 0.621258151717484, "reward_before_std": 0.7254021167755127, "reward_change_max": 0.000746704638004303, "reward_change_mean": -0.4574466794729233, "reward_change_min": -0.8183608688414097, "reward_change_std": 0.31307646073400974, "reward_std": 0.8152791261672974, "rewards/cosine_scaled_reward": -0.001870934385806322, "rewards/format_reward": 0.6250000093132257, "step": 466 }, { "advantage_max": 1.8866389095783234, "advantage_mean": 7.450581429591097e-09, "advantage_min": -0.825259268283844, "advantage_std": 0.9998439028859138, "completion_length": 1645.6875267028809, "epoch": 0.5337142857142857, "grad_norm": 1.0692403316497803, "kl": 0.2185211181640625, "lambda_div_used": 0.6, "learning_rate": 1.1188949370707787e-07, "loss": 0.0088, "reward": 0.14308831095695496, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.14308831095695496, "reward_after_std": 0.8592479415237904, "reward_before_mean": 0.5868221651762724, "reward_before_std": 0.7930984199047089, "reward_change_max": 0.0006082803010940552, "reward_change_mean": -0.4437338560819626, "reward_change_min": -0.8282331451773643, "reward_change_std": 0.3037046445533633, "reward_std": 0.8592479638755322, "rewards/cosine_scaled_reward": -0.09200559067539871, "rewards/format_reward": 0.7708333507180214, "step": 467 }, { "advantage_max": 1.841501995921135, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.9024374559521675, "advantage_std": 0.9998460412025452, "completion_length": 1776.7500610351562, "epoch": 0.5348571428571428, "grad_norm": 1.1234499216079712, "kl": 0.2584228515625, "lambda_div_used": 0.6, "learning_rate": 1.1118279056249653e-07, "loss": 0.0103, "reward": 0.05111578106880188, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.05111578106880188, "reward_after_std": 0.7923570424318314, "reward_before_mean": 0.45699799386784434, "reward_before_std": 0.7673504799604416, "reward_change_max": 0.0007030144333839417, "reward_change_mean": -0.40588223095983267, "reward_change_min": -0.8021054789423943, "reward_change_std": 0.2961500799283385, "reward_std": 0.792357049882412, "rewards/cosine_scaled_reward": -0.1048343344591558, "rewards/format_reward": 0.6666666753590107, "step": 468 }, { "advantage_max": 1.872466504573822, "advantage_mean": -1.5522042651205936e-08, "advantage_min": -0.9310917481780052, "advantage_std": 0.9998057186603546, "completion_length": 1483.645866394043, "epoch": 0.536, "grad_norm": 0.9675341844558716, "kl": 0.2593994140625, "lambda_div_used": 0.6, "learning_rate": 1.1049747474962444e-07, "loss": 0.0104, "reward": 0.04530154122039676, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04530154122039676, "reward_after_std": 0.546064380556345, "reward_before_mean": 0.49883297085762024, "reward_before_std": 0.4377680625766516, "reward_change_max": 0.0004517883062362671, "reward_change_mean": -0.4535314626991749, "reward_change_min": -0.6864179968833923, "reward_change_std": 0.2757315170019865, "reward_std": 0.5460643954575062, "rewards/cosine_scaled_reward": -0.11516685970127583, "rewards/format_reward": 0.7291666772216558, "step": 469 }, { "advantage_max": 1.7256519347429276, "advantage_mean": 3.073364596151151e-08, "advantage_min": -0.8964690193533897, "advantage_std": 0.9998262897133827, "completion_length": 2194.7708740234375, "epoch": 0.5371428571428571, "grad_norm": 2.29399037361145, "kl": 0.50030517578125, "lambda_div_used": 0.6, "learning_rate": 1.0983357966978745e-07, "loss": 0.02, "reward": -0.03671526629477739, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03671526629477739, "reward_after_std": 0.7010341510176659, "reward_before_mean": 0.3439774438738823, "reward_before_std": 0.7186840288341045, "reward_change_max": 0.0023821890354156494, "reward_change_mean": -0.3806927101686597, "reward_change_min": -0.7756905145943165, "reward_change_std": 0.30107271298766136, "reward_std": 0.7010341733694077, "rewards/cosine_scaled_reward": -0.13009461481124163, "rewards/format_reward": 0.6041666679084301, "step": 470 }, { "advantage_max": 1.7769792973995209, "advantage_mean": 9.31322685637781e-10, "advantage_min": -1.0818905234336853, "advantage_std": 0.9998090341687202, "completion_length": 2019.2500457763672, "epoch": 0.5382857142857143, "grad_norm": 1.6096271276474, "kl": 0.419647216796875, "lambda_div_used": 0.6, "learning_rate": 1.0919113768029517e-07, "loss": 0.0168, "reward": 0.12106739962473512, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12106739962473512, "reward_after_std": 0.6331440471112728, "reward_before_mean": 0.6041710805147886, "reward_before_std": 0.61738595739007, "reward_change_max": 0.0002641230821609497, "reward_change_mean": -0.4831036403775215, "reward_change_min": -0.8042481653392315, "reward_change_std": 0.3272095564752817, "reward_std": 0.6331440769135952, "rewards/cosine_scaled_reward": -0.020831143483519554, "rewards/format_reward": 0.6458333432674408, "step": 471 }, { "advantage_max": 1.9175452291965485, "advantage_mean": 4.9670543234014986e-09, "advantage_min": -0.7889066264033318, "advantage_std": 0.999824769794941, "completion_length": 1967.4583892822266, "epoch": 0.5394285714285715, "grad_norm": 0.9014610648155212, "kl": 0.339630126953125, "lambda_div_used": 0.6, "learning_rate": 1.0857018009286381e-07, "loss": 0.0136, "reward": -0.0833109375089407, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.0833109375089407, "reward_after_std": 0.6962088979780674, "reward_before_mean": 0.2651079539209604, "reward_before_std": 0.6203143000602722, "reward_change_max": 0.0, "reward_change_mean": -0.34841887280344963, "reward_change_min": -0.6095849871635437, "reward_change_std": 0.23636100441217422, "reward_std": 0.6962089166045189, "rewards/cosine_scaled_reward": -0.2007793728262186, "rewards/format_reward": 0.6666666753590107, "step": 472 }, { "advantage_max": 1.7719455808401108, "advantage_mean": -9.934107647602275e-09, "advantage_min": -1.0141915827989578, "advantage_std": 0.9998443275690079, "completion_length": 1939.916732788086, "epoch": 0.5405714285714286, "grad_norm": 1.1787399053573608, "kl": 0.34352874755859375, "lambda_div_used": 0.6, "learning_rate": 1.0797073717209013e-07, "loss": 0.0137, "reward": 0.030010550282895565, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.030010550282895565, "reward_after_std": 0.7244163006544113, "reward_before_mean": 0.4455965477973223, "reward_before_std": 0.7282268181443214, "reward_change_max": 0.0021120458841323853, "reward_change_mean": -0.41558599285781384, "reward_change_min": -0.7554572410881519, "reward_change_std": 0.31058728136122227, "reward_std": 0.7244163267314434, "rewards/cosine_scaled_reward": -0.14178507030010223, "rewards/format_reward": 0.7291666753590107, "step": 473 }, { "advantage_max": 1.7680951356887817, "advantage_mean": -3.6011140736036396e-08, "advantage_min": -1.0582480803132057, "advantage_std": 0.999866709113121, "completion_length": 1825.8750381469727, "epoch": 0.5417142857142857, "grad_norm": 1.1968833208084106, "kl": 0.3557090759277344, "lambda_div_used": 0.6, "learning_rate": 1.0739283813397639e-07, "loss": 0.0142, "reward": 0.6759944395162165, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6759944395162165, "reward_after_std": 0.8907247744500637, "reward_before_mean": 1.4209766387939453, "reward_before_std": 0.8035724554210901, "reward_change_max": 0.0008310303092002869, "reward_change_mean": -0.7449822090566158, "reward_change_min": -1.1608989238739014, "reward_change_std": 0.4833258595317602, "reward_std": 0.8907247819006443, "rewards/cosine_scaled_reward": 0.31465496867895126, "rewards/format_reward": 0.7916666828095913, "step": 474 }, { "advantage_max": 1.8225472569465637, "advantage_mean": -5.5879357807597785e-09, "advantage_min": -0.8803718611598015, "advantage_std": 0.9998807609081268, "completion_length": 1464.9375381469727, "epoch": 0.5428571428571428, "grad_norm": 1.0400774478912354, "kl": 0.16465377807617188, "lambda_div_used": 0.6, "learning_rate": 1.068365111445064e-07, "loss": 0.0066, "reward": 0.40743301063776016, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.40743301063776016, "reward_after_std": 0.963828545063734, "reward_before_mean": 0.9846168980002403, "reward_before_std": 0.9466955624520779, "reward_change_max": 0.0, "reward_change_mean": -0.5771838165819645, "reward_change_min": -1.096062332391739, "reward_change_std": 0.4055010573938489, "reward_std": 0.963828556239605, "rewards/cosine_scaled_reward": 0.05480840336531401, "rewards/format_reward": 0.8750000055879354, "step": 475 }, { "advantage_max": 1.8622777611017227, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.9705834239721298, "advantage_std": 0.9998641908168793, "completion_length": 1534.2708740234375, "epoch": 0.544, "grad_norm": 0.9736862778663635, "kl": 0.16690826416015625, "lambda_div_used": 0.6, "learning_rate": 1.063017833182728e-07, "loss": 0.0067, "reward": 0.2875065142288804, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2875065142288804, "reward_after_std": 0.8677309937775135, "reward_before_mean": 0.8108938159421086, "reward_before_std": 0.8140701726078987, "reward_change_max": 0.0, "reward_change_mean": -0.5233872570097446, "reward_change_min": -0.9255200996994972, "reward_change_std": 0.3573854472488165, "reward_std": 0.8677310347557068, "rewards/cosine_scaled_reward": -0.021636446937918663, "rewards/format_reward": 0.8541666865348816, "step": 476 }, { "advantage_max": 1.8614348471164703, "advantage_mean": -1.614292433060882e-08, "advantage_min": -0.8287764713168144, "advantage_std": 0.9998591169714928, "completion_length": 1370.2500343322754, "epoch": 0.5451428571428572, "grad_norm": 0.8771942853927612, "kl": 0.14801025390625, "lambda_div_used": 0.6, "learning_rate": 1.0578868071715544e-07, "loss": 0.0059, "reward": 0.35155012272298336, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35155012272298336, "reward_after_std": 0.872353557497263, "reward_before_mean": 0.9064036831259727, "reward_before_std": 0.7498830500990152, "reward_change_max": 0.0010341629385948181, "reward_change_mean": -0.5548535455018282, "reward_change_min": -0.925615169107914, "reward_change_std": 0.3508171420544386, "reward_std": 0.872353583574295, "rewards/cosine_scaled_reward": 0.05736849526874721, "rewards/format_reward": 0.791666679084301, "step": 477 }, { "advantage_max": 1.912682831287384, "advantage_mean": -4.967053324200776e-09, "advantage_min": -0.8365165963768959, "advantage_std": 0.9998528063297272, "completion_length": 1837.791732788086, "epoch": 0.5462857142857143, "grad_norm": 1.1443203687667847, "kl": 0.3565864562988281, "lambda_div_used": 0.6, "learning_rate": 1.0529722834905125e-07, "loss": 0.0142, "reward": 0.192441092338413, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.192441092338413, "reward_after_std": 0.827689416706562, "reward_before_mean": 0.6641485160216689, "reward_before_std": 0.7144726626574993, "reward_change_max": 0.0, "reward_change_mean": -0.4717074781656265, "reward_change_min": -0.7887590229511261, "reward_change_std": 0.3077526353299618, "reward_std": 0.8276894614100456, "rewards/cosine_scaled_reward": -0.0116757289506495, "rewards/format_reward": 0.687500013038516, "step": 478 }, { "advantage_max": 1.7926261574029922, "advantage_mean": 2.1730862664881556e-09, "advantage_min": -0.9882497265934944, "advantage_std": 0.9998070746660233, "completion_length": 2006.4792098999023, "epoch": 0.5474285714285714, "grad_norm": 1.2607219219207764, "kl": 0.40653228759765625, "lambda_div_used": 0.6, "learning_rate": 1.0482745016665526e-07, "loss": 0.0162, "reward": 0.013332958100363612, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.013332958100363612, "reward_after_std": 0.6395022124052048, "reward_before_mean": 0.43424001708626747, "reward_before_std": 0.6039809063076973, "reward_change_max": 0.0, "reward_change_mean": -0.42090705782175064, "reward_change_min": -0.743909303098917, "reward_change_std": 0.2912545781582594, "reward_std": 0.6395022161304951, "rewards/cosine_scaled_reward": -0.08496332913637161, "rewards/format_reward": 0.6041666679084301, "step": 479 }, { "advantage_max": 1.8559697270393372, "advantage_mean": 2.173086099954702e-09, "advantage_min": -0.834226630628109, "advantage_std": 0.999858595430851, "completion_length": 1779.0833740234375, "epoch": 0.5485714285714286, "grad_norm": 1.2886638641357422, "kl": 0.357391357421875, "lambda_div_used": 0.6, "learning_rate": 1.0437936906629334e-07, "loss": 0.0143, "reward": 0.138822834007442, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.138822834007442, "reward_after_std": 0.9744543358683586, "reward_before_mean": 0.557562917470932, "reward_before_std": 0.9577418360859156, "reward_change_max": 0.0003891661763191223, "reward_change_mean": -0.41874008253216743, "reward_change_min": -0.8996061198413372, "reward_change_std": 0.34219310991466045, "reward_std": 0.9744543917477131, "rewards/cosine_scaled_reward": -0.0858018803410232, "rewards/format_reward": 0.7291666772216558, "step": 480 }, { "advantage_max": 1.8721778094768524, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.9300287738442421, "advantage_std": 0.9998296350240707, "completion_length": 1719.6459045410156, "epoch": 0.5497142857142857, "grad_norm": 0.6540005803108215, "kl": 0.24334716796875, "lambda_div_used": 0.6, "learning_rate": 1.0395300688680625e-07, "loss": 0.0097, "reward": 0.038552841171622276, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.038552841171622276, "reward_after_std": 0.6775308847427368, "reward_before_mean": 0.4625336639583111, "reward_before_std": 0.6034595742821693, "reward_change_max": 0.0, "reward_change_mean": -0.4239808265119791, "reward_change_min": -0.7124708220362663, "reward_change_std": 0.2699653413146734, "reward_std": 0.6775308921933174, "rewards/cosine_scaled_reward": -0.12289983592927456, "rewards/format_reward": 0.7083333507180214, "step": 481 }, { "advantage_max": 1.7981848269701004, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.9321806207299232, "advantage_std": 0.9998157620429993, "completion_length": 1528.458366394043, "epoch": 0.5508571428571428, "grad_norm": 1.0169956684112549, "kl": 0.2531890869140625, "lambda_div_used": 0.6, "learning_rate": 1.0354838440848501e-07, "loss": 0.0101, "reward": 0.15139730274677277, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15139730274677277, "reward_after_std": 0.7655397467315197, "reward_before_mean": 0.6284934259019792, "reward_before_std": 0.7671113889664412, "reward_change_max": 0.0007704496383666992, "reward_change_mean": -0.477096114307642, "reward_change_min": -0.9491989016532898, "reward_change_std": 0.3575117578729987, "reward_std": 0.7655397653579712, "rewards/cosine_scaled_reward": -0.05033663008362055, "rewards/format_reward": 0.7291666865348816, "step": 482 }, { "advantage_max": 1.9271509051322937, "advantage_mean": -1.9247333948868572e-08, "advantage_min": -0.8351431041955948, "advantage_std": 0.9998330399394035, "completion_length": 1956.7917022705078, "epoch": 0.552, "grad_norm": 0.846644401550293, "kl": 0.28060150146484375, "lambda_div_used": 0.6, "learning_rate": 1.0316552135205837e-07, "loss": 0.0112, "reward": 0.07792708650231361, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07792708650231361, "reward_after_std": 0.8151919208467007, "reward_before_mean": 0.48914189636707306, "reward_before_std": 0.7106311805546284, "reward_change_max": 0.0, "reward_change_mean": -0.41121480986475945, "reward_change_min": -0.7100636810064316, "reward_change_std": 0.2673608586192131, "reward_std": 0.8151919543743134, "rewards/cosine_scaled_reward": -0.1200123907183297, "rewards/format_reward": 0.7291666753590107, "step": 483 }, { "advantage_max": 1.803183764219284, "advantage_mean": -1.8936892692833496e-08, "advantage_min": -0.9652373790740967, "advantage_std": 0.9998325034976006, "completion_length": 1344.0417022705078, "epoch": 0.5531428571428572, "grad_norm": 1.1684057712554932, "kl": 0.20900726318359375, "lambda_div_used": 0.6, "learning_rate": 1.0280443637773163e-07, "loss": 0.0084, "reward": 0.35994289815425873, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.35994289815425873, "reward_after_std": 0.8187261484563351, "reward_before_mean": 0.9432360231876373, "reward_before_std": 0.7991822604089975, "reward_change_max": 0.0, "reward_change_mean": -0.5832931138575077, "reward_change_min": -0.9661167785525322, "reward_change_std": 0.39387095533311367, "reward_std": 0.8187261670827866, "rewards/cosine_scaled_reward": 0.04453466390259564, "rewards/format_reward": 0.854166679084301, "step": 484 }, { "advantage_max": 1.8188410550355911, "advantage_mean": -2.0489097307674342e-08, "advantage_min": -0.9117425456643105, "advantage_std": 0.999856524169445, "completion_length": 1547.0834121704102, "epoch": 0.5542857142857143, "grad_norm": 1.74229097366333, "kl": 0.2944793701171875, "lambda_div_used": 0.6, "learning_rate": 1.0246514708427701e-07, "loss": 0.0118, "reward": 0.23888829350471497, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23888829350471497, "reward_after_std": 0.7746790088713169, "reward_before_mean": 0.7610771879553795, "reward_before_std": 0.733045406639576, "reward_change_max": 0.000652313232421875, "reward_change_mean": -0.5221888944506645, "reward_change_min": -0.8894664272665977, "reward_change_std": 0.3509647063910961, "reward_std": 0.7746790125966072, "rewards/cosine_scaled_reward": -0.025711423717439175, "rewards/format_reward": 0.8125000149011612, "step": 485 }, { "advantage_max": 1.8529712557792664, "advantage_mean": -8.692343955729598e-09, "advantage_min": -0.9172825664281845, "advantage_std": 0.9998394474387169, "completion_length": 901.3125305175781, "epoch": 0.5554285714285714, "grad_norm": 1.0345560312271118, "kl": 0.11465072631835938, "lambda_div_used": 0.6, "learning_rate": 1.0214767000817596e-07, "loss": 0.0046, "reward": 0.291486918926239, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.291486918926239, "reward_after_std": 0.7153438255190849, "reward_before_mean": 0.8480473421514034, "reward_before_std": 0.5964875835925341, "reward_change_max": 0.0, "reward_change_mean": -0.5565604045987129, "reward_change_min": -0.9302693456411362, "reward_change_std": 0.3404785916209221, "reward_std": 0.7153438292443752, "rewards/cosine_scaled_reward": -0.02389301359653473, "rewards/format_reward": 0.8958333432674408, "step": 486 }, { "advantage_max": 1.8933127522468567, "advantage_mean": 1.6763806898190126e-08, "advantage_min": -0.8865720219910145, "advantage_std": 0.9998225793242455, "completion_length": 1088.0625286102295, "epoch": 0.5565714285714286, "grad_norm": 1.1974716186523438, "kl": 0.109893798828125, "lambda_div_used": 0.6, "learning_rate": 1.0185202062281336e-07, "loss": 0.0044, "reward": 0.4667796200737939, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4667796200737939, "reward_after_std": 0.7072942927479744, "reward_before_mean": 1.1232407530769706, "reward_before_std": 0.5628406452015042, "reward_change_max": 0.0, "reward_change_mean": -0.6564611494541168, "reward_change_min": -1.0229570604860783, "reward_change_std": 0.3858571834862232, "reward_std": 0.7072943113744259, "rewards/cosine_scaled_reward": 0.11370370723307133, "rewards/format_reward": 0.8958333432674408, "step": 487 }, { "advantage_max": 1.824654459953308, "advantage_mean": 1.862645193639878e-08, "advantage_min": -0.9079797342419624, "advantage_std": 0.9998256489634514, "completion_length": 1488.770866394043, "epoch": 0.5577142857142857, "grad_norm": 1.3172677755355835, "kl": 0.33730316162109375, "lambda_div_used": 0.6, "learning_rate": 1.0157821333772304e-07, "loss": 0.0135, "reward": 0.13362685590982437, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13362685590982437, "reward_after_std": 0.6905847005546093, "reward_before_mean": 0.6119700018316507, "reward_before_std": 0.6388088949024677, "reward_change_max": 0.0, "reward_change_mean": -0.47834310680627823, "reward_change_min": -0.8052879460155964, "reward_change_std": 0.30606131348758936, "reward_std": 0.6905847080051899, "rewards/cosine_scaled_reward": -0.07943168503697962, "rewards/format_reward": 0.7708333395421505, "step": 488 }, { "advantage_max": 1.9077175855636597, "advantage_mean": 1.490116174895917e-08, "advantage_min": -0.8542089462280273, "advantage_std": 0.999797873198986, "completion_length": 1636.25004196167, "epoch": 0.5588571428571428, "grad_norm": 1.2856013774871826, "kl": 0.2859954833984375, "lambda_div_used": 0.6, "learning_rate": 1.013262614978859e-07, "loss": 0.0114, "reward": -0.1691692327876808, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1691692327876808, "reward_after_std": 0.5174032747745514, "reward_before_mean": 0.1689762193709612, "reward_before_std": 0.4460158832371235, "reward_change_max": 0.0016556233167648315, "reward_change_mean": -0.3381454488262534, "reward_change_min": -0.5612492710351944, "reward_change_std": 0.20866003772243857, "reward_std": 0.517403282225132, "rewards/cosine_scaled_reward": -0.31134524568915367, "rewards/format_reward": 0.7916666846722364, "step": 489 }, { "advantage_max": 1.9346612095832825, "advantage_mean": 9.313225746154785e-09, "advantage_min": -0.7759344056248665, "advantage_std": 0.9998493641614914, "completion_length": 1638.666690826416, "epoch": 0.56, "grad_norm": 0.999034583568573, "kl": 0.37992095947265625, "lambda_div_used": 0.6, "learning_rate": 1.0109617738307911e-07, "loss": 0.0152, "reward": 0.2699229356367141, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2699229356367141, "reward_after_std": 0.8593488521873951, "reward_before_mean": 0.7801721151918173, "reward_before_std": 0.7295130789279938, "reward_change_max": 0.0, "reward_change_mean": -0.5102491900324821, "reward_change_min": -0.8690995946526527, "reward_change_std": 0.322820819914341, "reward_std": 0.8593488857150078, "rewards/cosine_scaled_reward": -0.01616394752636552, "rewards/format_reward": 0.8125000111758709, "step": 490 }, { "advantage_max": 1.8448296338319778, "advantage_mean": 2.1109978987077227e-08, "advantage_min": -0.8977131173014641, "advantage_std": 0.9998864755034447, "completion_length": 1702.6250686645508, "epoch": 0.5611428571428572, "grad_norm": 1.1440184116363525, "kl": 0.2804298400878906, "lambda_div_used": 0.6, "learning_rate": 1.0088797220727779e-07, "loss": 0.0112, "reward": 0.40567315742373466, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.40567315742373466, "reward_after_std": 0.9561508670449257, "reward_before_mean": 0.9794415173455491, "reward_before_std": 0.890728622674942, "reward_change_max": 0.00013072043657302856, "reward_change_mean": -0.5737683735787868, "reward_change_min": -1.022655088454485, "reward_change_std": 0.392275283113122, "reward_std": 0.9561508968472481, "rewards/cosine_scaled_reward": 0.10430408432148397, "rewards/format_reward": 0.7708333432674408, "step": 491 }, { "advantage_max": 1.8528670370578766, "advantage_mean": -8.265487849712372e-09, "advantage_min": -0.9619855433702469, "advantage_std": 0.9998434036970139, "completion_length": 1238.020881652832, "epoch": 0.5622857142857143, "grad_norm": 1.146223783493042, "kl": 0.2194671630859375, "lambda_div_used": 0.6, "learning_rate": 1.0070165611810855e-07, "loss": 0.0088, "reward": 0.17629980109632015, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17629980109632015, "reward_after_std": 0.7229134701192379, "reward_before_mean": 0.664001327008009, "reward_before_std": 0.6515200156718493, "reward_change_max": 0.003348402678966522, "reward_change_mean": -0.48770153522491455, "reward_change_min": -0.759284932166338, "reward_change_std": 0.31521399691700935, "reward_std": 0.7229134850203991, "rewards/cosine_scaled_reward": -0.02216600440442562, "rewards/format_reward": 0.7083333358168602, "step": 492 }, { "advantage_max": 1.845920279622078, "advantage_mean": -2.421438827227007e-08, "advantage_min": -0.9289551004767418, "advantage_std": 0.9998549744486809, "completion_length": 1406.0417022705078, "epoch": 0.5634285714285714, "grad_norm": 1.184366226196289, "kl": 0.219512939453125, "lambda_div_used": 0.6, "learning_rate": 1.005372381963547e-07, "loss": 0.0088, "reward": 0.3451455421745777, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3451455421745777, "reward_after_std": 0.794960230588913, "reward_before_mean": 0.914349190890789, "reward_before_std": 0.6987705379724503, "reward_change_max": 0.0, "reward_change_mean": -0.5692036896944046, "reward_change_min": -0.9593402594327927, "reward_change_std": 0.36596825532615185, "reward_std": 0.7949602715671062, "rewards/cosine_scaled_reward": 0.019674593582749367, "rewards/format_reward": 0.8750000149011612, "step": 493 }, { "advantage_max": 1.8998329788446426, "advantage_mean": -8.07146260939362e-09, "advantage_min": -0.7808930799365044, "advantage_std": 0.9998975768685341, "completion_length": 1432.9375381469727, "epoch": 0.5645714285714286, "grad_norm": 1.0612187385559082, "kl": 0.2517433166503906, "lambda_div_used": 0.6, "learning_rate": 1.0039472645551372e-07, "loss": 0.01, "reward": 0.3418361786752939, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3418361786752939, "reward_after_std": 1.0622280165553093, "reward_before_mean": 0.8451494723558426, "reward_before_std": 0.9670507088303566, "reward_change_max": 0.0, "reward_change_mean": -0.5033133029937744, "reward_change_min": -0.9100279286503792, "reward_change_std": 0.33215487375855446, "reward_std": 1.0622280314564705, "rewards/cosine_scaled_reward": -0.03575861267745495, "rewards/format_reward": 0.9166666865348816, "step": 494 }, { "advantage_max": 1.8994756937026978, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.8431757986545563, "advantage_std": 0.999841496348381, "completion_length": 1628.3542098999023, "epoch": 0.5657142857142857, "grad_norm": 1.2267202138900757, "kl": 0.2889556884765625, "lambda_div_used": 0.6, "learning_rate": 1.002741278414069e-07, "loss": 0.0116, "reward": 0.2072940650396049, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2072940650396049, "reward_after_std": 0.8287488594651222, "reward_before_mean": 0.689543791115284, "reward_before_std": 0.7430235873907804, "reward_change_max": 0.0, "reward_change_mean": -0.4822497144341469, "reward_change_min": -0.8137565180659294, "reward_change_std": 0.3069156575948, "reward_std": 0.828748881816864, "rewards/cosine_scaled_reward": -0.040644790045917034, "rewards/format_reward": 0.7708333376795053, "step": 495 }, { "advantage_max": 1.8756288141012192, "advantage_mean": -2.9414272129102415e-08, "advantage_min": -0.8122064806520939, "advantage_std": 0.9998170509934425, "completion_length": 1611.4375076293945, "epoch": 0.5668571428571428, "grad_norm": 0.8443682193756104, "kl": 0.2679557800292969, "lambda_div_used": 0.6, "learning_rate": 1.0017544823184055e-07, "loss": 0.0107, "reward": 0.3717600470408797, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3717600470408797, "reward_after_std": 0.6946693547070026, "reward_before_mean": 0.9796913452446461, "reward_before_std": 0.5551566991489381, "reward_change_max": 9.44286584854126e-05, "reward_change_mean": -0.6079313308000565, "reward_change_min": -0.9462003000080585, "reward_change_std": 0.37002974562346935, "reward_std": 0.694669384509325, "rewards/cosine_scaled_reward": 0.07317899935878813, "rewards/format_reward": 0.8333333395421505, "step": 496 }, { "advantage_max": 1.8541768789291382, "advantage_mean": -1.241763691872677e-09, "advantage_min": -0.912086084485054, "advantage_std": 0.9998577758669853, "completion_length": 1330.3542175292969, "epoch": 0.568, "grad_norm": 0.8634279370307922, "kl": 0.2939910888671875, "lambda_div_used": 0.6, "learning_rate": 1.0009869243631952e-07, "loss": 0.0117, "reward": 0.4358122395351529, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4358122395351529, "reward_after_std": 0.8337152674794197, "reward_before_mean": 1.0514396652579308, "reward_before_std": 0.7460405230522156, "reward_change_max": 0.0, "reward_change_mean": -0.6156274164095521, "reward_change_min": -1.005971860140562, "reward_change_std": 0.39167843107134104, "reward_std": 0.8337152861058712, "rewards/cosine_scaled_reward": 0.09863647632300854, "rewards/format_reward": 0.8541666772216558, "step": 497 }, { "advantage_max": 1.8234520852565765, "advantage_mean": 2.421438694000244e-08, "advantage_min": -0.9808195158839226, "advantage_std": 0.9998212158679962, "completion_length": 1489.4792251586914, "epoch": 0.5691428571428572, "grad_norm": 1.3871195316314697, "kl": 0.23724365234375, "lambda_div_used": 0.6, "learning_rate": 1.000438641958131e-07, "loss": 0.0095, "reward": 0.07590579707175493, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07590579707175493, "reward_after_std": 0.8010249249637127, "reward_before_mean": 0.49792688991874456, "reward_before_std": 0.7750258985906839, "reward_change_max": 0.001229986548423767, "reward_change_mean": -0.42202106304466724, "reward_change_min": -0.7636589221656322, "reward_change_std": 0.29837763123214245, "reward_std": 0.801024928689003, "rewards/cosine_scaled_reward": -0.09478656761348248, "rewards/format_reward": 0.6875000204890966, "step": 498 }, { "advantage_max": 1.8711726665496826, "advantage_mean": -1.7384688466570708e-08, "advantage_min": -0.9095111042261124, "advantage_std": 0.9998540803790092, "completion_length": 1537.0416946411133, "epoch": 0.5702857142857143, "grad_norm": 1.4712544679641724, "kl": 0.21712112426757812, "lambda_div_used": 0.6, "learning_rate": 1.0001096618257236e-07, "loss": 0.0087, "reward": 0.2286053616553545, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2286053616553545, "reward_after_std": 0.8291343785822392, "reward_before_mean": 0.7263824446126819, "reward_before_std": 0.7433257922530174, "reward_change_max": 0.0, "reward_change_mean": -0.4977770820260048, "reward_change_min": -0.8215654790401459, "reward_change_std": 0.3198694474995136, "reward_std": 0.8291344232857227, "rewards/cosine_scaled_reward": -0.04305878991726786, "rewards/format_reward": 0.8125000111758709, "step": 499 }, { "advantage_max": 1.8898231089115143, "advantage_mean": -9.934107980669182e-09, "advantage_min": -0.8911553248763084, "advantage_std": 0.9998495951294899, "completion_length": 1224.2291793823242, "epoch": 0.5714285714285714, "grad_norm": 1.1404680013656616, "kl": 0.22232437133789062, "lambda_div_used": 0.6, "learning_rate": 1e-07, "loss": 0.0089, "reward": 0.28170950431376696, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.28170950431376696, "reward_after_std": 0.8098886050283909, "reward_before_mean": 0.8126306012272835, "reward_before_std": 0.7228812500834465, "reward_change_max": 0.0, "reward_change_mean": -0.5309210903942585, "reward_change_min": -0.8794347941875458, "reward_change_std": 0.3362457137554884, "reward_std": 0.8098886422812939, "rewards/cosine_scaled_reward": -0.04160138592123985, "rewards/format_reward": 0.8958333507180214, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.0016669491224483837, "train_runtime": 56169.988, "train_samples_per_second": 0.427, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }