{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 1.6607913672924042, "advantage_mean": 3.042320539936583e-08, "advantage_min": -1.1215128377079964, "advantage_std": 0.9998299926519394, "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.1972947120666504, "kl": 0.0, "lambda_div_used": 0.7000000000000001, "learning_rate": 2e-08, "loss": -0.0, "reward": 0.1723687592893839, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1723687592893839, "reward_after_std": 0.7976016290485859, "reward_before_mean": 0.4897647276520729, "reward_before_std": 0.8290339298546314, "reward_change_max": 0.00042107701301574707, "reward_change_mean": -0.31739595998078585, "reward_change_min": -0.6219300664961338, "reward_change_std": 0.2523575215600431, "reward_std": 0.7976016625761986, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "advantage_max": 1.5881455093622208, "advantage_mean": 4.9670528801115665e-09, "advantage_min": -1.1069519817829132, "advantage_std": 0.9997488334774971, "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.18163850903511047, "kl": 0.0, "lambda_div_used": 0.7000000000000001, "learning_rate": 4e-08, "loss": 0.0, "reward": -0.018269629566930234, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.018269629566930234, "reward_after_std": 0.444029388949275, "reward_before_mean": 0.27539755403995514, "reward_before_std": 0.42092561535537243, "reward_change_max": 0.0009796768426895142, "reward_change_mean": -0.29366718512028456, "reward_change_min": -0.478233277797699, "reward_change_std": 0.19509424595162272, "reward_std": 0.44402940198779106, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "advantage_max": 1.6156881153583527, "advantage_mean": 6.301949470599588e-08, "advantage_min": -1.172730676829815, "advantage_std": 0.9996396973729134, "completion_length": 3375.4375, "epoch": 0.0034285714285714284, "grad_norm": 0.16647681593894958, "kl": 4.2244791984558105e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.439353309571743, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.439353309571743, "reward_after_std": 0.32052337378263474, "reward_before_mean": -0.28239025454968214, "reward_before_std": 0.3277525738812983, "reward_change_max": 0.002180330455303192, "reward_change_mean": -0.15696304757148027, "reward_change_min": -0.2843666225671768, "reward_change_std": 0.11916331853717566, "reward_std": 0.3205233830958605, "rewards/cosine_scaled_reward": -0.1828617942519486, "rewards/format_reward": 0.0833333358168602, "step": 3 }, { "advantage_max": 1.7174324095249176, "advantage_mean": -2.3593506148777976e-08, "advantage_min": -1.043996386229992, "advantage_std": 0.9998214840888977, "completion_length": 2284.8125762939453, "epoch": 0.004571428571428572, "grad_norm": 0.2447059452533722, "kl": 3.663450479507446e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8e-08, "loss": 0.0, "reward": 0.18947136122733355, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.18947136122733355, "reward_after_std": 0.7922682836651802, "reward_before_mean": 0.5098046064376831, "reward_before_std": 0.7891745567321777, "reward_change_max": 0.0017295852303504944, "reward_change_mean": -0.32033321633934975, "reward_change_min": -0.6060966607183218, "reward_change_std": 0.243425321765244, "reward_std": 0.7922682948410511, "rewards/cosine_scaled_reward": -0.05759772006422281, "rewards/format_reward": 0.6250000055879354, "step": 4 }, { "advantage_max": 1.8149476051330566, "advantage_mean": -1.4280278959333259e-08, "advantage_min": -0.8999244049191475, "advantage_std": 0.9998219162225723, "completion_length": 3342.2083740234375, "epoch": 0.005714285714285714, "grad_norm": 0.17365455627441406, "kl": 4.491955041885376e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1e-07, "loss": 0.0, "reward": -0.33979691937565804, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.33979691937565804, "reward_after_std": 0.7315108627080917, "reward_before_mean": -0.2079819105565548, "reward_before_std": 0.7307381853461266, "reward_change_max": 0.0010365769267082214, "reward_change_mean": -0.13181501254439354, "reward_change_min": -0.2945617139339447, "reward_change_std": 0.11999980034306645, "reward_std": 0.7315108887851238, "rewards/cosine_scaled_reward": -0.23940762784332037, "rewards/format_reward": 0.2708333395421505, "step": 5 }, { "advantage_max": 1.8157245814800262, "advantage_mean": 3.228585049308563e-08, "advantage_min": -0.8999299108982086, "advantage_std": 0.9997979998588562, "completion_length": 3106.187545776367, "epoch": 0.006857142857142857, "grad_norm": 0.17503227293491364, "kl": 4.060566425323486e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2e-07, "loss": 0.0, "reward": -0.23407822172157466, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.23407822172157466, "reward_after_std": 0.6936947889626026, "reward_before_mean": -0.05999967269599438, "reward_before_std": 0.6732829138636589, "reward_change_max": 0.0016162022948265076, "reward_change_mean": -0.17407855205237865, "reward_change_min": -0.33925578370690346, "reward_change_std": 0.1386033445596695, "reward_std": 0.6936948224902153, "rewards/cosine_scaled_reward": -0.18624983666813932, "rewards/format_reward": 0.31250000558793545, "step": 6 }, { "advantage_max": 1.6683564186096191, "advantage_mean": 1.117587122845265e-08, "advantage_min": -1.0670213848352432, "advantage_std": 0.9998510107398033, "completion_length": 3045.5000610351562, "epoch": 0.008, "grad_norm": 0.1693323701620102, "kl": 1.975148916244507e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4e-07, "loss": 0.0, "reward": 0.12328216899186373, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12328216899186373, "reward_after_std": 0.8423587679862976, "reward_before_mean": 0.4237218517810106, "reward_before_std": 0.8992306850850582, "reward_change_max": 0.0007850527763366699, "reward_change_mean": -0.3004396799951792, "reward_change_min": -0.6309426687657833, "reward_change_std": 0.2723789247684181, "reward_std": 0.8423588238656521, "rewards/cosine_scaled_reward": -0.07980574667453766, "rewards/format_reward": 0.5833333395421505, "step": 7 }, { "advantage_max": 1.859576016664505, "advantage_mean": -9.313226301266297e-09, "advantage_min": -0.8094820007681847, "advantage_std": 0.9997851550579071, "completion_length": 2703.3125228881836, "epoch": 0.009142857142857144, "grad_norm": 0.17851409316062927, "kl": 2.454221248626709e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.46806809585541487, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.46806809585541487, "reward_after_std": 0.758019644767046, "reward_before_mean": 0.8838345343247056, "reward_before_std": 0.6388167105615139, "reward_change_max": 6.803125143051147e-05, "reward_change_mean": -0.4157663956284523, "reward_change_min": -0.6230667233467102, "reward_change_std": 0.23545266035944223, "reward_std": 0.7580196596682072, "rewards/cosine_scaled_reward": 0.20233390713110566, "rewards/format_reward": 0.4791666679084301, "step": 8 }, { "advantage_max": 1.6771964579820633, "advantage_mean": 9.934107203513065e-09, "advantage_min": -1.049280323088169, "advantage_std": 0.9998246431350708, "completion_length": 3010.291748046875, "epoch": 0.010285714285714285, "grad_norm": 0.21464405953884125, "kl": 3.267824649810791e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8e-07, "loss": 0.0, "reward": -0.1797618877608329, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1797618877608329, "reward_after_std": 0.8017399236559868, "reward_before_mean": 0.007110316306352615, "reward_before_std": 0.8378483131527901, "reward_change_max": 0.001242212951183319, "reward_change_mean": -0.18687221501022577, "reward_change_min": -0.4056093581020832, "reward_change_std": 0.1812688522040844, "reward_std": 0.8017399460077286, "rewards/cosine_scaled_reward": -0.15269484417513013, "rewards/format_reward": 0.3125000074505806, "step": 9 }, { "advantage_max": 1.7588439136743546, "advantage_mean": 5.898376453927767e-09, "advantage_min": -0.8439291417598724, "advantage_std": 0.9998617395758629, "completion_length": 2760.6041946411133, "epoch": 0.011428571428571429, "grad_norm": 0.19300341606140137, "kl": 2.6448629796504974e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.02827950823120773, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.02827950823120773, "reward_after_std": 0.9186790250241756, "reward_before_mean": 0.2741504404693842, "reward_before_std": 0.9365747645497322, "reward_change_max": 0.0016991794109344482, "reward_change_mean": -0.24587093386799097, "reward_change_min": -0.5156212039291859, "reward_change_std": 0.21967529505491257, "reward_std": 0.9186790362000465, "rewards/cosine_scaled_reward": -0.07125811779405922, "rewards/format_reward": 0.4166666679084301, "step": 10 }, { "advantage_max": 1.7310106009244919, "advantage_mean": 6.798655038942059e-08, "advantage_min": -0.9994613453745842, "advantage_std": 0.9997423142194748, "completion_length": 3429.1458740234375, "epoch": 0.012571428571428572, "grad_norm": 0.18800467252731323, "kl": 3.6150217056274414e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": -0.5139304362237453, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5139304362237453, "reward_after_std": 0.5909987725317478, "reward_before_mean": -0.4253815487027168, "reward_before_std": 0.6046973243355751, "reward_change_max": 0.0015730857849121094, "reward_change_mean": -0.08854888007044792, "reward_change_min": -0.20590653829276562, "reward_change_std": 0.08903698669746518, "reward_std": 0.5909987948834896, "rewards/cosine_scaled_reward": -0.25435744039714336, "rewards/format_reward": 0.0833333358168602, "step": 11 }, { "advantage_max": 1.7789526730775833, "advantage_mean": -8.692344621863413e-09, "advantage_min": -0.9305046126246452, "advantage_std": 0.9997878223657608, "completion_length": 2463.1250228881836, "epoch": 0.013714285714285714, "grad_norm": 0.21396146714687347, "kl": 3.5785138607025146e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.1051873336546123, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1051873336546123, "reward_after_std": 0.6017142459750175, "reward_before_mean": 0.42070355266332626, "reward_before_std": 0.5864638835191727, "reward_change_max": 0.0003143623471260071, "reward_change_mean": -0.31551621994003654, "reward_change_min": -0.5249358452856541, "reward_change_std": 0.21235592477023602, "reward_std": 0.601714264601469, "rewards/cosine_scaled_reward": -0.10214823856949806, "rewards/format_reward": 0.6250000018626451, "step": 12 }, { "advantage_max": 1.7444801926612854, "advantage_mean": 2.460243814228491e-08, "advantage_min": -0.9748427495360374, "advantage_std": 0.9998096376657486, "completion_length": 2783.166732788086, "epoch": 0.014857142857142857, "grad_norm": 0.1993982493877411, "kl": 2.54213809967041e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.6e-07, "loss": 0.0, "reward": 0.11293572559952736, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.11293572559952736, "reward_after_std": 0.7738058529794216, "reward_before_mean": 0.40740850754082203, "reward_before_std": 0.7732120640575886, "reward_change_max": 0.00036291033029556274, "reward_change_mean": -0.29447276424616575, "reward_change_min": -0.5720260292291641, "reward_change_std": 0.2302146479487419, "reward_std": 0.7738058604300022, "rewards/cosine_scaled_reward": -0.056712422519922256, "rewards/format_reward": 0.5208333395421505, "step": 13 }, { "advantage_max": 1.815529689192772, "advantage_mean": 1.2417638028949796e-09, "advantage_min": -0.8386121764779091, "advantage_std": 0.9998480901122093, "completion_length": 2916.541717529297, "epoch": 0.016, "grad_norm": 0.25046306848526, "kl": 3.520399332046509e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.8e-07, "loss": 0.0, "reward": 0.023446697276085615, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.023446697276085615, "reward_after_std": 0.9370927512645721, "reward_before_mean": 0.26226543355733156, "reward_before_std": 0.9471188820898533, "reward_change_max": 7.539987564086914e-06, "reward_change_mean": -0.2388187418691814, "reward_change_min": -0.559939157217741, "reward_change_std": 0.2085849130526185, "reward_std": 0.9370927736163139, "rewards/cosine_scaled_reward": -0.077200623229146, "rewards/format_reward": 0.4166666753590107, "step": 14 }, { "advantage_max": 1.8290736228227615, "advantage_mean": -6.208810687802213e-10, "advantage_min": -0.8647431060671806, "advantage_std": 0.9997366145253181, "completion_length": 2774.145851135254, "epoch": 0.017142857142857144, "grad_norm": 0.18985596299171448, "kl": 2.9733404517173767e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.02424021461047232, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.02424021461047232, "reward_after_std": 0.6245809756219387, "reward_before_mean": 0.30136442370712757, "reward_before_std": 0.572150208055973, "reward_change_max": 0.0018563717603683472, "reward_change_mean": -0.2771241939626634, "reward_change_min": -0.4524312876164913, "reward_change_std": 0.18269951874390244, "reward_std": 0.6245809886604548, "rewards/cosine_scaled_reward": -0.03681779658654705, "rewards/format_reward": 0.3750000037252903, "step": 15 }, { "advantage_max": 1.7455659359693527, "advantage_mean": 6.457170187434969e-08, "advantage_min": -0.8832721263170242, "advantage_std": 0.9997515827417374, "completion_length": 3506.312530517578, "epoch": 0.018285714285714287, "grad_norm": 0.17467094957828522, "kl": 4.1387975215911865e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.2e-07, "loss": 0.0, "reward": -0.5477899853140116, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.5477899853140116, "reward_after_std": 0.4932219237089157, "reward_before_mean": -0.4574399571865797, "reward_before_std": 0.5024523958563805, "reward_change_max": 0.0009342208504676819, "reward_change_mean": -0.09035002673044801, "reward_change_min": -0.2171798963099718, "reward_change_std": 0.09168374631553888, "reward_std": 0.4932219348847866, "rewards/cosine_scaled_reward": -0.2495533125475049, "rewards/format_reward": 0.0416666679084301, "step": 16 }, { "advantage_max": 1.7420411109924316, "advantage_mean": -2.980232371996294e-08, "advantage_min": -0.9265064224600792, "advantage_std": 0.9998435750603676, "completion_length": 2181.583381652832, "epoch": 0.019428571428571427, "grad_norm": 0.27064624428749084, "kl": 3.8176774978637695e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": 0.29008908569812775, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.29008908569812775, "reward_after_std": 0.8539934456348419, "reward_before_mean": 0.6364327892661095, "reward_before_std": 0.8340755216777325, "reward_change_max": 0.000418640673160553, "reward_change_mean": -0.3463437305763364, "reward_change_min": -0.632047176361084, "reward_change_std": 0.23590328451246023, "reward_std": 0.8539934530854225, "rewards/cosine_scaled_reward": 0.02654973231256008, "rewards/format_reward": 0.5833333414047956, "step": 17 }, { "advantage_max": 1.7018649876117706, "advantage_mean": 2.1109978765032622e-08, "advantage_min": -1.0383967086672783, "advantage_std": 0.9998053833842278, "completion_length": 3002.083335876465, "epoch": 0.02057142857142857, "grad_norm": 0.20723013579845428, "kl": 2.5073066353797913e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.6e-07, "loss": 0.0, "reward": -0.0296906647272408, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.0296906647272408, "reward_after_std": 0.6654968298971653, "reward_before_mean": 0.2298431594390422, "reward_before_std": 0.6792299374938011, "reward_change_max": 0.0009440481662750244, "reward_change_mean": -0.2595338122919202, "reward_change_min": -0.49312782287597656, "reward_change_std": 0.1969085903838277, "reward_std": 0.6654968522489071, "rewards/cosine_scaled_reward": -0.0621617641299963, "rewards/format_reward": 0.35416667349636555, "step": 18 }, { "advantage_max": 1.7161194682121277, "advantage_mean": 1.6142925329809543e-08, "advantage_min": -1.0108607560396194, "advantage_std": 0.9998414665460587, "completion_length": 2970.104217529297, "epoch": 0.021714285714285714, "grad_norm": 0.17796573042869568, "kl": 3.248453140258789e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.28523150458931923, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.28523150458931923, "reward_after_std": 0.8642893098294735, "reward_before_mean": 0.6326376395300031, "reward_before_std": 0.8732211291790009, "reward_change_max": 0.0008312985301017761, "reward_change_mean": -0.347406136803329, "reward_change_min": -0.6251656841486692, "reward_change_std": 0.2677363967522979, "reward_std": 0.8642893265932798, "rewards/cosine_scaled_reward": 0.0767354778945446, "rewards/format_reward": 0.479166679084301, "step": 19 }, { "advantage_max": 1.6717558950185776, "advantage_mean": -9.93410742555767e-09, "advantage_min": -1.0723228305578232, "advantage_std": 0.9998261108994484, "completion_length": 2553.312557220459, "epoch": 0.022857142857142857, "grad_norm": 0.20613417029380798, "kl": 2.1822750568389893e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.1660033669322729, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1660033669322729, "reward_after_std": 0.7932123467326164, "reward_before_mean": 0.4822961278259754, "reward_before_std": 0.829401932656765, "reward_change_max": 0.0012446194887161255, "reward_change_mean": -0.3162927683442831, "reward_change_min": -0.6362494938075542, "reward_change_std": 0.25846812035888433, "reward_std": 0.793212354183197, "rewards/cosine_scaled_reward": -0.0609352788887918, "rewards/format_reward": 0.6041666734963655, "step": 20 }, { "advantage_max": 1.8311471194028854, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.8520054742693901, "advantage_std": 0.9998204782605171, "completion_length": 2773.7083892822266, "epoch": 0.024, "grad_norm": 0.3204244077205658, "kl": 5.778670310974121e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": 0.0062417155131697655, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0062417155131697655, "reward_after_std": 0.7150941640138626, "reward_before_mean": 0.2617703266441822, "reward_before_std": 0.6540368758141994, "reward_change_max": 0.000498548150062561, "reward_change_mean": -0.25552862230688334, "reward_change_min": -0.4329958073794842, "reward_change_std": 0.1697560576722026, "reward_std": 0.7150941789150238, "rewards/cosine_scaled_reward": -0.07744817808270454, "rewards/format_reward": 0.41666667349636555, "step": 21 }, { "advantage_max": 1.8255604952573776, "advantage_mean": -8.692344288796505e-09, "advantage_min": -0.9599033072590828, "advantage_std": 0.9998328015208244, "completion_length": 1705.8750534057617, "epoch": 0.025142857142857144, "grad_norm": 0.25831398367881775, "kl": 2.4750828742980957e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.43544840905815363, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.43544840905815363, "reward_after_std": 0.6950667537748814, "reward_before_mean": 0.8501517660915852, "reward_before_std": 0.6053979620337486, "reward_change_max": 0.0, "reward_change_mean": -0.41470331884920597, "reward_change_min": -0.6296084709465504, "reward_change_std": 0.2352834241464734, "reward_std": 0.6950667649507523, "rewards/cosine_scaled_reward": -0.0020074760541319847, "rewards/format_reward": 0.8541666716337204, "step": 22 }, { "advantage_max": 1.8754753768444061, "advantage_mean": 1.8005570812107408e-08, "advantage_min": -0.7764259651303291, "advantage_std": 0.9997936338186264, "completion_length": 2557.7500534057617, "epoch": 0.026285714285714287, "grad_norm": 0.22300642728805542, "kl": 3.248080611228943e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.6e-07, "loss": 0.0, "reward": -0.12141754711046815, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.12141754711046815, "reward_after_std": 0.6048268787562847, "reward_before_mean": 0.10700802784413099, "reward_before_std": 0.577505748718977, "reward_change_max": 0.0003226622939109802, "reward_change_mean": -0.22842555586248636, "reward_change_min": -0.4473407156765461, "reward_change_std": 0.1617820616811514, "reward_std": 0.6048269048333168, "rewards/cosine_scaled_reward": -0.154829328879714, "rewards/format_reward": 0.4166666716337204, "step": 23 }, { "advantage_max": 1.6645357608795166, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -1.0173160433769226, "advantage_std": 0.9998892396688461, "completion_length": 2809.2708892822266, "epoch": 0.027428571428571427, "grad_norm": 0.20829880237579346, "kl": 2.4370849132537842e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.15873132180422544, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.15873132180422544, "reward_after_std": 1.0201821699738503, "reward_before_mean": 0.44553845963673666, "reward_before_std": 1.0898916274309158, "reward_change_max": 0.0005302950739860535, "reward_change_mean": -0.28680715151131153, "reward_change_min": -0.6231388412415981, "reward_change_std": 0.27841206826269627, "reward_std": 1.0201821774244308, "rewards/cosine_scaled_reward": -0.037647439166903496, "rewards/format_reward": 0.5208333469927311, "step": 24 }, { "advantage_max": 1.6873723566532135, "advantage_mean": 3.4769377821319836e-08, "advantage_min": -1.0170445293188095, "advantage_std": 0.9998362511396408, "completion_length": 2766.604202270508, "epoch": 0.02857142857142857, "grad_norm": 0.22878548502922058, "kl": 4.16487455368042e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.09160550683736801, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09160550683736801, "reward_after_std": 0.814907144755125, "reward_before_mean": 0.3796383887529373, "reward_before_std": 0.8612931333482265, "reward_change_max": 0.0008770972490310669, "reward_change_mean": -0.28803289122879505, "reward_change_min": -0.6174117531627417, "reward_change_std": 0.24560536537319422, "reward_std": 0.8149071894586086, "rewards/cosine_scaled_reward": -0.018514135852456093, "rewards/format_reward": 0.41666667722165585, "step": 25 }, { "advantage_max": 1.579800471663475, "advantage_mean": 3.725289299261192e-09, "advantage_min": -1.174089826643467, "advantage_std": 0.9997816830873489, "completion_length": 2946.4583587646484, "epoch": 0.029714285714285714, "grad_norm": 0.16254839301109314, "kl": 2.8759241104125977e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.2e-07, "loss": 0.0, "reward": 0.10168552584946156, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10168552584946156, "reward_after_std": 0.5445095933973789, "reward_before_mean": 0.42849843576550484, "reward_before_std": 0.5547886602580547, "reward_change_max": 0.0030766278505325317, "reward_change_mean": -0.32681294344365597, "reward_change_min": -0.5595498997718096, "reward_change_std": 0.2215216150507331, "reward_std": 0.5445096306502819, "rewards/cosine_scaled_reward": -0.004500776529312134, "rewards/format_reward": 0.4375, "step": 26 }, { "advantage_max": 1.765315592288971, "advantage_mean": 4.967053213178474e-09, "advantage_min": -0.9876908138394356, "advantage_std": 0.9997563660144806, "completion_length": 3034.437530517578, "epoch": 0.030857142857142857, "grad_norm": 0.18291035294532776, "kl": 2.619624137878418e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.4e-07, "loss": 0.0, "reward": -0.165783321717754, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.165783321717754, "reward_after_std": 0.6133995819836855, "reward_before_mean": 0.04999001696705818, "reward_before_std": 0.61482073366642, "reward_change_max": 0.000221937894821167, "reward_change_mean": -0.21577333379536867, "reward_change_min": -0.4704149141907692, "reward_change_std": 0.1753271510824561, "reward_std": 0.6133995968848467, "rewards/cosine_scaled_reward": -0.17292165895923972, "rewards/format_reward": 0.39583333767950535, "step": 27 }, { "advantage_max": 1.6552045345306396, "advantage_mean": -4.1599076183729267e-08, "advantage_min": -1.0491575673222542, "advantage_std": 0.9998111873865128, "completion_length": 2725.6250534057617, "epoch": 0.032, "grad_norm": 0.20827879011631012, "kl": 3.992021083831787e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 0.1778556825593114, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1778556825593114, "reward_after_std": 0.6806318163871765, "reward_before_mean": 0.5107554253190756, "reward_before_std": 0.6749352067708969, "reward_change_max": 0.0004945173859596252, "reward_change_mean": -0.33289976697415113, "reward_change_min": -0.5996529888361692, "reward_change_std": 0.2332574538886547, "reward_std": 0.680631835013628, "rewards/cosine_scaled_reward": 0.015794383361935616, "rewards/format_reward": 0.47916666977107525, "step": 28 }, { "advantage_max": 1.7731102406978607, "advantage_mean": 2.1109978987077227e-08, "advantage_min": -1.0666637495160103, "advantage_std": 0.99976696819067, "completion_length": 3326.4584045410156, "epoch": 0.03314285714285714, "grad_norm": 0.1783670037984848, "kl": 3.111734986305237e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.8e-07, "loss": 0.0, "reward": -0.32537109963595867, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.32537109963595867, "reward_after_std": 0.5295126177370548, "reward_before_mean": -0.1593697527423501, "reward_before_std": 0.5184246599674225, "reward_change_max": 0.0005779862403869629, "reward_change_mean": -0.16600135434418917, "reward_change_min": -0.3253139555454254, "reward_change_std": 0.12970570009201765, "reward_std": 0.5295126214623451, "rewards/cosine_scaled_reward": -0.19426821160595864, "rewards/format_reward": 0.22916666977107525, "step": 29 }, { "advantage_max": 1.725986734032631, "advantage_mean": 7.761021969532322e-08, "advantage_min": -1.0261987745761871, "advantage_std": 0.9996884390711784, "completion_length": 2856.625045776367, "epoch": 0.03428571428571429, "grad_norm": 0.22694918513298035, "kl": 2.3245811462402344e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.07334983453620225, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07334983453620225, "reward_after_std": 0.7822481319308281, "reward_before_mean": 0.35768909752368927, "reward_before_std": 0.8035914115607738, "reward_change_max": 0.0011820197105407715, "reward_change_mean": -0.28433927800506353, "reward_change_min": -0.5090161636471748, "reward_change_std": 0.22235635644756258, "reward_std": 0.7822481654584408, "rewards/cosine_scaled_reward": -0.03990544652333483, "rewards/format_reward": 0.4375000111758709, "step": 30 }, { "advantage_max": 1.616593599319458, "advantage_mean": 4.532436603810197e-08, "advantage_min": -1.122175931930542, "advantage_std": 0.9997361823916435, "completion_length": 3045.2500610351562, "epoch": 0.03542857142857143, "grad_norm": 0.1838632971048355, "kl": 2.6345252990722656e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.2e-07, "loss": 0.0, "reward": -0.18626230396330357, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.18626230396330357, "reward_after_std": 0.5281820576637983, "reward_before_mean": 0.040904104709625244, "reward_before_std": 0.5716760288923979, "reward_change_max": 0.00021841377019882202, "reward_change_mean": -0.22716641426086426, "reward_change_min": -0.44321779906749725, "reward_change_std": 0.1879756571725011, "reward_std": 0.5281820632517338, "rewards/cosine_scaled_reward": -0.12538127042353153, "rewards/format_reward": 0.29166667722165585, "step": 31 }, { "advantage_max": 1.6113842129707336, "advantage_mean": 4.221995741904294e-08, "advantage_min": -0.9919478967785835, "advantage_std": 0.9997682124376297, "completion_length": 3211.2708435058594, "epoch": 0.036571428571428574, "grad_norm": 0.16190707683563232, "kl": 3.495067358016968e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.4e-07, "loss": 0.0, "reward": -0.19964229501783848, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19964229501783848, "reward_after_std": 0.7030479311943054, "reward_before_mean": -0.0024698860943317413, "reward_before_std": 0.7546122074127197, "reward_change_max": 0.0014301910996437073, "reward_change_mean": -0.1971724338363856, "reward_change_min": -0.45910441875457764, "reward_change_std": 0.19600312830880284, "reward_std": 0.7030479423701763, "rewards/cosine_scaled_reward": -0.12623494304716587, "rewards/format_reward": 0.2500000111758709, "step": 32 }, { "advantage_max": 1.7362781912088394, "advantage_mean": 2.6077032755367213e-08, "advantage_min": -1.0833544805645943, "advantage_std": 0.9997647777199745, "completion_length": 3288.1250610351562, "epoch": 0.037714285714285714, "grad_norm": 0.1477300524711609, "kl": 2.972036600112915e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.6e-07, "loss": 0.0, "reward": -0.08128016069531441, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08128016069531441, "reward_after_std": 0.670670498162508, "reward_before_mean": 0.157458019151818, "reward_before_std": 0.6790058668702841, "reward_change_max": 0.0, "reward_change_mean": -0.23873815266415477, "reward_change_min": -0.47829458490014076, "reward_change_std": 0.18856940185651183, "reward_std": 0.6706705018877983, "rewards/cosine_scaled_reward": -0.0671043461188674, "rewards/format_reward": 0.2916666753590107, "step": 33 }, { "advantage_max": 1.7444235980510712, "advantage_mean": -3.259628988949714e-08, "advantage_min": -1.0349977537989616, "advantage_std": 0.9997931867837906, "completion_length": 2577.8541870117188, "epoch": 0.038857142857142854, "grad_norm": 0.24762187898159027, "kl": 2.6561319828033447e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 0.1865523054730147, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1865523054730147, "reward_after_std": 0.734912283718586, "reward_before_mean": 0.5138917192816734, "reward_before_std": 0.7215470131486654, "reward_change_max": 0.0006349757313728333, "reward_change_mean": -0.32733945874497294, "reward_change_min": -0.5636269450187683, "reward_change_std": 0.22762802941724658, "reward_std": 0.7349123004823923, "rewards/cosine_scaled_reward": -0.013887470122426748, "rewards/format_reward": 0.541666679084301, "step": 34 }, { "advantage_max": 1.7818120419979095, "advantage_mean": 4.221995775210985e-08, "advantage_min": -0.8760261386632919, "advantage_std": 0.9998079240322113, "completion_length": 3114.958366394043, "epoch": 0.04, "grad_norm": 0.1881113350391388, "kl": 3.4675002098083496e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 7e-07, "loss": 0.0, "reward": -0.1845263810828328, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1845263810828328, "reward_after_std": 0.8198233172297478, "reward_before_mean": -0.006346432492136955, "reward_before_std": 0.8238743580877781, "reward_change_max": 0.0004191100597381592, "reward_change_mean": -0.1781799392774701, "reward_change_min": -0.4374943785369396, "reward_change_std": 0.16398774064145982, "reward_std": 0.8198233619332314, "rewards/cosine_scaled_reward": -0.1281732227653265, "rewards/format_reward": 0.25000000186264515, "step": 35 }, { "advantage_max": 1.696033626794815, "advantage_mean": 3.2285850215529877e-08, "advantage_min": -0.9351080358028412, "advantage_std": 0.999752476811409, "completion_length": 3368.3541870117188, "epoch": 0.04114285714285714, "grad_norm": 0.16902585327625275, "kl": 3.435090184211731e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.2e-07, "loss": 0.0, "reward": -0.4779059775173664, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4779059775173664, "reward_after_std": 0.49636591225862503, "reward_before_mean": -0.36104049161076546, "reward_before_std": 0.5082600563764572, "reward_change_max": 0.001483917236328125, "reward_change_mean": -0.11686548870056868, "reward_change_min": -0.2881821468472481, "reward_change_std": 0.1096858661621809, "reward_std": 0.4963659346103668, "rewards/cosine_scaled_reward": -0.2638535853475332, "rewards/format_reward": 0.16666667349636555, "step": 36 }, { "advantage_max": 1.6557658016681671, "advantage_mean": 1.8471230989192122e-08, "advantage_min": -1.0146674513816833, "advantage_std": 0.9997165277600288, "completion_length": 3431.2708740234375, "epoch": 0.04228571428571429, "grad_norm": 0.15887229144573212, "kl": 2.222880721092224e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.4e-07, "loss": 0.0, "reward": -0.5451079215854406, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.5451079215854406, "reward_after_std": 0.37246148101985455, "reward_before_mean": -0.4324076659977436, "reward_before_std": 0.39398239366710186, "reward_change_max": 0.0004478543996810913, "reward_change_mean": -0.1127002714201808, "reward_change_min": -0.25096222572028637, "reward_change_std": 0.10608560917899013, "reward_std": 0.3724614940583706, "rewards/cosine_scaled_reward": -0.2787038255482912, "rewards/format_reward": 0.12500000558793545, "step": 37 }, { "advantage_max": 1.6931967288255692, "advantage_mean": 7.004322000181418e-08, "advantage_min": -1.035066694021225, "advantage_std": 0.9997488483786583, "completion_length": 3174.4791870117188, "epoch": 0.04342857142857143, "grad_norm": 0.17361724376678467, "kl": 2.7738511562347412e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": -0.27881590090692043, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.27881590090692043, "reward_after_std": 0.5268222466111183, "reward_before_mean": -0.09247179701924324, "reward_before_std": 0.5383048206567764, "reward_change_max": 0.00025244802236557007, "reward_change_mean": -0.1863441038876772, "reward_change_min": -0.3567695301026106, "reward_change_std": 0.1453970018774271, "reward_std": 0.5268222652375698, "rewards/cosine_scaled_reward": -0.13998588593676686, "rewards/format_reward": 0.1875, "step": 38 }, { "advantage_max": 1.645759716629982, "advantage_mean": 6.984918643482274e-09, "advantage_min": -1.258909560739994, "advantage_std": 0.9997495338320732, "completion_length": 2880.645866394043, "epoch": 0.044571428571428574, "grad_norm": 0.19487108290195465, "kl": 8.773058652877808e-06, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": 0.04547960311174393, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04547960311174393, "reward_after_std": 0.4941613469272852, "reward_before_mean": 0.35694460570812225, "reward_before_std": 0.4939267039299011, "reward_change_max": 0.0006987825036048889, "reward_change_mean": -0.311465029604733, "reward_change_min": -0.5112638585269451, "reward_change_std": 0.2062849523499608, "reward_std": 0.4941613618284464, "rewards/cosine_scaled_reward": -0.05069436226040125, "rewards/format_reward": 0.4583333395421505, "step": 39 }, { "advantage_max": 1.6932796239852905, "advantage_mean": 1.1796751686610207e-08, "advantage_min": -1.1757134571671486, "advantage_std": 0.9996892586350441, "completion_length": 2494.2083740234375, "epoch": 0.045714285714285714, "grad_norm": 0.20470260083675385, "kl": 1.6691628843545914e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8e-07, "loss": 0.0, "reward": -0.016887841746211052, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.016887841746211052, "reward_after_std": 0.5138958254829049, "reward_before_mean": 0.2634038657415658, "reward_before_std": 0.48501414991915226, "reward_change_max": 0.00026522576808929443, "reward_change_mean": -0.28029171470552683, "reward_change_min": -0.46240196004509926, "reward_change_std": 0.17986160283908248, "reward_std": 0.513895845040679, "rewards/cosine_scaled_reward": -0.13913138769567013, "rewards/format_reward": 0.5416666828095913, "step": 40 }, { "advantage_max": 1.6760390102863312, "advantage_mean": 7.792065762068923e-08, "advantage_min": -1.0733157843351364, "advantage_std": 0.9996796399354935, "completion_length": 2848.500030517578, "epoch": 0.046857142857142854, "grad_norm": 0.22284355759620667, "kl": 1.667812466621399e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": -0.1447519608773291, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1447519608773291, "reward_after_std": 0.7171404417604208, "reward_before_mean": 0.06463775411248207, "reward_before_std": 0.7331238826736808, "reward_change_max": 0.0006848946213722229, "reward_change_mean": -0.20938971149735153, "reward_change_min": -0.4455184154212475, "reward_change_std": 0.1833113746251911, "reward_std": 0.7171404454857111, "rewards/cosine_scaled_reward": -0.186431135982275, "rewards/format_reward": 0.4375000074505806, "step": 41 }, { "advantage_max": 1.6876756697893143, "advantage_mean": 2.4835268952472234e-08, "advantage_min": -0.9819491431117058, "advantage_std": 0.999714769423008, "completion_length": 2844.3333702087402, "epoch": 0.048, "grad_norm": 0.26894325017929077, "kl": 4.445016384124756e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": -0.31103704776614904, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.31103704776614904, "reward_after_std": 0.4100134428590536, "reward_before_mean": -0.12106682872399688, "reward_before_std": 0.40934691950678825, "reward_change_max": 0.000611230731010437, "reward_change_mean": -0.18997021205723286, "reward_change_min": -0.35201727971434593, "reward_change_std": 0.14233277086168528, "reward_std": 0.4100134577602148, "rewards/cosine_scaled_reward": -0.2376167606562376, "rewards/format_reward": 0.3541666679084301, "step": 42 }, { "advantage_max": 1.7010575532913208, "advantage_mean": 9.313225413087878e-09, "advantage_min": -1.018822930753231, "advantage_std": 0.9997816532850266, "completion_length": 2978.437545776367, "epoch": 0.04914285714285714, "grad_norm": 0.17909610271453857, "kl": 3.220885992050171e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": -0.13193300738930702, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.13193300738930702, "reward_after_std": 0.597914507612586, "reward_before_mean": 0.09899591468274593, "reward_before_std": 0.611473685130477, "reward_change_max": 0.0009703263640403748, "reward_change_mean": -0.2309289202094078, "reward_change_min": -0.47725389897823334, "reward_change_std": 0.1805841075256467, "reward_std": 0.5979145113378763, "rewards/cosine_scaled_reward": -0.09633538499474525, "rewards/format_reward": 0.2916666716337204, "step": 43 }, { "advantage_max": 1.6900798082351685, "advantage_mean": 9.313225635132483e-09, "advantage_min": -1.1357970833778381, "advantage_std": 0.9998387694358826, "completion_length": 2929.666702270508, "epoch": 0.05028571428571429, "grad_norm": 0.245319664478302, "kl": 5.491822957992554e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": 0.09997905418276787, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09997905418276787, "reward_after_std": 0.8079127669334412, "reward_before_mean": 0.38700914289802313, "reward_before_std": 0.8344130367040634, "reward_change_max": 0.0017061308026313782, "reward_change_mean": -0.2870300691574812, "reward_change_min": -0.5179458074271679, "reward_change_std": 0.22538639418780804, "reward_std": 0.8079127967357635, "rewards/cosine_scaled_reward": -0.014828769257292151, "rewards/format_reward": 0.41666667722165585, "step": 44 }, { "advantage_max": 1.5660130977630615, "advantage_mean": 5.960464499743523e-08, "advantage_min": -0.9861617833375931, "advantage_std": 0.999788261950016, "completion_length": 3407.8333740234375, "epoch": 0.05142857142857143, "grad_norm": 0.14619755744934082, "kl": 2.90796160697937e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9e-07, "loss": 0.0, "reward": -0.17518600821495056, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.17518600821495056, "reward_after_std": 0.762459272518754, "reward_before_mean": 0.026804424822330475, "reward_before_std": 0.8420644588768482, "reward_change_max": 0.0015473440289497375, "reward_change_mean": -0.20199044118635356, "reward_change_min": -0.5272135585546494, "reward_change_std": 0.2229527528397739, "reward_std": 0.7624592930078506, "rewards/cosine_scaled_reward": -0.09076445642858744, "rewards/format_reward": 0.2083333395421505, "step": 45 }, { "advantage_max": 1.7986343055963516, "advantage_mean": -1.3969839840477505e-08, "advantage_min": -0.9203949719667435, "advantage_std": 0.9997272342443466, "completion_length": 3176.5833435058594, "epoch": 0.052571428571428575, "grad_norm": 0.1975124180316925, "kl": 3.5181641578674316e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.2e-07, "loss": 0.0, "reward": -0.4810139127075672, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4810139127075672, "reward_after_std": 0.46965909004211426, "reward_before_mean": -0.3638890916481614, "reward_before_std": 0.46313183568418026, "reward_change_max": 0.0019199252128601074, "reward_change_mean": -0.11712482548318803, "reward_change_min": -0.2390049435198307, "reward_change_std": 0.09974978445097804, "reward_std": 0.46965910121798515, "rewards/cosine_scaled_reward": -0.27569454722106457, "rewards/format_reward": 0.18750000186264515, "step": 46 }, { "advantage_max": 1.7070032507181168, "advantage_mean": 8.692344621863413e-09, "advantage_min": -1.046562485396862, "advantage_std": 0.9998353719711304, "completion_length": 2953.583366394043, "epoch": 0.053714285714285714, "grad_norm": 0.22726935148239136, "kl": 2.8312206268310547e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.035570548847317696, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.035570548847317696, "reward_after_std": 0.8251568898558617, "reward_before_mean": 0.29871960915625095, "reward_before_std": 0.8540180884301662, "reward_change_max": 0.0003110915422439575, "reward_change_mean": -0.26314909430220723, "reward_change_min": -0.49642592296004295, "reward_change_std": 0.20880132168531418, "reward_std": 0.8251569084823132, "rewards/cosine_scaled_reward": -0.02772352658212185, "rewards/format_reward": 0.35416667349636555, "step": 47 }, { "advantage_max": 1.7456063479185104, "advantage_mean": 6.084641057668705e-08, "advantage_min": -0.9159170165657997, "advantage_std": 0.9997773617506027, "completion_length": 2935.812545776367, "epoch": 0.054857142857142854, "grad_norm": 0.20156747102737427, "kl": 6.876140832901001e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.6e-07, "loss": 0.0, "reward": -0.14274100959300995, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.14274100959300995, "reward_after_std": 0.6637316886335611, "reward_before_mean": 0.0721710603684187, "reward_before_std": 0.6555942036211491, "reward_change_max": 0.0008794218301773071, "reward_change_mean": -0.2149120302638039, "reward_change_min": -0.42936922423541546, "reward_change_std": 0.16268173593562096, "reward_std": 0.6637317016720772, "rewards/cosine_scaled_reward": -0.14099781308323145, "rewards/format_reward": 0.35416667349636555, "step": 48 }, { "advantage_max": 1.7003394067287445, "advantage_mean": -1.8626452047421083e-08, "advantage_min": -0.914850402623415, "advantage_std": 0.9998288676142693, "completion_length": 2436.0208740234375, "epoch": 0.056, "grad_norm": 0.21987055242061615, "kl": 3.507547080516815e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.08041839301586151, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08041839301586151, "reward_after_std": 0.8396259732544422, "reward_before_mean": 0.35583220049738884, "reward_before_std": 0.840287160128355, "reward_change_max": 0.0, "reward_change_mean": -0.27541382424533367, "reward_change_min": -0.5413349587470293, "reward_change_std": 0.2108948975801468, "reward_std": 0.8396259918808937, "rewards/cosine_scaled_reward": -0.10333390219602734, "rewards/format_reward": 0.5625000055879354, "step": 49 }, { "advantage_max": 1.6772018820047379, "advantage_mean": 1.9247333504779363e-08, "advantage_min": -1.1436176598072052, "advantage_std": 0.9997652471065521, "completion_length": 3029.270835876465, "epoch": 0.05714285714285714, "grad_norm": 0.1825171858072281, "kl": 3.2992102205753326e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.15730741992592812, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15730741992592812, "reward_after_std": 0.607163991779089, "reward_before_mean": 0.4929181206971407, "reward_before_std": 0.5782719552516937, "reward_change_max": 0.0, "reward_change_mean": -0.33561069518327713, "reward_change_min": -0.5604764055460691, "reward_change_std": 0.2263810858130455, "reward_std": 0.6071640066802502, "rewards/cosine_scaled_reward": 0.07979238592088223, "rewards/format_reward": 0.33333333395421505, "step": 50 }, { "advantage_max": 1.6508052051067352, "advantage_mean": 1.4901161526914564e-08, "advantage_min": -1.1475177481770515, "advantage_std": 0.9997635409235954, "completion_length": 2245.0208740234375, "epoch": 0.05828571428571429, "grad_norm": 0.22916845977306366, "kl": 8.122995495796204e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": 0.004832329228520393, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.004832329228520393, "reward_after_std": 0.4980671815574169, "reward_before_mean": 0.2986472975462675, "reward_before_std": 0.48453923873603344, "reward_change_max": 0.0009013116359710693, "reward_change_mean": -0.2938149394467473, "reward_change_min": -0.4876815639436245, "reward_change_std": 0.19188937451690435, "reward_std": 0.4980671852827072, "rewards/cosine_scaled_reward": -0.12150970660150051, "rewards/format_reward": 0.5416666679084301, "step": 51 }, { "advantage_max": 1.820540651679039, "advantage_mean": 2.9181441818515452e-08, "advantage_min": -0.8227907046675682, "advantage_std": 0.9998376965522766, "completion_length": 2812.2083854675293, "epoch": 0.05942857142857143, "grad_norm": 0.21117231249809265, "kl": 8.490681648254395e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": -0.0212899805046618, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0212899805046618, "reward_after_std": 0.8910897560417652, "reward_before_mean": 0.2077227346599102, "reward_before_std": 0.9012789577245712, "reward_change_max": 0.0003147795796394348, "reward_change_mean": -0.22901272028684616, "reward_change_min": -0.5391429048031569, "reward_change_std": 0.20690517546609044, "reward_std": 0.8910897895693779, "rewards/cosine_scaled_reward": -0.08363863360136747, "rewards/format_reward": 0.3750000037252903, "step": 52 }, { "advantage_max": 1.603944092988968, "advantage_mean": 2.173086188772544e-08, "advantage_min": -1.1513922810554504, "advantage_std": 0.9997826814651489, "completion_length": 2806.625030517578, "epoch": 0.060571428571428575, "grad_norm": 0.22769437730312347, "kl": 0.000128135085105896, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": 0.2369850054383278, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2369850054383278, "reward_after_std": 0.6251183226704597, "reward_before_mean": 0.6029936671257019, "reward_before_std": 0.6293339114636183, "reward_change_max": 0.0, "reward_change_mean": -0.36600867845118046, "reward_change_min": -0.6412353664636612, "reward_change_std": 0.25781027879565954, "reward_std": 0.6251183375716209, "rewards/cosine_scaled_reward": 0.030663497745990753, "rewards/format_reward": 0.5416666697710752, "step": 53 }, { "advantage_max": 1.6730604618787766, "advantage_mean": -3.6632022637483885e-08, "advantage_min": -1.1187285035848618, "advantage_std": 0.9998723268508911, "completion_length": 2772.062545776367, "epoch": 0.061714285714285715, "grad_norm": 0.18292006850242615, "kl": 3.987550735473633e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.3220935259014368, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3220935259014368, "reward_after_std": 0.914025504142046, "reward_before_mean": 0.680418030358851, "reward_before_std": 0.9553813710808754, "reward_change_max": 0.0002489909529685974, "reward_change_mean": -0.35832452960312366, "reward_change_min": -0.6526261102408171, "reward_change_std": 0.2787675419822335, "reward_std": 0.914025541394949, "rewards/cosine_scaled_reward": 0.06937567051500082, "rewards/format_reward": 0.5416666828095913, "step": 54 }, { "advantage_max": 1.639712780714035, "advantage_mean": 2.7939677571531263e-08, "advantage_min": -1.0915622636675835, "advantage_std": 0.999813862144947, "completion_length": 3026.812530517578, "epoch": 0.06285714285714286, "grad_norm": 0.19612926244735718, "kl": 4.6916306018829346e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.07077167462557554, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.07077167462557554, "reward_after_std": 0.7388047780841589, "reward_before_mean": 0.3599805114790797, "reward_before_std": 0.7721150405704975, "reward_change_max": 0.0, "reward_change_mean": -0.2892088247463107, "reward_change_min": -0.5648720655590296, "reward_change_std": 0.2283388590440154, "reward_std": 0.7388047929853201, "rewards/cosine_scaled_reward": 0.013323572697117925, "rewards/format_reward": 0.33333334140479565, "step": 55 }, { "advantage_max": 1.6640954315662384, "advantage_mean": -2.048909719665204e-08, "advantage_min": -1.0538555011153221, "advantage_std": 0.9997746124863625, "completion_length": 3033.0000228881836, "epoch": 0.064, "grad_norm": 0.1757792979478836, "kl": 3.0547380447387695e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": -0.005534622818231583, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.005534622818231583, "reward_after_std": 0.5717477016150951, "reward_before_mean": 0.27818619227036834, "reward_before_std": 0.5735124535858631, "reward_change_max": 0.0, "reward_change_mean": -0.28372081089764833, "reward_change_min": -0.5253443699330091, "reward_change_std": 0.21497930027544498, "reward_std": 0.5717477202415466, "rewards/cosine_scaled_reward": -0.027573585510253906, "rewards/format_reward": 0.3333333395421505, "step": 56 }, { "advantage_max": 1.655980110168457, "advantage_mean": 2.7318796780306798e-08, "advantage_min": -1.0294505134224892, "advantage_std": 0.9997557923197746, "completion_length": 3271.2291870117188, "epoch": 0.06514285714285714, "grad_norm": 0.12939302623271942, "kl": 1.4789402484893799e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": -0.13052963837981224, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13052963837981224, "reward_after_std": 0.7320014405995607, "reward_before_mean": 0.08981280773878098, "reward_before_std": 0.793254129588604, "reward_change_max": 0.0015274062752723694, "reward_change_mean": -0.22034245822578669, "reward_change_min": -0.5432382598519325, "reward_change_std": 0.22339811036363244, "reward_std": 0.7320014480501413, "rewards/cosine_scaled_reward": -0.11134359892457724, "rewards/format_reward": 0.3125, "step": 57 }, { "advantage_max": 1.7863916009664536, "advantage_mean": 5.2774945524802774e-09, "advantage_min": -0.9740641564130783, "advantage_std": 0.999830387532711, "completion_length": 2345.687545776367, "epoch": 0.06628571428571428, "grad_norm": 0.21642234921455383, "kl": 0.00025500357151031494, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": 0.0725353374145925, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0725353374145925, "reward_after_std": 0.8109330330044031, "reward_before_mean": 0.3429924047086388, "reward_before_std": 0.7900348640978336, "reward_change_max": 0.0004199966788291931, "reward_change_mean": -0.27045707777142525, "reward_change_min": -0.4974858798086643, "reward_change_std": 0.20084482524544, "reward_std": 0.8109330497682095, "rewards/cosine_scaled_reward": -0.13058713916689157, "rewards/format_reward": 0.604166679084301, "step": 58 }, { "advantage_max": 1.6153929084539413, "advantage_mean": 5.494803234640244e-08, "advantage_min": -1.2002663016319275, "advantage_std": 0.9997520595788956, "completion_length": 2710.062515258789, "epoch": 0.06742857142857143, "grad_norm": 0.18601581454277039, "kl": 2.874433994293213e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": 0.016920048743486404, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.016920048743486404, "reward_after_std": 0.5601688083261251, "reward_before_mean": 0.31204191595315933, "reward_before_std": 0.5862467214465141, "reward_change_max": 0.0023939386010169983, "reward_change_mean": -0.2951218015514314, "reward_change_min": -0.5066243596374989, "reward_change_std": 0.21398248756304383, "reward_std": 0.5601688250899315, "rewards/cosine_scaled_reward": -0.041895726695656776, "rewards/format_reward": 0.3958333395421505, "step": 59 }, { "advantage_max": 1.7348787784576416, "advantage_mean": 9.809931356130619e-08, "advantage_min": -0.9890720546245575, "advantage_std": 0.9996658861637115, "completion_length": 3001.4583435058594, "epoch": 0.06857142857142857, "grad_norm": 0.1735920011997223, "kl": 2.6339665055274963e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": -0.3645767252892256, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3645767252892256, "reward_after_std": 0.31776400841772556, "reward_before_mean": -0.1823885552585125, "reward_before_std": 0.2956855818629265, "reward_change_max": 0.0006458982825279236, "reward_change_mean": -0.18218815978616476, "reward_change_min": -0.3113494608551264, "reward_change_std": 0.11784279346466064, "reward_std": 0.31776401959359646, "rewards/cosine_scaled_reward": -0.2266109511256218, "rewards/format_reward": 0.27083333395421505, "step": 60 }, { "advantage_max": 1.6786080598831177, "advantage_mean": 3.414849436556011e-08, "advantage_min": -1.119702823460102, "advantage_std": 0.9997992143034935, "completion_length": 2769.0209045410156, "epoch": 0.06971428571428571, "grad_norm": 0.18313685059547424, "kl": 0.00015373528003692627, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": 0.17589345946907997, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17589345946907997, "reward_after_std": 0.6231853775680065, "reward_before_mean": 0.5198908429592848, "reward_before_std": 0.6389935575425625, "reward_change_max": 0.0, "reward_change_mean": -0.34399732667952776, "reward_change_min": -0.5837460160255432, "reward_change_std": 0.23666787100955844, "reward_std": 0.6231853924691677, "rewards/cosine_scaled_reward": -0.03172127902507782, "rewards/format_reward": 0.5833333507180214, "step": 61 }, { "advantage_max": 1.669366866350174, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.9924981147050858, "advantage_std": 0.9998872131109238, "completion_length": 2773.750045776367, "epoch": 0.07085714285714285, "grad_norm": 0.1696423441171646, "kl": 0.00029055215418338776, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.23453661100938916, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23453661100938916, "reward_after_std": 1.0506018809974194, "reward_before_mean": 0.5446470603346825, "reward_before_std": 1.1166776195168495, "reward_change_max": 0.000665523111820221, "reward_change_mean": -0.31011044047772884, "reward_change_min": -0.7151584103703499, "reward_change_std": 0.2924557887017727, "reward_std": 1.050601914525032, "rewards/cosine_scaled_reward": 0.02232352737337351, "rewards/format_reward": 0.5000000186264515, "step": 62 }, { "advantage_max": 1.6369207352399826, "advantage_mean": -2.1730858223989458e-09, "advantage_min": -1.1514763534069061, "advantage_std": 0.9997848942875862, "completion_length": 2316.5208892822266, "epoch": 0.072, "grad_norm": 0.19967959821224213, "kl": 0.0006964057683944702, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "reward": 0.25051923716819147, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.25051923716819147, "reward_after_std": 0.7038908246904612, "reward_before_mean": 0.6116887461394072, "reward_before_std": 0.7206394355744123, "reward_change_max": 0.00013612210750579834, "reward_change_mean": -0.3611695375293493, "reward_change_min": -0.66447863727808, "reward_change_std": 0.2597482795827091, "reward_std": 0.7038908563554287, "rewards/cosine_scaled_reward": -0.00665562879294157, "rewards/format_reward": 0.6250000149011612, "step": 63 }, { "advantage_max": 1.7015090882778168, "advantage_mean": -8.692344288796505e-09, "advantage_min": -0.9915741682052612, "advantage_std": 0.9997834339737892, "completion_length": 2962.8958892822266, "epoch": 0.07314285714285715, "grad_norm": 0.21716347336769104, "kl": 0.00016808509826660156, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": -0.05920291552320123, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05920291552320123, "reward_after_std": 0.7344120070338249, "reward_before_mean": 0.18000058270990849, "reward_before_std": 0.7569667212665081, "reward_change_max": 0.0007149055600166321, "reward_change_mean": -0.23920350428670645, "reward_change_min": -0.5351427774876356, "reward_change_std": 0.20876519661396742, "reward_std": 0.7344120163470507, "rewards/cosine_scaled_reward": -0.07666637981310487, "rewards/format_reward": 0.33333334140479565, "step": 64 }, { "advantage_max": 1.7702575773000717, "advantage_mean": 5.836288285987479e-08, "advantage_min": -0.8710718601942062, "advantage_std": 0.999729834496975, "completion_length": 2697.9166984558105, "epoch": 0.07428571428571429, "grad_norm": 0.2333095371723175, "kl": 0.00011762231588363647, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": -0.17378600779920816, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17378600779920816, "reward_after_std": 0.5362526774406433, "reward_before_mean": 0.05147118680179119, "reward_before_std": 0.5402431320399046, "reward_change_max": 0.0002131238579750061, "reward_change_mean": -0.225257174577564, "reward_change_min": -0.4259749613702297, "reward_change_std": 0.16993668302893639, "reward_std": 0.5362526811659336, "rewards/cosine_scaled_reward": -0.1825977684929967, "rewards/format_reward": 0.41666666977107525, "step": 65 }, { "advantage_max": 1.8111245334148407, "advantage_mean": -1.1796755183812735e-08, "advantage_min": -0.7910459190607071, "advantage_std": 0.9998306408524513, "completion_length": 2073.7916870117188, "epoch": 0.07542857142857143, "grad_norm": 0.3012547791004181, "kl": 0.00011523813009262085, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": 0.2881050488795154, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.2881050488795154, "reward_after_std": 0.7458667606115341, "reward_before_mean": 0.6470101624727249, "reward_before_std": 0.6932319095358253, "reward_change_max": 0.0010689571499824524, "reward_change_mean": -0.35890511190518737, "reward_change_min": -0.6494046673178673, "reward_change_std": 0.25436983490362763, "reward_std": 0.7458667755126953, "rewards/cosine_scaled_reward": 0.05267174355685711, "rewards/format_reward": 0.5416666734963655, "step": 66 }, { "advantage_max": 1.7939701974391937, "advantage_mean": 5.153318427542075e-08, "advantage_min": -0.8784763589501381, "advantage_std": 0.9997288808226585, "completion_length": 3337.8125, "epoch": 0.07657142857142857, "grad_norm": 0.1328975111246109, "kl": 0.00017693452537059784, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": -0.5479356572031975, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5479356572031975, "reward_after_std": 0.40658218413591385, "reward_before_mean": -0.44514430686831474, "reward_before_std": 0.4042688310146332, "reward_change_max": 0.0020049139857292175, "reward_change_mean": -0.10279132472351193, "reward_change_min": -0.23496553674340248, "reward_change_std": 0.09160416387021542, "reward_std": 0.40658219903707504, "rewards/cosine_scaled_reward": -0.28507216461002827, "rewards/format_reward": 0.125, "step": 67 }, { "advantage_max": 1.723268672823906, "advantage_mean": -2.2351742123838392e-08, "advantage_min": -0.919980626553297, "advantage_std": 0.9998259618878365, "completion_length": 1948.0833587646484, "epoch": 0.07771428571428571, "grad_norm": 0.305684357881546, "kl": 0.000670015811920166, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.964516155915151e-07, "loss": 0.0, "reward": 0.28808481246232986, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.28808481246232986, "reward_after_std": 0.7557829841971397, "reward_before_mean": 0.6472232132218778, "reward_before_std": 0.7139394320547581, "reward_change_max": 0.0006256401538848877, "reward_change_mean": -0.35913838632404804, "reward_change_min": -0.6378680765628815, "reward_change_std": 0.2483411794528365, "reward_std": 0.7557829916477203, "rewards/cosine_scaled_reward": 0.0006949367234483361, "rewards/format_reward": 0.6458333358168602, "step": 68 }, { "advantage_max": 1.729750543832779, "advantage_mean": -1.8626451825376478e-08, "advantage_min": -0.9881731793284416, "advantage_std": 0.9997735396027565, "completion_length": 2395.3125228881836, "epoch": 0.07885714285714286, "grad_norm": 0.2285999357700348, "kl": 0.00042310357093811035, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.960469931131936e-07, "loss": 0.0, "reward": 0.011252993252128363, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.011252993252128363, "reward_after_std": 0.6043797954916954, "reward_before_mean": 0.2924608364701271, "reward_before_std": 0.6024846043437719, "reward_change_max": 0.0012515783309936523, "reward_change_mean": -0.2812078380957246, "reward_change_min": -0.5379021652042866, "reward_change_std": 0.2072286745533347, "reward_std": 0.6043797992169857, "rewards/cosine_scaled_reward": -0.12460293434560299, "rewards/format_reward": 0.541666679084301, "step": 69 }, { "advantage_max": 1.6841091215610504, "advantage_mean": 2.0178656051639265e-08, "advantage_min": -1.074761189520359, "advantage_std": 0.9997704774141312, "completion_length": 3123.9792098999023, "epoch": 0.08, "grad_norm": 0.19950328767299652, "kl": 0.0007490180432796478, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": -0.1935001676902175, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1935001676902175, "reward_after_std": 0.6410425715148449, "reward_before_mean": 0.0074747540056705475, "reward_before_std": 0.6485170591622591, "reward_change_max": 0.0011179223656654358, "reward_change_mean": -0.20097492635250092, "reward_change_min": -0.4275518637150526, "reward_change_std": 0.1724476171657443, "reward_std": 0.6410425901412964, "rewards/cosine_scaled_reward": -0.1733459592796862, "rewards/format_reward": 0.3541666753590107, "step": 70 }, { "advantage_max": 1.5801240652799606, "advantage_mean": -3.4148484040485982e-09, "advantage_min": -1.1414336860179901, "advantage_std": 0.9997307136654854, "completion_length": 2690.062530517578, "epoch": 0.08114285714285714, "grad_norm": 0.26009178161621094, "kl": 0.0004898197948932648, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.951725498333448e-07, "loss": 0.0, "reward": -0.003988415002822876, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.003988415002822876, "reward_after_std": 0.5181414168328047, "reward_before_mean": 0.28938897512853146, "reward_before_std": 0.5445828661322594, "reward_change_max": 0.0002566725015640259, "reward_change_mean": -0.29337740410119295, "reward_change_min": -0.5023646540939808, "reward_change_std": 0.20717438496649265, "reward_std": 0.5181414242833853, "rewards/cosine_scaled_reward": -0.05322218965739012, "rewards/format_reward": 0.39583333395421505, "step": 71 }, { "advantage_max": 1.772824466228485, "advantage_mean": 1.3038516488705909e-08, "advantage_min": -0.9934405237436295, "advantage_std": 0.9998027831315994, "completion_length": 2903.1250610351562, "epoch": 0.08228571428571428, "grad_norm": 0.27986767888069153, "kl": 0.0005079209804534912, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.947027716509488e-07, "loss": 0.0, "reward": -0.13487945345696062, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.13487945345696062, "reward_after_std": 0.6645340472459793, "reward_before_mean": 0.08111082226969302, "reward_before_std": 0.6453656032681465, "reward_change_max": 0.00031088292598724365, "reward_change_mean": -0.21599026769399643, "reward_change_min": -0.4061376452445984, "reward_change_std": 0.15622367896139622, "reward_std": 0.6645340621471405, "rewards/cosine_scaled_reward": -0.1573612610809505, "rewards/format_reward": 0.3958333432674408, "step": 72 }, { "advantage_max": 1.813662901520729, "advantage_mean": 4.7187011964489045e-08, "advantage_min": -0.8960592672228813, "advantage_std": 0.9997815191745758, "completion_length": 3430.875030517578, "epoch": 0.08342857142857144, "grad_norm": 0.19496390223503113, "kl": 0.00014146510511636734, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "reward": -0.2878233203664422, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2878233203664422, "reward_after_std": 0.7369754761457443, "reward_before_mean": -0.1345462128520012, "reward_before_std": 0.7610294371843338, "reward_change_max": 0.00028471648693084717, "reward_change_mean": -0.15327709051780403, "reward_change_min": -0.32139917090535164, "reward_change_std": 0.13741923682391644, "reward_std": 0.7369754910469055, "rewards/cosine_scaled_reward": -0.15060644689947367, "rewards/format_reward": 0.16666667349636555, "step": 73 }, { "advantage_max": 1.7698954343795776, "advantage_mean": -6.705522592742597e-08, "advantage_min": -0.8610250949859619, "advantage_std": 0.9997531995177269, "completion_length": 3099.0000610351562, "epoch": 0.08457142857142858, "grad_norm": 0.16673633456230164, "kl": 0.0004587322473526001, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "reward": 0.011684894561767578, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.011684894561767578, "reward_after_std": 0.737697672098875, "reward_before_mean": 0.2787039801478386, "reward_before_std": 0.7473894041031599, "reward_change_max": 0.0005998089909553528, "reward_change_mean": -0.2670190935023129, "reward_change_min": -0.5640953052788973, "reward_change_std": 0.2240947454702109, "reward_std": 0.737697672098875, "rewards/cosine_scaled_reward": -0.016898008063435555, "rewards/format_reward": 0.31250000558793545, "step": 74 }, { "advantage_max": 1.812890887260437, "advantage_mean": 2.3593506481844884e-08, "advantage_min": -0.9130612201988697, "advantage_std": 0.9997989013791084, "completion_length": 3063.5209045410156, "epoch": 0.08571428571428572, "grad_norm": 0.17492325603961945, "kl": 0.0006076022982597351, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "reward": 0.07526686601340771, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07526686601340771, "reward_after_std": 0.6792824864387512, "reward_before_mean": 0.36507416516542435, "reward_before_std": 0.6259769611060619, "reward_change_max": 0.00024361908435821533, "reward_change_mean": -0.2898072935640812, "reward_change_min": -0.4996296912431717, "reward_change_std": 0.198143930407241, "reward_std": 0.6792825050652027, "rewards/cosine_scaled_reward": 0.015870411414653063, "rewards/format_reward": 0.33333334140479565, "step": 75 }, { "advantage_max": 1.7067972123622894, "advantage_mean": 1.862645149230957e-08, "advantage_min": -1.0221246927976608, "advantage_std": 0.999797098338604, "completion_length": 2828.2292098999023, "epoch": 0.08685714285714285, "grad_norm": 0.17829923331737518, "kl": 0.00010164221748709679, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": -0.12922447035089135, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.12922447035089135, "reward_after_std": 0.6604071147739887, "reward_before_mean": 0.0960783651098609, "reward_before_std": 0.6799979507923126, "reward_change_max": 0.00018787384033203125, "reward_change_mean": -0.22530285641551018, "reward_change_min": -0.470570569857955, "reward_change_std": 0.18850323092192411, "reward_std": 0.6604071296751499, "rewards/cosine_scaled_reward": -0.1498774797655642, "rewards/format_reward": 0.3958333358168602, "step": 76 }, { "advantage_max": 1.723575621843338, "advantage_mean": 2.545615063187512e-08, "advantage_min": -1.105763018131256, "advantage_std": 0.9997577667236328, "completion_length": 3023.812530517578, "epoch": 0.088, "grad_norm": 0.16036880016326904, "kl": 0.0004429072141647339, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": -0.07509020436555147, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07509020436555147, "reward_after_std": 0.5212648697197437, "reward_before_mean": 0.18559409055160359, "reward_before_std": 0.5110939890146255, "reward_change_max": 0.00029993802309036255, "reward_change_mean": -0.2606842555105686, "reward_change_min": -0.420489102602005, "reward_change_std": 0.1720300316810608, "reward_std": 0.5212648846209049, "rewards/cosine_scaled_reward": -0.10511963814496994, "rewards/format_reward": 0.39583333767950535, "step": 77 }, { "advantage_max": 1.6002394706010818, "advantage_mean": 2.235174201281609e-08, "advantage_min": -1.0966233238577843, "advantage_std": 0.9997987076640129, "completion_length": 3212.041717529297, "epoch": 0.08914285714285715, "grad_norm": 0.1576608568429947, "kl": 0.00013444339856505394, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": -0.03098016418516636, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.03098016418516636, "reward_after_std": 0.6884403191506863, "reward_before_mean": 0.2296357899904251, "reward_before_std": 0.7215537466108799, "reward_change_max": 0.00030860304832458496, "reward_change_mean": -0.26061592996120453, "reward_change_min": -0.5376428719609976, "reward_change_std": 0.21406401693820953, "reward_std": 0.688440341502428, "rewards/cosine_scaled_reward": -0.051848778035491705, "rewards/format_reward": 0.3333333395421505, "step": 78 }, { "advantage_max": 1.8595923781394958, "advantage_mean": 2.048909680807398e-08, "advantage_min": -0.8024958111345768, "advantage_std": 0.9997843727469444, "completion_length": 2289.37504196167, "epoch": 0.09028571428571429, "grad_norm": 0.22345860302448273, "kl": 0.000730365514755249, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.908088623197048e-07, "loss": 0.0, "reward": 0.01609797467244789, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.01609797467244789, "reward_after_std": 0.5654987283051014, "reward_before_mean": 0.29465459007769823, "reward_before_std": 0.47823043167591095, "reward_change_max": 0.0004724934697151184, "reward_change_mean": -0.2785566207021475, "reward_change_min": -0.4156152084469795, "reward_change_std": 0.159152552485466, "reward_std": 0.5654987320303917, "rewards/cosine_scaled_reward": -0.11308937147259712, "rewards/format_reward": 0.520833333954215, "step": 79 }, { "advantage_max": 1.641204446554184, "advantage_mean": 4.284083887640122e-08, "advantage_min": -1.104140542447567, "advantage_std": 0.9997965767979622, "completion_length": 3268.104202270508, "epoch": 0.09142857142857143, "grad_norm": 0.18267425894737244, "kl": 0.0005040690302848816, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": -0.10733454604633152, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10733454604633152, "reward_after_std": 0.6708246134221554, "reward_before_mean": 0.13074208237230778, "reward_before_std": 0.7244840562343597, "reward_change_max": 5.655735731124878e-05, "reward_change_mean": -0.23807661328464746, "reward_change_min": -0.4838180225342512, "reward_change_std": 0.21340421494096518, "reward_std": 0.6708246357738972, "rewards/cosine_scaled_reward": -0.10129562206566334, "rewards/format_reward": 0.3333333395421505, "step": 80 }, { "advantage_max": 1.7489165961742401, "advantage_mean": 3.864988729063157e-08, "advantage_min": -0.8702133595943451, "advantage_std": 0.9997365474700928, "completion_length": 2905.3333740234375, "epoch": 0.09257142857142857, "grad_norm": 0.23969367146492004, "kl": 0.002313464879989624, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.895025252503755e-07, "loss": 0.0001, "reward": -0.2649361342191696, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2649361342191696, "reward_after_std": 0.5374672412872314, "reward_before_mean": -0.07541222497820854, "reward_before_std": 0.5437739994376898, "reward_change_max": 0.00048619508743286133, "reward_change_mean": -0.1895239173900336, "reward_change_min": -0.4257840495556593, "reward_change_std": 0.15698988223448396, "reward_std": 0.5374672636389732, "rewards/cosine_scaled_reward": -0.19395611807703972, "rewards/format_reward": 0.3125000037252903, "step": 81 }, { "advantage_max": 1.75922991335392, "advantage_mean": 3.849466811978175e-08, "advantage_min": -0.8853632733225822, "advantage_std": 0.9997850656509399, "completion_length": 2894.9166717529297, "epoch": 0.09371428571428571, "grad_norm": 0.17825722694396973, "kl": 0.0007796967402100563, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.888172094375033e-07, "loss": 0.0, "reward": -0.044029101729393005, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.044029101729393005, "reward_after_std": 0.6101236864924431, "reward_before_mean": 0.2116898074746132, "reward_before_std": 0.554393008351326, "reward_change_max": 0.0009601488709449768, "reward_change_mean": -0.25571888452395797, "reward_change_min": -0.4296169802546501, "reward_change_std": 0.16489798668771982, "reward_std": 0.6101237051188946, "rewards/cosine_scaled_reward": -0.0712384432554245, "rewards/format_reward": 0.35416666977107525, "step": 82 }, { "advantage_max": 1.7100812792778015, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.9794905483722687, "advantage_std": 0.9997932314872742, "completion_length": 2652.2291717529297, "epoch": 0.09485714285714286, "grad_norm": 0.23454752564430237, "kl": 0.000807344913482666, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "reward": -0.10935159772634506, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.10935159772634506, "reward_after_std": 0.6078389920294285, "reward_before_mean": 0.12803274020552635, "reward_before_std": 0.6211237497627735, "reward_change_max": 0.0008202046155929565, "reward_change_mean": -0.237384338863194, "reward_change_min": -0.46528430469334126, "reward_change_std": 0.1859043724834919, "reward_std": 0.6078390143811703, "rewards/cosine_scaled_reward": -0.12348363362252712, "rewards/format_reward": 0.3750000037252903, "step": 83 }, { "advantage_max": 1.6017490476369858, "advantage_mean": 9.934109090892207e-09, "advantage_min": -1.1636109501123428, "advantage_std": 0.9997934252023697, "completion_length": 3062.875045776367, "epoch": 0.096, "grad_norm": 0.16458596289157867, "kl": 0.0003489255905151367, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.23049863800406456, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23049863800406456, "reward_after_std": 0.7958911098539829, "reward_before_mean": 0.5794772133231163, "reward_before_std": 0.8653192967176437, "reward_change_max": 0.0006882175803184509, "reward_change_mean": -0.34897857904434204, "reward_change_min": -0.6671846639364958, "reward_change_std": 0.2844910528510809, "reward_std": 0.7958911247551441, "rewards/cosine_scaled_reward": 0.07098859921097755, "rewards/format_reward": 0.43750000558793545, "step": 84 }, { "advantage_max": 1.7872411012649536, "advantage_mean": 4.718701118733293e-08, "advantage_min": -0.9174601286649704, "advantage_std": 0.9997250586748123, "completion_length": 3009.687545776367, "epoch": 0.09714285714285714, "grad_norm": 0.13440193235874176, "kl": 0.00025300681591033936, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": -0.22818884439766407, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.22818884439766407, "reward_after_std": 0.5634497385472059, "reward_before_mean": -0.03151026368141174, "reward_before_std": 0.535164738073945, "reward_change_max": 0.001543976366519928, "reward_change_mean": -0.196678570471704, "reward_change_min": -0.38646116480231285, "reward_change_std": 0.14546805899590254, "reward_std": 0.5634497571736574, "rewards/cosine_scaled_reward": -0.20325513370335102, "rewards/format_reward": 0.3750000037252903, "step": 85 }, { "advantage_max": 1.6266282349824905, "advantage_mean": 3.985284613428064e-08, "advantage_min": -1.1330338940024376, "advantage_std": 0.9997888952493668, "completion_length": 3026.6458740234375, "epoch": 0.09828571428571428, "grad_norm": 0.17491480708122253, "kl": 0.0012866854667663574, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.85862422507884e-07, "loss": 0.0001, "reward": -0.06933249346911907, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06933249346911907, "reward_after_std": 0.6919153667986393, "reward_before_mean": 0.17476091790013015, "reward_before_std": 0.7231006026268005, "reward_change_max": 0.001809820532798767, "reward_change_mean": -0.24409340415149927, "reward_change_min": -0.47525056079030037, "reward_change_std": 0.203478641808033, "reward_std": 0.6919153667986393, "rewards/cosine_scaled_reward": -0.11053621396422386, "rewards/format_reward": 0.3958333432674408, "step": 86 }, { "advantage_max": 1.7204211950302124, "advantage_mean": 2.545615074289742e-08, "advantage_min": -0.8401723429560661, "advantage_std": 0.9998236000537872, "completion_length": 2917.6459045410156, "epoch": 0.09942857142857142, "grad_norm": 0.2169748693704605, "kl": 0.0011854320764541626, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.850705248720068e-07, "loss": 0.0, "reward": -0.15189265552908182, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15189265552908182, "reward_after_std": 0.7466767616569996, "reward_before_mean": 0.05327833816409111, "reward_before_std": 0.7826491482555866, "reward_change_max": 0.0012126415967941284, "reward_change_mean": -0.20517097879201174, "reward_change_min": -0.5490577183663845, "reward_change_std": 0.20065120910294354, "reward_std": 0.7466767616569996, "rewards/cosine_scaled_reward": -0.1712775026098825, "rewards/format_reward": 0.3958333469927311, "step": 87 }, { "advantage_max": 1.8208003491163254, "advantage_mean": -2.1109979264632983e-08, "advantage_min": -0.8185875862836838, "advantage_std": 0.9998869821429253, "completion_length": 2878.187530517578, "epoch": 0.10057142857142858, "grad_norm": 0.20829077064990997, "kl": 0.0016402900218963623, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.8425742251254e-07, "loss": 0.0001, "reward": 0.08122656709747389, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.08122656709747389, "reward_after_std": 1.133527971804142, "reward_before_mean": 0.3133980662096292, "reward_before_std": 1.1508859395980835, "reward_change_max": 0.0, "reward_change_mean": -0.2321715233847499, "reward_change_min": -0.5831114239990711, "reward_change_std": 0.22239528130739927, "reward_std": 1.133528009057045, "rewards/cosine_scaled_reward": -0.06205096747726202, "rewards/format_reward": 0.43750001676380634, "step": 88 }, { "advantage_max": 1.7402677088975906, "advantage_mean": 7.76102143662527e-09, "advantage_min": -0.9676889851689339, "advantage_std": 0.9998382776975632, "completion_length": 3328.1875610351562, "epoch": 0.10171428571428572, "grad_norm": 0.2292921543121338, "kl": 0.0014015436172485352, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.83423155058946e-07, "loss": 0.0001, "reward": -0.11148460092954338, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.11148460092954338, "reward_after_std": 0.8901860229671001, "reward_before_mean": 0.08984524011611938, "reward_before_std": 0.9413608275353909, "reward_change_max": 0.0010383576154708862, "reward_change_mean": -0.20132984593510628, "reward_change_min": -0.49834184907376766, "reward_change_std": 0.21247117966413498, "reward_std": 0.8901860415935516, "rewards/cosine_scaled_reward": -0.10091071901842952, "rewards/format_reward": 0.2916666716337204, "step": 89 }, { "advantage_max": 1.685807541012764, "advantage_mean": 5.339583031283013e-08, "advantage_min": -0.9631387665867805, "advantage_std": 0.9997355118393898, "completion_length": 2410.8541870117188, "epoch": 0.10285714285714286, "grad_norm": 0.2840408384799957, "kl": 0.0023888349533081055, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.825677631722435e-07, "loss": 0.0001, "reward": -0.19426595163531601, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19426595163531601, "reward_after_std": 0.4587608836591244, "reward_before_mean": 0.028027180582284927, "reward_before_std": 0.4346531294286251, "reward_change_max": 0.00036888569593429565, "reward_change_mean": -0.22229311801493168, "reward_change_min": -0.377274714410305, "reward_change_std": 0.14629555866122246, "reward_std": 0.45876089110970497, "rewards/cosine_scaled_reward": -0.22556974878534675, "rewards/format_reward": 0.4791666716337204, "step": 90 }, { "advantage_max": 1.7024072706699371, "advantage_mean": 1.179675274132208e-08, "advantage_min": -1.0454597994685173, "advantage_std": 0.9998550862073898, "completion_length": 3143.312530517578, "epoch": 0.104, "grad_norm": 0.17777808010578156, "kl": 0.0011315643787384033, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.816912885430258e-07, "loss": 0.0, "reward": 0.019155754358507693, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.019155754358507693, "reward_after_std": 0.8894457370042801, "reward_before_mean": 0.26651920937001705, "reward_before_std": 0.9211246967315674, "reward_change_max": 0.0009043514728546143, "reward_change_mean": -0.24736345745623112, "reward_change_min": -0.5050514657050371, "reward_change_std": 0.21295447181910276, "reward_std": 0.8894457705318928, "rewards/cosine_scaled_reward": -0.054240401834249496, "rewards/format_reward": 0.37500001303851604, "step": 91 }, { "advantage_max": 1.8360530883073807, "advantage_mean": 1.8626452158443385e-08, "advantage_min": -0.7955534011125565, "advantage_std": 0.9998010694980621, "completion_length": 2562.8125534057617, "epoch": 0.10514285714285715, "grad_norm": 0.24126262962818146, "kl": 0.0009981989860534668, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.807937738894303e-07, "loss": 0.0, "reward": 0.0586411589756608, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0586411589756608, "reward_after_std": 0.7283506281673908, "reward_before_mean": 0.33699838956817985, "reward_before_std": 0.6955078411847353, "reward_change_max": 0.0006359219551086426, "reward_change_mean": -0.2783571844920516, "reward_change_min": -0.535007182508707, "reward_change_std": 0.20023613749071956, "reward_std": 0.7283506728708744, "rewards/cosine_scaled_reward": -0.1023341715335846, "rewards/format_reward": 0.5416666679084301, "step": 92 }, { "advantage_max": 1.548183873295784, "advantage_mean": 9.934108091691485e-09, "advantage_min": -1.1905573010444641, "advantage_std": 0.9997426718473434, "completion_length": 3577.0208435058594, "epoch": 0.10628571428571429, "grad_norm": 0.16404443979263306, "kl": 0.0016289949417114258, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.798752629550546e-07, "loss": 0.0001, "reward": -0.47817371785640717, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.47817371785640717, "reward_after_std": 0.5007516071200371, "reward_before_mean": -0.3584257047623396, "reward_before_std": 0.5362331233918667, "reward_change_max": 0.0026210248470306396, "reward_change_mean": -0.11974802287295461, "reward_change_min": -0.26703036576509476, "reward_change_std": 0.11669965367764235, "reward_std": 0.5007516108453274, "rewards/cosine_scaled_reward": -0.20004619285464287, "rewards/format_reward": 0.0416666679084301, "step": 93 }, { "advantage_max": 1.7747317552566528, "advantage_mean": 5.8362882304763275e-08, "advantage_min": -0.9935160875320435, "advantage_std": 0.9997159689664841, "completion_length": 2936.270866394043, "epoch": 0.10742857142857143, "grad_norm": 0.18288840353488922, "kl": 0.0019444599747657776, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.78935800506826e-07, "loss": 0.0001, "reward": -0.08148485980927944, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08148485980927944, "reward_after_std": 0.578640497289598, "reward_before_mean": 0.1689160345122218, "reward_before_std": 0.5636005159467459, "reward_change_max": 0.0006143450736999512, "reward_change_mean": -0.25040087942034006, "reward_change_min": -0.45593926683068275, "reward_change_std": 0.19041610066778958, "reward_std": 0.5786405233666301, "rewards/cosine_scaled_reward": -0.0717919934540987, "rewards/format_reward": 0.31250000186264515, "step": 94 }, { "advantage_max": 1.6343782097101212, "advantage_mean": 1.3659398168108794e-08, "advantage_min": -1.233926348388195, "advantage_std": 0.999689556658268, "completion_length": 3428.479217529297, "epoch": 0.10857142857142857, "grad_norm": 0.14903585612773895, "kl": 0.00032570958137512207, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": -0.25682489573955536, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.25682489573955536, "reward_after_std": 0.4950319044291973, "reward_before_mean": -0.0534893162548542, "reward_before_std": 0.5203002355992794, "reward_change_max": 0.0, "reward_change_mean": -0.20333559694699943, "reward_change_min": -0.3770237062126398, "reward_change_std": 0.16128122247755527, "reward_std": 0.49503191001713276, "rewards/cosine_scaled_reward": -0.14132798463106155, "rewards/format_reward": 0.22916667349636555, "step": 95 }, { "advantage_max": 1.7309832870960236, "advantage_mean": 1.0554989704480988e-07, "advantage_min": -1.0128647983074188, "advantage_std": 0.9997375980019569, "completion_length": 2689.020866394043, "epoch": 0.10971428571428571, "grad_norm": 0.19174990057945251, "kl": 0.0033934414386749268, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.769942052400235e-07, "loss": 0.0001, "reward": 0.009245343506336212, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.009245343506336212, "reward_after_std": 0.559165321290493, "reward_before_mean": 0.2946573458611965, "reward_before_std": 0.5430589374154806, "reward_change_max": 0.001197025179862976, "reward_change_mean": -0.285411992110312, "reward_change_min": -0.4874095767736435, "reward_change_std": 0.19107454037293792, "reward_std": 0.5591653324663639, "rewards/cosine_scaled_reward": -0.071421317756176, "rewards/format_reward": 0.4375, "step": 96 }, { "advantage_max": 1.6804829388856888, "advantage_mean": -6.239861560786153e-08, "advantage_min": -1.020699881017208, "advantage_std": 0.9997079521417618, "completion_length": 3359.625030517578, "epoch": 0.11085714285714286, "grad_norm": 0.15624156594276428, "kl": 0.0007499158382415771, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.759921670520634e-07, "loss": 0.0, "reward": -0.1554597243666649, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1554597243666649, "reward_after_std": 0.5989795615896583, "reward_before_mean": 0.07574700936675072, "reward_before_std": 0.6590539943426847, "reward_change_max": 0.0017822310328483582, "reward_change_mean": -0.23120674211531878, "reward_change_min": -0.44350939244031906, "reward_change_std": 0.19939671899192035, "reward_std": 0.5989795653149486, "rewards/cosine_scaled_reward": -0.07670984230935574, "rewards/format_reward": 0.2291666753590107, "step": 97 }, { "advantage_max": 1.6190086752176285, "advantage_mean": 1.862645238048799e-08, "advantage_min": -1.2423430234193802, "advantage_std": 0.9997780472040176, "completion_length": 2969.7708740234375, "epoch": 0.112, "grad_norm": 0.164066880941391, "kl": 0.0005201101303100586, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.749693666068663e-07, "loss": 0.0, "reward": -0.11222604289650917, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.11222604289650917, "reward_after_std": 0.6267273426055908, "reward_before_mean": 0.1267369776032865, "reward_before_std": 0.6630882322788239, "reward_change_max": 0.0, "reward_change_mean": -0.23896300233900547, "reward_change_min": -0.45961318351328373, "reward_change_std": 0.19679743982851505, "reward_std": 0.6267273500561714, "rewards/cosine_scaled_reward": -0.12413152866065502, "rewards/format_reward": 0.3750000074505806, "step": 98 }, { "advantage_max": 1.7954564839601517, "advantage_mean": -2.7318795892128378e-08, "advantage_min": -0.924432247877121, "advantage_std": 0.9997765943408012, "completion_length": 2861.125015258789, "epoch": 0.11314285714285714, "grad_norm": 0.17843745648860931, "kl": 0.0010409355163574219, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "reward": -0.026294399052858353, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.026294399052858353, "reward_after_std": 0.7253436986356974, "reward_before_mean": 0.21599197760224342, "reward_before_std": 0.6580101121217012, "reward_change_max": 0.0005289614200592041, "reward_change_mean": -0.24228637106716633, "reward_change_min": -0.3682333882898092, "reward_change_std": 0.14797560684382915, "reward_std": 0.7253437153995037, "rewards/cosine_scaled_reward": -0.0378373465500772, "rewards/format_reward": 0.2916666679084301, "step": 99 }, { "advantage_max": 1.6659599244594574, "advantage_mean": 7.202227914060444e-08, "advantage_min": -1.227808453142643, "advantage_std": 0.9996214956045151, "completion_length": 2829.791702270508, "epoch": 0.11428571428571428, "grad_norm": 0.1729695200920105, "kl": 0.0014210939407348633, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.728616793536587e-07, "loss": 0.0001, "reward": 0.12535643577575684, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12535643577575684, "reward_after_std": 0.596457451581955, "reward_before_mean": 0.4535660892724991, "reward_before_std": 0.6117115598171949, "reward_change_max": 4.544854164123535e-07, "reward_change_mean": -0.32820960134267807, "reward_change_min": -0.5949569642543793, "reward_change_std": 0.23030776623636484, "reward_std": 0.5964574702084064, "rewards/cosine_scaled_reward": 0.018449692055583, "rewards/format_reward": 0.4166666716337204, "step": 100 }, { "advantage_max": 1.6771378815174103, "advantage_mean": 3.756334540638839e-08, "advantage_min": -1.0258858352899551, "advantage_std": 0.9997877031564713, "completion_length": 2763.8333740234375, "epoch": 0.11542857142857142, "grad_norm": 0.22144387662410736, "kl": 0.0012489557266235352, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.717768952713511e-07, "loss": 0.0, "reward": -0.002344711683690548, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.002344711683690548, "reward_after_std": 0.5791723318397999, "reward_before_mean": 0.27940622717142105, "reward_before_std": 0.5787831321358681, "reward_change_max": 0.0008510574698448181, "reward_change_mean": -0.28175091254524887, "reward_change_min": -0.4957294128835201, "reward_change_std": 0.20934505388140678, "reward_std": 0.5791723504662514, "rewards/cosine_scaled_reward": -0.03738022409379482, "rewards/format_reward": 0.35416667349636555, "step": 101 }, { "advantage_max": 1.8033748865127563, "advantage_mean": -1.3969838952299085e-08, "advantage_min": -0.9170250967144966, "advantage_std": 0.9998318701982498, "completion_length": 2385.770896911621, "epoch": 0.11657142857142858, "grad_norm": 0.22797729074954987, "kl": 0.003253161907196045, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.706715543782064e-07, "loss": 0.0001, "reward": 0.10309968190267682, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.10309968190267682, "reward_after_std": 0.7670713812112808, "reward_before_mean": 0.3925341051071882, "reward_before_std": 0.7520890105515718, "reward_change_max": 0.0002071782946586609, "reward_change_mean": -0.2894344194792211, "reward_change_min": -0.5455930791795254, "reward_change_std": 0.21182263363152742, "reward_std": 0.7670714221894741, "rewards/cosine_scaled_reward": -0.1266496255993843, "rewards/format_reward": 0.6458333414047956, "step": 102 }, { "advantage_max": 1.636709839105606, "advantage_mean": 3.663202197135007e-08, "advantage_min": -1.1363722458481789, "advantage_std": 0.9998014122247696, "completion_length": 2906.8333740234375, "epoch": 0.11771428571428572, "grad_norm": 0.2754819989204407, "kl": 0.002112448215484619, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.695457105469804e-07, "loss": 0.0001, "reward": 0.0014424961991608143, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0014424961991608143, "reward_after_std": 0.7158971205353737, "reward_before_mean": 0.2669691275805235, "reward_before_std": 0.74494394287467, "reward_change_max": 0.002191774547100067, "reward_change_mean": -0.26552660297602415, "reward_change_min": -0.497951403260231, "reward_change_std": 0.20551541727036238, "reward_std": 0.7158971298485994, "rewards/cosine_scaled_reward": -0.06443211250007153, "rewards/format_reward": 0.39583334513008595, "step": 103 }, { "advantage_max": 1.7317698448896408, "advantage_mean": 9.934107536579972e-09, "advantage_min": -0.9683221206068993, "advantage_std": 0.9997662752866745, "completion_length": 2858.4791870117188, "epoch": 0.11885714285714286, "grad_norm": 0.3066559135913849, "kl": 0.012755632400512695, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.683994186497132e-07, "loss": 0.0005, "reward": -0.17632766626775265, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17632766626775265, "reward_after_std": 0.615046065300703, "reward_before_mean": 0.034148991107940674, "reward_before_std": 0.6209501847624779, "reward_change_max": 0.0013972148299217224, "reward_change_mean": -0.2104766578413546, "reward_change_min": -0.4065582901239395, "reward_change_std": 0.16453414456918836, "reward_std": 0.6150460727512836, "rewards/cosine_scaled_reward": -0.16000884026288986, "rewards/format_reward": 0.35416666977107525, "step": 104 }, { "advantage_max": 1.7005452513694763, "advantage_mean": -7.217749509180749e-09, "advantage_min": -0.9641866832971573, "advantage_std": 0.9998493865132332, "completion_length": 2649.0208740234375, "epoch": 0.12, "grad_norm": 0.19496333599090576, "kl": 0.0011529326438903809, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.672327345550543e-07, "loss": 0.0, "reward": 0.23285237979143858, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.23285237979143858, "reward_after_std": 0.9236329235136509, "reward_before_mean": 0.5560791362076998, "reward_before_std": 0.9535301281139255, "reward_change_max": 0.0009224936366081238, "reward_change_mean": -0.3232267568819225, "reward_change_min": -0.6001363098621368, "reward_change_std": 0.2598576219752431, "reward_std": 0.923632949590683, "rewards/cosine_scaled_reward": 0.03845623089000583, "rewards/format_reward": 0.4791666716337204, "step": 105 }, { "advantage_max": 1.7395610213279724, "advantage_mean": -1.1175870895385742e-08, "advantage_min": -0.8789972476661205, "advantage_std": 0.999837689101696, "completion_length": 2158.31254196167, "epoch": 0.12114285714285715, "grad_norm": 0.17631974816322327, "kl": 0.0020558834075927734, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.66045715125541e-07, "loss": 0.0001, "reward": 0.43815290927886963, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.43815290927886963, "reward_after_std": 0.820628497749567, "reward_before_mean": 0.844319261610508, "reward_before_std": 0.7749473014846444, "reward_change_max": 0.0, "reward_change_mean": -0.40616634115576744, "reward_change_min": -0.7530931998044252, "reward_change_std": 0.29046634398400784, "reward_std": 0.8206285163760185, "rewards/cosine_scaled_reward": 0.10965961031615734, "rewards/format_reward": 0.6250000018626451, "step": 106 }, { "advantage_max": 1.7683300226926804, "advantage_mean": 4.3772161228972095e-08, "advantage_min": -0.9992892891168594, "advantage_std": 0.9997427016496658, "completion_length": 3117.4791870117188, "epoch": 0.12228571428571429, "grad_norm": 0.18865761160850525, "kl": 0.0014286041259765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.648384182148252e-07, "loss": 0.0001, "reward": -0.11298406589776278, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11298406589776278, "reward_after_std": 0.5871306788176298, "reward_before_mean": 0.12983395779156126, "reward_before_std": 0.6119651421904564, "reward_change_max": 7.678568363189697e-05, "reward_change_mean": -0.2428179858252406, "reward_change_min": -0.4364739805459976, "reward_change_std": 0.18585507525131106, "reward_std": 0.587130693718791, "rewards/cosine_scaled_reward": -0.1329997219145298, "rewards/format_reward": 0.3958333507180214, "step": 107 }, { "advantage_max": 1.6300202906131744, "advantage_mean": 1.5522043428362053e-08, "advantage_min": -1.1117057800292969, "advantage_std": 0.9998191893100739, "completion_length": 2839.666717529297, "epoch": 0.12342857142857143, "grad_norm": 0.20808304846286774, "kl": 0.001534193754196167, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.636109026648554e-07, "loss": 0.0001, "reward": 0.08168945461511612, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08168945461511612, "reward_after_std": 0.8422982320189476, "reward_before_mean": 0.35761653259396553, "reward_before_std": 0.8752856999635696, "reward_change_max": 0.0006015077233314514, "reward_change_mean": -0.2759270705282688, "reward_change_min": -0.5596902929246426, "reward_change_std": 0.2283379090949893, "reward_std": 0.8422982841730118, "rewards/cosine_scaled_reward": -0.019108395092189312, "rewards/format_reward": 0.3958333432674408, "step": 108 }, { "advantage_max": 1.7969225347042084, "advantage_mean": -4.408260395605268e-08, "advantage_min": -0.9135128036141396, "advantage_std": 0.9997855126857758, "completion_length": 3086.3958740234375, "epoch": 0.12457142857142857, "grad_norm": 0.18924620747566223, "kl": 0.0005153417587280273, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": -0.11098742205649614, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11098742205649614, "reward_after_std": 0.585125271230936, "reward_before_mean": 0.12289483658969402, "reward_before_std": 0.5465646982192993, "reward_change_max": 0.0010496899485588074, "reward_change_mean": -0.23388232104480267, "reward_change_min": -0.39980729669332504, "reward_change_std": 0.15995712112635374, "reward_std": 0.5851252935826778, "rewards/cosine_scaled_reward": -0.08438592031598091, "rewards/format_reward": 0.2916666716337204, "step": 109 }, { "advantage_max": 1.7251650094985962, "advantage_mean": 2.483527050678447e-09, "advantage_min": -0.9533622413873672, "advantage_std": 0.9997881427407265, "completion_length": 2699.3750228881836, "epoch": 0.12571428571428572, "grad_norm": 0.2168547809123993, "kl": 0.0008405148983001709, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "reward": -0.12120639532804489, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.12120639532804489, "reward_after_std": 0.5944926850497723, "reward_before_mean": 0.11219263076782227, "reward_before_std": 0.5862014722079039, "reward_change_max": 0.0005781054496765137, "reward_change_mean": -0.23339902609586716, "reward_change_min": -0.4640258774161339, "reward_change_std": 0.1791122118011117, "reward_std": 0.5944926962256432, "rewards/cosine_scaled_reward": -0.15223703160881996, "rewards/format_reward": 0.41666666977107525, "step": 110 }, { "advantage_max": 1.6638637781143188, "advantage_mean": 6.33299379604324e-08, "advantage_min": -1.1016447097063065, "advantage_std": 0.9997721016407013, "completion_length": 3214.500030517578, "epoch": 0.12685714285714286, "grad_norm": 0.18918636441230774, "kl": 0.0017311573028564453, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.598076473627796e-07, "loss": 0.0001, "reward": -0.162625964730978, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.162625964730978, "reward_after_std": 0.7703050579875708, "reward_before_mean": 0.038309202529489994, "reward_before_std": 0.8199625238776207, "reward_change_max": 0.0, "reward_change_mean": -0.20093515701591969, "reward_change_min": -0.5358131155371666, "reward_change_std": 0.2113886373117566, "reward_std": 0.7703050971031189, "rewards/cosine_scaled_reward": -0.10584540944546461, "rewards/format_reward": 0.25000000558793545, "step": 111 }, { "advantage_max": 1.6523158550262451, "advantage_mean": -1.4901160749758446e-08, "advantage_min": -1.0043475404381752, "advantage_std": 0.9997996687889099, "completion_length": 3108.5416870117188, "epoch": 0.128, "grad_norm": 0.1536342054605484, "kl": 0.0007609724998474121, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "reward": -0.08307351113762707, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08307351113762707, "reward_after_std": 0.604747511446476, "reward_before_mean": 0.16323358565568924, "reward_before_std": 0.5882431715726852, "reward_change_max": 0.0014370977878570557, "reward_change_mean": -0.24630710575729609, "reward_change_min": -0.44425584748387337, "reward_change_std": 0.17876359913498163, "reward_std": 0.6047475337982178, "rewards/cosine_scaled_reward": -0.10588321159593761, "rewards/format_reward": 0.3750000074505806, "step": 112 }, { "advantage_max": 1.6927157193422318, "advantage_mean": -2.7318795670083773e-08, "advantage_min": -1.0044822320342064, "advantage_std": 0.9998141229152679, "completion_length": 2595.7083702087402, "epoch": 0.12914285714285714, "grad_norm": 0.20856420695781708, "kl": 0.0011587142944335938, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.571721736097088e-07, "loss": 0.0, "reward": 0.07129224203526974, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07129224203526974, "reward_after_std": 0.7336039766669273, "reward_before_mean": 0.3595566302537918, "reward_before_std": 0.751370508223772, "reward_change_max": 0.0006392970681190491, "reward_change_mean": -0.28826442174613476, "reward_change_min": -0.5697160176932812, "reward_change_std": 0.22509220149368048, "reward_std": 0.7336040064692497, "rewards/cosine_scaled_reward": -0.05980501789599657, "rewards/format_reward": 0.4791666753590107, "step": 113 }, { "advantage_max": 1.6434798538684845, "advantage_mean": 1.2417642469841894e-09, "advantage_min": -1.1047632172703743, "advantage_std": 0.9997706264257431, "completion_length": 2544.9792098999023, "epoch": 0.13028571428571428, "grad_norm": 0.1979716271162033, "kl": 0.0017580986022949219, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.55824636882301e-07, "loss": 0.0001, "reward": -0.04609719692962244, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.04609719692962244, "reward_after_std": 0.496824212372303, "reward_before_mean": 0.2309152688831091, "reward_before_std": 0.4938830081373453, "reward_change_max": 0.0004075542092323303, "reward_change_mean": -0.27701246459037066, "reward_change_min": -0.47630488499999046, "reward_change_std": 0.1878509558737278, "reward_std": 0.496824212372303, "rewards/cosine_scaled_reward": -0.18662571161985397, "rewards/format_reward": 0.6041666753590107, "step": 114 }, { "advantage_max": 1.668518453836441, "advantage_mean": 3.849466900796017e-08, "advantage_min": -0.9183512553572655, "advantage_std": 0.9997554123401642, "completion_length": 2669.208366394043, "epoch": 0.13142857142857142, "grad_norm": 0.19642752408981323, "kl": 0.0015791654586791992, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.54457320834625e-07, "loss": 0.0001, "reward": -0.14574788976460695, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14574788976460695, "reward_after_std": 0.49735596403479576, "reward_before_mean": 0.09368688985705376, "reward_before_std": 0.4808156006038189, "reward_change_max": 0.001104116439819336, "reward_change_mean": -0.2394347987137735, "reward_change_min": -0.46826227754354477, "reward_change_std": 0.18234600871801376, "reward_std": 0.49735597148537636, "rewards/cosine_scaled_reward": -0.13023989088833332, "rewards/format_reward": 0.3541666679084301, "step": 115 }, { "advantage_max": 1.7265245020389557, "advantage_mean": 9.623666868963099e-08, "advantage_min": -0.9298912286758423, "advantage_std": 0.9997406750917435, "completion_length": 3273.6666870117188, "epoch": 0.13257142857142856, "grad_norm": 0.17037241160869598, "kl": 0.0019521713256835938, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.530702921077358e-07, "loss": 0.0001, "reward": -0.383125726133585, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.383125726133585, "reward_after_std": 0.4773464947938919, "reward_before_mean": -0.22625800222158432, "reward_before_std": 0.49769464135169983, "reward_change_max": 0.0025099217891693115, "reward_change_mean": -0.1568676927126944, "reward_change_min": -0.3555114157497883, "reward_change_std": 0.1372272982262075, "reward_std": 0.4773465134203434, "rewards/cosine_scaled_reward": -0.2068790066987276, "rewards/format_reward": 0.18750000186264515, "step": 116 }, { "advantage_max": 1.829407513141632, "advantage_mean": -2.1730860666480112e-08, "advantage_min": -0.8818910084664822, "advantage_std": 0.9997041150927544, "completion_length": 3200.4375610351562, "epoch": 0.1337142857142857, "grad_norm": 0.18679514527320862, "kl": 0.0017189979553222656, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.516636183034564e-07, "loss": 0.0001, "reward": -0.3048145342618227, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.3048145342618227, "reward_after_std": 0.7093480145558715, "reward_before_mean": -0.15804192423820496, "reward_before_std": 0.7109875828027725, "reward_change_max": 0.0019478797912597656, "reward_change_mean": -0.14677260722965002, "reward_change_min": -0.3250487558543682, "reward_change_std": 0.13174744765274227, "reward_std": 0.7093480592593551, "rewards/cosine_scaled_reward": -0.19360430166125298, "rewards/format_reward": 0.22916666977107525, "step": 117 }, { "advantage_max": 1.704234093427658, "advantage_mean": 2.9181441485448545e-08, "advantage_min": -1.0465329363942146, "advantage_std": 0.9998274445533752, "completion_length": 3257.1666870117188, "epoch": 0.13485714285714287, "grad_norm": 0.15395978093147278, "kl": 0.0013146400451660156, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.08495918568223715, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08495918568223715, "reward_after_std": 0.906765516847372, "reward_before_mean": 0.35544082429260015, "reward_before_std": 0.9444892201572657, "reward_change_max": 0.0008550956845283508, "reward_change_mean": -0.2704816050827503, "reward_change_min": -0.6102131735533476, "reward_change_std": 0.23540864512324333, "reward_std": 0.9067655466496944, "rewards/cosine_scaled_reward": 0.021470395382493734, "rewards/format_reward": 0.31250001303851604, "step": 118 }, { "advantage_max": 1.6215424090623856, "advantage_mean": 4.470348535789981e-08, "advantage_min": -1.0004944801330566, "advantage_std": 0.99982850253582, "completion_length": 2470.5833740234375, "epoch": 0.136, "grad_norm": 0.2119596004486084, "kl": 0.0038170814514160156, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.487916106540465e-07, "loss": 0.0002, "reward": 0.3729916112497449, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3729916112497449, "reward_after_std": 0.6704384796321392, "reward_before_mean": 0.7786594908684492, "reward_before_std": 0.660895586013794, "reward_change_max": 0.0, "reward_change_mean": -0.4056678432971239, "reward_change_min": -0.7041777707636356, "reward_change_std": 0.27028635516762733, "reward_std": 0.6704384945333004, "rewards/cosine_scaled_reward": 0.10807974357157946, "rewards/format_reward": 0.5625000037252903, "step": 119 }, { "advantage_max": 1.7663956731557846, "advantage_mean": 3.663202230441698e-08, "advantage_min": -1.032372571527958, "advantage_std": 0.9997234866023064, "completion_length": 2499.6458587646484, "epoch": 0.13714285714285715, "grad_norm": 0.26399776339530945, "kl": 0.0019845962524414062, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.473264167865171e-07, "loss": 0.0001, "reward": -0.10639402689412236, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10639402689412236, "reward_after_std": 0.4683174900710583, "reward_before_mean": 0.1477334424853325, "reward_before_std": 0.4303309554234147, "reward_change_max": 0.0015648677945137024, "reward_change_mean": -0.2541274814866483, "reward_change_min": -0.4159995745867491, "reward_change_std": 0.1737190696876496, "reward_std": 0.4683174919337034, "rewards/cosine_scaled_reward": -0.15529993548989296, "rewards/format_reward": 0.4583333395421505, "step": 120 }, { "advantage_max": 1.8353542536497116, "advantage_mean": 5.277494996569487e-09, "advantage_min": -0.9251411855220795, "advantage_std": 0.9998060166835785, "completion_length": 1701.1250190734863, "epoch": 0.1382857142857143, "grad_norm": 0.2281099408864975, "kl": 0.003339052200317383, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.458418577899774e-07, "loss": 0.0001, "reward": 0.2946602776646614, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2946602776646614, "reward_after_std": 0.6927468255162239, "reward_before_mean": 0.6606933157891035, "reward_before_std": 0.6180010866373777, "reward_change_max": 0.0, "reward_change_mean": -0.36603305861353874, "reward_change_min": -0.5772724207490683, "reward_change_std": 0.22722634207457304, "reward_std": 0.6927468329668045, "rewards/cosine_scaled_reward": -0.04465334676206112, "rewards/format_reward": 0.7500000074505806, "step": 121 }, { "advantage_max": 1.7002272754907608, "advantage_mean": 9.934107647602275e-09, "advantage_min": -0.9179603233933449, "advantage_std": 0.999838799238205, "completion_length": 2917.708366394043, "epoch": 0.13942857142857143, "grad_norm": 0.18572425842285156, "kl": 0.0010418891906738281, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.443380060197385e-07, "loss": 0.0, "reward": 0.012051770463585854, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.012051770463585854, "reward_after_std": 0.8977667167782784, "reward_before_mean": 0.2602110430598259, "reward_before_std": 0.9553406834602356, "reward_change_max": 0.0020881518721580505, "reward_change_mean": -0.24815926584415138, "reward_change_min": -0.6369654312729836, "reward_change_std": 0.2480492745526135, "reward_std": 0.8977667540311813, "rewards/cosine_scaled_reward": -0.06781115615740418, "rewards/format_reward": 0.39583334140479565, "step": 122 }, { "advantage_max": 1.6201731711626053, "advantage_mean": 3.1044089521259366e-09, "advantage_min": -1.0070946142077446, "advantage_std": 0.99980229139328, "completion_length": 2936.5833587646484, "epoch": 0.14057142857142857, "grad_norm": 0.16751940548419952, "kl": 0.0016289949417114258, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.428149347714143e-07, "loss": 0.0001, "reward": -0.04834012687206268, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.04834012687206268, "reward_after_std": 0.6940950267016888, "reward_before_mean": 0.20856638252735138, "reward_before_std": 0.7591890692710876, "reward_change_max": 0.00046505779027938843, "reward_change_mean": -0.25690650660544634, "reward_change_min": -0.5357954353094101, "reward_change_std": 0.2297231680713594, "reward_std": 0.6940950490534306, "rewards/cosine_scaled_reward": -0.09363348968327045, "rewards/format_reward": 0.3958333395421505, "step": 123 }, { "advantage_max": 1.825218215584755, "advantage_mean": 1.986821529520455e-08, "advantage_min": -0.8632050305604935, "advantage_std": 0.9998418614268303, "completion_length": 2236.770866394043, "epoch": 0.1417142857142857, "grad_norm": 0.22649620473384857, "kl": 0.005209922790527344, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.412727182773486e-07, "loss": 0.0002, "reward": 0.25160552957095206, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.25160552957095206, "reward_after_std": 0.8396368101239204, "reward_before_mean": 0.5823488412424922, "reward_before_std": 0.7940035983920097, "reward_change_max": 7.907301187515259e-05, "reward_change_mean": -0.3307433230802417, "reward_change_min": -0.5853971652686596, "reward_change_std": 0.2142821978777647, "reward_std": 0.8396368362009525, "rewards/cosine_scaled_reward": 0.009924418991431594, "rewards/format_reward": 0.5625000074505806, "step": 124 }, { "advantage_max": 1.7880681157112122, "advantage_mean": -2.4214388494314676e-08, "advantage_min": -0.9557374641299248, "advantage_std": 0.9998224824666977, "completion_length": 2819.6666870117188, "epoch": 0.14285714285714285, "grad_norm": 0.17020586133003235, "kl": 0.0011887550354003906, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.397114317029974e-07, "loss": 0.0, "reward": 0.18851646315306425, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18851646315306425, "reward_after_std": 0.7697963416576385, "reward_before_mean": 0.5041636296864453, "reward_before_std": 0.7168074026703835, "reward_change_max": 0.00021360069513320923, "reward_change_mean": -0.31564717646688223, "reward_change_min": -0.5297516994178295, "reward_change_std": 0.21122274547815323, "reward_std": 0.76979636028409, "rewards/cosine_scaled_reward": 0.06458180956542492, "rewards/format_reward": 0.37500000186264515, "step": 125 }, { "advantage_max": 1.7075690478086472, "advantage_mean": 1.1796753685011652e-08, "advantage_min": -0.8932720050215721, "advantage_std": 0.9998270645737648, "completion_length": 2850.791717529297, "epoch": 0.144, "grad_norm": 0.17294757068157196, "kl": 0.0010404586791992188, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.381311511432658e-07, "loss": 0.0, "reward": 0.026090468280017376, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.026090468280017376, "reward_after_std": 0.8577112816274166, "reward_before_mean": 0.28807709738612175, "reward_before_std": 0.920661810785532, "reward_change_max": 0.00035068392753601074, "reward_change_mean": -0.2619865979067981, "reward_change_min": -0.6276376582682133, "reward_change_std": 0.25567311397753656, "reward_std": 0.8577113188803196, "rewards/cosine_scaled_reward": -0.08512814342975616, "rewards/format_reward": 0.45833333767950535, "step": 126 }, { "advantage_max": 1.6422581225633621, "advantage_mean": 4.346172111091562e-08, "advantage_min": -1.0609121546149254, "advantage_std": 0.9998015984892845, "completion_length": 3061.979232788086, "epoch": 0.14514285714285713, "grad_norm": 0.1751047670841217, "kl": 0.0015621185302734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.36531953618799e-07, "loss": 0.0001, "reward": -0.26915243885014206, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.26915243885014206, "reward_after_std": 0.6073360815644264, "reward_before_mean": -0.08590502664446831, "reward_before_std": 0.6498651243746281, "reward_change_max": 0.0012364760041236877, "reward_change_mean": -0.18324739765375853, "reward_change_min": -0.4379800371825695, "reward_change_std": 0.17618474457412958, "reward_std": 0.6073361039161682, "rewards/cosine_scaled_reward": -0.2096191830933094, "rewards/format_reward": 0.33333334140479565, "step": 127 }, { "advantage_max": 1.5232260525226593, "advantage_mean": -1.6996637097754785e-08, "advantage_min": -1.2536739706993103, "advantage_std": 0.9998079761862755, "completion_length": 2886.666702270508, "epoch": 0.1462857142857143, "grad_norm": 0.1872163563966751, "kl": 0.0024051666259765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": 0.2702927812933922, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2702927812933922, "reward_after_std": 0.7844690792262554, "reward_before_mean": 0.6392449736595154, "reward_before_std": 0.8807711731642485, "reward_change_max": 0.0012902989983558655, "reward_change_mean": -0.3689522175118327, "reward_change_min": -0.6815567724406719, "reward_change_std": 0.30271041486412287, "reward_std": 0.7844691053032875, "rewards/cosine_scaled_reward": 0.121705811470747, "rewards/format_reward": 0.3958333432674408, "step": 128 }, { "advantage_max": 1.8235354870557785, "advantage_mean": 6.550302189545221e-08, "advantage_min": -0.9512067809700966, "advantage_std": 0.9996916577219963, "completion_length": 3393.312530517578, "epoch": 0.14742857142857144, "grad_norm": 0.19768068194389343, "kl": 0.004204392433166504, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.332771203643714e-07, "loss": 0.0002, "reward": -0.33976896293461323, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.33976896293461323, "reward_after_std": 0.5787487234920263, "reward_before_mean": -0.18687636405229568, "reward_before_std": 0.5698404628783464, "reward_change_max": 0.0004341527819633484, "reward_change_mean": -0.1528926098253578, "reward_change_min": -0.3150724731385708, "reward_change_std": 0.12029975117184222, "reward_std": 0.5787487458437681, "rewards/cosine_scaled_reward": -0.1559381727129221, "rewards/format_reward": 0.12500000186264515, "step": 129 }, { "advantage_max": 1.7205127328634262, "advantage_mean": 2.1109979431166437e-08, "advantage_min": -1.0131467059254646, "advantage_std": 0.9998162314295769, "completion_length": 3001.9375762939453, "epoch": 0.14857142857142858, "grad_norm": 0.17083537578582764, "kl": 0.002476215362548828, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": -0.08935129642486572, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08935129642486572, "reward_after_std": 0.6507752612233162, "reward_before_mean": 0.14665965549647808, "reward_before_std": 0.6409893110394478, "reward_change_max": 0.0011742040514945984, "reward_change_mean": -0.23601093608886003, "reward_change_min": -0.43137823790311813, "reward_change_std": 0.1740275491029024, "reward_std": 0.6507752649486065, "rewards/cosine_scaled_reward": -0.09333684341982007, "rewards/format_reward": 0.3333333358168602, "step": 130 }, { "advantage_max": 1.6520789712667465, "advantage_mean": -5.122273216695561e-09, "advantage_min": -1.0345544703304768, "advantage_std": 0.9998089522123337, "completion_length": 2968.4584045410156, "epoch": 0.14971428571428572, "grad_norm": 0.2010752111673355, "kl": 0.004048824310302734, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.299475664759068e-07, "loss": 0.0002, "reward": 0.10589881427586079, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10589881427586079, "reward_after_std": 0.779745314270258, "reward_before_mean": 0.40487060509622097, "reward_before_std": 0.8145109578035772, "reward_change_max": 0.0004974156618118286, "reward_change_mean": -0.29897179640829563, "reward_change_min": -0.6234648525714874, "reward_change_std": 0.26533777453005314, "reward_std": 0.7797453217208385, "rewards/cosine_scaled_reward": 0.046185292303562164, "rewards/format_reward": 0.31250000186264515, "step": 131 }, { "advantage_max": 1.6282098293304443, "advantage_mean": -1.148631220693197e-08, "advantage_min": -1.203570768237114, "advantage_std": 0.9997874647378922, "completion_length": 2738.729202270508, "epoch": 0.15085714285714286, "grad_norm": 0.17747002840042114, "kl": 0.0018082857131958008, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": 0.022863969206809998, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.022863969206809998, "reward_after_std": 0.6254726573824883, "reward_before_mean": 0.30920008942484856, "reward_before_std": 0.6438378132879734, "reward_change_max": 0.0, "reward_change_mean": -0.28633613139390945, "reward_change_min": -0.49352785758674145, "reward_change_std": 0.21022523380815983, "reward_std": 0.6254726871848106, "rewards/cosine_scaled_reward": -0.032899949699640274, "rewards/format_reward": 0.3750000111758709, "step": 132 }, { "advantage_max": 1.7653182744979858, "advantage_mean": 1.5025338073737515e-07, "advantage_min": -0.9128772616386414, "advantage_std": 0.9996593818068504, "completion_length": 3411.687530517578, "epoch": 0.152, "grad_norm": 0.1533278077840805, "kl": 0.0017893314361572266, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.265439410565328e-07, "loss": 0.0001, "reward": -0.3110358901321888, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3110358901321888, "reward_after_std": 0.621335350908339, "reward_before_mean": -0.15247543808072805, "reward_before_std": 0.616786933504045, "reward_change_max": 0.001486368477344513, "reward_change_mean": -0.1585604448337108, "reward_change_min": -0.3544439058750868, "reward_change_std": 0.13797544175758958, "reward_std": 0.6213353676721454, "rewards/cosine_scaled_reward": -0.16998771950602531, "rewards/format_reward": 0.18750000186264515, "step": 133 }, { "advantage_max": 1.6762676239013672, "advantage_mean": 3.1044085080367267e-09, "advantage_min": -1.02804596722126, "advantage_std": 0.9998197704553604, "completion_length": 2377.291702270508, "epoch": 0.15314285714285714, "grad_norm": 0.2139447033405304, "kl": 0.0031991004943847656, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.248145583195447e-07, "loss": 0.0001, "reward": 0.20286109019070864, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20286109019070864, "reward_after_std": 0.8197793439030647, "reward_before_mean": 0.5282937306910753, "reward_before_std": 0.8379609473049641, "reward_change_max": 0.00047829002141952515, "reward_change_mean": -0.3254326405003667, "reward_change_min": -0.624567674472928, "reward_change_std": 0.2450050301849842, "reward_std": 0.8197793886065483, "rewards/cosine_scaled_reward": -0.027519810013473034, "rewards/format_reward": 0.5833333414047956, "step": 134 }, { "advantage_max": 1.801884487271309, "advantage_mean": -6.736566748877237e-08, "advantage_min": -0.8994268290698528, "advantage_std": 0.9998299553990364, "completion_length": 2069.7083435058594, "epoch": 0.15428571428571428, "grad_norm": 0.2806594669818878, "kl": 0.0039038658142089844, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.230669076497687e-07, "loss": 0.0002, "reward": 0.7120445097534684, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7120445097534684, "reward_after_std": 0.8061739876866341, "reward_before_mean": 1.217892847955227, "reward_before_std": 0.7093977243639529, "reward_change_max": 0.0, "reward_change_mean": -0.5058483928442001, "reward_change_min": -0.7590719014406204, "reward_change_std": 0.3119040550664067, "reward_std": 0.8061739951372147, "rewards/cosine_scaled_reward": 0.2964464204851538, "rewards/format_reward": 0.6250000111758709, "step": 135 }, { "advantage_max": 1.6649291664361954, "advantage_mean": -2.4059166436884993e-08, "advantage_min": -1.0710264444351196, "advantage_std": 0.9997969418764114, "completion_length": 2885.437545776367, "epoch": 0.15542857142857142, "grad_norm": 0.1821409910917282, "kl": 0.002956390380859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.213010742252327e-07, "loss": 0.0001, "reward": 0.16429415345191956, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16429415345191956, "reward_after_std": 0.8656391948461533, "reward_before_mean": 0.46965679712593555, "reward_before_std": 0.9076335597783327, "reward_change_max": 0.0012604892253875732, "reward_change_mean": -0.30536266695708036, "reward_change_min": -0.5674776807427406, "reward_change_std": 0.23894102359190583, "reward_std": 0.8656392116099596, "rewards/cosine_scaled_reward": 0.026495054364204407, "rewards/format_reward": 0.4166666753590107, "step": 136 }, { "advantage_max": 1.7792393267154694, "advantage_mean": 4.594525115919623e-08, "advantage_min": -0.8564295172691345, "advantage_std": 0.9996432065963745, "completion_length": 3005.979202270508, "epoch": 0.15657142857142858, "grad_norm": 0.16614751517772675, "kl": 0.0016880035400390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, "reward": -0.2532076071947813, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2532076071947813, "reward_after_std": 0.5896189948543906, "reward_before_mean": -0.06770487129688263, "reward_before_std": 0.5853857873007655, "reward_change_max": 0.0006206557154655457, "reward_change_mean": -0.18550273345317692, "reward_change_min": -0.38860990293323994, "reward_change_std": 0.1535409848438576, "reward_std": 0.5896190209314227, "rewards/cosine_scaled_reward": -0.1796857751905918, "rewards/format_reward": 0.29166667349636555, "step": 137 }, { "advantage_max": 1.7774585783481598, "advantage_mean": 1.5522042928761692e-08, "advantage_min": -0.9052527472376823, "advantage_std": 0.9998042657971382, "completion_length": 2607.0417098999023, "epoch": 0.15771428571428572, "grad_norm": 0.18709927797317505, "kl": 0.0021820068359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": -0.0209913490107283, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.0209913490107283, "reward_after_std": 0.5939326249063015, "reward_before_mean": 0.2456446960568428, "reward_before_std": 0.5537993088364601, "reward_change_max": 0.0011073648929595947, "reward_change_mean": -0.2666360158473253, "reward_change_min": -0.483868595212698, "reward_change_std": 0.187117257155478, "reward_std": 0.5939326323568821, "rewards/cosine_scaled_reward": -0.13759432919323444, "rewards/format_reward": 0.5208333432674408, "step": 138 }, { "advantage_max": 1.7712155729532242, "advantage_mean": 5.712111783573448e-08, "advantage_min": -0.8953900262713432, "advantage_std": 0.9997160732746124, "completion_length": 3208.8750610351562, "epoch": 0.15885714285714286, "grad_norm": 0.18067364394664764, "kl": 0.0030040740966796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.158953424711624e-07, "loss": 0.0001, "reward": -0.14805123955011368, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": -0.14805123955011368, "reward_after_std": 0.7463845647871494, "reward_before_mean": 0.05662687122821808, "reward_before_std": 0.7650268105790019, "reward_change_max": 0.0, "reward_change_mean": -0.20467811857815832, "reward_change_min": -0.5247853584587574, "reward_change_std": 0.19177173380739987, "reward_std": 0.7463846132159233, "rewards/cosine_scaled_reward": -0.1383532276377082, "rewards/format_reward": 0.3333333358168602, "step": 139 }, { "advantage_max": 1.8531812131404877, "advantage_mean": 1.1175870895385742e-08, "advantage_min": -0.870878055691719, "advantage_std": 0.9998489245772362, "completion_length": 2808.104217529297, "epoch": 0.16, "grad_norm": 0.3052091896533966, "kl": 0.005020618438720703, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.140576474687263e-07, "loss": 0.0002, "reward": 0.04824933037161827, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04824933037161827, "reward_after_std": 0.8385294415056705, "reward_before_mean": 0.3010829398408532, "reward_before_std": 0.7895087189972401, "reward_change_max": 0.0015497952699661255, "reward_change_mean": -0.2528335778042674, "reward_change_min": -0.46051032468676567, "reward_change_std": 0.17168503161519766, "reward_std": 0.8385294824838638, "rewards/cosine_scaled_reward": -0.026541877537965775, "rewards/format_reward": 0.35416667349636555, "step": 140 }, { "advantage_max": 1.7642945349216461, "advantage_mean": 4.159907629475157e-08, "advantage_min": -0.8846809454262257, "advantage_std": 0.9998798370361328, "completion_length": 2980.9375610351562, "epoch": 0.16114285714285714, "grad_norm": 0.24328534305095673, "kl": 0.004295825958251953, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.122022088101613e-07, "loss": 0.0002, "reward": -0.09271654859185219, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09271654859185219, "reward_after_std": 1.0476576872169971, "reward_before_mean": 0.09144638106226921, "reward_before_std": 1.095714807510376, "reward_change_max": 0.000997297465801239, "reward_change_mean": -0.18416290963068604, "reward_change_min": -0.5330093465745449, "reward_change_std": 0.20969668123871088, "reward_std": 1.0476577542722225, "rewards/cosine_scaled_reward": -0.13136015739291906, "rewards/format_reward": 0.354166679084301, "step": 141 }, { "advantage_max": 1.6684113442897797, "advantage_mean": 2.1109978320943412e-08, "advantage_min": -0.9383320510387421, "advantage_std": 0.9998245537281036, "completion_length": 2897.6459045410156, "epoch": 0.16228571428571428, "grad_norm": 0.18924222886562347, "kl": 0.003300189971923828, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.103291169269299e-07, "loss": 0.0001, "reward": -0.01705992827191949, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.01705992827191949, "reward_after_std": 0.7630139328539371, "reward_before_mean": 0.23858460783958435, "reward_before_std": 0.8049006313085556, "reward_change_max": 0.001080736517906189, "reward_change_mean": -0.2556445375084877, "reward_change_min": -0.5574557054787874, "reward_change_std": 0.23587777838110924, "reward_std": 0.7630139626562595, "rewards/cosine_scaled_reward": -0.14112437143921852, "rewards/format_reward": 0.5208333395421505, "step": 142 }, { "advantage_max": 1.6403550207614899, "advantage_mean": 3.1044088966147854e-09, "advantage_min": -1.0210207104682922, "advantage_std": 0.9997807443141937, "completion_length": 2891.375030517578, "epoch": 0.16342857142857142, "grad_norm": 0.21474313735961914, "kl": 0.0063076019287109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.084384631108882e-07, "loss": 0.0003, "reward": -0.20011389954015613, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.20011389954015613, "reward_after_std": 0.5853808727115393, "reward_before_mean": 0.00993584532989189, "reward_before_std": 0.6135505214333534, "reward_change_max": 0.0020471736788749695, "reward_change_mean": -0.21004975493997335, "reward_change_min": -0.44171823002398014, "reward_change_std": 0.18350199330598116, "reward_std": 0.5853809006512165, "rewards/cosine_scaled_reward": -0.22419874742627144, "rewards/format_reward": 0.45833334140479565, "step": 143 }, { "advantage_max": 1.7889351844787598, "advantage_mean": 2.8560561415869046e-08, "advantage_min": -0.8820901550352573, "advantage_std": 0.999851867556572, "completion_length": 2961.5000228881836, "epoch": 0.16457142857142856, "grad_norm": 0.2291809469461441, "kl": 0.0053441524505615234, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.065303395098358e-07, "loss": 0.0002, "reward": 0.027923045679926872, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.027923045679926872, "reward_after_std": 0.9694040082395077, "reward_before_mean": 0.2654368379153311, "reward_before_std": 1.0005079247057438, "reward_change_max": 0.0005721151828765869, "reward_change_mean": -0.23751378356246278, "reward_change_min": -0.5377595163881779, "reward_change_std": 0.21641618467401713, "reward_std": 0.9694040454924107, "rewards/cosine_scaled_reward": -0.033948251977562904, "rewards/format_reward": 0.3333333395421505, "step": 144 }, { "advantage_max": 1.8446239680051804, "advantage_mean": -9.002783740719167e-09, "advantage_min": -0.9158533997833729, "advantage_std": 0.9997996985912323, "completion_length": 2237.375045776367, "epoch": 0.1657142857142857, "grad_norm": 0.2505512535572052, "kl": 0.005219697952270508, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.046048391230247e-07, "loss": 0.0002, "reward": 0.2820494510233402, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2820494510233402, "reward_after_std": 0.6724843680858612, "reward_before_mean": 0.6460502780973911, "reward_before_std": 0.6055637616664171, "reward_change_max": 0.0002672523260116577, "reward_change_mean": -0.3640008121728897, "reward_change_min": -0.5924087129533291, "reward_change_std": 0.2218690738081932, "reward_std": 0.672484390437603, "rewards/cosine_scaled_reward": 0.03135844843927771, "rewards/format_reward": 0.5833333414047956, "step": 145 }, { "advantage_max": 1.771018609404564, "advantage_mean": -2.4835264955669345e-09, "advantage_min": -0.9762936607003212, "advantage_std": 0.9997522756457329, "completion_length": 2690.3958740234375, "epoch": 0.16685714285714287, "grad_norm": 0.20695403218269348, "kl": 0.0024411678314208984, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": -0.3023377712816, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.3023377712816, "reward_after_std": 0.48150240257382393, "reward_before_mean": -0.12040900066494942, "reward_before_std": 0.4715173728764057, "reward_change_max": 0.0003331676125526428, "reward_change_mean": -0.18192878179252148, "reward_change_min": -0.35417483001947403, "reward_change_std": 0.1391107146628201, "reward_std": 0.4815024062991142, "rewards/cosine_scaled_reward": -0.2997878435999155, "rewards/format_reward": 0.47916667349636555, "step": 146 }, { "advantage_max": 1.7762034386396408, "advantage_mean": 3.725290298461914e-09, "advantage_min": -1.0109000727534294, "advantage_std": 0.9998396635055542, "completion_length": 2681.791717529297, "epoch": 0.168, "grad_norm": 0.23290328681468964, "kl": 0.0038328170776367188, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.007020842191634e-07, "loss": 0.0002, "reward": 0.11544580478221178, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.11544580478221178, "reward_after_std": 0.8492474108934402, "reward_before_mean": 0.3966683419421315, "reward_before_std": 0.8318487480282784, "reward_change_max": 0.0005847439169883728, "reward_change_mean": -0.2812225092202425, "reward_change_min": -0.5237756464630365, "reward_change_std": 0.20360548980534077, "reward_std": 0.8492474406957626, "rewards/cosine_scaled_reward": -0.030832513701170683, "rewards/format_reward": 0.4583333395421505, "step": 147 }, { "advantage_max": 1.7309150248765945, "advantage_mean": -3.166496748141512e-08, "advantage_min": -0.9409750513732433, "advantage_std": 0.9997960329055786, "completion_length": 2401.416717529297, "epoch": 0.16914285714285715, "grad_norm": 0.17279323935508728, "kl": 0.002333402633666992, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.987250199168808e-07, "loss": 0.0001, "reward": 0.20989026129245758, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20989026129245758, "reward_after_std": 0.6291894465684891, "reward_before_mean": 0.559065692126751, "reward_before_std": 0.5892521552741528, "reward_change_max": 0.00042691081762313843, "reward_change_mean": -0.3491754289716482, "reward_change_min": -0.5810739956796169, "reward_change_std": 0.2249550400301814, "reward_std": 0.62918945774436, "rewards/cosine_scaled_reward": -0.05380050651729107, "rewards/format_reward": 0.666666679084301, "step": 148 }, { "advantage_max": 1.5458619594573975, "advantage_mean": 1.4901162082026076e-08, "advantage_min": -1.1815932020545006, "advantage_std": 0.9998332262039185, "completion_length": 2644.3333892822266, "epoch": 0.1702857142857143, "grad_norm": 0.20652993023395538, "kl": 0.0053501129150390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.967309592491052e-07, "loss": 0.0002, "reward": 0.2713468788861064, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2713468788861064, "reward_after_std": 0.7613490857183933, "reward_before_mean": 0.6372056715190411, "reward_before_std": 0.8292011804878712, "reward_change_max": 0.00029724836349487305, "reward_change_mean": -0.3658587923273444, "reward_change_min": -0.6828206889331341, "reward_change_std": 0.2869762387126684, "reward_std": 0.7613491117954254, "rewards/cosine_scaled_reward": 0.006102843210101128, "rewards/format_reward": 0.625000013038516, "step": 149 }, { "advantage_max": 1.6237319260835648, "advantage_mean": 2.483526606589237e-09, "advantage_min": -1.017818108201027, "advantage_std": 0.9998415634036064, "completion_length": 2881.3125534057617, "epoch": 0.17142857142857143, "grad_norm": 0.18204692006111145, "kl": 0.004920005798339844, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.9471999940354e-07, "loss": 0.0002, "reward": -0.004406252293847501, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.004406252293847501, "reward_after_std": 0.8845288045704365, "reward_before_mean": 0.24131087586283684, "reward_before_std": 0.9448149167001247, "reward_change_max": 0.0015275925397872925, "reward_change_mean": -0.24571714829653502, "reward_change_min": -0.5798435471951962, "reward_change_std": 0.23720442783087492, "reward_std": 0.8845288455486298, "rewards/cosine_scaled_reward": -0.07726123073371127, "rewards/format_reward": 0.39583334140479565, "step": 150 }, { "advantage_max": 1.7477609366178513, "advantage_mean": 2.3593506592867186e-08, "advantage_min": -1.0146696493029594, "advantage_std": 0.9998183324933052, "completion_length": 2472.229232788086, "epoch": 0.17257142857142857, "grad_norm": 0.23976579308509827, "kl": 0.004656791687011719, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.926922383915315e-07, "loss": 0.0002, "reward": 0.29189296532422304, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.29189296532422304, "reward_after_std": 0.819140899926424, "reward_before_mean": 0.6456237062811852, "reward_before_std": 0.790733078494668, "reward_change_max": 0.0, "reward_change_mean": -0.353730708360672, "reward_change_min": -0.6472217217087746, "reward_change_std": 0.25632510613650084, "reward_std": 0.8191409334540367, "rewards/cosine_scaled_reward": 0.05197850498370826, "rewards/format_reward": 0.541666679084301, "step": 151 }, { "advantage_max": 1.7117934972047806, "advantage_mean": 4.842877565636172e-08, "advantage_min": -0.8732845857739449, "advantage_std": 0.999762549996376, "completion_length": 2872.500011444092, "epoch": 0.1737142857142857, "grad_norm": 0.2292066067457199, "kl": 0.0038803815841674805, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.906477750432903e-07, "loss": 0.0002, "reward": -0.3233077572658658, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.3233077572658658, "reward_after_std": 0.48260749503970146, "reward_before_mean": -0.1462894231081009, "reward_before_std": 0.48653700202703476, "reward_change_max": 0.0008822306990623474, "reward_change_mean": -0.17701831832528114, "reward_change_min": -0.3990900721400976, "reward_change_std": 0.14951521158218384, "reward_std": 0.48260750249028206, "rewards/cosine_scaled_reward": -0.22939471807330847, "rewards/format_reward": 0.3125, "step": 152 }, { "advantage_max": 1.7327706962823868, "advantage_mean": 4.3461721888071736e-08, "advantage_min": -1.0526022017002106, "advantage_std": 0.9997944980859756, "completion_length": 3011.062530517578, "epoch": 0.17485714285714285, "grad_norm": 0.26520031690597534, "kl": 0.005156517028808594, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.88586709003076e-07, "loss": 0.0002, "reward": -0.23992188030388206, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23992188030388206, "reward_after_std": 0.6819894313812256, "reward_before_mean": -0.06268766801804304, "reward_before_std": 0.6885003410279751, "reward_change_max": 0.00020004808902740479, "reward_change_mean": -0.17723419680260122, "reward_change_min": -0.3584374338388443, "reward_change_std": 0.14762821584008634, "reward_std": 0.6819894574582577, "rewards/cosine_scaled_reward": -0.15634385170415044, "rewards/format_reward": 0.2500000074505806, "step": 153 }, { "advantage_max": 1.7009950578212738, "advantage_mean": -1.862645104822036e-08, "advantage_min": -0.8887446373701096, "advantage_std": 0.9998434856534004, "completion_length": 3437.1459045410156, "epoch": 0.176, "grad_norm": 0.15206857025623322, "kl": 0.002620697021484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": 0.022732499055564404, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.022732499055564404, "reward_after_std": 1.0319015718996525, "reward_before_mean": 0.2582432683557272, "reward_before_std": 1.1024227440357208, "reward_change_max": 0.0006276369094848633, "reward_change_mean": -0.23551079258322716, "reward_change_min": -0.6142942979931831, "reward_change_std": 0.2546743592247367, "reward_std": 1.0319016017019749, "rewards/cosine_scaled_reward": -0.027128373738378286, "rewards/format_reward": 0.3125000037252903, "step": 154 }, { "advantage_max": 1.6061706393957138, "advantage_mean": 6.661338147750939e-16, "advantage_min": -1.1612521186470985, "advantage_std": 0.9998240694403648, "completion_length": 2568.895866394043, "epoch": 0.17714285714285713, "grad_norm": 0.23418724536895752, "kl": 0.0033125877380371094, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.844151714648274e-07, "loss": 0.0001, "reward": 0.3138754814863205, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3138754814863205, "reward_after_std": 0.8139069825410843, "reward_before_mean": 0.6861309893429279, "reward_before_std": 0.8666851222515106, "reward_change_max": 2.2761523723602295e-05, "reward_change_mean": -0.37225551158189774, "reward_change_min": -0.675981055945158, "reward_change_std": 0.28465048875659704, "reward_std": 0.8139070123434067, "rewards/cosine_scaled_reward": 0.10348214022815228, "rewards/format_reward": 0.4791666753590107, "step": 155 }, { "advantage_max": 1.6984343826770782, "advantage_mean": 3.725290431688677e-08, "advantage_min": -0.9817985594272614, "advantage_std": 0.9997797906398773, "completion_length": 2838.833351135254, "epoch": 0.1782857142857143, "grad_norm": 0.1789286583662033, "kl": 0.002118349075317383, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.823049032816478e-07, "loss": 0.0001, "reward": -0.04328125435858965, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.04328125435858965, "reward_after_std": 0.6757789999246597, "reward_before_mean": 0.20892359875142574, "reward_before_std": 0.6871361993253231, "reward_change_max": 0.0006064549088478088, "reward_change_mean": -0.25220485124737024, "reward_change_min": -0.5162005349993706, "reward_change_std": 0.20435158256441355, "reward_std": 0.6757790241390467, "rewards/cosine_scaled_reward": -0.051788204873446375, "rewards/format_reward": 0.3125000074505806, "step": 156 }, { "advantage_max": 1.738521471619606, "advantage_mean": 4.656612939690774e-08, "advantage_min": -1.0321544483304024, "advantage_std": 0.9997901543974876, "completion_length": 2895.7083740234375, "epoch": 0.17942857142857144, "grad_norm": 0.20847547054290771, "kl": 0.004949331283569336, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.801784390262943e-07, "loss": 0.0002, "reward": -0.07134869415313005, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07134869415313005, "reward_after_std": 0.7075528223067522, "reward_before_mean": 0.16578519972972572, "reward_before_std": 0.7076399326324463, "reward_change_max": 0.0002654939889907837, "reward_change_mean": -0.23713390016928315, "reward_change_min": -0.42857304587960243, "reward_change_std": 0.17433835892006755, "reward_std": 0.7075528409332037, "rewards/cosine_scaled_reward": -0.11502407429179584, "rewards/format_reward": 0.3958333507180214, "step": 157 }, { "advantage_max": 1.8929852694272995, "advantage_mean": 1.8005570812107408e-08, "advantage_min": -0.840268399566412, "advantage_std": 0.9998440369963646, "completion_length": 3047.7709045410156, "epoch": 0.18057142857142858, "grad_norm": 0.16761554777622223, "kl": 0.0032958984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.780358823396352e-07, "loss": 0.0001, "reward": 0.2771665162872523, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2771665162872523, "reward_after_std": 0.8393938392400742, "reward_before_mean": 0.6094142701476812, "reward_before_std": 0.7368073351681232, "reward_change_max": 0.000417560338973999, "reward_change_mean": -0.33224774803966284, "reward_change_min": -0.5177211537957191, "reward_change_std": 0.20231639966368675, "reward_std": 0.8393938541412354, "rewards/cosine_scaled_reward": 0.09637379320338368, "rewards/format_reward": 0.41666667722165585, "step": 158 }, { "advantage_max": 1.7092038244009018, "advantage_mean": 3.2285850659619086e-08, "advantage_min": -1.0310008600354195, "advantage_std": 0.9997366070747375, "completion_length": 2762.6875, "epoch": 0.18171428571428572, "grad_norm": 0.17658032476902008, "kl": 0.00368499755859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.758773376468604e-07, "loss": 0.0001, "reward": -0.2589376363903284, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2589376363903284, "reward_after_std": 0.4805982671678066, "reward_before_mean": -0.05937272682785988, "reward_before_std": 0.47807489708065987, "reward_change_max": 0.0009552910923957825, "reward_change_mean": -0.19956490956246853, "reward_change_min": -0.40817511081695557, "reward_change_std": 0.14949493948370218, "reward_std": 0.4805982783436775, "rewards/cosine_scaled_reward": -0.2171863690018654, "rewards/format_reward": 0.375, "step": 159 }, { "advantage_max": 1.6917408555746078, "advantage_mean": 1.3348957161873898e-08, "advantage_min": -0.992470346391201, "advantage_std": 0.9998021721839905, "completion_length": 2795.979202270508, "epoch": 0.18285714285714286, "grad_norm": 0.20086175203323364, "kl": 0.0056705474853515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.737029101523929e-07, "loss": 0.0002, "reward": -0.03473919133330128, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.03473919133330128, "reward_after_std": 0.789188040420413, "reward_before_mean": 0.20616674236953259, "reward_before_std": 0.8110079057514668, "reward_change_max": 0.0001484006643295288, "reward_change_mean": -0.24090593494474888, "reward_change_min": -0.5440912507474422, "reward_change_std": 0.21322820242494345, "reward_std": 0.7891880441457033, "rewards/cosine_scaled_reward": -0.0844166330061853, "rewards/format_reward": 0.37500000186264515, "step": 160 }, { "advantage_max": 1.5803005397319794, "advantage_mean": 1.0554990159672428e-08, "advantage_min": -1.1769323199987411, "advantage_std": 0.9998519346117973, "completion_length": 2502.062545776367, "epoch": 0.184, "grad_norm": 0.2479405552148819, "kl": 0.011577606201171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.715127058347614e-07, "loss": 0.0005, "reward": 0.29036849178373814, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.29036849178373814, "reward_after_std": 0.9104583151638508, "reward_before_mean": 0.6383865168318152, "reward_before_std": 0.9597002416849136, "reward_change_max": 5.9388577938079834e-05, "reward_change_mean": -0.3480180446058512, "reward_change_min": -0.6826099902391434, "reward_change_std": 0.2846729140728712, "reward_std": 0.910458330065012, "rewards/cosine_scaled_reward": 0.06919325143098831, "rewards/format_reward": 0.5000000167638063, "step": 161 }, { "advantage_max": 1.5485426783561707, "advantage_mean": 5.277494719013731e-08, "advantage_min": -1.1469803005456924, "advantage_std": 0.999797448515892, "completion_length": 2919.750045776367, "epoch": 0.18514285714285714, "grad_norm": 0.2067309468984604, "kl": 0.008340835571289062, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.693068314414344e-07, "loss": 0.0003, "reward": -0.0706309461966157, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0706309461966157, "reward_after_std": 0.7202874831855297, "reward_before_mean": 0.17403563857078552, "reward_before_std": 0.7770990468561649, "reward_change_max": 0.0013139024376869202, "reward_change_mean": -0.2446665894240141, "reward_change_min": -0.5037376508116722, "reward_change_std": 0.21818586625158787, "reward_std": 0.72028748691082, "rewards/cosine_scaled_reward": -0.10048217233270407, "rewards/format_reward": 0.37500000931322575, "step": 162 }, { "advantage_max": 1.7131734490394592, "advantage_mean": -4.594524882772788e-08, "advantage_min": -1.019066572189331, "advantage_std": 0.9997795969247818, "completion_length": 2453.333381652832, "epoch": 0.18628571428571428, "grad_norm": 0.25771090388298035, "kl": 0.0038504600524902344, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.670853944836176e-07, "loss": 0.0002, "reward": 0.49204983934760094, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.49204983934760094, "reward_after_std": 0.6561245676130056, "reward_before_mean": 0.9415503926575184, "reward_before_std": 0.623434953391552, "reward_change_max": 0.00024286657571792603, "reward_change_mean": -0.4495005705393851, "reward_change_min": -0.7392841130495071, "reward_change_std": 0.2843097187578678, "reward_std": 0.6561245918273926, "rewards/cosine_scaled_reward": 0.1686918642371893, "rewards/format_reward": 0.6041666716337204, "step": 163 }, { "advantage_max": 1.7787838876247406, "advantage_mean": 4.7187013296756675e-08, "advantage_min": -0.9464346244931221, "advantage_std": 0.9998352006077766, "completion_length": 2421.1875762939453, "epoch": 0.18742857142857142, "grad_norm": 0.24368895590305328, "kl": 0.0044498443603515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": 0.2344353199005127, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2344353199005127, "reward_after_std": 0.7934345155954361, "reward_before_mean": 0.566842844709754, "reward_before_std": 0.7604377698153257, "reward_change_max": 0.000662878155708313, "reward_change_mean": -0.3324074991978705, "reward_change_min": -0.6209581308066845, "reward_change_std": 0.239043434150517, "reward_std": 0.7934345453977585, "rewards/cosine_scaled_reward": 0.01258809631690383, "rewards/format_reward": 0.5416666734963655, "step": 164 }, { "advantage_max": 1.7160216122865677, "advantage_mean": 4.5634808820693706e-08, "advantage_min": -1.0043220072984695, "advantage_std": 0.9997565597295761, "completion_length": 2584.7083892822266, "epoch": 0.18857142857142858, "grad_norm": 0.23223291337490082, "kl": 0.0044155120849609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.625962667065487e-07, "loss": 0.0002, "reward": -0.2676575245568529, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2676575245568529, "reward_after_std": 0.5935316011309624, "reward_before_mean": -0.09024508250877261, "reward_before_std": 0.581473633646965, "reward_change_max": 0.00023662298917770386, "reward_change_mean": -0.17741243075579405, "reward_change_min": -0.3212465066462755, "reward_change_std": 0.12872007954865694, "reward_std": 0.5935316234827042, "rewards/cosine_scaled_reward": -0.23262254672590643, "rewards/format_reward": 0.3750000074505806, "step": 165 }, { "advantage_max": 1.6435736864805222, "advantage_mean": -1.1796753351944744e-08, "advantage_min": -1.1509440392255783, "advantage_std": 0.9997942671179771, "completion_length": 2934.666717529297, "epoch": 0.18971428571428572, "grad_norm": 0.19854706525802612, "kl": 0.003127574920654297, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.603287946810513e-07, "loss": 0.0001, "reward": -0.0017978083342313766, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0017978083342313766, "reward_after_std": 0.7677610591053963, "reward_before_mean": 0.2567120725288987, "reward_before_std": 0.8104214817285538, "reward_change_max": 0.0012328997254371643, "reward_change_mean": -0.25850985338911414, "reward_change_min": -0.5152819417417049, "reward_change_std": 0.22118300944566727, "reward_std": 0.7677610758692026, "rewards/cosine_scaled_reward": -0.03831063862890005, "rewards/format_reward": 0.3333333395421505, "step": 166 }, { "advantage_max": 1.7621634602546692, "advantage_mean": -1.2417632477834672e-09, "advantage_min": -0.9529619812965393, "advantage_std": 0.9998401030898094, "completion_length": 2483.8542404174805, "epoch": 0.19085714285714286, "grad_norm": 0.2032567411661148, "kl": 0.0027036666870117188, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.580461976679099e-07, "loss": 0.0001, "reward": 0.055609733797609806, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.055609733797609806, "reward_after_std": 0.8073126636445522, "reward_before_mean": 0.32204285881016403, "reward_before_std": 0.7914337478578091, "reward_change_max": 0.0, "reward_change_mean": -0.2664331328123808, "reward_change_min": -0.5186327453702688, "reward_change_std": 0.19630906637758017, "reward_std": 0.8073126897215843, "rewards/cosine_scaled_reward": -0.12022857554256916, "rewards/format_reward": 0.5625000167638063, "step": 167 }, { "advantage_max": 1.661664754152298, "advantage_mean": 3.4924596922780715e-08, "advantage_min": -1.1289391070604324, "advantage_std": 0.9997860416769981, "completion_length": 2825.0208435058594, "epoch": 0.192, "grad_norm": 0.16721251606941223, "kl": 0.00345611572265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.557485869176825e-07, "loss": 0.0001, "reward": 0.2044251561164856, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2044251561164856, "reward_after_std": 0.6206236407160759, "reward_before_mean": 0.5572252720594406, "reward_before_std": 0.6114452034235001, "reward_change_max": 8.478760719299316e-05, "reward_change_mean": -0.35280010662972927, "reward_change_min": -0.5640292279422283, "reward_change_std": 0.2297749798744917, "reward_std": 0.6206236593425274, "rewards/cosine_scaled_reward": 0.02861264254897833, "rewards/format_reward": 0.5000000111758709, "step": 168 }, { "advantage_max": 1.755803495645523, "advantage_mean": -3.725290398381986e-08, "advantage_min": -1.0638891533017159, "advantage_std": 0.9998467043042183, "completion_length": 2066.6875534057617, "epoch": 0.19314285714285714, "grad_norm": 0.33004748821258545, "kl": 0.0041844844818115234, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.534360744126753e-07, "loss": 0.0002, "reward": 0.6655769548378885, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6655769548378885, "reward_after_std": 0.7399082817137241, "reward_before_mean": 1.1608899924904108, "reward_before_std": 0.6558673195540905, "reward_change_max": 0.0, "reward_change_mean": -0.49531308002769947, "reward_change_min": -0.7641774192452431, "reward_change_std": 0.29795828089118004, "reward_std": 0.7399082891643047, "rewards/cosine_scaled_reward": 0.21586166881024837, "rewards/format_reward": 0.7291666716337204, "step": 169 }, { "advantage_max": 1.7788434475660324, "advantage_mean": 6.829697918320221e-09, "advantage_min": -0.8895556852221489, "advantage_std": 0.9998018741607666, "completion_length": 2374.458366394043, "epoch": 0.19428571428571428, "grad_norm": 0.19749268889427185, "kl": 0.003981590270996094, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.511087728614862e-07, "loss": 0.0002, "reward": 0.19533677399158478, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.19533677399158478, "reward_after_std": 0.6278506964445114, "reward_before_mean": 0.5367435729131103, "reward_before_std": 0.5629333853721619, "reward_change_max": 0.00014884024858474731, "reward_change_mean": -0.3414067728444934, "reward_change_min": -0.6151969563215971, "reward_change_std": 0.23563461285084486, "reward_std": 0.6278507374227047, "rewards/cosine_scaled_reward": 0.018371770158410072, "rewards/format_reward": 0.5000000074505806, "step": 170 }, { "advantage_max": 1.5723862498998642, "advantage_mean": -4.440892098500626e-16, "advantage_min": -1.1505431681871414, "advantage_std": 0.9998114183545113, "completion_length": 2558.6666870117188, "epoch": 0.19542857142857142, "grad_norm": 0.19266659021377563, "kl": 0.0031795501708984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.487667956935087e-07, "loss": 0.0001, "reward": 0.21401724219322205, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.21401724219322205, "reward_after_std": 0.7456048391759396, "reward_before_mean": 0.5556304566562176, "reward_before_std": 0.7742940001189709, "reward_change_max": 0.00022812187671661377, "reward_change_mean": -0.34161321073770523, "reward_change_min": -0.6149733066558838, "reward_change_std": 0.2527832752093673, "reward_std": 0.7456048503518105, "rewards/cosine_scaled_reward": 0.02781522087752819, "rewards/format_reward": 0.5000000111758709, "step": 171 }, { "advantage_max": 1.7205638736486435, "advantage_mean": -1.9868215184182247e-08, "advantage_min": -0.9265585131943226, "advantage_std": 0.9998242631554604, "completion_length": 2866.6041984558105, "epoch": 0.19657142857142856, "grad_norm": 0.22508394718170166, "kl": 0.004887580871582031, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.464102570534061e-07, "loss": 0.0002, "reward": 0.11025743559002876, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11025743559002876, "reward_after_std": 0.852999858558178, "reward_before_mean": 0.39235235191881657, "reward_before_std": 0.845063797198236, "reward_change_max": 0.0006053447723388672, "reward_change_mean": -0.2820949163287878, "reward_change_min": -0.5408760532736778, "reward_change_std": 0.22555957734584808, "reward_std": 0.8529998771846294, "rewards/cosine_scaled_reward": 0.02950950153172016, "rewards/format_reward": 0.3333333358168602, "step": 172 }, { "advantage_max": 1.847335934638977, "advantage_mean": -1.6763807009212428e-08, "advantage_min": -0.9138921350240707, "advantage_std": 0.9998233467340469, "completion_length": 1595.3541870117188, "epoch": 0.1977142857142857, "grad_norm": 0.24679189920425415, "kl": 0.003414154052734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.440392717955475e-07, "loss": 0.0001, "reward": 0.21174699254333973, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21174699254333973, "reward_after_std": 0.7577876187860966, "reward_before_mean": 0.5373185947537422, "reward_before_std": 0.6953968647867441, "reward_change_max": 0.00207403302192688, "reward_change_mean": -0.32557161804288626, "reward_change_min": -0.5288458727300167, "reward_change_std": 0.20955347921699286, "reward_std": 0.7577876411378384, "rewards/cosine_scaled_reward": -0.11675737611949444, "rewards/format_reward": 0.7708333395421505, "step": 173 }, { "advantage_max": 1.6314794272184372, "advantage_mean": 3.7563345628433e-08, "advantage_min": -1.247012808918953, "advantage_std": 0.9998359009623528, "completion_length": 2359.916732788086, "epoch": 0.19885714285714284, "grad_norm": 0.25068995356559753, "kl": 0.008630752563476562, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.416539554784089e-07, "loss": 0.0003, "reward": 0.1418452576326672, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1418452576326672, "reward_after_std": 0.7283108457922935, "reward_before_mean": 0.45502501539886, "reward_before_std": 0.7445112951099873, "reward_change_max": 0.0009864121675491333, "reward_change_mean": -0.31317971367388964, "reward_change_min": -0.5779304951429367, "reward_change_std": 0.23148463666439056, "reward_std": 0.7283108867704868, "rewards/cosine_scaled_reward": -0.07457084278576076, "rewards/format_reward": 0.6041666846722364, "step": 174 }, { "advantage_max": 1.6474027633666992, "advantage_mean": -1.4280279403422469e-08, "advantage_min": -1.065418280661106, "advantage_std": 0.9997929632663727, "completion_length": 2647.5833740234375, "epoch": 0.2, "grad_norm": 0.20481029152870178, "kl": 0.0050411224365234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.392544243589427e-07, "loss": 0.0002, "reward": 0.10538380034267902, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.10538380034267902, "reward_after_std": 0.7414525561034679, "reward_before_mean": 0.4085885342210531, "reward_before_std": 0.7780010476708412, "reward_change_max": 0.000210687518119812, "reward_change_mean": -0.30320476926863194, "reward_change_min": -0.6048127487301826, "reward_change_std": 0.24645437160506845, "reward_std": 0.7414525710046291, "rewards/cosine_scaled_reward": -0.02487239707261324, "rewards/format_reward": 0.4583333395421505, "step": 175 }, { "advantage_max": 1.6558598428964615, "advantage_mean": -8.07146260939362e-09, "advantage_min": -1.1213047727942467, "advantage_std": 0.9998004138469696, "completion_length": 2123.875045776367, "epoch": 0.20114285714285715, "grad_norm": 0.23461802303791046, "kl": 0.004420280456542969, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.368407953869103e-07, "loss": 0.0002, "reward": 0.2901948895305395, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2901948895305395, "reward_after_std": 0.7168191187083721, "reward_before_mean": 0.6617438420653343, "reward_before_std": 0.7364391945302486, "reward_change_max": 0.0010713860392570496, "reward_change_mean": -0.3715489520691335, "reward_change_min": -0.6301031894981861, "reward_change_std": 0.2616481054574251, "reward_std": 0.7168191466480494, "rewards/cosine_scaled_reward": -0.002461428754031658, "rewards/format_reward": 0.6666666697710752, "step": 176 }, { "advantage_max": 1.7093915194272995, "advantage_mean": -2.793967834868738e-09, "advantage_min": -1.0653635561466217, "advantage_std": 0.9998228698968887, "completion_length": 2524.6250534057617, "epoch": 0.2022857142857143, "grad_norm": 0.2250983715057373, "kl": 0.005145072937011719, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.344131861991828e-07, "loss": 0.0002, "reward": 0.33880720753222704, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33880720753222704, "reward_after_std": 0.7156060859560966, "reward_before_mean": 0.7245945755857974, "reward_before_std": 0.6927108000963926, "reward_change_max": 0.0008964464068412781, "reward_change_mean": -0.38578732684254646, "reward_change_min": -0.6814925745129585, "reward_change_std": 0.2651552055031061, "reward_std": 0.7156060971319675, "rewards/cosine_scaled_reward": 0.028963930439203978, "rewards/format_reward": 0.6666666753590107, "step": 177 }, { "advantage_max": 1.708361804485321, "advantage_mean": -1.6142925773898753e-08, "advantage_min": -0.9775385111570358, "advantage_std": 0.9998570010066032, "completion_length": 2525.0625915527344, "epoch": 0.20342857142857143, "grad_norm": 0.19316639006137848, "kl": 0.0074977874755859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.319717151140072e-07, "loss": 0.0003, "reward": 0.1444633398205042, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1444633398205042, "reward_after_std": 0.9142027124762535, "reward_before_mean": 0.43146845512092113, "reward_before_std": 0.9240044802427292, "reward_change_max": 0.0003721490502357483, "reward_change_mean": -0.28700513765215874, "reward_change_min": -0.5878622345626354, "reward_change_std": 0.23241629172116518, "reward_std": 0.9142027199268341, "rewards/cosine_scaled_reward": -0.04468244104646146, "rewards/format_reward": 0.5208333395421505, "step": 178 }, { "advantage_max": 1.7899749428033829, "advantage_mean": 3.601114034745834e-08, "advantage_min": -0.8923960477113724, "advantage_std": 0.9997547343373299, "completion_length": 2693.0417098999023, "epoch": 0.20457142857142857, "grad_norm": 0.18073520064353943, "kl": 0.0033159255981445312, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.295165011252396e-07, "loss": 0.0001, "reward": -0.20887351408600807, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.20887351408600807, "reward_after_std": 0.5476625636219978, "reward_before_mean": -0.0019117742776870728, "reward_before_std": 0.5281481985002756, "reward_change_max": 9.660422801971436e-05, "reward_change_mean": -0.20696171931922436, "reward_change_min": -0.41723362915217876, "reward_change_std": 0.15670202346518636, "reward_std": 0.5476625710725784, "rewards/cosine_scaled_reward": -0.19887255877256393, "rewards/format_reward": 0.3958333395421505, "step": 179 }, { "advantage_max": 1.8884473145008087, "advantage_mean": -1.6142924996742636e-08, "advantage_min": -0.7513148039579391, "advantage_std": 0.9998113960027695, "completion_length": 1993.3333549499512, "epoch": 0.2057142857142857, "grad_norm": 0.2384978085756302, "kl": 0.0056743621826171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.270476638965461e-07, "loss": 0.0002, "reward": 0.335174560546875, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.335174560546875, "reward_after_std": 0.7826052382588387, "reward_before_mean": 0.7005536345532164, "reward_before_std": 0.6965623302385211, "reward_change_max": 0.0, "reward_change_mean": -0.3653790873941034, "reward_change_min": -0.6039573587477207, "reward_change_std": 0.22388500557281077, "reward_std": 0.7826052382588387, "rewards/cosine_scaled_reward": 0.02736013988032937, "rewards/format_reward": 0.645833333954215, "step": 180 }, { "advantage_max": 1.769053503870964, "advantage_mean": 1.4901160194646934e-08, "advantage_min": -0.8874433152377605, "advantage_std": 0.9998081848025322, "completion_length": 3000.7083587646484, "epoch": 0.20685714285714285, "grad_norm": 0.24826426804065704, "kl": 0.005057334899902344, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.245653237555705e-07, "loss": 0.0002, "reward": 0.061417024582624435, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.061417024582624435, "reward_after_std": 0.6935353614389896, "reward_before_mean": 0.34182374289957806, "reward_before_std": 0.6251449626870453, "reward_change_max": 0.0, "reward_change_mean": -0.28040668182075024, "reward_change_min": -0.42512313835322857, "reward_change_std": 0.17812555376440287, "reward_std": 0.6935353688895702, "rewards/cosine_scaled_reward": 0.014661841792985797, "rewards/format_reward": 0.31250000186264515, "step": 181 }, { "advantage_max": 1.757993906736374, "advantage_mean": 8.69234495493032e-09, "advantage_min": -0.8931112438440323, "advantage_std": 0.9998567774891853, "completion_length": 2112.250030517578, "epoch": 0.208, "grad_norm": 0.15252088010311127, "kl": 0.0021615028381347656, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.220696016880687e-07, "loss": 0.0001, "reward": 0.2109930714359507, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2109930714359507, "reward_after_std": 0.8981235288083553, "reward_before_mean": 0.5247037280350924, "reward_before_std": 0.8953145183622837, "reward_change_max": 0.0, "reward_change_mean": -0.3137106457725167, "reward_change_min": -0.5981119312345982, "reward_change_std": 0.24134177062660456, "reward_std": 0.898123562335968, "rewards/cosine_scaled_reward": -0.08139814250171185, "rewards/format_reward": 0.6875000055879354, "step": 182 }, { "advantage_max": 1.6220180839300156, "advantage_mean": -1.552204320631745e-08, "advantage_min": -1.0699148625135422, "advantage_std": 0.9998834803700447, "completion_length": 1983.7083587646484, "epoch": 0.20914285714285713, "grad_norm": 0.3158145546913147, "kl": 0.010114669799804688, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.195606193320136e-07, "loss": 0.0004, "reward": 0.5140761295333505, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5140761295333505, "reward_after_std": 0.9779587611556053, "reward_before_mean": 0.9352103937417269, "reward_before_std": 1.0185614228248596, "reward_change_max": 0.0004429146647453308, "reward_change_mean": -0.42113425582647324, "reward_change_min": -0.8102418407797813, "reward_change_std": 0.3226440940052271, "reward_std": 0.9779587686061859, "rewards/cosine_scaled_reward": 0.06135518569499254, "rewards/format_reward": 0.8125000074505806, "step": 183 }, { "advantage_max": 1.6859700828790665, "advantage_mean": 1.8626448716752009e-09, "advantage_min": -1.0676620677113533, "advantage_std": 0.9997839480638504, "completion_length": 2427.25008392334, "epoch": 0.2102857142857143, "grad_norm": 0.20175513625144958, "kl": 0.005039215087890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.170384989716657e-07, "loss": 0.0002, "reward": -0.05496148514794186, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05496148514794186, "reward_after_std": 0.5272735208272934, "reward_before_mean": 0.21152281388640404, "reward_before_std": 0.5178352911025286, "reward_change_max": 0.0011545643210411072, "reward_change_mean": -0.26648432202637196, "reward_change_min": -0.4563158415257931, "reward_change_std": 0.18578255455940962, "reward_std": 0.5272735357284546, "rewards/cosine_scaled_reward": -0.17548860050737858, "rewards/format_reward": 0.5625000055879354, "step": 184 }, { "advantage_max": 1.7627823650836945, "advantage_mean": 2.017865632919502e-08, "advantage_min": -0.921674445271492, "advantage_std": 0.9997727647423744, "completion_length": 2029.7083549499512, "epoch": 0.21142857142857144, "grad_norm": 0.22637981176376343, "kl": 0.0041866302490234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.145033635316128e-07, "loss": 0.0002, "reward": -0.03227381221950054, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.03227381221950054, "reward_after_std": 0.6277635786682367, "reward_before_mean": 0.2307340381667018, "reward_before_std": 0.6286961147561669, "reward_change_max": 0.0004841610789299011, "reward_change_mean": -0.2630078513175249, "reward_change_min": -0.5276074483990669, "reward_change_std": 0.20393808418884873, "reward_std": 0.6277635879814625, "rewards/cosine_scaled_reward": -0.19713297532871366, "rewards/format_reward": 0.6250000074505806, "step": 185 }, { "advantage_max": 1.6061919927597046, "advantage_mean": 1.7074252012250213e-09, "advantage_min": -1.1070566922426224, "advantage_std": 0.9998181238770485, "completion_length": 2501.6250762939453, "epoch": 0.21257142857142858, "grad_norm": 0.2071540653705597, "kl": 0.0051727294921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.119553365707802e-07, "loss": 0.0002, "reward": 0.09124485030770302, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09124485030770302, "reward_after_std": 0.6487777419388294, "reward_before_mean": 0.4001100370660424, "reward_before_std": 0.670202262699604, "reward_change_max": 0.0011777132749557495, "reward_change_mean": -0.30886520724743605, "reward_change_min": -0.5774688478559256, "reward_change_std": 0.22890846990048885, "reward_std": 0.6487777531147003, "rewards/cosine_scaled_reward": -0.039528317749500275, "rewards/format_reward": 0.47916667349636555, "step": 186 }, { "advantage_max": 1.6829168498516083, "advantage_mean": -1.2883296318655368e-08, "advantage_min": -1.1902817785739899, "advantage_std": 0.9997801929712296, "completion_length": 1949.7917175292969, "epoch": 0.21371428571428572, "grad_norm": 0.2297697514295578, "kl": 0.006892204284667969, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.093945422764069e-07, "loss": 0.0003, "reward": 0.13866344715643208, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.13866344715643208, "reward_after_std": 0.5238078869879246, "reward_before_mean": 0.4772673398256302, "reward_before_std": 0.5128211658447981, "reward_change_max": 1.4215707778930664e-05, "reward_change_mean": -0.33860391844064, "reward_change_min": -0.5362261161208153, "reward_change_std": 0.21345719881355762, "reward_std": 0.5238079018890858, "rewards/cosine_scaled_reward": -0.11553299427032471, "rewards/format_reward": 0.7083333414047956, "step": 187 }, { "advantage_max": 1.7234875112771988, "advantage_mean": 8.661300032741792e-08, "advantage_min": -0.9957601577043533, "advantage_std": 0.9997361674904823, "completion_length": 2821.291702270508, "epoch": 0.21485714285714286, "grad_norm": 0.194006085395813, "kl": 0.005023956298828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.068211054579943e-07, "loss": 0.0002, "reward": -0.2130349650979042, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2130349650979042, "reward_after_std": 0.4473920725286007, "reward_before_mean": 0.005325134843587875, "reward_before_std": 0.41845056042075157, "reward_change_max": 0.0010787621140480042, "reward_change_mean": -0.21836009714752436, "reward_change_min": -0.36716089956462383, "reward_change_std": 0.1442157169803977, "reward_std": 0.447392076253891, "rewards/cosine_scaled_reward": -0.18483743316028267, "rewards/format_reward": 0.37500000558793545, "step": 188 }, { "advantage_max": 1.648227721452713, "advantage_mean": -3.011276394904172e-08, "advantage_min": -1.1830167844891548, "advantage_std": 0.9997998625040054, "completion_length": 2226.1458892822266, "epoch": 0.216, "grad_norm": 0.21700221300125122, "kl": 0.004192352294921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.04235151541222e-07, "loss": 0.0002, "reward": 0.1258898489177227, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1258898489177227, "reward_after_std": 0.6013828478753567, "reward_before_mean": 0.4480108686257154, "reward_before_std": 0.597816426306963, "reward_change_max": 0.0032317787408828735, "reward_change_mean": -0.3221210283227265, "reward_change_min": -0.5646117441356182, "reward_change_std": 0.2199263358488679, "reward_std": 0.601382851600647, "rewards/cosine_scaled_reward": -0.09891124721616507, "rewards/format_reward": 0.645833345130086, "step": 189 }, { "advantage_max": 1.7906949818134308, "advantage_mean": -5.8983763706610404e-08, "advantage_min": -0.8948409780859947, "advantage_std": 0.9998382180929184, "completion_length": 1786.2292175292969, "epoch": 0.21714285714285714, "grad_norm": 0.21094462275505066, "kl": 0.0045032501220703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.01636806561836e-07, "loss": 0.0002, "reward": 0.4733063979074359, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4733063979074359, "reward_after_std": 0.8122390434145927, "reward_before_mean": 0.8912005809834227, "reward_before_std": 0.7499916590750217, "reward_change_max": 0.0, "reward_change_mean": -0.4178942274302244, "reward_change_min": -0.7554305158555508, "reward_change_std": 0.27918168157339096, "reward_std": 0.8122390657663345, "rewards/cosine_scaled_reward": 0.049766961485147476, "rewards/format_reward": 0.7916666716337204, "step": 190 }, { "advantage_max": 1.7209120094776154, "advantage_mean": -1.1175870451296532e-08, "advantage_min": -1.0205633342266083, "advantage_std": 0.9998108744621277, "completion_length": 1704.187515258789, "epoch": 0.21828571428571428, "grad_norm": 0.23883959650993347, "kl": 0.0055179595947265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.990261971595048e-07, "loss": 0.0002, "reward": 0.37219422310590744, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.37219422310590744, "reward_after_std": 0.7899254895746708, "reward_before_mean": 0.7583240692038089, "reward_before_std": 0.7712064934894443, "reward_change_max": 0.0002664923667907715, "reward_change_mean": -0.3861298616975546, "reward_change_min": -0.6892862729728222, "reward_change_std": 0.2698373291641474, "reward_std": 0.7899255082011223, "rewards/cosine_scaled_reward": 0.00416203960776329, "rewards/format_reward": 0.7500000037252903, "step": 191 }, { "advantage_max": 1.6131475567817688, "advantage_mean": 2.1730859944835146e-08, "advantage_min": -1.1713385730981827, "advantage_std": 0.9998152405023575, "completion_length": 2661.416717529297, "epoch": 0.21942857142857142, "grad_norm": 0.19996660947799683, "kl": 0.0050182342529296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.964034505716476e-07, "loss": 0.0002, "reward": -0.09057431854307652, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.09057431854307652, "reward_after_std": 0.6735001057386398, "reward_before_mean": 0.15036606090143323, "reward_before_std": 0.7253024652600288, "reward_change_max": 0.0015541240572929382, "reward_change_mean": -0.24094037897884846, "reward_change_min": -0.5180305242538452, "reward_change_std": 0.21543918922543526, "reward_std": 0.6735001131892204, "rewards/cosine_scaled_reward": -0.16440030559897423, "rewards/format_reward": 0.479166679084301, "step": 192 }, { "advantage_max": 1.597271353006363, "advantage_mean": 1.8626451825376478e-08, "advantage_min": -1.1252245754003525, "advantage_std": 0.999815858900547, "completion_length": 2672.666702270508, "epoch": 0.22057142857142858, "grad_norm": 4.665724277496338, "kl": 0.10103225708007812, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.93768694627233e-07, "loss": 0.0041, "reward": -0.010438046418130398, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.010438046418130398, "reward_after_std": 0.7259911634027958, "reward_before_mean": 0.2530776970088482, "reward_before_std": 0.7804213501513004, "reward_change_max": 0.0007387250661849976, "reward_change_mean": -0.2635157126933336, "reward_change_min": -0.5600268393754959, "reward_change_std": 0.23291568830609322, "reward_std": 0.7259911671280861, "rewards/cosine_scaled_reward": -0.1026278231292963, "rewards/format_reward": 0.45833334513008595, "step": 193 }, { "advantage_max": 1.6629322618246078, "advantage_mean": 1.024455043019401e-08, "advantage_min": -1.0662253573536873, "advantage_std": 0.9998521134257317, "completion_length": 2609.666717529297, "epoch": 0.22171428571428572, "grad_norm": 0.17539848387241364, "kl": 0.005194664001464844, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.911220577405484e-07, "loss": 0.0002, "reward": 0.6561014717444777, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6561014717444777, "reward_after_std": 0.8593751005828381, "reward_before_mean": 1.1453020740300417, "reward_before_std": 0.872301047667861, "reward_change_max": 0.0008367374539375305, "reward_change_mean": -0.48920058365911245, "reward_change_min": -0.8176311627030373, "reward_change_std": 0.34396423678845167, "reward_std": 0.8593751192092896, "rewards/cosine_scaled_reward": 0.22890102118253708, "rewards/format_reward": 0.6875000223517418, "step": 194 }, { "advantage_max": 1.765949085354805, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -1.0046433061361313, "advantage_std": 0.9998738616704941, "completion_length": 2081.250068664551, "epoch": 0.22285714285714286, "grad_norm": 0.17416663467884064, "kl": 0.0044994354248046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.884636689049422e-07, "loss": 0.0002, "reward": 0.28990895487368107, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28990895487368107, "reward_after_std": 0.9091884158551693, "reward_before_mean": 0.6280412841588259, "reward_before_std": 0.9021401889622211, "reward_change_max": 0.00036738812923431396, "reward_change_mean": -0.33813233859837055, "reward_change_min": -0.6659745052456856, "reward_change_std": 0.25673703476786613, "reward_std": 0.9091884270310402, "rewards/cosine_scaled_reward": -0.04014602582901716, "rewards/format_reward": 0.7083333414047956, "step": 195 }, { "advantage_max": 1.6617747992277145, "advantage_mean": 7.140139368644327e-09, "advantage_min": -1.1848780363798141, "advantage_std": 0.999791108071804, "completion_length": 3116.687545776367, "epoch": 0.224, "grad_norm": 0.18944311141967773, "kl": 0.005228996276855469, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.857936576865356e-07, "loss": 0.0002, "reward": 0.06021673604846001, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06021673604846001, "reward_after_std": 0.6161251626908779, "reward_before_mean": 0.36034686118364334, "reward_before_std": 0.6228807531297207, "reward_change_max": 0.0, "reward_change_mean": -0.3001301297917962, "reward_change_min": -0.5282945893704891, "reward_change_std": 0.2124737584963441, "reward_std": 0.6161251924932003, "rewards/cosine_scaled_reward": -0.038576578721404076, "rewards/format_reward": 0.4375000037252903, "step": 196 }, { "advantage_max": 1.7562730759382248, "advantage_mean": -2.2351742234860694e-08, "advantage_min": -0.9214370250701904, "advantage_std": 0.9998482018709183, "completion_length": 1383.583351135254, "epoch": 0.22514285714285714, "grad_norm": 0.26324012875556946, "kl": 0.005023956298828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.831121542179086e-07, "loss": 0.0002, "reward": 0.4635090157389641, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4635090157389641, "reward_after_std": 0.9525328483432531, "reward_before_mean": 0.8630753671750426, "reward_before_std": 0.9506349777802825, "reward_change_max": 0.00024419277906417847, "reward_change_mean": -0.3995663672685623, "reward_change_min": -0.804127898067236, "reward_change_std": 0.3014351148158312, "reward_std": 0.952532859519124, "rewards/cosine_scaled_reward": -0.005962323164567351, "rewards/format_reward": 0.8750000111758709, "step": 197 }, { "advantage_max": 1.7840017676353455, "advantage_mean": -3.290673239453312e-08, "advantage_min": -0.984473280608654, "advantage_std": 0.9998098015785217, "completion_length": 1875.1042022705078, "epoch": 0.22628571428571428, "grad_norm": 0.23212411999702454, "kl": 0.006099700927734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.804192891917571e-07, "loss": 0.0002, "reward": 0.32404271978884935, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32404271978884935, "reward_after_std": 0.6649378295987844, "reward_before_mean": 0.7054564189165831, "reward_before_std": 0.6262098029255867, "reward_change_max": 0.00044227391481399536, "reward_change_mean": -0.3814137037843466, "reward_change_min": -0.6207973323762417, "reward_change_std": 0.23640513373538852, "reward_std": 0.664937837049365, "rewards/cosine_scaled_reward": -0.011855116579681635, "rewards/format_reward": 0.7291666697710752, "step": 198 }, { "advantage_max": 1.8807825297117233, "advantage_mean": -7.45058070794613e-09, "advantage_min": -0.7725731208920479, "advantage_std": 0.9998340010643005, "completion_length": 1730.5000381469727, "epoch": 0.22742857142857142, "grad_norm": 0.21435077488422394, "kl": 0.005886077880859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.777151938545235e-07, "loss": 0.0002, "reward": 0.2156513766385615, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2156513766385615, "reward_after_std": 0.7489030882716179, "reward_before_mean": 0.5398490279912949, "reward_before_std": 0.6581693179905415, "reward_change_max": 0.0, "reward_change_mean": -0.3241976536810398, "reward_change_min": -0.5322713330388069, "reward_change_std": 0.18413777090609074, "reward_std": 0.7489030919969082, "rewards/cosine_scaled_reward": -0.1675754910102114, "rewards/format_reward": 0.8750000149011612, "step": 199 }, { "advantage_max": 1.6572798639535904, "advantage_mean": -9.623666974434286e-09, "advantage_min": -1.108552247285843, "advantage_std": 0.9998734667897224, "completion_length": 1517.520881652832, "epoch": 0.22857142857142856, "grad_norm": 0.3045530617237091, "kl": 0.0044193267822265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.75e-07, "loss": 0.0002, "reward": 0.6410710557247512, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6410710557247512, "reward_after_std": 0.9314690865576267, "reward_before_mean": 1.1083662956953049, "reward_before_std": 0.9360203258693218, "reward_change_max": 0.0, "reward_change_mean": -0.46729524061083794, "reward_change_min": -0.8295398950576782, "reward_change_std": 0.3195909596979618, "reward_std": 0.9314690865576267, "rewards/cosine_scaled_reward": 0.11668314225971699, "rewards/format_reward": 0.8750000223517418, "step": 200 }, { "advantage_max": 1.808216169476509, "advantage_mean": 2.793967557312982e-09, "advantage_min": -0.8820472247898579, "advantage_std": 0.9998187273740768, "completion_length": 2081.1458892822266, "epoch": 0.2297142857142857, "grad_norm": 0.21598877012729645, "kl": 0.0034742355346679688, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.72273839962904e-07, "loss": 0.0001, "reward": 0.6985563724301755, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6985563724301755, "reward_after_std": 0.7065941356122494, "reward_before_mean": 1.2103144563734531, "reward_before_std": 0.6006112582981586, "reward_change_max": 0.0006928369402885437, "reward_change_mean": -0.5117580210790038, "reward_change_min": -0.7841762602329254, "reward_change_std": 0.3112507052719593, "reward_std": 0.70659414306283, "rewards/cosine_scaled_reward": 0.25099053978919983, "rewards/format_reward": 0.708333333954215, "step": 201 }, { "advantage_max": 1.8314096629619598, "advantage_mean": -2.2351742234860694e-08, "advantage_min": -0.9325228035449982, "advantage_std": 0.9998162910342216, "completion_length": 1540.708339691162, "epoch": 0.23085714285714284, "grad_norm": 0.21634282171726227, "kl": 0.00432586669921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.695368466124296e-07, "loss": 0.0002, "reward": 0.6391191184520721, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6391191184520721, "reward_after_std": 0.6194901466369629, "reward_before_mean": 1.139224898070097, "reward_before_std": 0.4869413049891591, "reward_change_max": 0.0, "reward_change_mean": -0.5001057712361217, "reward_change_min": -0.7225706353783607, "reward_change_std": 0.28183376882225275, "reward_std": 0.6194901615381241, "rewards/cosine_scaled_reward": 0.18419576808810234, "rewards/format_reward": 0.770833333954215, "step": 202 }, { "advantage_max": 1.7919741123914719, "advantage_mean": 9.74008224075007e-09, "advantage_min": -0.987497553229332, "advantage_std": 0.9998574629426003, "completion_length": 2103.2084350585938, "epoch": 0.232, "grad_norm": 0.24814803898334503, "kl": 0.0065250396728515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.667891533457718e-07, "loss": 0.0003, "reward": 0.28046993631869555, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.28046993631869555, "reward_after_std": 0.9375440664589405, "reward_before_mean": 0.608580538071692, "reward_before_std": 0.9169996790587902, "reward_change_max": 0.0007397383451461792, "reward_change_mean": -0.32811061944812536, "reward_change_min": -0.6576522812247276, "reward_change_std": 0.25996890291571617, "reward_std": 0.9375441148877144, "rewards/cosine_scaled_reward": -0.03945971839129925, "rewards/format_reward": 0.6875000111758709, "step": 203 }, { "advantage_max": 1.6612145751714706, "advantage_mean": -2.4214386662446685e-08, "advantage_min": -1.1346538737416267, "advantage_std": 0.999829389154911, "completion_length": 1538.0625305175781, "epoch": 0.23314285714285715, "grad_norm": 0.2627336084842682, "kl": 0.005649566650390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.640308940816239e-07, "loss": 0.0002, "reward": 0.5036372705362737, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5036372705362737, "reward_after_std": 0.7205771654844284, "reward_before_mean": 0.9513525278307498, "reward_before_std": 0.7078847829252481, "reward_change_max": 0.0014368817210197449, "reward_change_mean": -0.4477152395993471, "reward_change_min": -0.7495695874094963, "reward_change_std": 0.3069461267441511, "reward_std": 0.7205771878361702, "rewards/cosine_scaled_reward": 0.05900955572724342, "rewards/format_reward": 0.8333333507180214, "step": 204 }, { "advantage_max": 1.7795833498239517, "advantage_mean": -2.0178655746327934e-08, "advantage_min": -0.9798993840813637, "advantage_std": 0.9998395889997482, "completion_length": 1963.7500610351562, "epoch": 0.2342857142857143, "grad_norm": 0.25391820073127747, "kl": 0.0049304962158203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.612622032536507e-07, "loss": 0.0002, "reward": 0.4566840205807239, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4566840205807239, "reward_after_std": 0.7728042900562286, "reward_before_mean": 0.8691596183925867, "reward_before_std": 0.7022025138139725, "reward_change_max": 0.00012721121311187744, "reward_change_mean": -0.41247556265443563, "reward_change_min": -0.6653988249599934, "reward_change_std": 0.2569505739957094, "reward_std": 0.7728042975068092, "rewards/cosine_scaled_reward": 0.06999644916504622, "rewards/format_reward": 0.7291666753590107, "step": 205 }, { "advantage_max": 1.7442635595798492, "advantage_mean": -7.450581374079945e-09, "advantage_min": -1.0118940994143486, "advantage_std": 0.9997678473591805, "completion_length": 2631.875045776367, "epoch": 0.23542857142857143, "grad_norm": 0.24104242026805878, "kl": 0.0054798126220703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.584832158039378e-07, "loss": 0.0002, "reward": -0.1666237860918045, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.1666237860918045, "reward_after_std": 0.6209865715354681, "reward_before_mean": 0.043679025024175644, "reward_before_std": 0.6138331200927496, "reward_change_max": 0.00023755431175231934, "reward_change_mean": -0.2103028108831495, "reward_change_min": -0.40402938798069954, "reward_change_std": 0.1589896511286497, "reward_std": 0.6209866013377905, "rewards/cosine_scaled_reward": -0.2177438314538449, "rewards/format_reward": 0.47916667722165585, "step": 206 }, { "advantage_max": 1.7491918504238129, "advantage_mean": 6.208818459363386e-10, "advantage_min": -1.0733982175588608, "advantage_std": 0.9998262375593185, "completion_length": 2135.3333740234375, "epoch": 0.23657142857142857, "grad_norm": 0.26506415009498596, "kl": 0.007083892822265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.556940671764124e-07, "loss": 0.0003, "reward": 0.2579227043315768, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2579227043315768, "reward_after_std": 0.7553893104195595, "reward_before_mean": 0.6061990465968847, "reward_before_std": 0.7481972500681877, "reward_change_max": 0.000323563814163208, "reward_change_mean": -0.34827633760869503, "reward_change_min": -0.6217250227928162, "reward_change_std": 0.24982457607984543, "reward_std": 0.7553893364965916, "rewards/cosine_scaled_reward": -0.08231715299189091, "rewards/format_reward": 0.7708333507180214, "step": 207 }, { "advantage_max": 1.9023619145154953, "advantage_mean": -6.54448147341391e-08, "advantage_min": -0.8643072620034218, "advantage_std": 0.9998050183057785, "completion_length": 1508.5208892822266, "epoch": 0.2377142857142857, "grad_norm": 0.2220260202884674, "kl": 0.005397796630859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.528948933102438e-07, "loss": 0.0002, "reward": 0.3743312137085013, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3743312137085013, "reward_after_std": 0.5354681648313999, "reward_before_mean": 0.7853333353996277, "reward_before_std": 0.41513026878237724, "reward_change_max": 0.0, "reward_change_mean": -0.41100213676691055, "reward_change_min": -0.5858609657734632, "reward_change_std": 0.22129827551543713, "reward_std": 0.5354681760072708, "rewards/cosine_scaled_reward": -0.024000000208616257, "rewards/format_reward": 0.8333333414047956, "step": 208 }, { "advantage_max": 1.7572353929281235, "advantage_mean": -9.934107647602275e-09, "advantage_min": -1.0508848205208778, "advantage_std": 0.9998808801174164, "completion_length": 1723.68754196167, "epoch": 0.23885714285714285, "grad_norm": 0.31209099292755127, "kl": 0.006572723388671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.500858306332172e-07, "loss": 0.0003, "reward": 0.4929328802973032, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4929328802973032, "reward_after_std": 0.9200324267148972, "reward_before_mean": 0.9008718383265659, "reward_before_std": 0.8839384727180004, "reward_change_max": 0.0002880990505218506, "reward_change_mean": -0.40793896839022636, "reward_change_min": -0.7071980945765972, "reward_change_std": 0.28069434128701687, "reward_std": 0.9200324341654778, "rewards/cosine_scaled_reward": 0.07543591904686764, "rewards/format_reward": 0.7500000037252903, "step": 209 }, { "advantage_max": 1.855013519525528, "advantage_mean": -2.8560559472978753e-08, "advantage_min": -0.9087839424610138, "advantage_std": 0.9998267069458961, "completion_length": 1806.8958892822266, "epoch": 0.24, "grad_norm": 0.1629388928413391, "kl": 0.0041351318359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.472670160550848e-07, "loss": 0.0002, "reward": 0.28397317486815155, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28397317486815155, "reward_after_std": 0.7189972102642059, "reward_before_mean": 0.6398107409477234, "reward_before_std": 0.6449246574193239, "reward_change_max": 0.0005335360765457153, "reward_change_mean": -0.3558375835418701, "reward_change_min": -0.5656214728951454, "reward_change_std": 0.2113625300116837, "reward_std": 0.7189972251653671, "rewards/cosine_scaled_reward": -0.07592796720564365, "rewards/format_reward": 0.791666679084301, "step": 210 }, { "advantage_max": 1.8298995792865753, "advantage_mean": -2.7318796114172983e-08, "advantage_min": -0.9699450358748436, "advantage_std": 0.9997979700565338, "completion_length": 1768.3333587646484, "epoch": 0.24114285714285713, "grad_norm": 0.2429257035255432, "kl": 0.0068950653076171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.444385869608921e-07, "loss": 0.0003, "reward": 0.3679083255119622, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3679083255119622, "reward_after_std": 0.5517916530370712, "reward_before_mean": 0.7749578189104795, "reward_before_std": 0.4289670567959547, "reward_change_max": 0.0006231963634490967, "reward_change_mean": -0.40704949386417866, "reward_change_min": -0.5773862935602665, "reward_change_std": 0.22743167914450169, "reward_std": 0.5517916902899742, "rewards/cosine_scaled_reward": 0.043728915974497795, "rewards/format_reward": 0.6875000055879354, "step": 211 }, { "advantage_max": 1.7164371311664581, "advantage_mean": 1.8626452713554897e-08, "advantage_min": -0.9786007851362228, "advantage_std": 0.9997889846563339, "completion_length": 1601.1667022705078, "epoch": 0.2422857142857143, "grad_norm": 0.2649551033973694, "kl": 0.0053863525390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.416006812042827e-07, "loss": 0.0002, "reward": 0.5316742798313498, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5316742798313498, "reward_after_std": 0.6969167459756136, "reward_before_mean": 0.9865418933331966, "reward_before_std": 0.6379282623529434, "reward_change_max": 0.0023511871695518494, "reward_change_mean": -0.454867587890476, "reward_change_min": -0.733961433172226, "reward_change_std": 0.2893480281345546, "reward_std": 0.6969167813658714, "rewards/cosine_scaled_reward": 0.09743759734556079, "rewards/format_reward": 0.7916666716337204, "step": 212 }, { "advantage_max": 1.7899350970983505, "advantage_mean": -1.5211602755194065e-08, "advantage_min": -0.9111004024744034, "advantage_std": 0.9998993426561356, "completion_length": 1980.3750457763672, "epoch": 0.24342857142857144, "grad_norm": 0.34746071696281433, "kl": 0.009075164794921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.387534371007797e-07, "loss": 0.0004, "reward": 0.3948659114539623, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3948659114539623, "reward_after_std": 1.0813026502728462, "reward_before_mean": 0.7464291974902153, "reward_before_std": 1.0565481334924698, "reward_change_max": 0.0008080750703811646, "reward_change_mean": -0.35156326554715633, "reward_change_min": -0.6812599115073681, "reward_change_std": 0.26903059892356396, "reward_std": 1.0813026651740074, "rewards/cosine_scaled_reward": 0.029464581864885986, "rewards/format_reward": 0.6875000074505806, "step": 213 }, { "advantage_max": 1.7154824137687683, "advantage_mean": 1.5056382085543163e-08, "advantage_min": -1.0866163074970245, "advantage_std": 0.9998372048139572, "completion_length": 2199.375045776367, "epoch": 0.24457142857142858, "grad_norm": 0.24130062758922577, "kl": 0.0061092376708984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.358969934210438e-07, "loss": 0.0002, "reward": 0.3310183547437191, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3310183547437191, "reward_after_std": 0.7900069244205952, "reward_before_mean": 0.7001559771597385, "reward_before_std": 0.7506730165332556, "reward_change_max": 0.0006579458713531494, "reward_change_mean": -0.3691375870257616, "reward_change_min": -0.6290277764201164, "reward_change_std": 0.26337078493088484, "reward_std": 0.7900069244205952, "rewards/cosine_scaled_reward": 0.016744631924666464, "rewards/format_reward": 0.6666666828095913, "step": 214 }, { "advantage_max": 1.7432467490434647, "advantage_mean": -2.0023435798321998e-08, "advantage_min": -1.077702857553959, "advantage_std": 0.9997599869966507, "completion_length": 1566.0000228881836, "epoch": 0.24571428571428572, "grad_norm": 0.22881284356117249, "kl": 0.00359344482421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.330314893841101e-07, "loss": 0.0001, "reward": 0.2557785410899669, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2557785410899669, "reward_after_std": 0.45268482342362404, "reward_before_mean": 0.6378269121050835, "reward_before_std": 0.3827780820429325, "reward_change_max": 0.00026692450046539307, "reward_change_mean": -0.3820483675226569, "reward_change_min": -0.5721809715032578, "reward_change_std": 0.21542515978217125, "reward_std": 0.45268482342362404, "rewards/cosine_scaled_reward": -0.10816989466547966, "rewards/format_reward": 0.8541666716337204, "step": 215 }, { "advantage_max": 1.782645806670189, "advantage_mean": -1.1175871006408045e-08, "advantage_min": -0.9347731694579124, "advantage_std": 0.9998399987816811, "completion_length": 1313.1458778381348, "epoch": 0.24685714285714286, "grad_norm": 0.2895472049713135, "kl": 0.005374908447265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.301570646506027e-07, "loss": 0.0002, "reward": 0.5556383287766948, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5556383287766948, "reward_after_std": 0.7719503156840801, "reward_before_mean": 1.007195319980383, "reward_before_std": 0.7181936427950859, "reward_change_max": 0.0006204098463058472, "reward_change_mean": -0.4515569722279906, "reward_change_min": -0.7585007883608341, "reward_change_std": 0.2962316516786814, "reward_std": 0.7719503194093704, "rewards/cosine_scaled_reward": 0.08693097531795502, "rewards/format_reward": 0.833333333954215, "step": 216 }, { "advantage_max": 1.7308773547410965, "advantage_mean": -2.2351742345882997e-08, "advantage_min": -0.9450201913714409, "advantage_std": 0.999861940741539, "completion_length": 1677.333366394043, "epoch": 0.248, "grad_norm": 0.220436692237854, "kl": 0.0054531097412109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.27273859315928e-07, "loss": 0.0002, "reward": 0.5233246088027954, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5233246088027954, "reward_after_std": 0.9267925694584846, "reward_before_mean": 0.9467302523553371, "reward_before_std": 0.916178110986948, "reward_change_max": 0.0010505840182304382, "reward_change_mean": -0.4234056528657675, "reward_change_min": -0.7419403791427612, "reward_change_std": 0.28369833342731, "reward_std": 0.9267925843596458, "rewards/cosine_scaled_reward": 0.09836512617766857, "rewards/format_reward": 0.7500000055879354, "step": 217 }, { "advantage_max": 1.8217461854219437, "advantage_mean": -1.2417635808503746e-09, "advantage_min": -0.9110037386417389, "advantage_std": 0.9998432099819183, "completion_length": 1789.2291793823242, "epoch": 0.24914285714285714, "grad_norm": 0.23365142941474915, "kl": 0.006023406982421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.243820139034464e-07, "loss": 0.0002, "reward": 0.16982462257146835, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16982462257146835, "reward_after_std": 0.7668844200670719, "reward_before_mean": 0.48138661216944456, "reward_before_std": 0.7267553135752678, "reward_change_max": 0.0, "reward_change_mean": -0.3115620091557503, "reward_change_min": -0.5759349763393402, "reward_change_std": 0.2177075995132327, "reward_std": 0.7668844312429428, "rewards/cosine_scaled_reward": -0.1343067018315196, "rewards/format_reward": 0.7500000074505806, "step": 218 }, { "advantage_max": 1.771141156554222, "advantage_mean": -1.893689272058907e-08, "advantage_min": -1.0041688904166222, "advantage_std": 0.9998351633548737, "completion_length": 1520.6458549499512, "epoch": 0.2502857142857143, "grad_norm": 0.2651885747909546, "kl": 0.0072784423828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.214816693576234e-07, "loss": 0.0003, "reward": 0.5208356026560068, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5208356026560068, "reward_after_std": 0.7574468180537224, "reward_before_mean": 0.9618058688938618, "reward_before_std": 0.7059715762734413, "reward_change_max": 0.0, "reward_change_mean": -0.4409702867269516, "reward_change_min": -0.7351334057748318, "reward_change_std": 0.2692116089165211, "reward_std": 0.7574468441307545, "rewards/cosine_scaled_reward": 0.07465293304994702, "rewards/format_reward": 0.812500013038516, "step": 219 }, { "advantage_max": 1.7675309777259827, "advantage_mean": -6.208817460162663e-09, "advantage_min": -0.9326722472906113, "advantage_std": 0.9997783303260803, "completion_length": 1808.1666851043701, "epoch": 0.25142857142857145, "grad_norm": 0.25858521461486816, "kl": 0.00669097900390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.185729670371604e-07, "loss": 0.0003, "reward": -0.03782558673992753, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.03782558673992753, "reward_after_std": 0.5307250507175922, "reward_before_mean": 0.22734039183706045, "reward_before_std": 0.49058748222887516, "reward_change_max": 0.0006260201334953308, "reward_change_mean": -0.2651659846305847, "reward_change_min": -0.46392854675650597, "reward_change_std": 0.16752388421446085, "reward_std": 0.530725060030818, "rewards/cosine_scaled_reward": -0.2613298185169697, "rewards/format_reward": 0.7500000074505806, "step": 220 }, { "advantage_max": 1.849363923072815, "advantage_mean": 8.226683889667186e-09, "advantage_min": -0.8677055388689041, "advantage_std": 0.9998321458697319, "completion_length": 1643.6250343322754, "epoch": 0.25257142857142856, "grad_norm": 0.22296759486198425, "kl": 0.0048961639404296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.156560487081051e-07, "loss": 0.0002, "reward": 0.465535756200552, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.465535756200552, "reward_after_std": 0.69632688164711, "reward_before_mean": 0.8917008936405182, "reward_before_std": 0.5923177506774664, "reward_change_max": 0.0, "reward_change_mean": -0.4261651113629341, "reward_change_min": -0.688133642077446, "reward_change_std": 0.25779683608561754, "reward_std": 0.6963269002735615, "rewards/cosine_scaled_reward": 0.060433757957071066, "rewards/format_reward": 0.7708333432674408, "step": 221 }, { "advantage_max": 1.7082463651895523, "advantage_mean": 3.6011140736036396e-08, "advantage_min": -1.0366277173161507, "advantage_std": 0.9998287856578827, "completion_length": 1782.1666870117188, "epoch": 0.2537142857142857, "grad_norm": 0.23950423300266266, "kl": 0.0060272216796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.127310565369415e-07, "loss": 0.0002, "reward": 0.3626019451767206, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3626019451767206, "reward_after_std": 0.6691025421023369, "reward_before_mean": 0.7616823986172676, "reward_before_std": 0.6462901309132576, "reward_change_max": 2.7194619178771973e-05, "reward_change_mean": -0.39908043551258743, "reward_change_min": -0.6558765843510628, "reward_change_std": 0.2545696459710598, "reward_std": 0.6691025458276272, "rewards/cosine_scaled_reward": 0.016257859766483307, "rewards/format_reward": 0.7291666697710752, "step": 222 }, { "advantage_max": 1.657853752374649, "advantage_mean": 1.8005570368018198e-08, "advantage_min": -1.1626385524868965, "advantage_std": 0.9998083114624023, "completion_length": 1793.7917098999023, "epoch": 0.25485714285714284, "grad_norm": 0.20337367057800293, "kl": 0.0051937103271484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.097981330836616e-07, "loss": 0.0002, "reward": 0.2881290102377534, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2881290102377534, "reward_after_std": 0.6012133844196796, "reward_before_mean": 0.6712304223328829, "reward_before_std": 0.58708431199193, "reward_change_max": 0.0011701211333274841, "reward_change_mean": -0.3831014111638069, "reward_change_min": -0.5992018133401871, "reward_change_std": 0.24398234859108925, "reward_std": 0.6012133918702602, "rewards/cosine_scaled_reward": -0.018551473505795002, "rewards/format_reward": 0.7083333358168602, "step": 223 }, { "advantage_max": 1.6988431364297867, "advantage_mean": -9.934107980669182e-09, "advantage_min": -0.9599725678563118, "advantage_std": 0.9998431652784348, "completion_length": 2086.041732788086, "epoch": 0.256, "grad_norm": 0.19765803217887878, "kl": 0.0055084228515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.068574212948169e-07, "loss": 0.0002, "reward": 0.30418982403352857, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.30418982403352857, "reward_after_std": 0.8525899536907673, "reward_before_mean": 0.6595669016242027, "reward_before_std": 0.8552987053990364, "reward_change_max": 0.0, "reward_change_mean": -0.3553770836442709, "reward_change_min": -0.7288083583116531, "reward_change_std": 0.2651802469044924, "reward_std": 0.8525899797677994, "rewards/cosine_scaled_reward": -0.05563322547823191, "rewards/format_reward": 0.7708333432674408, "step": 224 }, { "advantage_max": 1.6905263662338257, "advantage_mean": 2.0489097363185493e-08, "advantage_min": -1.1092445850372314, "advantage_std": 0.999829076230526, "completion_length": 2680.625045776367, "epoch": 0.2571428571428571, "grad_norm": 0.2719421088695526, "kl": 0.012577056884765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.039090644965509e-07, "loss": 0.0005, "reward": -0.06987423868849874, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06987423868849874, "reward_after_std": 0.7944876663386822, "reward_before_mean": 0.15667898394167423, "reward_before_std": 0.8274992145597935, "reward_change_max": 0.00026056915521621704, "reward_change_mean": -0.22655323334038258, "reward_change_min": -0.5153910927474499, "reward_change_std": 0.2113372590392828, "reward_std": 0.7944876700639725, "rewards/cosine_scaled_reward": -0.1612438365118578, "rewards/format_reward": 0.4791666828095913, "step": 225 }, { "advantage_max": 1.6395408809185028, "advantage_mean": -4.346172088887101e-09, "advantage_min": -1.205371432006359, "advantage_std": 0.9998501390218735, "completion_length": 1881.666732788086, "epoch": 0.2582857142857143, "grad_norm": 0.20754872262477875, "kl": 0.00574493408203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.009532063876148e-07, "loss": 0.0002, "reward": 0.43648668099194765, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.43648668099194765, "reward_after_std": 0.8036997690796852, "reward_before_mean": 0.8469499461352825, "reward_before_std": 0.7947820425033569, "reward_change_max": 4.5202672481536865e-05, "reward_change_mean": -0.41046320647001266, "reward_change_min": -0.6600200831890106, "reward_change_std": 0.2689579091966152, "reward_std": 0.8036997728049755, "rewards/cosine_scaled_reward": 0.048474946059286594, "rewards/format_reward": 0.7500000149011612, "step": 226 }, { "advantage_max": 1.7682945430278778, "advantage_mean": -2.545615118698663e-08, "advantage_min": -0.9086620286107063, "advantage_std": 0.999826617538929, "completion_length": 1243.5208740234375, "epoch": 0.25942857142857145, "grad_norm": 0.2935648560523987, "kl": 0.0074939727783203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.979899910323624e-07, "loss": 0.0003, "reward": 0.4100653901696205, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4100653901696205, "reward_after_std": 0.7520852331072092, "reward_before_mean": 0.8095783777534962, "reward_before_std": 0.6977898720651865, "reward_change_max": 0.0, "reward_change_mean": -0.39951300621032715, "reward_change_min": -0.7274561859667301, "reward_change_std": 0.252217099070549, "reward_std": 0.752085255458951, "rewards/cosine_scaled_reward": -0.07437748275697231, "rewards/format_reward": 0.9583333358168602, "step": 227 }, { "advantage_max": 1.7306746989488602, "advantage_mean": 3.973643097898716e-08, "advantage_min": -1.0087413154542446, "advantage_std": 0.9997887387871742, "completion_length": 1414.6875381469727, "epoch": 0.26057142857142856, "grad_norm": 0.26180991530418396, "kl": 0.005126953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.950195628537299e-07, "loss": 0.0002, "reward": 0.559282305650413, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.559282305650413, "reward_after_std": 0.7350594010204077, "reward_before_mean": 1.0196346994489431, "reward_before_std": 0.6797581622377038, "reward_change_max": 0.00039236247539520264, "reward_change_mean": -0.46035236120224, "reward_change_min": -0.7457386590540409, "reward_change_std": 0.2904507303610444, "reward_std": 0.7350594233721495, "rewards/cosine_scaled_reward": 0.10356733575463295, "rewards/format_reward": 0.8125000074505806, "step": 228 }, { "advantage_max": 1.8085173517465591, "advantage_mean": 1.583248399050774e-08, "advantage_min": -0.9795521348714828, "advantage_std": 0.9997864812612534, "completion_length": 1644.6041946411133, "epoch": 0.26171428571428573, "grad_norm": 0.25798800587654114, "kl": 0.0072021484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.920420666261961e-07, "loss": 0.0003, "reward": 0.30033984655165114, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.30033984655165114, "reward_after_std": 0.5501540526747704, "reward_before_mean": 0.6844759145751595, "reward_before_std": 0.4642267283052206, "reward_change_max": 0.0001852661371231079, "reward_change_mean": -0.38413603976368904, "reward_change_min": -0.5813822597265244, "reward_change_std": 0.2193038398399949, "reward_std": 0.5501540638506413, "rewards/cosine_scaled_reward": -0.07442873902618885, "rewards/format_reward": 0.8333333469927311, "step": 229 }, { "advantage_max": 1.7984435707330704, "advantage_mean": -8.071462387349015e-09, "advantage_min": -0.9476852715015411, "advantage_std": 0.9997808933258057, "completion_length": 1950.0833892822266, "epoch": 0.26285714285714284, "grad_norm": 0.3283679485321045, "kl": 0.0067119598388671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.890576474687263e-07, "loss": 0.0003, "reward": -0.08325734292156994, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.08325734292156994, "reward_after_std": 0.5276125986129045, "reward_before_mean": 0.16819528955966234, "reward_before_std": 0.4903040248900652, "reward_change_max": 0.0013059750199317932, "reward_change_mean": -0.2514526303857565, "reward_change_min": -0.4294059984385967, "reward_change_std": 0.16498796828091145, "reward_std": 0.5276126060634851, "rewards/cosine_scaled_reward": -0.28048570454120636, "rewards/format_reward": 0.729166679084301, "step": 230 }, { "advantage_max": 1.7109387069940567, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -1.1373762860894203, "advantage_std": 0.9998445957899094, "completion_length": 1855.104248046875, "epoch": 0.264, "grad_norm": 0.20254160463809967, "kl": 0.0066585540771484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.860664508377001e-07, "loss": 0.0003, "reward": 0.4411482270807028, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4411482270807028, "reward_after_std": 0.7419208399951458, "reward_before_mean": 0.8538082093000412, "reward_before_std": 0.6892174817621708, "reward_change_max": 0.0015272125601768494, "reward_change_mean": -0.4126600045710802, "reward_change_min": -0.6403000093996525, "reward_change_std": 0.2542659565806389, "reward_std": 0.7419208474457264, "rewards/cosine_scaled_reward": 0.03107076697051525, "rewards/format_reward": 0.7916666846722364, "step": 231 }, { "advantage_max": 1.8143134117126465, "advantage_mean": 2.173086377510458e-09, "advantage_min": -0.8859074115753174, "advantage_std": 0.99986182898283, "completion_length": 2088.333366394043, "epoch": 0.2651428571428571, "grad_norm": 0.25557824969291687, "kl": 0.0066165924072265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.83068622519821e-07, "loss": 0.0003, "reward": 0.13319048564881086, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13319048564881086, "reward_after_std": 0.782651349902153, "reward_before_mean": 0.4277662206441164, "reward_before_std": 0.7301117442548275, "reward_change_max": 0.0006254464387893677, "reward_change_mean": -0.2945757247507572, "reward_change_min": -0.5024777874350548, "reward_change_std": 0.19493667595088482, "reward_std": 0.7826513797044754, "rewards/cosine_scaled_reward": -0.1402835687622428, "rewards/format_reward": 0.7083333395421505, "step": 232 }, { "advantage_max": 1.8585294634103775, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.8971026092767715, "advantage_std": 0.999821811914444, "completion_length": 1434.93754196167, "epoch": 0.2662857142857143, "grad_norm": 0.3193257749080658, "kl": 0.006378173828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.800643086250121e-07, "loss": 0.0003, "reward": 0.18415822181850672, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18415822181850672, "reward_after_std": 0.7197468094527721, "reward_before_mean": 0.5022184662520885, "reward_before_std": 0.65783516690135, "reward_change_max": 0.005561359226703644, "reward_change_mean": -0.31806023977696896, "reward_change_min": -0.5661965161561966, "reward_change_std": 0.20751821622252464, "reward_std": 0.7197468131780624, "rewards/cosine_scaled_reward": -0.17597410455346107, "rewards/format_reward": 0.8541666939854622, "step": 233 }, { "advantage_max": 1.7506544291973114, "advantage_mean": 2.4214388383292373e-08, "advantage_min": -0.9426228627562523, "advantage_std": 0.9998092576861382, "completion_length": 2045.208381652832, "epoch": 0.2674285714285714, "grad_norm": 0.2814824879169464, "kl": 0.00868988037109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.770536555792944e-07, "loss": 0.0003, "reward": 0.31502785813063383, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.31502785813063383, "reward_after_std": 0.6782049648463726, "reward_before_mean": 0.6948503362946212, "reward_before_std": 0.6401338074356318, "reward_change_max": 0.0, "reward_change_mean": -0.37982242554426193, "reward_change_min": -0.6463275514543056, "reward_change_std": 0.2551281042397022, "reward_std": 0.6782049834728241, "rewards/cosine_scaled_reward": 0.024508466944098473, "rewards/format_reward": 0.6458333414047956, "step": 234 }, { "advantage_max": 1.7867973893880844, "advantage_mean": -2.0489097307674342e-08, "advantage_min": -0.941958375275135, "advantage_std": 0.9998260587453842, "completion_length": 1557.5000534057617, "epoch": 0.26857142857142857, "grad_norm": 0.2836053967475891, "kl": 0.007953643798828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.740368101176495e-07, "loss": 0.0003, "reward": 0.6468500215560198, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6468500215560198, "reward_after_std": 0.6875507459044456, "reward_before_mean": 1.1402056589722633, "reward_before_std": 0.5763269923627377, "reward_change_max": 0.0, "reward_change_mean": -0.49335562624037266, "reward_change_min": -0.7646256927400827, "reward_change_std": 0.28742435295134783, "reward_std": 0.6875507608056068, "rewards/cosine_scaled_reward": 0.16385281505063176, "rewards/format_reward": 0.8125, "step": 235 }, { "advantage_max": 1.7209309339523315, "advantage_mean": -3.601114018092488e-08, "advantage_min": -0.9355932921171188, "advantage_std": 0.9997842013835907, "completion_length": 2091.5416870117188, "epoch": 0.26971428571428574, "grad_norm": 0.2015104740858078, "kl": 0.00628662109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.710139192768694e-07, "loss": 0.0003, "reward": 0.22984511405229568, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22984511405229568, "reward_after_std": 0.709774911403656, "reward_before_mean": 0.5813790038228035, "reward_before_std": 0.7154958508908749, "reward_change_max": 0.0, "reward_change_mean": -0.35153392143547535, "reward_change_min": -0.6476014740765095, "reward_change_std": 0.25439552776515484, "reward_std": 0.7097749188542366, "rewards/cosine_scaled_reward": -0.0530604999512434, "rewards/format_reward": 0.6875000074505806, "step": 236 }, { "advantage_max": 1.7595865577459335, "advantage_mean": -2.6077033199456423e-08, "advantage_min": -0.9901612102985382, "advantage_std": 0.9998205602169037, "completion_length": 1709.583366394043, "epoch": 0.27085714285714285, "grad_norm": 0.21263200044631958, "kl": 0.0065212249755859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.679851303883891e-07, "loss": 0.0003, "reward": 0.46819635666906834, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.46819635666906834, "reward_after_std": 0.7337536811828613, "reward_before_mean": 0.8961526490747929, "reward_before_std": 0.681405933573842, "reward_change_max": 0.0, "reward_change_mean": -0.42795631662011147, "reward_change_min": -0.7515837773680687, "reward_change_std": 0.2844035355374217, "reward_std": 0.7337537072598934, "rewards/cosine_scaled_reward": 0.05224300303962082, "rewards/format_reward": 0.7916666865348816, "step": 237 }, { "advantage_max": 1.7369954288005829, "advantage_mean": -1.1835557922612594e-08, "advantage_min": -1.0499492287635803, "advantage_std": 0.9998564645648003, "completion_length": 1461.3958587646484, "epoch": 0.272, "grad_norm": 0.22223296761512756, "kl": 0.0070953369140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.649505910711058e-07, "loss": 0.0003, "reward": 0.5198564641177654, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5198564641177654, "reward_after_std": 0.8789115846157074, "reward_before_mean": 0.9473402900621295, "reward_before_std": 0.8605465069413185, "reward_change_max": 0.0, "reward_change_mean": -0.42748381942510605, "reward_change_min": -0.7568193078041077, "reward_change_std": 0.28529735654592514, "reward_std": 0.8789115995168686, "rewards/cosine_scaled_reward": 0.03617013603798114, "rewards/format_reward": 0.8750000111758709, "step": 238 }, { "advantage_max": 1.7351737469434738, "advantage_mean": 2.235174390019523e-08, "advantage_min": -0.9487668685615063, "advantage_std": 0.999817244708538, "completion_length": 1483.0000457763672, "epoch": 0.27314285714285713, "grad_norm": 0.24678769707679749, "kl": 0.0050640106201171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.619104492241847e-07, "loss": 0.0002, "reward": 0.7381034195423126, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7381034195423126, "reward_after_std": 0.5955033414065838, "reward_before_mean": 1.280349224805832, "reward_before_std": 0.4761668825522065, "reward_change_max": 0.0007323846220970154, "reward_change_mean": -0.5422458099201322, "reward_change_min": -0.7931827120482922, "reward_change_std": 0.3161123748868704, "reward_std": 0.5955033525824547, "rewards/cosine_scaled_reward": 0.265174625441432, "rewards/format_reward": 0.7500000055879354, "step": 239 }, { "advantage_max": 1.7912371009588242, "advantage_mean": -1.738468857759301e-08, "advantage_min": -0.9993407279253006, "advantage_std": 0.999750941991806, "completion_length": 1787.5625228881836, "epoch": 0.2742857142857143, "grad_norm": 0.2600703835487366, "kl": 0.007839202880859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.588648530198504e-07, "loss": 0.0003, "reward": -0.08032464499410708, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08032464499410708, "reward_after_std": 0.41133350878953934, "reward_before_mean": 0.1878053043037653, "reward_before_std": 0.3531003165990114, "reward_change_max": 0.0003596469759941101, "reward_change_mean": -0.2681299690157175, "reward_change_min": -0.41517274081707, "reward_change_std": 0.15671829506754875, "reward_std": 0.41133351996541023, "rewards/cosine_scaled_reward": -0.2810973487794399, "rewards/format_reward": 0.7500000149011612, "step": 240 }, { "advantage_max": 1.767946794629097, "advantage_mean": -9.313223525708736e-10, "advantage_min": -0.9240212738513947, "advantage_std": 0.9998046532273293, "completion_length": 1953.9166870117188, "epoch": 0.2754285714285714, "grad_norm": 0.22386980056762695, "kl": 0.008207321166992188, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.558139508961654e-07, "loss": 0.0003, "reward": -0.01403034944087267, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.01403034944087267, "reward_after_std": 0.5795079059898853, "reward_before_mean": 0.2563190895016305, "reward_before_std": 0.5416142418980598, "reward_change_max": 0.0009088218212127686, "reward_change_mean": -0.27034944854676723, "reward_change_min": -0.47141827642917633, "reward_change_std": 0.17613562010228634, "reward_std": 0.5795079097151756, "rewards/cosine_scaled_reward": -0.23642379604279995, "rewards/format_reward": 0.7291666716337204, "step": 241 }, { "advantage_max": 1.7918582111597061, "advantage_mean": 2.8560560028090265e-08, "advantage_min": -0.8681273683905602, "advantage_std": 0.9997833594679832, "completion_length": 1473.1458549499512, "epoch": 0.2765714285714286, "grad_norm": 0.2848529517650604, "kl": 0.010305404663085938, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.527578915497951e-07, "loss": 0.0004, "reward": 0.3877807483077049, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3877807483077049, "reward_after_std": 0.7101338040083647, "reward_before_mean": 0.7856805149931461, "reward_before_std": 0.6383706177584827, "reward_change_max": 0.00012966245412826538, "reward_change_mean": -0.3978997580707073, "reward_change_min": -0.665248416364193, "reward_change_std": 0.2482099225744605, "reward_std": 0.710133820772171, "rewards/cosine_scaled_reward": -0.04465975146740675, "rewards/format_reward": 0.8750000074505806, "step": 242 }, { "advantage_max": 1.7156173139810562, "advantage_mean": -9.623666752389681e-09, "advantage_min": -1.0441725701093674, "advantage_std": 0.9998472258448601, "completion_length": 1773.4375228881836, "epoch": 0.2777142857142857, "grad_norm": 0.21271540224552155, "kl": 0.005889892578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.496968239287603e-07, "loss": 0.0002, "reward": 0.5646212995052338, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5646212995052338, "reward_after_std": 0.7931492328643799, "reward_before_mean": 1.0185755155980587, "reward_before_std": 0.7427886761724949, "reward_change_max": 0.0, "reward_change_mean": -0.453954191878438, "reward_change_min": -0.7664834596216679, "reward_change_std": 0.2892959900200367, "reward_std": 0.7931492626667023, "rewards/cosine_scaled_reward": 0.10303773730993271, "rewards/format_reward": 0.8125000149011612, "step": 243 }, { "advantage_max": 1.7865074276924133, "advantage_mean": -3.1044087189791014e-08, "advantage_min": -0.8982498571276665, "advantage_std": 0.9998435750603676, "completion_length": 1897.7916984558105, "epoch": 0.27885714285714286, "grad_norm": 0.24355360865592957, "kl": 0.00751495361328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.466308972251785e-07, "loss": 0.0003, "reward": 0.483418392483145, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.483418392483145, "reward_after_std": 0.7782931216061115, "reward_before_mean": 0.9102065991610289, "reward_before_std": 0.7269596271216869, "reward_change_max": 6.21899962425232e-05, "reward_change_mean": -0.42678820062428713, "reward_change_min": -0.729209691286087, "reward_change_std": 0.2835246790200472, "reward_std": 0.7782931476831436, "rewards/cosine_scaled_reward": 0.10093661397695541, "rewards/format_reward": 0.7083333395421505, "step": 244 }, { "advantage_max": 1.6835286319255829, "advantage_mean": -3.244107016353581e-08, "advantage_min": -1.0740430131554604, "advantage_std": 0.9998616874217987, "completion_length": 2091.1875610351562, "epoch": 0.28, "grad_norm": 0.22807635366916656, "kl": 0.00653076171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.435602608679916e-07, "loss": 0.0003, "reward": 0.49553263932466507, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.49553263932466507, "reward_after_std": 0.9236143790185452, "reward_before_mean": 0.9156752824783325, "reward_before_std": 0.9481416214257479, "reward_change_max": 0.0, "reward_change_mean": -0.42014264315366745, "reward_change_min": -0.8106455057859421, "reward_change_std": 0.32762840017676353, "reward_std": 0.9236144013702869, "rewards/cosine_scaled_reward": 0.08283763099461794, "rewards/format_reward": 0.7500000074505806, "step": 245 }, { "advantage_max": 1.830370932817459, "advantage_mean": -1.8626454822978644e-09, "advantage_min": -0.9631890580058098, "advantage_std": 0.9998647123575211, "completion_length": 1740.145866394043, "epoch": 0.28114285714285714, "grad_norm": 0.19543612003326416, "kl": 0.009716033935546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.404850645156841e-07, "loss": 0.0004, "reward": 0.446297419257462, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.446297419257462, "reward_after_std": 0.9271194748580456, "reward_before_mean": 0.835994508874137, "reward_before_std": 0.8750112801790237, "reward_change_max": 0.0, "reward_change_mean": -0.38969707675278187, "reward_change_min": -0.6727043651044369, "reward_change_std": 0.2569092642515898, "reward_std": 0.927119504660368, "rewards/cosine_scaled_reward": -0.009086107485927641, "rewards/format_reward": 0.8541666828095913, "step": 246 }, { "advantage_max": 1.812906727194786, "advantage_mean": 1.3348957328407351e-08, "advantage_min": -0.8430109173059464, "advantage_std": 0.9998029470443726, "completion_length": 2167.645851135254, "epoch": 0.2822857142857143, "grad_norm": 0.2304369956254959, "kl": 0.010379791259765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.374054580489873e-07, "loss": 0.0004, "reward": -0.04499432723969221, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.04499432723969221, "reward_after_std": 0.7093330919742584, "reward_before_mean": 0.1989488247781992, "reward_before_std": 0.6923120841383934, "reward_change_max": 0.0005119815468788147, "reward_change_mean": -0.24394315527752042, "reward_change_min": -0.46251858957111835, "reward_change_std": 0.18785839760676026, "reward_std": 0.7093331031501293, "rewards/cosine_scaled_reward": -0.20260892622172832, "rewards/format_reward": 0.6041666679084301, "step": 247 }, { "advantage_max": 1.761580914258957, "advantage_mean": -2.7939678126642775e-08, "advantage_min": -0.8895273953676224, "advantage_std": 0.9998041912913322, "completion_length": 1705.4166870117188, "epoch": 0.2834285714285714, "grad_norm": 0.26865696907043457, "kl": 0.009264945983886719, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.343215915635761e-07, "loss": 0.0004, "reward": 0.5432401020079851, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5432401020079851, "reward_after_std": 0.6029433831572533, "reward_before_mean": 1.011788947507739, "reward_before_std": 0.4950931016355753, "reward_change_max": 0.00010482966899871826, "reward_change_mean": -0.46854883804917336, "reward_change_min": -0.7035993114113808, "reward_change_std": 0.27063075825572014, "reward_std": 0.6029433980584145, "rewards/cosine_scaled_reward": 0.1621444746851921, "rewards/format_reward": 0.6875, "step": 248 }, { "advantage_max": 1.7205699533224106, "advantage_mean": -2.3593505260599557e-08, "advantage_min": -0.98729008436203, "advantage_std": 0.99983149766922, "completion_length": 1893.2500305175781, "epoch": 0.2845714285714286, "grad_norm": 0.2509145140647888, "kl": 0.008714675903320312, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.31233615362752e-07, "loss": 0.0003, "reward": 0.5128680039197206, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5128680039197206, "reward_after_std": 0.7195250578224659, "reward_before_mean": 0.9610644094645977, "reward_before_std": 0.6817304203286767, "reward_change_max": 0.0005414485931396484, "reward_change_mean": -0.44819645024836063, "reward_change_min": -0.733538419008255, "reward_change_std": 0.28826362919062376, "reward_std": 0.7195250727236271, "rewards/cosine_scaled_reward": 0.14719888754189014, "rewards/format_reward": 0.6666666772216558, "step": 249 }, { "advantage_max": 1.7117508351802826, "advantage_mean": 1.1796752463766325e-08, "advantage_min": -0.9972748905420303, "advantage_std": 0.999837726354599, "completion_length": 1853.2500610351562, "epoch": 0.2857142857142857, "grad_norm": 0.2886704206466675, "kl": 0.01222991943359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.281416799501187e-07, "loss": 0.0005, "reward": 0.33519203681498766, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33519203681498766, "reward_after_std": 0.7724294364452362, "reward_before_mean": 0.7103296183049679, "reward_before_std": 0.7524272501468658, "reward_change_max": 0.0, "reward_change_mean": -0.3751375786960125, "reward_change_min": -0.6786920055747032, "reward_change_std": 0.25570056959986687, "reward_std": 0.772429458796978, "rewards/cosine_scaled_reward": -0.05108520283829421, "rewards/format_reward": 0.8125000149011612, "step": 250 }, { "advantage_max": 1.948138415813446, "advantage_mean": -1.73846881335038e-08, "advantage_min": -0.7720078602433205, "advantage_std": 0.9998499900102615, "completion_length": 1489.6042022705078, "epoch": 0.28685714285714287, "grad_norm": 0.2549422085285187, "kl": 0.013996124267578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.25045936022246e-07, "loss": 0.0006, "reward": 0.30879892292432487, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.30879892292432487, "reward_after_std": 0.8464222475886345, "reward_before_mean": 0.64876519003883, "reward_before_std": 0.733924706466496, "reward_change_max": 0.0, "reward_change_mean": -0.33996627293527126, "reward_change_min": -0.5379550084471703, "reward_change_std": 0.2002180265262723, "reward_std": 0.8464222624897957, "rewards/cosine_scaled_reward": -0.08186741732060909, "rewards/format_reward": 0.8125000111758709, "step": 251 }, { "advantage_max": 1.7325638681650162, "advantage_mean": 3.2906732450044274e-08, "advantage_min": -1.0570075288414955, "advantage_std": 0.9997806400060654, "completion_length": 2135.0625610351562, "epoch": 0.288, "grad_norm": 0.2550097405910492, "kl": 0.01044464111328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.219465344613258e-07, "loss": 0.0004, "reward": 0.012848891317844391, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.012848891317844391, "reward_after_std": 0.5318264309316874, "reward_before_mean": 0.3033873178064823, "reward_before_std": 0.4942729715257883, "reward_change_max": 0.00042698532342910767, "reward_change_mean": -0.29053843207657337, "reward_change_min": -0.49809412099421024, "reward_change_std": 0.1977363796904683, "reward_std": 0.5318264402449131, "rewards/cosine_scaled_reward": -0.16080632619559765, "rewards/format_reward": 0.6250000111758709, "step": 252 }, { "advantage_max": 1.8799777328968048, "advantage_mean": 2.483526828633842e-09, "advantage_min": -0.8240553066134453, "advantage_std": 0.9998625963926315, "completion_length": 2007.62504196167, "epoch": 0.28914285714285715, "grad_norm": 0.3555978536605835, "kl": 0.01372528076171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.188436263278172e-07, "loss": 0.0005, "reward": 0.07007137686014175, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07007137686014175, "reward_after_std": 0.8272786363959312, "reward_before_mean": 0.33053629170171916, "reward_before_std": 0.7710927985608578, "reward_change_max": 0.0, "reward_change_mean": -0.2604649029672146, "reward_change_min": -0.4797077141702175, "reward_change_std": 0.17783019971102476, "reward_std": 0.8272786512970924, "rewards/cosine_scaled_reward": -0.16806519869714975, "rewards/format_reward": 0.6666666772216558, "step": 253 }, { "advantage_max": 1.801082119345665, "advantage_mean": -1.8626453157644107e-09, "advantage_min": -0.8402153700590134, "advantage_std": 0.999859981238842, "completion_length": 2428.729232788086, "epoch": 0.29028571428571426, "grad_norm": 0.2833425998687744, "kl": 0.013912200927734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.157373628530852e-07, "loss": 0.0006, "reward": 0.2539786659181118, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2539786659181118, "reward_after_std": 0.9438169002532959, "reward_before_mean": 0.5755032449960709, "reward_before_std": 0.9321219827979803, "reward_change_max": 0.0013218075037002563, "reward_change_mean": -0.32152455020695925, "reward_change_min": -0.7000974081456661, "reward_change_std": 0.26952244341373444, "reward_std": 0.9438169039785862, "rewards/cosine_scaled_reward": -0.003915064735338092, "rewards/format_reward": 0.5833333432674408, "step": 254 }, { "advantage_max": 1.847363829612732, "advantage_mean": -4.967053768289986e-09, "advantage_min": -0.8652100116014481, "advantage_std": 0.9998600110411644, "completion_length": 2457.1458892822266, "epoch": 0.2914285714285714, "grad_norm": 0.2822459936141968, "kl": 0.012584686279296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.126278954320294e-07, "loss": 0.0005, "reward": 0.058952754363417625, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.058952754363417625, "reward_after_std": 0.9104130305349827, "reward_before_mean": 0.3073294563218951, "reward_before_std": 0.8749484308063984, "reward_change_max": 0.002128191292285919, "reward_change_mean": -0.2483767168596387, "reward_change_min": -0.503126535564661, "reward_change_std": 0.19132153410464525, "reward_std": 0.9104130640625954, "rewards/cosine_scaled_reward": -0.13800194393843412, "rewards/format_reward": 0.583333345130086, "step": 255 }, { "advantage_max": 1.6621895879507065, "advantage_mean": 2.9181441929537755e-08, "advantage_min": -1.0839653462171555, "advantage_std": 0.9998313784599304, "completion_length": 1737.7083892822266, "epoch": 0.2925714285714286, "grad_norm": 0.24610646069049835, "kl": 0.0100860595703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.095153756157051e-07, "loss": 0.0004, "reward": 0.46933331596665084, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.46933331596665084, "reward_after_std": 0.8646598383784294, "reward_before_mean": 0.8876928202807903, "reward_before_std": 0.894544892013073, "reward_change_max": 0.0, "reward_change_mean": -0.4183594724163413, "reward_change_min": -0.7175531946122646, "reward_change_std": 0.30656644608825445, "reward_std": 0.8646598570048809, "rewards/cosine_scaled_reward": 0.0584297482855618, "rewards/format_reward": 0.7708333414047956, "step": 256 }, { "advantage_max": 1.6426567435264587, "advantage_mean": -1.1175871561519557e-08, "advantage_min": -1.0939588397741318, "advantage_std": 0.999855637550354, "completion_length": 2350.229202270508, "epoch": 0.2937142857142857, "grad_norm": 0.278710275888443, "kl": 0.00988006591796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.06399955103937e-07, "loss": 0.0004, "reward": 0.39300163090229034, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.39300163090229034, "reward_after_std": 0.9901136755943298, "reward_before_mean": 0.7709381179884076, "reward_before_std": 1.052088800817728, "reward_change_max": 3.172457218170166e-05, "reward_change_mean": -0.3779364926740527, "reward_change_min": -0.806621216237545, "reward_change_std": 0.3292911360040307, "reward_std": 0.9901137053966522, "rewards/cosine_scaled_reward": 0.10421905107796192, "rewards/format_reward": 0.5625000055879354, "step": 257 }, { "advantage_max": 1.714397206902504, "advantage_mean": -2.4835269951672956e-08, "advantage_min": -0.9847019612789154, "advantage_std": 0.999831810593605, "completion_length": 2281.250045776367, "epoch": 0.2948571428571429, "grad_norm": 0.30255088210105896, "kl": 0.009405136108398438, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.032817857379256e-07, "loss": 0.0004, "reward": 0.23631289353215834, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23631289353215834, "reward_after_std": 0.8485419265925884, "reward_before_mean": 0.5696935877203941, "reward_before_std": 0.8634561467915773, "reward_change_max": 0.0004423484206199646, "reward_change_mean": -0.3333806870505214, "reward_change_min": -0.683384831994772, "reward_change_std": 0.26979376189410686, "reward_std": 0.8485419563949108, "rewards/cosine_scaled_reward": -0.05890321545302868, "rewards/format_reward": 0.6875000111758709, "step": 258 }, { "advantage_max": 1.8956716507673264, "advantage_mean": -3.104401180564764e-10, "advantage_min": -0.805887870490551, "advantage_std": 0.9998586103320122, "completion_length": 1569.7917289733887, "epoch": 0.296, "grad_norm": 0.30994725227355957, "kl": 0.009660720825195312, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.001610194928464e-07, "loss": 0.0004, "reward": 0.53939204569906, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.53939204569906, "reward_after_std": 0.8799764476716518, "reward_before_mean": 0.9608602588996291, "reward_before_std": 0.7651578206568956, "reward_change_max": 0.0004586055874824524, "reward_change_mean": -0.4214681852608919, "reward_change_min": -0.6685535982251167, "reward_change_std": 0.24400191474705935, "reward_std": 0.879976499825716, "rewards/cosine_scaled_reward": 0.07418010605033487, "rewards/format_reward": 0.8125000018626451, "step": 259 }, { "advantage_max": 1.7556828409433365, "advantage_mean": -2.7939678570731985e-08, "advantage_min": -1.0517635755240917, "advantage_std": 0.9998450726270676, "completion_length": 1492.6250648498535, "epoch": 0.29714285714285715, "grad_norm": 0.35956430435180664, "kl": 0.008182525634765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.97037808470444e-07, "loss": 0.0003, "reward": 0.7097536413930357, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7097536413930357, "reward_after_std": 0.7402579858899117, "reward_before_mean": 1.2236465960741043, "reward_before_std": 0.6539434809237719, "reward_change_max": 8.148699998855591e-05, "reward_change_mean": -0.5138929300010204, "reward_change_min": -0.7763529941439629, "reward_change_std": 0.3142371028661728, "reward_std": 0.740257989615202, "rewards/cosine_scaled_reward": 0.18473994603846222, "rewards/format_reward": 0.854166679084301, "step": 260 }, { "advantage_max": 1.7108137905597687, "advantage_mean": 2.7939678515220834e-08, "advantage_min": -1.0510932132601738, "advantage_std": 0.9997649118304253, "completion_length": 2236.416717529297, "epoch": 0.29828571428571427, "grad_norm": 0.2295432835817337, "kl": 0.009693145751953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.939123048916173e-07, "loss": 0.0004, "reward": 0.07659879606217146, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07659879606217146, "reward_after_std": 0.603269699960947, "reward_before_mean": 0.3794133588671684, "reward_before_std": 0.5740582179278135, "reward_change_max": 0.0011111795902252197, "reward_change_mean": -0.30281454883515835, "reward_change_min": -0.548656553030014, "reward_change_std": 0.19812428951263428, "reward_std": 0.6032697055488825, "rewards/cosine_scaled_reward": -0.11237666616216302, "rewards/format_reward": 0.6041666716337204, "step": 261 }, { "advantage_max": 1.739681914448738, "advantage_mean": -3.7252906315288215e-09, "advantage_min": -1.0619397088885307, "advantage_std": 0.999794527888298, "completion_length": 2125.833381652832, "epoch": 0.29942857142857143, "grad_norm": 0.2567846179008484, "kl": 0.01434326171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.907846610890011e-07, "loss": 0.0006, "reward": -0.11946423561312258, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11946423561312258, "reward_after_std": 0.6138969361782074, "reward_before_mean": 0.10810671746730804, "reward_before_std": 0.5971795227378607, "reward_change_max": 0.0005362033843994141, "reward_change_mean": -0.22757095284759998, "reward_change_min": -0.43056872859597206, "reward_change_std": 0.16507507860660553, "reward_std": 0.613896943628788, "rewards/cosine_scaled_reward": -0.23761332035064697, "rewards/format_reward": 0.5833333544433117, "step": 262 }, { "advantage_max": 1.7383368462324142, "advantage_mean": -8.071462664904772e-09, "advantage_min": -1.0678929574787617, "advantage_std": 0.9997862949967384, "completion_length": 1477.8958892822266, "epoch": 0.30057142857142854, "grad_norm": 0.25512686371803284, "kl": 0.008062362670898438, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.87655029499542e-07, "loss": 0.0003, "reward": 0.0012842966243624687, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0012842966243624687, "reward_after_std": 0.4931007996201515, "reward_before_mean": 0.29169657081365585, "reward_before_std": 0.4645478054881096, "reward_change_max": 0.00045380741357803345, "reward_change_mean": -0.29041226767003536, "reward_change_min": -0.4666724391281605, "reward_change_std": 0.18420977145433426, "reward_std": 0.493100818246603, "rewards/cosine_scaled_reward": -0.28123506531119347, "rewards/format_reward": 0.8541666865348816, "step": 263 }, { "advantage_max": 1.7824346870183945, "advantage_mean": 9.313225801665936e-09, "advantage_min": -0.9875854030251503, "advantage_std": 0.9998397901654243, "completion_length": 1735.7708740234375, "epoch": 0.3017142857142857, "grad_norm": 0.2410227209329605, "kl": 0.008129119873046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.845235626570683e-07, "loss": 0.0003, "reward": 0.41272473614662886, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.41272473614662886, "reward_after_std": 0.8832961153239012, "reward_before_mean": 0.7959709726274014, "reward_before_std": 0.8308148887008429, "reward_change_max": 0.0, "reward_change_mean": -0.38324626721441746, "reward_change_min": -0.6677347272634506, "reward_change_std": 0.25016341730952263, "reward_std": 0.8832961358129978, "rewards/cosine_scaled_reward": -0.018681177403777838, "rewards/format_reward": 0.8333333432674408, "step": 264 }, { "advantage_max": 1.7545087188482285, "advantage_mean": -7.450580596923828e-09, "advantage_min": -1.0084333643317223, "advantage_std": 0.9998309835791588, "completion_length": 1624.8958740234375, "epoch": 0.3028571428571429, "grad_norm": 0.3049719035625458, "kl": 0.013256072998046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.813904131848564e-07, "loss": 0.0005, "reward": 0.4534339467063546, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4534339467063546, "reward_after_std": 0.7054571099579334, "reward_before_mean": 0.8818895220756531, "reward_before_std": 0.6590976864099503, "reward_change_max": 0.0, "reward_change_mean": -0.4284555483609438, "reward_change_min": -0.7248011864721775, "reward_change_std": 0.27159628458321095, "reward_std": 0.7054571248590946, "rewards/cosine_scaled_reward": 0.013861406594514847, "rewards/format_reward": 0.8541666772216558, "step": 265 }, { "advantage_max": 1.7123200744390488, "advantage_mean": 4.967053546245381e-09, "advantage_min": -1.02156962454319, "advantage_std": 0.999795064330101, "completion_length": 1811.1042175292969, "epoch": 0.304, "grad_norm": 0.23162756860256195, "kl": 0.008726119995117188, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.78255733788191e-07, "loss": 0.0003, "reward": 0.15333101525902748, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.15333101525902748, "reward_after_std": 0.623324491083622, "reward_before_mean": 0.48525767773389816, "reward_before_std": 0.6184410713613033, "reward_change_max": 0.0012048259377479553, "reward_change_mean": -0.3319266433827579, "reward_change_min": -0.57975123077631, "reward_change_std": 0.23415389098227024, "reward_std": 0.6233245246112347, "rewards/cosine_scaled_reward": -0.1323711797595024, "rewards/format_reward": 0.7500000074505806, "step": 266 }, { "advantage_max": 1.7950815856456757, "advantage_mean": 1.3659397724019584e-08, "advantage_min": -0.9563385173678398, "advantage_std": 0.999781459569931, "completion_length": 2192.291679382324, "epoch": 0.30514285714285716, "grad_norm": 0.23464372754096985, "kl": 0.01287078857421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.751196772469237e-07, "loss": 0.0005, "reward": -0.10196404467569664, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.10196404467569664, "reward_after_std": 0.5876062475144863, "reward_before_mean": 0.13426795229315758, "reward_before_std": 0.5441399849951267, "reward_change_max": 0.0010639578104019165, "reward_change_mean": -0.23623200878500938, "reward_change_min": -0.4007794000208378, "reward_change_std": 0.1526002623140812, "reward_std": 0.5876062549650669, "rewards/cosine_scaled_reward": -0.2036993596702814, "rewards/format_reward": 0.5416666772216558, "step": 267 }, { "advantage_max": 1.7288957834243774, "advantage_mean": -3.725290298461914e-09, "advantage_min": -1.0056947842240334, "advantage_std": 0.9998733922839165, "completion_length": 1718.1875534057617, "epoch": 0.3062857142857143, "grad_norm": 0.2916905879974365, "kl": 0.01799774169921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.71982396408026e-07, "loss": 0.0007, "reward": 0.3987836604937911, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3987836604937911, "reward_after_std": 1.0241619944572449, "reward_before_mean": 0.7649654969573021, "reward_before_std": 1.042592279613018, "reward_change_max": 0.0005851238965988159, "reward_change_mean": -0.36618183366954327, "reward_change_min": -0.7570423036813736, "reward_change_std": 0.29365725815296173, "reward_std": 1.024162009358406, "rewards/cosine_scaled_reward": -0.02376725198701024, "rewards/format_reward": 0.8125000111758709, "step": 268 }, { "advantage_max": 1.7609665095806122, "advantage_mean": 1.4280279847511679e-08, "advantage_min": -1.0161311775445938, "advantage_std": 0.9998008832335472, "completion_length": 1846.4583892822266, "epoch": 0.30742857142857144, "grad_norm": 0.2923096716403961, "kl": 0.0104522705078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.688440441781398e-07, "loss": 0.0004, "reward": 0.014135362580418587, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.014135362580418587, "reward_after_std": 0.5908937342464924, "reward_before_mean": 0.2940690014511347, "reward_before_std": 0.5578373521566391, "reward_change_max": 0.0016152411699295044, "reward_change_mean": -0.27993362897541374, "reward_change_min": -0.4698670580983162, "reward_change_std": 0.1804506159387529, "reward_std": 0.5908937491476536, "rewards/cosine_scaled_reward": -0.20713217929005623, "rewards/format_reward": 0.7083333414047956, "step": 269 }, { "advantage_max": 1.7730845659971237, "advantage_mean": -3.2906732450044274e-08, "advantage_min": -1.0470278412103653, "advantage_std": 0.9998434036970139, "completion_length": 1666.9167098999023, "epoch": 0.30857142857142855, "grad_norm": 0.20551292598247528, "kl": 0.010869979858398438, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.657047735161255e-07, "loss": 0.0004, "reward": 0.675954706966877, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.675954706966877, "reward_after_std": 0.8003102131187916, "reward_before_mean": 1.1688923090696335, "reward_before_std": 0.7315810192376375, "reward_change_max": 0.0002498328685760498, "reward_change_mean": -0.492937583476305, "reward_change_min": -0.7724864110350609, "reward_change_std": 0.30114105716347694, "reward_std": 0.8003102131187916, "rewards/cosine_scaled_reward": 0.12611281359568238, "rewards/format_reward": 0.916666679084301, "step": 270 }, { "advantage_max": 1.7657735347747803, "advantage_mean": -4.097819428228178e-08, "advantage_min": -1.0376286134123802, "advantage_std": 0.9998616874217987, "completion_length": 1451.437557220459, "epoch": 0.3097142857142857, "grad_norm": 0.32915830612182617, "kl": 0.011127471923828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.625647374256061e-07, "loss": 0.0004, "reward": 0.8456198088824749, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8456198088824749, "reward_after_std": 0.8662345372140408, "reward_before_mean": 1.3880500681698322, "reward_before_std": 0.7929088193923235, "reward_change_max": 0.0, "reward_change_mean": -0.542430279776454, "reward_change_min": -0.8362083323299885, "reward_change_std": 0.32424600422382355, "reward_std": 0.8662345595657825, "rewards/cosine_scaled_reward": 0.2565250180196017, "rewards/format_reward": 0.8750000111758709, "step": 271 }, { "advantage_max": 1.7442324459552765, "advantage_mean": -2.2972623303640916e-08, "advantage_min": -0.9717377126216888, "advantage_std": 0.9998524338006973, "completion_length": 2025.2708587646484, "epoch": 0.31085714285714283, "grad_norm": 0.2377343773841858, "kl": 0.014011383056640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.594240889475106e-07, "loss": 0.0006, "reward": 0.434994975104928, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.434994975104928, "reward_after_std": 0.8914618045091629, "reward_before_mean": 0.8303525106748566, "reward_before_std": 0.8790302947163582, "reward_change_max": 0.00046882033348083496, "reward_change_mean": -0.39535755664110184, "reward_change_min": -0.7980300933122635, "reward_change_std": 0.28830082062631845, "reward_std": 0.8914618417620659, "rewards/cosine_scaled_reward": 0.061009590805042535, "rewards/format_reward": 0.7083333395421505, "step": 272 }, { "advantage_max": 1.8797968626022339, "advantage_mean": -2.9491882463927865e-08, "advantage_min": -0.8802257254719734, "advantage_std": 0.9998422265052795, "completion_length": 1598.3542098999023, "epoch": 0.312, "grad_norm": 0.28581616282463074, "kl": 0.010746002197265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.562829811526154e-07, "loss": 0.0004, "reward": 0.511997282737866, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.511997282737866, "reward_after_std": 0.6605219431221485, "reward_before_mean": 0.9574417360126972, "reward_before_std": 0.5232751611620188, "reward_change_max": 0.0, "reward_change_mean": -0.44544440880417824, "reward_change_min": -0.6586003005504608, "reward_change_std": 0.24995562713593245, "reward_std": 0.6605219617486, "rewards/cosine_scaled_reward": 0.07247084565460682, "rewards/format_reward": 0.8125000055879354, "step": 273 }, { "advantage_max": 1.6871041059494019, "advantage_mean": -2.3593506037755674e-08, "advantage_min": -1.1058197170495987, "advantage_std": 0.999884732067585, "completion_length": 1223.8958740234375, "epoch": 0.31314285714285717, "grad_norm": 0.3119834065437317, "kl": 0.011409759521484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.531415671340826e-07, "loss": 0.0005, "reward": 0.7157558053731918, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7157558053731918, "reward_after_std": 0.9508935250341892, "reward_before_mean": 1.2031367234885693, "reward_before_std": 0.9290197528898716, "reward_change_max": 0.0003786534070968628, "reward_change_mean": -0.4873809367418289, "reward_change_min": -0.8486338406801224, "reward_change_std": 0.32481664046645164, "reward_std": 0.9508935511112213, "rewards/cosine_scaled_reward": 0.1328183552250266, "rewards/format_reward": 0.9375000074505806, "step": 274 }, { "advantage_max": 1.8595446348190308, "advantage_mean": -3.011276428210863e-08, "advantage_min": -0.803222294896841, "advantage_std": 0.9998753815889359, "completion_length": 1697.8333435058594, "epoch": 0.3142857142857143, "grad_norm": 0.22085727751255035, "kl": 0.010059356689453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.5e-07, "loss": 0.0004, "reward": 0.6540884003043175, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6540884003043175, "reward_after_std": 0.8756862543523312, "reward_before_mean": 1.1225057151168585, "reward_before_std": 0.7728333584964275, "reward_change_max": 0.0, "reward_change_mean": -0.4684173297137022, "reward_change_min": -0.7583707384765148, "reward_change_std": 0.2782782781869173, "reward_std": 0.8756862618029118, "rewards/cosine_scaled_reward": 0.14458617568016052, "rewards/format_reward": 0.8333333414047956, "step": 275 }, { "advantage_max": 1.7695360034704208, "advantage_mean": 5.898376342905465e-09, "advantage_min": -0.9236005023121834, "advantage_std": 0.999855250120163, "completion_length": 1539.9167098999023, "epoch": 0.31542857142857145, "grad_norm": 0.382769376039505, "kl": 0.01567840576171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.468584328659172e-07, "loss": 0.0006, "reward": 0.18820764060365036, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.18820764060365036, "reward_after_std": 0.8256092742085457, "reward_before_mean": 0.4984450116753578, "reward_before_std": 0.80574531853199, "reward_change_max": 0.0008641183376312256, "reward_change_mean": -0.3102373657748103, "reward_change_min": -0.5691503696143627, "reward_change_std": 0.2160546649247408, "reward_std": 0.8256092742085457, "rewards/cosine_scaled_reward": -0.14661084674298763, "rewards/format_reward": 0.7916666772216558, "step": 276 }, { "advantage_max": 1.7275479137897491, "advantage_mean": 1.536682314728921e-08, "advantage_min": -1.023403838276863, "advantage_std": 0.9998606666922569, "completion_length": 1587.7083740234375, "epoch": 0.31657142857142856, "grad_norm": 0.45101556181907654, "kl": 0.012126922607421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.437170188473847e-07, "loss": 0.0005, "reward": 0.47736689331941307, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.47736689331941307, "reward_after_std": 0.8368259742856026, "reward_before_mean": 0.8933232873678207, "reward_before_std": 0.8181468620896339, "reward_change_max": 0.0009065493941307068, "reward_change_mean": -0.4159563574939966, "reward_change_min": -0.7502005323767662, "reward_change_std": 0.28646659292280674, "reward_std": 0.8368259966373444, "rewards/cosine_scaled_reward": 0.06124496250413358, "rewards/format_reward": 0.7708333414047956, "step": 277 }, { "advantage_max": 1.8014727234840393, "advantage_mean": -3.849466756467024e-08, "advantage_min": -0.9295470267534256, "advantage_std": 0.999811939895153, "completion_length": 1624.145866394043, "epoch": 0.3177142857142857, "grad_norm": 0.4834090769290924, "kl": 0.012338638305664062, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.405759110524894e-07, "loss": 0.0005, "reward": 0.49347927141934633, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.49347927141934633, "reward_after_std": 0.6437284983694553, "reward_before_mean": 0.9337407140992582, "reward_before_std": 0.5646857023239136, "reward_change_max": 0.0, "reward_change_mean": -0.4402614226564765, "reward_change_min": -0.6591823175549507, "reward_change_std": 0.25457138381898403, "reward_std": 0.6437284983694553, "rewards/cosine_scaled_reward": 0.05020365957170725, "rewards/format_reward": 0.8333333395421505, "step": 278 }, { "advantage_max": 1.7749723047018051, "advantage_mean": 6.208820124697922e-10, "advantage_min": -0.9562231674790382, "advantage_std": 0.9998540133237839, "completion_length": 1889.5417175292969, "epoch": 0.31885714285714284, "grad_norm": 0.2553693950176239, "kl": 0.01265716552734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.37435262574394e-07, "loss": 0.0005, "reward": 0.384612288326025, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.384612288326025, "reward_after_std": 0.8437690027058125, "reward_before_mean": 0.7663194974884391, "reward_before_std": 0.8090336602181196, "reward_change_max": 0.0024261847138404846, "reward_change_mean": -0.38170722872018814, "reward_change_min": -0.6650576516985893, "reward_change_std": 0.2538069412112236, "reward_std": 0.8437690064311028, "rewards/cosine_scaled_reward": -0.023090248927474022, "rewards/format_reward": 0.8125000074505806, "step": 279 }, { "advantage_max": 1.7013973146677017, "advantage_mean": -2.2972624746930848e-08, "advantage_min": -1.0674389824271202, "advantage_std": 0.9999038651585579, "completion_length": 1719.43754196167, "epoch": 0.32, "grad_norm": 0.3466682732105255, "kl": 0.01338958740234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.342952264838747e-07, "loss": 0.0005, "reward": 0.8902607094496489, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8902607094496489, "reward_after_std": 1.0842677131295204, "reward_before_mean": 1.4259627908468246, "reward_before_std": 1.0771494135260582, "reward_change_max": 0.0006999745965003967, "reward_change_mean": -0.5357020907104015, "reward_change_min": -1.0063834339380264, "reward_change_std": 0.3776916489005089, "reward_std": 1.0842677503824234, "rewards/cosine_scaled_reward": 0.33798138273414224, "rewards/format_reward": 0.7500000149011612, "step": 280 }, { "advantage_max": 1.8399807959794998, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -0.8623168915510178, "advantage_std": 0.9998367950320244, "completion_length": 2537.6458587646484, "epoch": 0.3211428571428571, "grad_norm": 0.2045326828956604, "kl": 0.01361083984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.311559558218603e-07, "loss": 0.0005, "reward": 0.012729940935969353, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.012729940935969353, "reward_after_std": 0.8245834782719612, "reward_before_mean": 0.25922183008515276, "reward_before_std": 0.794154167175293, "reward_change_max": 0.0015305504202842712, "reward_change_mean": -0.2464919025078416, "reward_change_min": -0.5046870838850737, "reward_change_std": 0.19383440725505352, "reward_std": 0.8245835117995739, "rewards/cosine_scaled_reward": -0.13080575612548273, "rewards/format_reward": 0.520833345130086, "step": 281 }, { "advantage_max": 1.747948780655861, "advantage_mean": -1.9868214629070735e-08, "advantage_min": -1.0267510414123535, "advantage_std": 0.9998375326395035, "completion_length": 1526.6666946411133, "epoch": 0.3222857142857143, "grad_norm": 0.26954999566078186, "kl": 0.0120849609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.28017603591974e-07, "loss": 0.0005, "reward": 0.4195130467414856, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4195130467414856, "reward_after_std": 0.6681225784122944, "reward_before_mean": 0.8378233400289901, "reward_before_std": 0.6094602532684803, "reward_change_max": 0.0, "reward_change_mean": -0.4183103181421757, "reward_change_min": -0.6764197871088982, "reward_change_std": 0.2652474418282509, "reward_std": 0.6681225821375847, "rewards/cosine_scaled_reward": 0.03349499590694904, "rewards/format_reward": 0.7708333395421505, "step": 282 }, { "advantage_max": 1.5678325295448303, "advantage_mean": 1.7384688855148767e-08, "advantage_min": -1.2273004427552223, "advantage_std": 0.9998588636517525, "completion_length": 2126.7084045410156, "epoch": 0.32342857142857145, "grad_norm": 0.22718200087547302, "kl": 0.01085662841796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.248803227530763e-07, "loss": 0.0004, "reward": 0.6442702282220125, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6442702282220125, "reward_after_std": 0.8793414235115051, "reward_before_mean": 1.127575246617198, "reward_before_std": 0.9262066558003426, "reward_change_max": 0.0, "reward_change_mean": -0.48330499418079853, "reward_change_min": -0.8393308259546757, "reward_change_std": 0.3402592074126005, "reward_std": 0.8793414533138275, "rewards/cosine_scaled_reward": 0.17837094888091087, "rewards/format_reward": 0.7708333488553762, "step": 283 }, { "advantage_max": 1.7835517078638077, "advantage_mean": 1.179675268581093e-08, "advantage_min": -0.9026581086218357, "advantage_std": 0.9998344406485558, "completion_length": 1361.6042022705078, "epoch": 0.32457142857142857, "grad_norm": 0.29662030935287476, "kl": 0.009703636169433594, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.21744266211809e-07, "loss": 0.0004, "reward": 0.2081410437822342, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2081410437822342, "reward_after_std": 0.7301465757191181, "reward_before_mean": 0.5356057696044445, "reward_before_std": 0.6986973285675049, "reward_change_max": 0.004074104130268097, "reward_change_mean": -0.3274647332727909, "reward_change_min": -0.6071773357689381, "reward_change_std": 0.22288612741976976, "reward_std": 0.7301465906202793, "rewards/cosine_scaled_reward": -0.13844711519777775, "rewards/format_reward": 0.8125000074505806, "step": 284 }, { "advantage_max": 1.7680955827236176, "advantage_mean": -1.7384688799637615e-08, "advantage_min": -0.9570752456784248, "advantage_std": 0.9998417645692825, "completion_length": 1142.9375534057617, "epoch": 0.32571428571428573, "grad_norm": 0.25314438343048096, "kl": 0.01023101806640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.186095868151436e-07, "loss": 0.0004, "reward": 0.4247536053881049, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4247536053881049, "reward_after_std": 0.7850625067949295, "reward_before_mean": 0.8296149596571922, "reward_before_std": 0.740747943520546, "reward_change_max": 0.0, "reward_change_mean": -0.40486135333776474, "reward_change_min": -0.6949414350092411, "reward_change_std": 0.260646503418684, "reward_std": 0.7850625216960907, "rewards/cosine_scaled_reward": -0.053942530415952206, "rewards/format_reward": 0.9375000074505806, "step": 285 }, { "advantage_max": 1.7359804213047028, "advantage_mean": -3.1044087189791014e-08, "advantage_min": -1.0563341975212097, "advantage_std": 0.9998172670602798, "completion_length": 1429.125015258789, "epoch": 0.32685714285714285, "grad_norm": 0.9064041376113892, "kl": 0.01177215576171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.154764373429315e-07, "loss": 0.0005, "reward": 0.3319121412932873, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3319121412932873, "reward_after_std": 0.6911419406533241, "reward_before_mean": 0.71196573600173, "reward_before_std": 0.6536110751330853, "reward_change_max": 0.0, "reward_change_mean": -0.38005358912050724, "reward_change_min": -0.6438889019191265, "reward_change_std": 0.23880964796990156, "reward_std": 0.6911419592797756, "rewards/cosine_scaled_reward": -0.06068380922079086, "rewards/format_reward": 0.8333333432674408, "step": 286 }, { "advantage_max": 1.867331638932228, "advantage_mean": -2.731879555906147e-08, "advantage_min": -0.8202384114265442, "advantage_std": 0.999826192855835, "completion_length": 1234.4375381469727, "epoch": 0.328, "grad_norm": 0.523858904838562, "kl": 0.019672393798828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.123449705004581e-07, "loss": 0.0008, "reward": 0.43224646942690015, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.43224646942690015, "reward_after_std": 0.704168938100338, "reward_before_mean": 0.8389612957835197, "reward_before_std": 0.5989695750176907, "reward_change_max": 0.0, "reward_change_mean": -0.40671483241021633, "reward_change_min": -0.6221253648400307, "reward_change_std": 0.24588334187865257, "reward_std": 0.7041689455509186, "rewards/cosine_scaled_reward": 0.023647296242415905, "rewards/format_reward": 0.7916666697710752, "step": 287 }, { "advantage_max": 1.7957639396190643, "advantage_mean": -2.4524829278504967e-08, "advantage_min": -1.0449253246188164, "advantage_std": 0.9998297840356827, "completion_length": 1705.8334045410156, "epoch": 0.3291428571428571, "grad_norm": 0.3404673933982849, "kl": 0.0146484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.09215338910999e-07, "loss": 0.0006, "reward": 0.1506073004566133, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1506073004566133, "reward_after_std": 0.7189630642533302, "reward_before_mean": 0.4591069910675287, "reward_before_std": 0.6755936108529568, "reward_change_max": 0.0007332637906074524, "reward_change_mean": -0.3084996845573187, "reward_change_min": -0.522847905755043, "reward_change_std": 0.2017457876354456, "reward_std": 0.718963086605072, "rewards/cosine_scaled_reward": -0.14544653287157416, "rewards/format_reward": 0.7500000149011612, "step": 288 }, { "advantage_max": 1.7939541935920715, "advantage_mean": -4.967053435223079e-09, "advantage_min": -0.943301260471344, "advantage_std": 0.9997204095125198, "completion_length": 1765.395866394043, "epoch": 0.3302857142857143, "grad_norm": 0.4641970694065094, "kl": 0.018299102783203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.060876951083828e-07, "loss": 0.0007, "reward": 0.17819978203624487, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17819978203624487, "reward_after_std": 0.48670842684805393, "reward_before_mean": 0.5280370216351002, "reward_before_std": 0.40649663284420967, "reward_change_max": 0.00031722337007522583, "reward_change_mean": -0.3498372109606862, "reward_change_min": -0.5378180183470249, "reward_change_std": 0.206516417209059, "reward_std": 0.48670843057334423, "rewards/cosine_scaled_reward": -0.06931484490633011, "rewards/format_reward": 0.6666666679084301, "step": 289 }, { "advantage_max": 1.7855763733386993, "advantage_mean": -3.0423204844254315e-08, "advantage_min": -0.9341821298003197, "advantage_std": 0.9998474344611168, "completion_length": 1121.645866394043, "epoch": 0.3314285714285714, "grad_norm": 0.3233749568462372, "kl": 0.010059356689453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.02962191529556e-07, "loss": 0.0004, "reward": 0.628340994939208, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.628340994939208, "reward_after_std": 0.7716374322772026, "reward_before_mean": 1.1041506007313728, "reward_before_std": 0.6896773837506771, "reward_change_max": 0.0, "reward_change_mean": -0.47580961883068085, "reward_change_min": -0.7892936766147614, "reward_change_std": 0.28673662804067135, "reward_std": 0.7716374434530735, "rewards/cosine_scaled_reward": 0.07290861825458705, "rewards/format_reward": 0.9583333358168602, "step": 290 }, { "advantage_max": 1.7877116948366165, "advantage_mean": -3.725290420586447e-08, "advantage_min": -0.8816217482089996, "advantage_std": 0.9998772963881493, "completion_length": 1248.4375228881836, "epoch": 0.3325714285714286, "grad_norm": 0.2256891280412674, "kl": 0.009935379028320312, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.998389805071536e-07, "loss": 0.0004, "reward": 0.5594779495149851, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5594779495149851, "reward_after_std": 0.8831817731261253, "reward_before_mean": 0.996112409979105, "reward_before_std": 0.8126354180276394, "reward_change_max": 0.0, "reward_change_mean": -0.43663448095321655, "reward_change_min": -0.7538793534040451, "reward_change_std": 0.27136605605483055, "reward_std": 0.8831818252801895, "rewards/cosine_scaled_reward": 0.01888952637091279, "rewards/format_reward": 0.9583333358168602, "step": 291 }, { "advantage_max": 1.829864040017128, "advantage_mean": 7.761021492136422e-09, "advantage_min": -0.8813152462244034, "advantage_std": 0.9997895732522011, "completion_length": 1878.145896911621, "epoch": 0.33371428571428574, "grad_norm": 0.2704259157180786, "kl": 0.012691497802734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.967182142620745e-07, "loss": 0.0005, "reward": 0.04974047373980284, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04974047373980284, "reward_after_std": 0.5577663220465183, "reward_before_mean": 0.3450439879670739, "reward_before_std": 0.5048244828358293, "reward_change_max": 0.0005193427205085754, "reward_change_mean": -0.2953035105019808, "reward_change_min": -0.4872824549674988, "reward_change_std": 0.19074784219264984, "reward_std": 0.5577663332223892, "rewards/cosine_scaled_reward": -0.22331134881824255, "rewards/format_reward": 0.7916666716337204, "step": 292 }, { "advantage_max": 1.7227300852537155, "advantage_mean": 7.450581263057643e-09, "advantage_min": -1.1504777669906616, "advantage_std": 0.9997950792312622, "completion_length": 1006.0000228881836, "epoch": 0.33485714285714285, "grad_norm": 0.302920937538147, "kl": 0.008930206298828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.93600044896063e-07, "loss": 0.0004, "reward": 0.4407608639448881, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4407608639448881, "reward_after_std": 0.525139145553112, "reward_before_mean": 0.8833719491958618, "reward_before_std": 0.46765924617648125, "reward_change_max": 0.0, "reward_change_mean": -0.44261104613542557, "reward_change_min": -0.6612563654780388, "reward_change_std": 0.25007418915629387, "reward_std": 0.5251391530036926, "rewards/cosine_scaled_reward": -0.047897398471832275, "rewards/format_reward": 0.9791666716337204, "step": 293 }, { "advantage_max": 1.6939482390880585, "advantage_mean": 1.4901161637936866e-08, "advantage_min": -0.9664500877261162, "advantage_std": 0.9998406693339348, "completion_length": 1792.3542022705078, "epoch": 0.336, "grad_norm": 0.29325243830680847, "kl": 0.01251983642578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.904846243842949e-07, "loss": 0.0005, "reward": 0.36595712695270777, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.36595712695270777, "reward_after_std": 0.8944174610078335, "reward_before_mean": 0.739785697311163, "reward_before_std": 0.9162412490695715, "reward_change_max": 0.0005852505564689636, "reward_change_mean": -0.3738285740837455, "reward_change_min": -0.7336220294237137, "reward_change_std": 0.284961283672601, "reward_std": 0.8944174610078335, "rewards/cosine_scaled_reward": 0.005309512373059988, "rewards/format_reward": 0.7291666865348816, "step": 294 }, { "advantage_max": 1.6340957880020142, "advantage_mean": -7.45058070794613e-09, "advantage_min": -1.1434592232108116, "advantage_std": 0.9998557269573212, "completion_length": 1682.395896911621, "epoch": 0.33714285714285713, "grad_norm": 0.4144446849822998, "kl": 0.016773223876953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.873721045679706e-07, "loss": 0.0007, "reward": 0.5403357809409499, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5403357809409499, "reward_after_std": 0.9011359810829163, "reward_before_mean": 0.9824107913300395, "reward_before_std": 0.9457733444869518, "reward_change_max": 0.00047722458839416504, "reward_change_mean": -0.4420750178396702, "reward_change_min": -0.8015450723469257, "reward_change_std": 0.32963609881699085, "reward_std": 0.9011360183358192, "rewards/cosine_scaled_reward": 0.07453872635960579, "rewards/format_reward": 0.8333333507180214, "step": 295 }, { "advantage_max": 1.7640704810619354, "advantage_mean": -1.4280279514444771e-08, "advantage_min": -0.9065175354480743, "advantage_std": 0.9998138546943665, "completion_length": 1610.1667251586914, "epoch": 0.3382857142857143, "grad_norm": 0.27682623267173767, "kl": 0.011745452880859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.842626371469149e-07, "loss": 0.0005, "reward": 0.37279043020680547, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.37279043020680547, "reward_after_std": 0.7672993745654821, "reward_before_mean": 0.7593379318714142, "reward_before_std": 0.728296990506351, "reward_change_max": 0.0, "reward_change_mean": -0.386547539383173, "reward_change_min": -0.6754231676459312, "reward_change_std": 0.2540635820478201, "reward_std": 0.7672993969172239, "rewards/cosine_scaled_reward": -0.05783102835994214, "rewards/format_reward": 0.8750000149011612, "step": 296 }, { "advantage_max": 1.8145205974578857, "advantage_mean": 1.6065314745183912e-08, "advantage_min": -0.9419127553701401, "advantage_std": 0.9998451545834541, "completion_length": 2349.375045776367, "epoch": 0.3394285714285714, "grad_norm": 0.23118162155151367, "kl": 0.01995849609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.811563736721829e-07, "loss": 0.0008, "reward": 0.05298507632687688, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.05298507632687688, "reward_after_std": 0.7990232817828655, "reward_before_mean": 0.31469991244375706, "reward_before_std": 0.7786304727196693, "reward_change_max": 0.0009455159306526184, "reward_change_mean": -0.2617148347198963, "reward_change_min": -0.5071679316461086, "reward_change_std": 0.19112743251025677, "reward_std": 0.7990233078598976, "rewards/cosine_scaled_reward": -0.1343167219310999, "rewards/format_reward": 0.5833333432674408, "step": 297 }, { "advantage_max": 1.7282511591911316, "advantage_mean": 1.2417645800510968e-09, "advantage_min": -1.0884372144937515, "advantage_std": 0.9998100697994232, "completion_length": 1669.9167098999023, "epoch": 0.3405714285714286, "grad_norm": 0.2794896066188812, "kl": 0.012470245361328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.780534655386743e-07, "loss": 0.0005, "reward": 0.284396500675939, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.284396500675939, "reward_after_std": 0.605756513774395, "reward_before_mean": 0.6611570566892624, "reward_before_std": 0.5663269981741905, "reward_change_max": 0.0, "reward_change_mean": -0.37676048278808594, "reward_change_min": -0.5789894834160805, "reward_change_std": 0.2259794371202588, "reward_std": 0.6057565286755562, "rewards/cosine_scaled_reward": -0.05483816470950842, "rewards/format_reward": 0.7708333432674408, "step": 298 }, { "advantage_max": 1.6793813556432724, "advantage_mean": 2.1730859889323995e-09, "advantage_min": -1.078734129667282, "advantage_std": 0.9998118504881859, "completion_length": 1641.3125457763672, "epoch": 0.3417142857142857, "grad_norm": 0.3441999554634094, "kl": 0.015369415283203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.749540639777539e-07, "loss": 0.0006, "reward": 0.3187793163815513, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3187793163815513, "reward_after_std": 0.6048445105552673, "reward_before_mean": 0.7112625315785408, "reward_before_std": 0.5736962109804153, "reward_change_max": 0.00048673897981643677, "reward_change_mean": -0.39248319156467915, "reward_change_min": -0.6510103233158588, "reward_change_std": 0.2509506158530712, "reward_std": 0.6048445180058479, "rewards/cosine_scaled_reward": -0.008952075615525246, "rewards/format_reward": 0.7291666828095913, "step": 299 }, { "advantage_max": 1.7159940302371979, "advantage_mean": -2.607703264434491e-08, "advantage_min": -1.0400254726409912, "advantage_std": 0.9997934773564339, "completion_length": 1725.7083740234375, "epoch": 0.34285714285714286, "grad_norm": 0.38895562291145325, "kl": 0.01593780517578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.7185832004988133e-07, "loss": 0.0006, "reward": 0.3178744805045426, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3178744805045426, "reward_after_std": 0.5978541970252991, "reward_before_mean": 0.7093947567045689, "reward_before_std": 0.5653713010251522, "reward_change_max": 0.0005860105156898499, "reward_change_mean": -0.3915202720090747, "reward_change_min": -0.6351774521172047, "reward_change_std": 0.2517464295960963, "reward_std": 0.5978542380034924, "rewards/cosine_scaled_reward": -0.009885963052511215, "rewards/format_reward": 0.729166679084301, "step": 300 }, { "advantage_max": 1.7751188725233078, "advantage_mean": 3.725290520506519e-09, "advantage_min": -1.0983238443732262, "advantage_std": 0.9998164921998978, "completion_length": 1744.6458740234375, "epoch": 0.344, "grad_norm": 0.3116256892681122, "kl": 0.01949310302734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.68766384637248e-07, "loss": 0.0008, "reward": 0.27887736656703055, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.27887736656703055, "reward_after_std": 0.6220687516033649, "reward_before_mean": 0.644267542520538, "reward_before_std": 0.5633281767368317, "reward_change_max": 0.0, "reward_change_mean": -0.3653901796787977, "reward_change_min": -0.5764031857252121, "reward_change_std": 0.22195403650403023, "reward_std": 0.6220687516033649, "rewards/cosine_scaled_reward": -0.10494956793263555, "rewards/format_reward": 0.8541666753590107, "step": 301 }, { "advantage_max": 1.760393887758255, "advantage_mean": 1.1796753462967047e-08, "advantage_min": -0.9932909980416298, "advantage_std": 0.9998266994953156, "completion_length": 1877.708381652832, "epoch": 0.34514285714285714, "grad_norm": 0.29929548501968384, "kl": 0.020648956298828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.656784084364238e-07, "loss": 0.0008, "reward": 0.27130209654569626, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.27130209654569626, "reward_after_std": 0.7204683609306812, "reward_before_mean": 0.6241901405155659, "reward_before_std": 0.6644104095175862, "reward_change_max": 0.0022316426038742065, "reward_change_mean": -0.35288800951093435, "reward_change_min": -0.6139239706099033, "reward_change_std": 0.23199661634862423, "reward_std": 0.720468383282423, "rewards/cosine_scaled_reward": -0.0004049413837492466, "rewards/format_reward": 0.6250000111758709, "step": 302 }, { "advantage_max": 1.7996810972690582, "advantage_mean": 0.0, "advantage_min": -0.8987217247486115, "advantage_std": 0.9998162016272545, "completion_length": 1373.125015258789, "epoch": 0.3462857142857143, "grad_norm": 0.2996969521045685, "kl": 0.017475128173828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.6259454195101267e-07, "loss": 0.0007, "reward": 0.3065176494419575, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3065176494419575, "reward_after_std": 0.7000116594135761, "reward_before_mean": 0.6733813043683767, "reward_before_std": 0.6450838912278414, "reward_change_max": 0.0, "reward_change_mean": -0.3668636828660965, "reward_change_min": -0.6291668303310871, "reward_change_std": 0.22623361833393574, "reward_std": 0.7000116631388664, "rewards/cosine_scaled_reward": -0.10080935060977936, "rewards/format_reward": 0.8750000055879354, "step": 303 }, { "advantage_max": 1.790680468082428, "advantage_mean": 4.967053990334591e-09, "advantage_min": -0.9446305185556412, "advantage_std": 0.9998354762792587, "completion_length": 1486.2292175292969, "epoch": 0.3474285714285714, "grad_norm": 0.2798292338848114, "kl": 0.01465606689453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.59514935484316e-07, "loss": 0.0006, "reward": 0.2333367036189884, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2333367036189884, "reward_after_std": 0.7814841605722904, "reward_before_mean": 0.5685579031705856, "reward_before_std": 0.7611190304160118, "reward_change_max": 0.0006494522094726562, "reward_change_mean": -0.3352212137542665, "reward_change_min": -0.6023150086402893, "reward_change_std": 0.23202185425907373, "reward_std": 0.781484205275774, "rewards/cosine_scaled_reward": -0.11155438236892223, "rewards/format_reward": 0.791666679084301, "step": 304 }, { "advantage_max": 1.8818738013505936, "advantage_mean": 9.934107758624577e-09, "advantage_min": -0.767436645925045, "advantage_std": 0.9998392090201378, "completion_length": 1492.9167175292969, "epoch": 0.3485714285714286, "grad_norm": 0.3052330017089844, "kl": 0.012569427490234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.5643973913200837e-07, "loss": 0.0005, "reward": 0.15397980774287134, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.15397980774287134, "reward_after_std": 0.727553766220808, "reward_before_mean": 0.4598623961210251, "reward_before_std": 0.6448974795639515, "reward_change_max": 0.0004153028130531311, "reward_change_mean": -0.30588257126510143, "reward_change_min": -0.5289436429738998, "reward_change_std": 0.18547483533620834, "reward_std": 0.7275537736713886, "rewards/cosine_scaled_reward": -0.19715214520692825, "rewards/format_reward": 0.8541666697710752, "step": 305 }, { "advantage_max": 1.808417722582817, "advantage_mean": -5.898376487234458e-08, "advantage_min": -0.9077035076916218, "advantage_std": 0.9998652338981628, "completion_length": 1246.395866394043, "epoch": 0.3497142857142857, "grad_norm": 0.27321967482566833, "kl": 0.0164947509765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.5336910277482155e-07, "loss": 0.0007, "reward": 0.7775358557701111, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7775358557701111, "reward_after_std": 0.8869066424667835, "reward_before_mean": 1.2912888433784246, "reward_before_std": 0.7921518832445145, "reward_change_max": 0.0, "reward_change_mean": -0.5137529708445072, "reward_change_min": -0.8009754791855812, "reward_change_std": 0.3069030549377203, "reward_std": 0.8869066424667835, "rewards/cosine_scaled_reward": 0.2289777360856533, "rewards/format_reward": 0.8333333358168602, "step": 306 }, { "advantage_max": 1.8169881999492645, "advantage_mean": -1.3659397946064189e-08, "advantage_min": -0.8610949888825417, "advantage_std": 0.9998676031827927, "completion_length": 1330.3125305175781, "epoch": 0.35085714285714287, "grad_norm": 0.2523382604122162, "kl": 0.010395050048828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.503031760712397e-07, "loss": 0.0004, "reward": 0.3598046926781535, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3598046926781535, "reward_after_std": 0.9233333840966225, "reward_before_mean": 0.7185269333422184, "reward_before_std": 0.883537333458662, "reward_change_max": 0.0, "reward_change_mean": -0.35872222762554884, "reward_change_min": -0.6905819624662399, "reward_change_std": 0.24804281257092953, "reward_std": 0.923333402723074, "rewards/cosine_scaled_reward": -0.07823655195534229, "rewards/format_reward": 0.8750000055879354, "step": 307 }, { "advantage_max": 1.7350960969924927, "advantage_mean": 1.6763807009212428e-08, "advantage_min": -1.086825430393219, "advantage_std": 0.9998591542243958, "completion_length": 2199.104217529297, "epoch": 0.352, "grad_norm": 0.21316513419151306, "kl": 0.01902008056640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.4724210845020494e-07, "loss": 0.0008, "reward": 0.2847254048101604, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2847254048101604, "reward_after_std": 0.8662929609417915, "reward_before_mean": 0.6271185912191868, "reward_before_std": 0.8565546348690987, "reward_change_max": 0.0008182600140571594, "reward_change_mean": -0.34239312261343, "reward_change_min": -0.5910441391170025, "reward_change_std": 0.23887865617871284, "reward_std": 0.8662929721176624, "rewards/cosine_scaled_reward": -0.009357405244372785, "rewards/format_reward": 0.6458333507180214, "step": 308 }, { "advantage_max": 1.8370120525360107, "advantage_mean": 1.4280279958533981e-08, "advantage_min": -0.9587069824337959, "advantage_std": 0.9998479038476944, "completion_length": 1737.8750381469727, "epoch": 0.35314285714285715, "grad_norm": 0.2074069380760193, "kl": 0.013385772705078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.441860491038345e-07, "loss": 0.0005, "reward": 0.32241479866206646, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32241479866206646, "reward_after_std": 0.8825575299561024, "reward_before_mean": 0.6710044899955392, "reward_before_std": 0.8154950402677059, "reward_change_max": 0.0, "reward_change_mean": -0.3485896661877632, "reward_change_min": -0.6016444638371468, "reward_change_std": 0.22454170510172844, "reward_std": 0.8825575634837151, "rewards/cosine_scaled_reward": -0.08116443594917655, "rewards/format_reward": 0.8333333488553762, "step": 309 }, { "advantage_max": 1.8830011785030365, "advantage_mean": -1.6142925107764938e-08, "advantage_min": -0.8540446609258652, "advantage_std": 0.9998420104384422, "completion_length": 1216.270866394043, "epoch": 0.35428571428571426, "grad_norm": 0.28493112325668335, "kl": 0.016704559326171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.4113514698014953e-07, "loss": 0.0007, "reward": 0.296893642982468, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.296893642982468, "reward_after_std": 0.7870561331510544, "reward_before_mean": 0.6431922949850559, "reward_before_std": 0.7002072893083096, "reward_change_max": 0.0, "reward_change_mean": -0.3462986573576927, "reward_change_min": -0.5790475532412529, "reward_change_std": 0.2089572511613369, "reward_std": 0.7870561443269253, "rewards/cosine_scaled_reward": -0.14715386601164937, "rewards/format_reward": 0.9375000149011612, "step": 310 }, { "advantage_max": 1.6857483983039856, "advantage_mean": -9.313225746154785e-09, "advantage_min": -1.016469158232212, "advantage_std": 0.9998601451516151, "completion_length": 1275.1458587646484, "epoch": 0.3554285714285714, "grad_norm": 0.3985695540904999, "kl": 0.0138702392578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.3808955077581546e-07, "loss": 0.0006, "reward": 0.5198321924544871, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5198321924544871, "reward_after_std": 0.8920879140496254, "reward_before_mean": 0.9477369291707873, "reward_before_std": 0.8948461581021547, "reward_change_max": 0.0012291297316551208, "reward_change_mean": -0.4279047343879938, "reward_change_min": -0.7817993424832821, "reward_change_std": 0.3022057879716158, "reward_std": 0.8920879177749157, "rewards/cosine_scaled_reward": 0.02595178346382454, "rewards/format_reward": 0.8958333432674408, "step": 311 }, { "advantage_max": 1.9115204960107803, "advantage_mean": -5.401671043792078e-08, "advantage_min": -0.7505245059728622, "advantage_std": 0.9998109638690948, "completion_length": 1247.1041946411133, "epoch": 0.3565714285714286, "grad_norm": 0.2590179741382599, "kl": 0.015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.350494089288943e-07, "loss": 0.0006, "reward": 0.6923795252414493, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6923795252414493, "reward_after_std": 0.6369202136993408, "reward_before_mean": 1.2022217735648155, "reward_before_std": 0.4634255887940526, "reward_change_max": 0.0, "reward_change_mean": -0.5098422933369875, "reward_change_min": -0.7307217977941036, "reward_change_std": 0.26922959461808205, "reward_std": 0.6369202248752117, "rewards/cosine_scaled_reward": 0.16361087746918201, "rewards/format_reward": 0.875, "step": 312 }, { "advantage_max": 1.8144911378622055, "advantage_mean": 3.818422725232473e-08, "advantage_min": -0.9053501859307289, "advantage_std": 0.9998280107975006, "completion_length": 1968.770881652832, "epoch": 0.3577142857142857, "grad_norm": 0.28470155596733093, "kl": 0.0232696533203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.3201486961161093e-07, "loss": 0.0009, "reward": 0.3348238281905651, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3348238281905651, "reward_after_std": 0.8333890065550804, "reward_before_mean": 0.6938161626458168, "reward_before_std": 0.7581512234173715, "reward_change_max": 0.00035356730222702026, "reward_change_mean": -0.3589923260733485, "reward_change_min": -0.6206830907613039, "reward_change_std": 0.24508488830178976, "reward_std": 0.8333890214562416, "rewards/cosine_scaled_reward": 0.02399141527712345, "rewards/format_reward": 0.6458333432674408, "step": 313 }, { "advantage_max": 1.7774415910243988, "advantage_mean": -2.4835269951672956e-08, "advantage_min": -1.0406172201037407, "advantage_std": 0.999833308160305, "completion_length": 1696.4792137145996, "epoch": 0.3588571428571429, "grad_norm": 0.29339587688446045, "kl": 0.020687103271484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.2898608072313045e-07, "loss": 0.0008, "reward": 0.3877815520390868, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3877815520390868, "reward_after_std": 0.8010883741080761, "reward_before_mean": 0.7714847847819328, "reward_before_std": 0.7351839672774076, "reward_change_max": 9.253621101379395e-06, "reward_change_mean": -0.38370330166071653, "reward_change_min": -0.6009878478944302, "reward_change_std": 0.23118219152092934, "reward_std": 0.8010884150862694, "rewards/cosine_scaled_reward": 0.031575741712003946, "rewards/format_reward": 0.7083333488553762, "step": 314 }, { "advantage_max": 1.8261048942804337, "advantage_mean": -3.725290470546483e-08, "advantage_min": -0.8346613347530365, "advantage_std": 0.9998562261462212, "completion_length": 2023.104232788086, "epoch": 0.36, "grad_norm": 0.25029265880584717, "kl": 0.033145904541015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.2596318988235037e-07, "loss": 0.0013, "reward": 0.48478072602301836, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.48478072602301836, "reward_after_std": 0.744693573564291, "reward_before_mean": 0.9110602084547281, "reward_before_std": 0.6513064093887806, "reward_change_max": 0.004788905382156372, "reward_change_mean": -0.4262795224785805, "reward_change_min": -0.6993596404790878, "reward_change_std": 0.26825306564569473, "reward_std": 0.7446935772895813, "rewards/cosine_scaled_reward": 0.07011344470083714, "rewards/format_reward": 0.7708333376795053, "step": 315 }, { "advantage_max": 1.7795376926660538, "advantage_mean": 0.0, "advantage_min": -0.97539883852005, "advantage_std": 0.9998418316245079, "completion_length": 2271.3333587646484, "epoch": 0.36114285714285715, "grad_norm": 0.6578998565673828, "kl": 0.02964019775390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.2294634442070553e-07, "loss": 0.0012, "reward": -0.07873509085038677, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07873509085038677, "reward_after_std": 0.6976997889578342, "reward_before_mean": 0.15203151758760214, "reward_before_std": 0.6933026351034641, "reward_change_max": 0.0005459263920783997, "reward_change_mean": -0.23076661862432957, "reward_change_min": -0.4783244729042053, "reward_change_std": 0.1863260120153427, "reward_std": 0.697699811309576, "rewards/cosine_scaled_reward": -0.24690090990043245, "rewards/format_reward": 0.6458333469927311, "step": 316 }, { "advantage_max": 1.7338401824235916, "advantage_mean": 1.2417635253392234e-09, "advantage_min": -0.9999021217226982, "advantage_std": 0.9998418316245079, "completion_length": 1979.5000534057617, "epoch": 0.36228571428571427, "grad_norm": 0.607611894607544, "kl": 0.02954864501953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1993569137498776e-07, "loss": 0.0012, "reward": 0.3126319421789958, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3126319421789958, "reward_after_std": 0.7558201961219311, "reward_before_mean": 0.6755854524672031, "reward_before_std": 0.713578600436449, "reward_change_max": 0.00012539327144622803, "reward_change_mean": -0.3629534859210253, "reward_change_min": -0.6226933412253857, "reward_change_std": 0.24551275558769703, "reward_std": 0.7558202259242535, "rewards/cosine_scaled_reward": 0.01487604295834899, "rewards/format_reward": 0.6458333488553762, "step": 317 }, { "advantage_max": 1.8692965060472488, "advantage_mean": -9.313224968998668e-09, "advantage_min": -0.8072339072823524, "advantage_std": 0.9997968673706055, "completion_length": 1533.8333435058594, "epoch": 0.36342857142857143, "grad_norm": 0.27196571230888367, "kl": 0.030017852783203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1693137748017915e-07, "loss": 0.0012, "reward": 0.31871563801541924, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.31871563801541924, "reward_after_std": 0.6673868671059608, "reward_before_mean": 0.6963669992983341, "reward_before_std": 0.5849269581958652, "reward_change_max": 0.0, "reward_change_mean": -0.3776513673365116, "reward_change_min": -0.6035145148634911, "reward_change_std": 0.2232550047338009, "reward_std": 0.6673868894577026, "rewards/cosine_scaled_reward": -0.0788998359348625, "rewards/format_reward": 0.8541666716337204, "step": 318 }, { "advantage_max": 1.7901747077703476, "advantage_mean": 1.1102230246251565e-16, "advantage_min": -0.9457738026976585, "advantage_std": 0.9998234286904335, "completion_length": 1601.4792175292969, "epoch": 0.36457142857142855, "grad_norm": 0.351672887802124, "kl": 0.017047882080078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1393354916230005e-07, "loss": 0.0007, "reward": 0.16944484133273363, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.16944484133273363, "reward_after_std": 0.7269079945981503, "reward_before_mean": 0.4864118620753288, "reward_before_std": 0.6883664382621646, "reward_change_max": 0.0006643906235694885, "reward_change_mean": -0.31696700118482113, "reward_change_min": -0.5574754904955626, "reward_change_std": 0.217126595787704, "reward_std": 0.7269080020487309, "rewards/cosine_scaled_reward": -0.16304408200085163, "rewards/format_reward": 0.812500013038516, "step": 319 }, { "advantage_max": 1.7996726334095001, "advantage_mean": -2.4835268508383024e-08, "advantage_min": -0.9349027052521706, "advantage_std": 0.9998023137450218, "completion_length": 1184.6250381469727, "epoch": 0.3657142857142857, "grad_norm": 0.5360416769981384, "kl": 0.029510498046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1094235253127374e-07, "loss": 0.0012, "reward": 0.38623422128148377, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.38623422128148377, "reward_after_std": 0.6377896293997765, "reward_before_mean": 0.7908014450222254, "reward_before_std": 0.5643230387941003, "reward_change_max": 0.0, "reward_change_mean": -0.4045672472566366, "reward_change_min": -0.6618893034756184, "reward_change_std": 0.24159432388842106, "reward_std": 0.6377896443009377, "rewards/cosine_scaled_reward": -0.07334926631301641, "rewards/format_reward": 0.9375000074505806, "step": 320 }, { "advantage_max": 1.8955932259559631, "advantage_mean": -1.614292521878724e-08, "advantage_min": -0.7440028414130211, "advantage_std": 0.9998475313186646, "completion_length": 1137.5625228881836, "epoch": 0.3668571428571429, "grad_norm": 0.32535532116889954, "kl": 0.017612457275390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.079579333738039e-07, "loss": 0.0007, "reward": 0.6462033367715776, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6462033367715776, "reward_after_std": 0.7865514345467091, "reward_before_mean": 1.1191453486680984, "reward_before_std": 0.6479927506297827, "reward_change_max": 0.00030688196420669556, "reward_change_mean": -0.4729420058429241, "reward_change_min": -0.7272159568965435, "reward_change_std": 0.27080815471708775, "reward_std": 0.7865514345467091, "rewards/cosine_scaled_reward": 0.08040599687956274, "rewards/format_reward": 0.9583333358168602, "step": 321 }, { "advantage_max": 1.8173749148845673, "advantage_mean": 1.0632600133675396e-08, "advantage_min": -0.9653920978307724, "advantage_std": 0.9997997060418129, "completion_length": 2001.7708587646484, "epoch": 0.368, "grad_norm": 0.5481053590774536, "kl": 0.043506622314453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.0498043714627006e-07, "loss": 0.0017, "reward": -0.11665507091674954, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11665507091674954, "reward_after_std": 0.6440123803913593, "reward_before_mean": 0.10169446747750044, "reward_before_std": 0.6042440123856068, "reward_change_max": 5.2444636821746826e-05, "reward_change_mean": -0.21834953594952822, "reward_change_min": -0.3703489415347576, "reward_change_std": 0.15231270249933004, "reward_std": 0.6440124064683914, "rewards/cosine_scaled_reward": -0.2408194406889379, "rewards/format_reward": 0.583333345130086, "step": 322 }, { "advantage_max": 1.5935311019420624, "advantage_mean": -8.071462942460528e-09, "advantage_min": -1.3468536585569382, "advantage_std": 0.999786026775837, "completion_length": 1932.4583740234375, "epoch": 0.36914285714285716, "grad_norm": 0.4607272744178772, "kl": 0.040294647216796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.020100089676376e-07, "loss": 0.0016, "reward": 0.11036273371428251, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11036273371428251, "reward_after_std": 0.5299815498292446, "reward_before_mean": 0.4396150344982743, "reward_before_std": 0.5286357495933771, "reward_change_max": 0.0002903193235397339, "reward_change_mean": -0.32925233617424965, "reward_change_min": -0.5357945971190929, "reward_change_std": 0.2163169952109456, "reward_std": 0.5299815647304058, "rewards/cosine_scaled_reward": -0.07185914600268006, "rewards/format_reward": 0.5833333488553762, "step": 323 }, { "advantage_max": 1.7652272433042526, "advantage_mean": -2.359350537162186e-08, "advantage_min": -0.9613615199923515, "advantage_std": 0.9998388364911079, "completion_length": 1644.4167404174805, "epoch": 0.3702857142857143, "grad_norm": 0.43445903062820435, "kl": 0.028423309326171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.9904679361238526e-07, "loss": 0.0011, "reward": 0.1769910454750061, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.1769910454750061, "reward_after_std": 0.8244478814303875, "reward_before_mean": 0.48550825845450163, "reward_before_std": 0.8139250371605158, "reward_change_max": 0.0014047995209693909, "reward_change_mean": -0.308517225086689, "reward_change_min": -0.6248892694711685, "reward_change_std": 0.23547564446926117, "reward_std": 0.8244479261338711, "rewards/cosine_scaled_reward": -0.09057921264320612, "rewards/format_reward": 0.6666666828095913, "step": 324 }, { "advantage_max": 1.8289382308721542, "advantage_mean": -1.1175870562318835e-08, "advantage_min": -0.9289100766181946, "advantage_std": 0.9998611584305763, "completion_length": 1989.5625114440918, "epoch": 0.37142857142857144, "grad_norm": 0.4403512477874756, "kl": 0.028411865234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.9609093550344907e-07, "loss": 0.0011, "reward": 0.22919806372374296, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.22919806372374296, "reward_after_std": 0.7629183158278465, "reward_before_mean": 0.5600673872977495, "reward_before_std": 0.7000786624848843, "reward_change_max": 0.00031850486993789673, "reward_change_mean": -0.3308693375438452, "reward_change_min": -0.5438312329351902, "reward_change_std": 0.21417414862662554, "reward_std": 0.7629183530807495, "rewards/cosine_scaled_reward": -0.042882971465587616, "rewards/format_reward": 0.6458333563059568, "step": 325 }, { "advantage_max": 1.6854747831821442, "advantage_mean": -6.208817238118058e-09, "advantage_min": -0.9491921663284302, "advantage_std": 0.9998669177293777, "completion_length": 1334.0000267028809, "epoch": 0.37257142857142855, "grad_norm": 0.46693912148475647, "kl": 0.024730682373046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.931425787051832e-07, "loss": 0.001, "reward": 0.3675832669250667, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3675832669250667, "reward_after_std": 0.8719189018011093, "reward_before_mean": 0.7395303957164288, "reward_before_std": 0.8456853181123734, "reward_change_max": 0.0005331039428710938, "reward_change_mean": -0.3719471115618944, "reward_change_min": -0.6407735645771027, "reward_change_std": 0.24785283766686916, "reward_std": 0.8719189167022705, "rewards/cosine_scaled_reward": -0.04690148448571563, "rewards/format_reward": 0.8333333395421505, "step": 326 }, { "advantage_max": 1.8275423794984818, "advantage_mean": -3.601113968132452e-08, "advantage_min": -0.8045339435338974, "advantage_std": 0.9998384490609169, "completion_length": 1592.208366394043, "epoch": 0.3737142857142857, "grad_norm": 0.29715391993522644, "kl": 0.01522064208984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.902018669163384e-07, "loss": 0.0006, "reward": 0.5296492774505168, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5296492774505168, "reward_after_std": 0.8084785155951977, "reward_before_mean": 0.9661593809723854, "reward_before_std": 0.7392793595790863, "reward_change_max": 0.000494934618473053, "reward_change_mean": -0.4365101332659833, "reward_change_min": -0.764576144516468, "reward_change_std": 0.2705193728907034, "reward_std": 0.8084785379469395, "rewards/cosine_scaled_reward": 0.04557968117296696, "rewards/format_reward": 0.875, "step": 327 }, { "advantage_max": 1.6745270639657974, "advantage_mean": -9.313226079221693e-09, "advantage_min": -1.210839420557022, "advantage_std": 0.9998301565647125, "completion_length": 1980.6250457763672, "epoch": 0.37485714285714283, "grad_norm": 0.5265143513679504, "kl": 0.03392791748046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.872689434630585e-07, "loss": 0.0014, "reward": 0.020316538168117404, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.020316538168117404, "reward_after_std": 0.6579835563898087, "reward_before_mean": 0.2947034602984786, "reward_before_std": 0.6620885357260704, "reward_change_max": 0.00039255619049072266, "reward_change_mean": -0.2743869163095951, "reward_change_min": -0.5088735148310661, "reward_change_std": 0.2032665442675352, "reward_std": 0.657983586192131, "rewards/cosine_scaled_reward": -0.17556495312601328, "rewards/format_reward": 0.6458333544433117, "step": 328 }, { "advantage_max": 1.6396190077066422, "advantage_mean": -6.208818015274176e-09, "advantage_min": -1.220699205994606, "advantage_std": 0.9998419806361198, "completion_length": 1220.4167175292969, "epoch": 0.376, "grad_norm": 0.5542118549346924, "kl": 0.027927398681640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.843439512918949e-07, "loss": 0.0011, "reward": 0.49249889561906457, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.49249889561906457, "reward_after_std": 0.7204984985291958, "reward_before_mean": 0.931340447627008, "reward_before_std": 0.7178945392370224, "reward_change_max": 0.00020537525415420532, "reward_change_mean": -0.4388415180146694, "reward_change_min": -0.73739018663764, "reward_change_std": 0.28611912578344345, "reward_std": 0.7204985357820988, "rewards/cosine_scaled_reward": 0.038586877286434174, "rewards/format_reward": 0.854166679084301, "step": 329 }, { "advantage_max": 1.7546139061450958, "advantage_mean": -2.8560560139112567e-08, "advantage_min": -1.0114218667149544, "advantage_std": 0.9998717978596687, "completion_length": 1826.6250686645508, "epoch": 0.37714285714285717, "grad_norm": 0.38264378905296326, "kl": 0.05272674560546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.8142703296283953e-07, "loss": 0.0021, "reward": 0.38759896298870444, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.38759896298870444, "reward_after_std": 0.9311463423073292, "reward_before_mean": 0.7545649614185095, "reward_before_std": 0.9111193716526031, "reward_change_max": 0.00023794174194335938, "reward_change_mean": -0.36696598306298256, "reward_change_min": -0.6920629777014256, "reward_change_std": 0.2625411916524172, "reward_std": 0.9311463460326195, "rewards/cosine_scaled_reward": -0.008134204195812345, "rewards/format_reward": 0.7708333488553762, "step": 330 }, { "advantage_max": 1.7087284177541733, "advantage_mean": 2.0799537508997545e-08, "advantage_min": -1.0848042890429497, "advantage_std": 0.9998006969690323, "completion_length": 1892.6042022705078, "epoch": 0.3782857142857143, "grad_norm": 0.6822070479393005, "kl": 0.03733062744140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.785183306423767e-07, "loss": 0.0015, "reward": 0.02184966392815113, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.02184966392815113, "reward_after_std": 0.591773048043251, "reward_before_mean": 0.30426534451544285, "reward_before_std": 0.5678266994655132, "reward_change_max": 2.5078654289245605e-05, "reward_change_mean": -0.2824156917631626, "reward_change_min": -0.49101157300174236, "reward_change_std": 0.1870785802602768, "reward_std": 0.5917730703949928, "rewards/cosine_scaled_reward": -0.1499506589025259, "rewards/format_reward": 0.6041666753590107, "step": 331 }, { "advantage_max": 1.776439443230629, "advantage_mean": -1.738468902168222e-08, "advantage_min": -0.9789760857820511, "advantage_std": 0.99981539696455, "completion_length": 1564.1250457763672, "epoch": 0.37942857142857145, "grad_norm": 0.39930421113967896, "kl": 0.021694183349609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.7561798609655373e-07, "loss": 0.0009, "reward": 0.16615198040381074, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16615198040381074, "reward_after_std": 0.5799468718469143, "reward_before_mean": 0.5001294352114201, "reward_before_std": 0.5344355627894402, "reward_change_max": 0.0, "reward_change_mean": -0.33397745341062546, "reward_change_min": -0.5477711912244558, "reward_change_std": 0.20790472254157066, "reward_std": 0.5799469090998173, "rewards/cosine_scaled_reward": -0.14576863683760166, "rewards/format_reward": 0.7916666734963655, "step": 332 }, { "advantage_max": 1.876511350274086, "advantage_mean": 4.967053657267684e-09, "advantage_min": -0.8139053508639336, "advantage_std": 0.9998176023364067, "completion_length": 1287.5417098999023, "epoch": 0.38057142857142856, "grad_norm": 0.2758875787258148, "kl": 0.02108001708984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.72726140684072e-07, "loss": 0.0008, "reward": 0.40404096292331815, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.40404096292331815, "reward_after_std": 0.7441972568631172, "reward_before_mean": 0.7986128572374582, "reward_before_std": 0.6491032605990767, "reward_change_max": 0.0, "reward_change_mean": -0.3945719040930271, "reward_change_min": -0.6370886974036694, "reward_change_std": 0.23393233679234982, "reward_std": 0.7441972848027945, "rewards/cosine_scaled_reward": -0.06944357417523861, "rewards/format_reward": 0.9375, "step": 333 }, { "advantage_max": 1.7530118376016617, "advantage_mean": 2.3360674616945687e-08, "advantage_min": -1.0081253498792648, "advantage_std": 0.9998287037014961, "completion_length": 2588.479263305664, "epoch": 0.38171428571428573, "grad_norm": 0.9488856792449951, "kl": 0.064056396484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.6984293534939737e-07, "loss": 0.0026, "reward": -0.14747369661927223, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14747369661927223, "reward_after_std": 0.6475025825202465, "reward_before_mean": 0.06826394773088396, "reward_before_std": 0.6546566933393478, "reward_change_max": 0.0009675323963165283, "reward_change_mean": -0.21573764830827713, "reward_change_min": -0.4678405858576298, "reward_change_std": 0.17771738301962614, "reward_std": 0.6475025936961174, "rewards/cosine_scaled_reward": -0.21586802694946527, "rewards/format_reward": 0.5000000074505806, "step": 334 }, { "advantage_max": 1.8158823996782303, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -1.0010618343949318, "advantage_std": 0.9998266473412514, "completion_length": 1407.9791946411133, "epoch": 0.38285714285714284, "grad_norm": 0.3690642714500427, "kl": 0.020839691162109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.6696851061588994e-07, "loss": 0.0008, "reward": 0.5123026471119374, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5123026471119374, "reward_after_std": 0.8610937166959047, "reward_before_mean": 0.932284346781671, "reward_before_std": 0.7961460901424289, "reward_change_max": 0.00041369348764419556, "reward_change_mean": -0.4199816770851612, "reward_change_min": -0.6663753166794777, "reward_change_std": 0.2613986600190401, "reward_std": 0.8610937315970659, "rewards/cosine_scaled_reward": 0.02864214894361794, "rewards/format_reward": 0.8750000074505806, "step": 335 }, { "advantage_max": 1.6519970148801804, "advantage_mean": -5.7431560618326216e-08, "advantage_min": -1.1527896374464035, "advantage_std": 0.999854676425457, "completion_length": 1741.9167098999023, "epoch": 0.384, "grad_norm": 0.4452887177467346, "kl": 0.036556243896484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.641030065789562e-07, "loss": 0.0015, "reward": 0.5018006656318903, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5018006656318903, "reward_after_std": 0.9337717443704605, "reward_before_mean": 0.9186398200690746, "reward_before_std": 0.9587063193321228, "reward_change_max": 0.00037522614002227783, "reward_change_mean": -0.41683919727802277, "reward_change_min": -0.8269048631191254, "reward_change_std": 0.31330749951303005, "reward_std": 0.9337717592716217, "rewards/cosine_scaled_reward": 0.13640324026346207, "rewards/format_reward": 0.6458333395421505, "step": 336 }, { "advantage_max": 1.8493523597717285, "advantage_mean": -1.1486311873865063e-08, "advantage_min": -0.8343974202871323, "advantage_std": 0.9998639076948166, "completion_length": 1862.2917175292969, "epoch": 0.3851428571428571, "grad_norm": 0.4439464807510376, "kl": 0.048858642578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.612465628992203e-07, "loss": 0.002, "reward": 0.3141383654437959, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3141383654437959, "reward_after_std": 0.9224808253347874, "reward_before_mean": 0.6509024668484926, "reward_before_std": 0.8570667169988155, "reward_change_max": 0.0016486793756484985, "reward_change_mean": -0.336764108389616, "reward_change_min": -0.5929896347224712, "reward_change_std": 0.22016599588096142, "reward_std": 0.9224808253347874, "rewards/cosine_scaled_reward": -0.08079877495765686, "rewards/format_reward": 0.8125000111758709, "step": 337 }, { "advantage_max": 1.8014024198055267, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.9077040404081345, "advantage_std": 0.9998239055275917, "completion_length": 1413.145851135254, "epoch": 0.3862857142857143, "grad_norm": 0.6317712068557739, "kl": 0.027873992919921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.5839931879571725e-07, "loss": 0.0011, "reward": 0.39002217911183834, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.39002217911183834, "reward_after_std": 0.6700347550213337, "reward_before_mean": 0.7876107357442379, "reward_before_std": 0.585445012897253, "reward_change_max": 0.0, "reward_change_mean": -0.39758857898414135, "reward_change_min": -0.6161500923335552, "reward_change_std": 0.23368200194090605, "reward_std": 0.6700347736477852, "rewards/cosine_scaled_reward": -0.022861299104988575, "rewards/format_reward": 0.8333333432674408, "step": 338 }, { "advantage_max": 1.5985001772642136, "advantage_mean": -1.862645371275562e-09, "advantage_min": -1.2936531230807304, "advantage_std": 0.9998006597161293, "completion_length": 1982.9376068115234, "epoch": 0.38742857142857146, "grad_norm": 0.5438936948776245, "kl": 0.0607452392578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.555614130391079e-07, "loss": 0.0024, "reward": -0.01311766542494297, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.01311766542494297, "reward_after_std": 0.5598406568169594, "reward_before_mean": 0.26114371605217457, "reward_before_std": 0.5639406815171242, "reward_change_max": 0.001089140772819519, "reward_change_mean": -0.27426140010356903, "reward_change_min": -0.4871658943593502, "reward_change_std": 0.19464866816997528, "reward_std": 0.55984066426754, "rewards/cosine_scaled_reward": -0.20276147779077291, "rewards/format_reward": 0.666666679084301, "step": 339 }, { "advantage_max": 1.753526747226715, "advantage_mean": -2.3283065031520778e-08, "advantage_min": -1.0007511749863625, "advantage_std": 0.9998476505279541, "completion_length": 1841.2292098999023, "epoch": 0.38857142857142857, "grad_norm": 0.7799713015556335, "kl": 0.0418548583984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.5273298394491515e-07, "loss": 0.0017, "reward": 0.3483142200857401, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3483142200857401, "reward_after_std": 0.8003076985478401, "reward_before_mean": 0.7235615830868483, "reward_before_std": 0.7892604991793633, "reward_change_max": 0.0, "reward_change_mean": -0.3752473834902048, "reward_change_min": -0.6742827035486698, "reward_change_std": 0.2584288869984448, "reward_std": 0.8003077395260334, "rewards/cosine_scaled_reward": -0.023635881021618843, "rewards/format_reward": 0.7708333507180214, "step": 340 }, { "advantage_max": 1.723728433251381, "advantage_mean": -2.8560559917067962e-08, "advantage_min": -0.9149245694279671, "advantage_std": 0.9998441636562347, "completion_length": 1600.3333587646484, "epoch": 0.38971428571428574, "grad_norm": 0.3691047430038452, "kl": 0.03575897216796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4991416936678276e-07, "loss": 0.0014, "reward": 0.5872527491301298, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5872527491301298, "reward_after_std": 0.7880188077688217, "reward_before_mean": 1.0462564658373594, "reward_before_std": 0.7202813364565372, "reward_change_max": 0.0, "reward_change_mean": -0.4590037427842617, "reward_change_min": -0.7513881102204323, "reward_change_std": 0.2904424434527755, "reward_std": 0.7880188301205635, "rewards/cosine_scaled_reward": 0.1793782263994217, "rewards/format_reward": 0.6875000055879354, "step": 341 }, { "advantage_max": 1.7118661999702454, "advantage_mean": -9.934107703113426e-09, "advantage_min": -0.9686658307909966, "advantage_std": 0.9998463094234467, "completion_length": 1779.9791793823242, "epoch": 0.39085714285714285, "grad_norm": 0.4578934609889984, "kl": 0.052387237548828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.471051066897562e-07, "loss": 0.0021, "reward": 0.300952305784449, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.300952305784449, "reward_after_std": 0.8132330104708672, "reward_before_mean": 0.6575404182076454, "reward_before_std": 0.7985194809734821, "reward_change_max": 0.0, "reward_change_mean": -0.35658809170126915, "reward_change_min": -0.6547360792756081, "reward_change_std": 0.2531323414295912, "reward_std": 0.8132330290973186, "rewards/cosine_scaled_reward": -0.046229824889451265, "rewards/format_reward": 0.7500000074505806, "step": 342 }, { "advantage_max": 1.7737548500299454, "advantage_mean": -3.383805535772666e-08, "advantage_min": -0.8950196951627731, "advantage_std": 0.9998361840844154, "completion_length": 2097.937545776367, "epoch": 0.392, "grad_norm": 0.34791120886802673, "kl": 0.0643768310546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4430593282358777e-07, "loss": 0.0026, "reward": 0.325270289555192, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.325270289555192, "reward_after_std": 0.748699925839901, "reward_before_mean": 0.6929463082924485, "reward_before_std": 0.6853790711611509, "reward_change_max": 0.00012090057134628296, "reward_change_mean": -0.3676760010421276, "reward_change_min": -0.6739123649895191, "reward_change_std": 0.24684995412826538, "reward_std": 0.7486999519169331, "rewards/cosine_scaled_reward": -0.028526873793452978, "rewards/format_reward": 0.7500000149011612, "step": 343 }, { "advantage_max": 1.6565604954957962, "advantage_mean": -2.8560559250934148e-08, "advantage_min": -1.2782927006483078, "advantage_std": 0.9998640418052673, "completion_length": 1736.4583740234375, "epoch": 0.3931428571428571, "grad_norm": 0.5713613629341125, "kl": 0.042018890380859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4151678419606233e-07, "loss": 0.0017, "reward": 0.7220300175249577, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7220300175249577, "reward_after_std": 0.8213397674262524, "reward_before_mean": 1.2313897691201419, "reward_before_std": 0.7971918359398842, "reward_change_max": 0.0006480291485786438, "reward_change_mean": -0.5093597397208214, "reward_change_min": -0.794022686779499, "reward_change_std": 0.31847442872822285, "reward_std": 0.8213397897779942, "rewards/cosine_scaled_reward": 0.2302781967446208, "rewards/format_reward": 0.7708333488553762, "step": 344 }, { "advantage_max": 1.7123928368091583, "advantage_mean": -1.8005570756596256e-08, "advantage_min": -1.033238098025322, "advantage_std": 0.999805174767971, "completion_length": 1808.8958740234375, "epoch": 0.3942857142857143, "grad_norm": 0.55589359998703, "kl": 0.06340408325195312, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.387377967463493e-07, "loss": 0.0025, "reward": 0.15163142536766827, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15163142536766827, "reward_after_std": 0.5693875066936016, "reward_before_mean": 0.4864295755396597, "reward_before_std": 0.5296621806919575, "reward_change_max": 0.001675918698310852, "reward_change_mean": -0.334798164665699, "reward_change_min": -0.5586585029959679, "reward_change_std": 0.221794243901968, "reward_std": 0.5693875104188919, "rewards/cosine_scaled_reward": -0.0901185441762209, "rewards/format_reward": 0.666666679084301, "step": 345 }, { "advantage_max": 1.7333679348230362, "advantage_mean": 4.346172755020916e-09, "advantage_min": -0.983534038066864, "advantage_std": 0.9998268410563469, "completion_length": 1690.958381652832, "epoch": 0.3954285714285714, "grad_norm": 0.5322929620742798, "kl": 0.042877197265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.359691059183761e-07, "loss": 0.0017, "reward": 0.19164329813793302, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19164329813793302, "reward_after_std": 0.7117663659155369, "reward_before_mean": 0.522115595638752, "reward_before_std": 0.6950028091669083, "reward_change_max": 0.0, "reward_change_mean": -0.3304723044857383, "reward_change_min": -0.5813820399343967, "reward_change_std": 0.22892522998154163, "reward_std": 0.7117663882672787, "rewards/cosine_scaled_reward": -0.12435888312757015, "rewards/format_reward": 0.7708333488553762, "step": 346 }, { "advantage_max": 1.7528623640537262, "advantage_mean": -7.761021603158724e-09, "advantage_min": -0.9384801313281059, "advantage_std": 0.9998300224542618, "completion_length": 2184.5000915527344, "epoch": 0.3965714285714286, "grad_norm": 0.3822513818740845, "kl": 0.06087493896484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.3321084665422803e-07, "loss": 0.0024, "reward": 0.23295758292078972, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23295758292078972, "reward_after_std": 0.7338234409689903, "reward_before_mean": 0.5732180112972856, "reward_before_std": 0.7169551942497492, "reward_change_max": 0.0018673241138458252, "reward_change_mean": -0.34026043489575386, "reward_change_min": -0.6701146997511387, "reward_change_std": 0.24350756220519543, "reward_std": 0.7338234409689903, "rewards/cosine_scaled_reward": -0.14047434460371733, "rewards/format_reward": 0.8541666753590107, "step": 347 }, { "advantage_max": 1.716232493519783, "advantage_mean": 1.7695129583827907e-08, "advantage_min": -1.0631478130817413, "advantage_std": 0.9998551607131958, "completion_length": 2050.3125534057617, "epoch": 0.3977142857142857, "grad_norm": 0.721562922000885, "kl": 0.06792068481445312, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.3046315338757026e-07, "loss": 0.0027, "reward": 0.14180525578558445, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14180525578558445, "reward_after_std": 0.7749368660151958, "reward_before_mean": 0.4474893862497993, "reward_before_std": 0.7959772050380707, "reward_change_max": 0.001784391701221466, "reward_change_mean": -0.30568412970751524, "reward_change_min": -0.55716672539711, "reward_change_std": 0.2335031647235155, "reward_std": 0.774936880916357, "rewards/cosine_scaled_reward": -0.06792197469621897, "rewards/format_reward": 0.5833333432674408, "step": 348 }, { "advantage_max": 1.7815395593643188, "advantage_mean": -2.4835271617007493e-09, "advantage_min": -0.9733176231384277, "advantage_std": 0.9998257160186768, "completion_length": 1645.3333892822266, "epoch": 0.39885714285714285, "grad_norm": 0.5131498575210571, "kl": 0.048900604248046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.2772616003709616e-07, "loss": 0.002, "reward": 0.3904577139765024, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3904577139765024, "reward_after_std": 0.7350214906036854, "reward_before_mean": 0.7852839026600122, "reward_before_std": 0.665465185418725, "reward_change_max": 0.0, "reward_change_mean": -0.39482616260647774, "reward_change_min": -0.6697969511151314, "reward_change_std": 0.24852763675153255, "reward_std": 0.7350215204060078, "rewards/cosine_scaled_reward": -0.02402474172413349, "rewards/format_reward": 0.8333333544433117, "step": 349 }, { "advantage_max": 1.7450901716947556, "advantage_mean": -1.4280279680978225e-08, "advantage_min": -0.9893204569816589, "advantage_std": 0.9998525604605675, "completion_length": 1271.7917175292969, "epoch": 0.4, "grad_norm": 0.7096825242042542, "kl": 0.0373077392578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.250000000000001e-07, "loss": 0.0015, "reward": 0.41953666880726814, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.41953666880726814, "reward_after_std": 0.8247261010110378, "reward_before_mean": 0.8147061467170715, "reward_before_std": 0.793494526296854, "reward_change_max": 0.0, "reward_change_mean": -0.39516947977244854, "reward_change_min": -0.713260505348444, "reward_change_std": 0.265448372811079, "reward_std": 0.8247261047363281, "rewards/cosine_scaled_reward": -0.03014694177545607, "rewards/format_reward": 0.8750000055879354, "step": 350 }, { "advantage_max": 1.6794244647026062, "advantage_mean": -5.587935669737476e-09, "advantage_min": -1.0799910724163055, "advantage_std": 0.9998472705483437, "completion_length": 1827.1458740234375, "epoch": 0.40114285714285713, "grad_norm": 0.4725710451602936, "kl": 0.06256103515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.222848061454764e-07, "loss": 0.0025, "reward": 0.38714588712900877, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.38714588712900877, "reward_after_std": 0.8485158495604992, "reward_before_mean": 0.7705275826156139, "reward_before_std": 0.852873831987381, "reward_change_max": 0.0022610872983932495, "reward_change_mean": -0.38338171504437923, "reward_change_min": -0.7179454322904348, "reward_change_std": 0.27640335727483034, "reward_std": 0.8485158681869507, "rewards/cosine_scaled_reward": -0.00015288405120372772, "rewards/format_reward": 0.7708333507180214, "step": 351 }, { "advantage_max": 1.7583201676607132, "advantage_mean": 1.179675268581093e-08, "advantage_min": -1.0457439199090004, "advantage_std": 0.9997604191303253, "completion_length": 1613.6250381469727, "epoch": 0.4022857142857143, "grad_norm": 0.6200236082077026, "kl": 0.04233551025390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.195807108082429e-07, "loss": 0.0017, "reward": 0.34590123407542706, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.34590123407542706, "reward_after_std": 0.5814727526158094, "reward_before_mean": 0.7459990214556456, "reward_before_std": 0.5219845222309232, "reward_change_max": 0.00010109692811965942, "reward_change_mean": -0.40009779296815395, "reward_change_min": -0.6257377043366432, "reward_change_std": 0.24037265591323376, "reward_std": 0.5814727805554867, "rewards/cosine_scaled_reward": 0.05008285026997328, "rewards/format_reward": 0.645833333954215, "step": 352 }, { "advantage_max": 1.7677299678325653, "advantage_mean": -2.7318795892128378e-08, "advantage_min": -0.9796153157949448, "advantage_std": 0.9998888298869133, "completion_length": 1377.7917098999023, "epoch": 0.4034285714285714, "grad_norm": 0.5955826044082642, "kl": 0.047733306884765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.168878457820915e-07, "loss": 0.0019, "reward": 0.7051948038861156, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7051948038861156, "reward_after_std": 1.0063594207167625, "reward_before_mean": 1.1756671071052551, "reward_before_std": 0.9599154591560364, "reward_change_max": 0.0014167726039886475, "reward_change_mean": -0.4704723171889782, "reward_change_min": -0.8216895461082458, "reward_change_std": 0.3180227465927601, "reward_std": 1.0063594430685043, "rewards/cosine_scaled_reward": 0.17116688983514905, "rewards/format_reward": 0.8333333544433117, "step": 353 }, { "advantage_max": 1.8110656440258026, "advantage_mean": -4.346172111091562e-08, "advantage_min": -0.8556944318115711, "advantage_std": 0.9997976273298264, "completion_length": 997.9791793823242, "epoch": 0.4045714285714286, "grad_norm": 0.4756532907485962, "kl": 0.0196990966796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.142063423134644e-07, "loss": 0.0008, "reward": 0.49964590836316347, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.49964590836316347, "reward_after_std": 0.6052370220422745, "reward_before_mean": 0.9540312092285603, "reward_before_std": 0.5143911112099886, "reward_change_max": 0.0, "reward_change_mean": -0.4543852899223566, "reward_change_min": -0.7094008177518845, "reward_change_std": 0.2678078208118677, "reward_std": 0.6052370518445969, "rewards/cosine_scaled_reward": 0.018682243302464485, "rewards/format_reward": 0.9166666679084301, "step": 354 }, { "advantage_max": 1.713475525379181, "advantage_mean": -7.450580596923828e-09, "advantage_min": -1.042502522468567, "advantage_std": 0.9998580813407898, "completion_length": 1211.1042098999023, "epoch": 0.4057142857142857, "grad_norm": 0.46108749508857727, "kl": 0.037975311279296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.115363310950578e-07, "loss": 0.0015, "reward": 0.5299158040434122, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5299158040434122, "reward_after_std": 0.8260532990098, "reward_before_mean": 0.9658863693475723, "reward_before_std": 0.8023704327642918, "reward_change_max": 0.0, "reward_change_mean": -0.43597057089209557, "reward_change_min": -0.7615926265716553, "reward_change_std": 0.28268345445394516, "reward_std": 0.8260533064603806, "rewards/cosine_scaled_reward": 0.014193183276802301, "rewards/format_reward": 0.9375000074505806, "step": 355 }, { "advantage_max": 1.752561241388321, "advantage_mean": 4.967053712778835e-09, "advantage_min": -0.9992698207497597, "advantage_std": 0.9998455420136452, "completion_length": 1745.5208740234375, "epoch": 0.40685714285714286, "grad_norm": 0.47905710339546204, "kl": 0.06621551513671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.0887794225945143e-07, "loss": 0.0026, "reward": 0.39957124821376055, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.39957124821376055, "reward_after_std": 0.7217008732259274, "reward_before_mean": 0.8020104505121708, "reward_before_std": 0.6859807781875134, "reward_change_max": 0.00046562403440475464, "reward_change_mean": -0.40243920870125294, "reward_change_min": -0.6796171739697456, "reward_change_std": 0.2583258803933859, "reward_std": 0.7217008769512177, "rewards/cosine_scaled_reward": -0.005244780331850052, "rewards/format_reward": 0.8125000074505806, "step": 356 }, { "advantage_max": 1.801738902926445, "advantage_mean": 8.692344288796505e-09, "advantage_min": -1.0072083622217178, "advantage_std": 0.9998702257871628, "completion_length": 2294.5208892822266, "epoch": 0.408, "grad_norm": 0.5011313557624817, "kl": 0.09915924072265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.062313053727671e-07, "loss": 0.004, "reward": 0.14724520407617092, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14724520407617092, "reward_after_std": 0.9041547067463398, "reward_before_mean": 0.4296424980275333, "reward_before_std": 0.8899640142917633, "reward_change_max": 0.0029114261269569397, "reward_change_mean": -0.28239729441702366, "reward_change_min": -0.570688933134079, "reward_change_std": 0.22510281763970852, "reward_std": 0.904154721647501, "rewards/cosine_scaled_reward": -0.08726209111046046, "rewards/format_reward": 0.604166679084301, "step": 357 }, { "advantage_max": 1.6455123275518417, "advantage_mean": 3.5312648360985577e-09, "advantage_min": -1.152344323694706, "advantage_std": 0.9998896345496178, "completion_length": 1574.8125534057617, "epoch": 0.40914285714285714, "grad_norm": 0.4527164101600647, "kl": 0.0278472900390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.0359654942835247e-07, "loss": 0.0011, "reward": 0.7210206426680088, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7210206426680088, "reward_after_std": 1.0176164768636227, "reward_before_mean": 1.206741967704147, "reward_before_std": 1.0266901180148125, "reward_change_max": 0.0005607828497886658, "reward_change_mean": -0.4857212696224451, "reward_change_min": -0.8255046382546425, "reward_change_std": 0.3363110963255167, "reward_std": 1.0176164954900742, "rewards/cosine_scaled_reward": 0.15545429242774844, "rewards/format_reward": 0.8958333432674408, "step": 358 }, { "advantage_max": 1.7403311282396317, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -1.0309613794088364, "advantage_std": 0.999843418598175, "completion_length": 1361.8125610351562, "epoch": 0.4102857142857143, "grad_norm": 0.5957604050636292, "kl": 0.05738067626953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.0097380284049523e-07, "loss": 0.0023, "reward": 0.1979818413965404, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1979818413965404, "reward_after_std": 0.763306338340044, "reward_before_mean": 0.5200300570577383, "reward_before_std": 0.7347928695380688, "reward_change_max": 0.001569896936416626, "reward_change_mean": -0.32204820215702057, "reward_change_min": -0.58536596596241, "reward_change_std": 0.2217138558626175, "reward_std": 0.7633063457906246, "rewards/cosine_scaled_reward": -0.15665165561949834, "rewards/format_reward": 0.8333333507180214, "step": 359 }, { "advantage_max": 1.804108589887619, "advantage_mean": 6.2088167940288486e-09, "advantage_min": -1.0246483609080315, "advantage_std": 0.9998931512236595, "completion_length": 1983.4792251586914, "epoch": 0.4114285714285714, "grad_norm": 0.8682091236114502, "kl": 0.10474014282226562, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.9836319343816397e-07, "loss": 0.0042, "reward": 0.43156973691657186, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.43156973691657186, "reward_after_std": 1.043577142059803, "reward_before_mean": 0.8001033894252032, "reward_before_std": 1.0149410367012024, "reward_change_max": 0.0015130415558815002, "reward_change_mean": -0.3685336671769619, "reward_change_min": -0.6645958721637726, "reward_change_std": 0.2618884276598692, "reward_std": 1.0435771495103836, "rewards/cosine_scaled_reward": 0.045885009691119194, "rewards/format_reward": 0.7083333432674408, "step": 360 }, { "advantage_max": 1.6712573170661926, "advantage_mean": -3.849466767569254e-08, "advantage_min": -1.1180831789970398, "advantage_std": 0.9998738914728165, "completion_length": 1789.3958740234375, "epoch": 0.4125714285714286, "grad_norm": 0.8642761707305908, "kl": 0.09290313720703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.9576484845877793e-07, "loss": 0.0037, "reward": 0.4625844731926918, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4625844731926918, "reward_after_std": 0.9455960988998413, "reward_before_mean": 0.8622227795422077, "reward_before_std": 0.9503796808421612, "reward_change_max": 0.00037945061922073364, "reward_change_mean": -0.3996383436024189, "reward_change_min": -0.7288960255682468, "reward_change_std": 0.2895132377743721, "reward_std": 0.9455961063504219, "rewards/cosine_scaled_reward": 0.014444717206060886, "rewards/format_reward": 0.833333358168602, "step": 361 }, { "advantage_max": 1.767193242907524, "advantage_mean": -4.3461716447978915e-09, "advantage_min": -0.9865794405341148, "advantage_std": 0.9997773095965385, "completion_length": 1468.770866394043, "epoch": 0.4137142857142857, "grad_norm": 0.5321914553642273, "kl": 0.09504318237304688, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.931788945420058e-07, "loss": 0.0038, "reward": 0.4529275484383106, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4529275484383106, "reward_after_std": 0.5799758862704039, "reward_before_mean": 0.8929033568128943, "reward_before_std": 0.5178761789575219, "reward_change_max": 0.0005796030163764954, "reward_change_mean": -0.4399757944047451, "reward_change_min": -0.6603477858006954, "reward_change_std": 0.26607093773782253, "reward_std": 0.5799759048968554, "rewards/cosine_scaled_reward": 0.008951665833592415, "rewards/format_reward": 0.8750000037252903, "step": 362 }, { "advantage_max": 1.7589514702558517, "advantage_mean": -5.036902950283917e-08, "advantage_min": -1.072552129626274, "advantage_std": 0.9998479261994362, "completion_length": 1127.0417022705078, "epoch": 0.41485714285714287, "grad_norm": 0.6177250146865845, "kl": 0.1038360595703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.9060545772359305e-07, "loss": 0.0041, "reward": 0.7460359609685838, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7460359609685838, "reward_after_std": 0.8154219090938568, "reward_before_mean": 1.2553061340004206, "reward_before_std": 0.7414013964589685, "reward_change_max": 0.0, "reward_change_mean": -0.5092701800167561, "reward_change_min": -0.7939641438424587, "reward_change_std": 0.31660088524222374, "reward_std": 0.815421923995018, "rewards/cosine_scaled_reward": 0.21098637953400612, "rewards/format_reward": 0.8333333395421505, "step": 363 }, { "advantage_max": 1.8528580367565155, "advantage_mean": -4.967054101356894e-09, "advantage_min": -0.8983760923147202, "advantage_std": 0.9998350441455841, "completion_length": 1412.2083740234375, "epoch": 0.416, "grad_norm": 0.7577179670333862, "kl": 0.0618743896484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.8804466342921987e-07, "loss": 0.0025, "reward": 0.09332260582596064, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09332260582596064, "reward_after_std": 0.6483968794345856, "reward_before_mean": 0.3865098184905946, "reward_before_std": 0.5817593857645988, "reward_change_max": 0.0017022117972373962, "reward_change_mean": -0.29318721406161785, "reward_change_min": -0.4892657585442066, "reward_change_std": 0.1843111701309681, "reward_std": 0.6483968980610371, "rewards/cosine_scaled_reward": -0.23382843763101846, "rewards/format_reward": 0.854166679084301, "step": 364 }, { "advantage_max": 1.7174775451421738, "advantage_mean": 9.313226023710541e-09, "advantage_min": -1.1426312774419785, "advantage_std": 0.9998360723257065, "completion_length": 2193.4792251586914, "epoch": 0.41714285714285715, "grad_norm": 0.4941312372684479, "kl": 0.1219482421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.854966364683872e-07, "loss": 0.0049, "reward": 0.3083358150906861, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3083358150906861, "reward_after_std": 0.7163702324032784, "reward_before_mean": 0.6797299887984991, "reward_before_std": 0.6961984969675541, "reward_change_max": 0.0013974756002426147, "reward_change_mean": -0.37139415368437767, "reward_change_min": -0.6369239427149296, "reward_change_std": 0.25196096766740084, "reward_std": 0.7163702547550201, "rewards/cosine_scaled_reward": 0.048198305536061525, "rewards/format_reward": 0.583333345130086, "step": 365 }, { "advantage_max": 1.7944408804178238, "advantage_mean": -2.4835268952472234e-08, "advantage_min": -0.9701410867273808, "advantage_std": 0.9998177662491798, "completion_length": 1229.0208625793457, "epoch": 0.41828571428571426, "grad_norm": 0.5207622647285461, "kl": 0.03325653076171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.829615010283344e-07, "loss": 0.0013, "reward": 0.6801526825875044, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6801526825875044, "reward_after_std": 0.6221742108464241, "reward_before_mean": 1.1953078974038363, "reward_before_std": 0.5093259969726205, "reward_change_max": 0.0, "reward_change_mean": -0.5151552185416222, "reward_change_min": -0.7402960807085037, "reward_change_std": 0.2926663924008608, "reward_std": 0.6221742257475853, "rewards/cosine_scaled_reward": 0.12890393659472466, "rewards/format_reward": 0.9375000074505806, "step": 366 }, { "advantage_max": 1.7212315797805786, "advantage_mean": 1.1102230246251565e-16, "advantage_min": -1.1223592311143875, "advantage_std": 0.9998173341155052, "completion_length": 1808.145866394043, "epoch": 0.41942857142857143, "grad_norm": 0.4319981634616852, "kl": 0.06876754760742188, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.8043938066798645e-07, "loss": 0.0028, "reward": 0.5296032973565161, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5296032973565161, "reward_after_std": 0.7023920379579067, "reward_before_mean": 0.9813962541520596, "reward_before_std": 0.6583868637681007, "reward_change_max": 0.0, "reward_change_mean": -0.4517929572612047, "reward_change_min": -0.7485668640583754, "reward_change_std": 0.2807191088795662, "reward_std": 0.7023920379579067, "rewards/cosine_scaled_reward": 0.09486476704478264, "rewards/format_reward": 0.7916666716337204, "step": 367 }, { "advantage_max": 1.6234066784381866, "advantage_mean": 2.2041300923314466e-08, "advantage_min": -1.1719984039664268, "advantage_std": 0.9998603090643883, "completion_length": 2330.0000610351562, "epoch": 0.4205714285714286, "grad_norm": 0.5970098972320557, "kl": 0.1321563720703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.7793039831193133e-07, "loss": 0.0053, "reward": 0.2834152211435139, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2834152211435139, "reward_after_std": 0.8817011006176472, "reward_before_mean": 0.6289000092074275, "reward_before_std": 0.9101042859256268, "reward_change_max": 0.00013091415166854858, "reward_change_mean": -0.3454847941175103, "reward_change_min": -0.6719449236989021, "reward_change_std": 0.27200268767774105, "reward_std": 0.8817011043429375, "rewards/cosine_scaled_reward": -0.01888333074748516, "rewards/format_reward": 0.666666679084301, "step": 368 }, { "advantage_max": 1.7020577490329742, "advantage_mean": 4.967053990334591e-09, "advantage_min": -1.0351696237921715, "advantage_std": 0.9998689591884613, "completion_length": 2127.2709007263184, "epoch": 0.4217142857142857, "grad_norm": 1.0540339946746826, "kl": 0.14821624755859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.7543467624442956e-07, "loss": 0.0059, "reward": 0.2355181914754212, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2355181914754212, "reward_after_std": 0.8739989325404167, "reward_before_mean": 0.5608813725411892, "reward_before_std": 0.8898568116128445, "reward_change_max": 0.00032555311918258667, "reward_change_mean": -0.32536316104233265, "reward_change_min": -0.6706997528672218, "reward_change_std": 0.2611721996217966, "reward_std": 0.873998936265707, "rewards/cosine_scaled_reward": -0.0737259928137064, "rewards/format_reward": 0.7083333395421505, "step": 369 }, { "advantage_max": 1.7186729609966278, "advantage_mean": 1.2107193581023523e-08, "advantage_min": -1.031410463154316, "advantage_std": 0.9997797161340714, "completion_length": 2167.0208892822266, "epoch": 0.4228571428571429, "grad_norm": 0.940868079662323, "kl": 0.1809539794921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.729523361034538e-07, "loss": 0.0072, "reward": 0.10207423567771912, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.10207423567771912, "reward_after_std": 0.49341800436377525, "reward_before_mean": 0.4260668642818928, "reward_before_std": 0.4589389022439718, "reward_change_max": 0.0, "reward_change_mean": -0.3239926230162382, "reward_change_min": -0.5411821566522121, "reward_change_std": 0.2021560426801443, "reward_std": 0.49341801181435585, "rewards/cosine_scaled_reward": -0.08904991298913956, "rewards/format_reward": 0.6041666753590107, "step": 370 }, { "advantage_max": 1.8417688310146332, "advantage_mean": -7.388492528903612e-08, "advantage_min": -0.9486106596887112, "advantage_std": 0.9998104944825172, "completion_length": 1245.5208587646484, "epoch": 0.424, "grad_norm": 1.0443849563598633, "kl": 0.08673095703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.7048349887476037e-07, "loss": 0.0035, "reward": 0.665027653798461, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.665027653798461, "reward_after_std": 0.5601641908288002, "reward_before_mean": 1.1774352552602068, "reward_before_std": 0.39603511430323124, "reward_change_max": 0.0006187185645103455, "reward_change_mean": -0.5124075748026371, "reward_change_min": -0.7063048034906387, "reward_change_std": 0.2821565680205822, "reward_std": 0.5601642094552517, "rewards/cosine_scaled_reward": 0.16163425520062447, "rewards/format_reward": 0.854166679084301, "step": 371 }, { "advantage_max": 1.6466941237449646, "advantage_mean": -1.6142923997541914e-08, "advantage_min": -1.1122152507305145, "advantage_std": 0.9998790249228477, "completion_length": 2170.979202270508, "epoch": 0.42514285714285716, "grad_norm": 1.2797470092773438, "kl": 0.143310546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.6802828488599294e-07, "loss": 0.0057, "reward": 0.5211599614704028, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5211599614704028, "reward_after_std": 0.8999143987894058, "reward_before_mean": 0.9481218121945858, "reward_before_std": 0.9148260280489922, "reward_change_max": 0.00012987852096557617, "reward_change_mean": -0.42696182802319527, "reward_change_min": -0.7499468177556992, "reward_change_std": 0.29452427197247744, "reward_std": 0.899914413690567, "rewards/cosine_scaled_reward": 0.10947755957022309, "rewards/format_reward": 0.7291666716337204, "step": 372 }, { "advantage_max": 1.8214802145957947, "advantage_mean": -8.381903504606214e-09, "advantage_min": -0.9805489256978035, "advantage_std": 0.9998204708099365, "completion_length": 1034.895851135254, "epoch": 0.42628571428571427, "grad_norm": 0.8010674715042114, "kl": 0.06592178344726562, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.655868138008171e-07, "loss": 0.0026, "reward": 0.2368478998541832, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2368478998541832, "reward_after_std": 0.6452644765377045, "reward_before_mean": 0.5821825656457804, "reward_before_std": 0.5779650695621967, "reward_change_max": 0.0002724975347518921, "reward_change_mean": -0.34533468075096607, "reward_change_min": -0.5144113972783089, "reward_change_std": 0.20268330350518227, "reward_std": 0.6452644914388657, "rewards/cosine_scaled_reward": -0.17765872552990913, "rewards/format_reward": 0.9375000074505806, "step": 373 }, { "advantage_max": 1.8111439943313599, "advantage_mean": -1.241763691872677e-09, "advantage_min": -0.9515286162495613, "advantage_std": 0.9998194202780724, "completion_length": 1608.1250495910645, "epoch": 0.42742857142857144, "grad_norm": 1.0322747230529785, "kl": 0.0944671630859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.631592046130896e-07, "loss": 0.0038, "reward": 0.3719535187119618, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3719535187119618, "reward_after_std": 0.6457658410072327, "reward_before_mean": 0.7688353797420859, "reward_before_std": 0.557947501540184, "reward_change_max": 0.0, "reward_change_mean": -0.39688185043632984, "reward_change_min": -0.6067758910357952, "reward_change_std": 0.22988414019346237, "reward_std": 0.6457658521831036, "rewards/cosine_scaled_reward": -0.03224899619817734, "rewards/format_reward": 0.8333333358168602, "step": 374 }, { "advantage_max": 1.8249159008264542, "advantage_mean": -2.421438777266971e-08, "advantage_min": -0.9235183000564575, "advantage_std": 0.9998628944158554, "completion_length": 2076.395866394043, "epoch": 0.42857142857142855, "grad_norm": 0.7585271000862122, "kl": 0.20538330078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.6074557564105724e-07, "loss": 0.0082, "reward": 0.4968459371011704, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4968459371011704, "reward_after_std": 0.8810446634888649, "reward_before_mean": 0.9040422799880616, "reward_before_std": 0.8108563907444477, "reward_change_max": 0.0005010664463043213, "reward_change_mean": -0.40719636250287294, "reward_change_min": -0.6932191513478756, "reward_change_std": 0.2573488671332598, "reward_std": 0.8810446709394455, "rewards/cosine_scaled_reward": 0.10827112477272749, "rewards/format_reward": 0.6875000018626451, "step": 375 }, { "advantage_max": 1.8238688856363297, "advantage_mean": -9.934107536579972e-09, "advantage_min": -0.8891046792268753, "advantage_std": 0.9998284503817558, "completion_length": 1665.0000228881836, "epoch": 0.4297142857142857, "grad_norm": 0.5375373363494873, "kl": 0.12718963623046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.583460445215911e-07, "loss": 0.0051, "reward": 0.3528364673256874, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3528364673256874, "reward_after_std": 0.8419087566435337, "reward_before_mean": 0.7210152223706245, "reward_before_std": 0.7949371058493853, "reward_change_max": 0.0, "reward_change_mean": -0.36817875877022743, "reward_change_min": -0.6667932122945786, "reward_change_std": 0.24621394462883472, "reward_std": 0.8419087827205658, "rewards/cosine_scaled_reward": -0.06657573767006397, "rewards/format_reward": 0.854166679084301, "step": 376 }, { "advantage_max": 1.7410006672143936, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -1.0070336163043976, "advantage_std": 0.9998903796076775, "completion_length": 2226.958396911621, "epoch": 0.4308571428571429, "grad_norm": 1.1558725833892822, "kl": 0.16888427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.5596072820445254e-07, "loss": 0.0068, "reward": 0.09637512359768152, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09637512359768152, "reward_after_std": 1.0012357905507088, "reward_before_mean": 0.3505655601620674, "reward_before_std": 1.018935278058052, "reward_change_max": 0.0006854459643363953, "reward_change_mean": -0.2541904244571924, "reward_change_min": -0.5014745779335499, "reward_change_std": 0.22105401195585728, "reward_std": 1.0012357980012894, "rewards/cosine_scaled_reward": -0.13721722643822432, "rewards/format_reward": 0.6250000186264515, "step": 377 }, { "advantage_max": 1.7350095808506012, "advantage_mean": -2.297262402528588e-08, "advantage_min": -0.9632280319929123, "advantage_std": 0.9998912960290909, "completion_length": 1623.437551498413, "epoch": 0.432, "grad_norm": 0.8382158875465393, "kl": 0.1004180908203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.5358974294659373e-07, "loss": 0.004, "reward": 0.594434508588165, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.594434508588165, "reward_after_std": 1.0145360939204693, "reward_before_mean": 1.02805445343256, "reward_before_std": 1.0013620741665363, "reward_change_max": 1.8477439880371094e-06, "reward_change_mean": -0.43361997976899147, "reward_change_min": -0.85247603058815, "reward_change_std": 0.3046064507216215, "reward_std": 1.0145361050963402, "rewards/cosine_scaled_reward": 0.08694389602169394, "rewards/format_reward": 0.8541666828095913, "step": 378 }, { "advantage_max": 1.73798269033432, "advantage_mean": -1.7384688910659918e-08, "advantage_min": -0.9359891712665558, "advantage_std": 0.9998560920357704, "completion_length": 2098.7916870117188, "epoch": 0.43314285714285716, "grad_norm": 0.8646078705787659, "kl": 0.187713623046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.512332043064913e-07, "loss": 0.0075, "reward": 0.3658733298070729, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3658733298070729, "reward_after_std": 0.861970629543066, "reward_before_mean": 0.7396698640659451, "reward_before_std": 0.8552902564406395, "reward_change_max": 0.000509701669216156, "reward_change_mean": -0.3737965375185013, "reward_change_min": -0.7624665647745132, "reward_change_std": 0.2767182420939207, "reward_std": 0.8619706593453884, "rewards/cosine_scaled_reward": 0.005251582711935043, "rewards/format_reward": 0.7291666716337204, "step": 379 }, { "advantage_max": 1.767277181148529, "advantage_mean": -1.5211601867015645e-08, "advantage_min": -1.0232531651854515, "advantage_std": 0.9998098164796829, "completion_length": 1504.479232788086, "epoch": 0.4342857142857143, "grad_norm": 1.021233320236206, "kl": 0.1306304931640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.488912271385139e-07, "loss": 0.0052, "reward": 0.4782151123508811, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4782151123508811, "reward_after_std": 0.7126452196389437, "reward_before_mean": 0.9069797731935978, "reward_before_std": 0.6340154409408569, "reward_change_max": 0.001208581030368805, "reward_change_mean": -0.42876469157636166, "reward_change_min": -0.6624927073717117, "reward_change_std": 0.26269793696701527, "reward_std": 0.7126452382653952, "rewards/cosine_scaled_reward": 0.05765656102448702, "rewards/format_reward": 0.7916666679084301, "step": 380 }, { "advantage_max": 1.8054725229740143, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -1.0350727289915085, "advantage_std": 0.9998171553015709, "completion_length": 2012.9584045410156, "epoch": 0.43542857142857144, "grad_norm": 1.260669469833374, "kl": 0.21795654296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.465639255873246e-07, "loss": 0.0087, "reward": -0.04927718825638294, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.04927718825638294, "reward_after_std": 0.6071189232170582, "reward_before_mean": 0.19980726018548012, "reward_before_std": 0.5741494670510292, "reward_change_max": 0.0005010589957237244, "reward_change_mean": -0.2490844428539276, "reward_change_min": -0.4365546405315399, "reward_change_std": 0.17182970978319645, "reward_std": 0.6071189418435097, "rewards/cosine_scaled_reward": -0.2334297150373459, "rewards/format_reward": 0.6666666828095913, "step": 381 }, { "advantage_max": 1.7921118289232254, "advantage_mean": -1.9092112890639612e-08, "advantage_min": -0.9720060527324677, "advantage_std": 0.9998367726802826, "completion_length": 1402.0833740234375, "epoch": 0.43657142857142855, "grad_norm": 0.8463107943534851, "kl": 0.161376953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.4425141308231765e-07, "loss": 0.0065, "reward": 0.26892574690282345, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.26892574690282345, "reward_after_std": 0.758769515901804, "reward_before_mean": 0.6149431341327727, "reward_before_std": 0.7166970930993557, "reward_change_max": 0.0008708909153938293, "reward_change_mean": -0.34601740539073944, "reward_change_min": -0.5896544642746449, "reward_change_std": 0.2252532821148634, "reward_std": 0.7587695270776749, "rewards/cosine_scaled_reward": -0.11961177922785282, "rewards/format_reward": 0.8541666753590107, "step": 382 }, { "advantage_max": 1.756461575627327, "advantage_mean": 3.725290964595729e-09, "advantage_min": -0.895235151052475, "advantage_std": 0.9998020455241203, "completion_length": 2359.937530517578, "epoch": 0.4377142857142857, "grad_norm": 1.1095963716506958, "kl": 0.3541107177734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.4195380233209006e-07, "loss": 0.0142, "reward": 0.11817532475106418, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.11817532475106418, "reward_after_std": 0.7191711701452732, "reward_before_mean": 0.4183031292632222, "reward_before_std": 0.7047206647694111, "reward_change_max": 0.0018953531980514526, "reward_change_mean": -0.30012779869139194, "reward_change_min": -0.6166234388947487, "reward_change_std": 0.22765531949698925, "reward_std": 0.7191711850464344, "rewards/cosine_scaled_reward": -0.09293179586529732, "rewards/format_reward": 0.6041666716337204, "step": 383 }, { "advantage_max": 1.8194759339094162, "advantage_mean": -3.166496892470505e-08, "advantage_min": -0.9654436111450195, "advantage_std": 0.9998816549777985, "completion_length": 1483.6667404174805, "epoch": 0.43885714285714283, "grad_norm": 0.8851792216300964, "kl": 0.10132217407226562, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.3967120531894857e-07, "loss": 0.004, "reward": 0.6962929107248783, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6962929107248783, "reward_after_std": 0.9415237978100777, "reward_before_mean": 1.172610143199563, "reward_before_std": 0.8729718811810017, "reward_change_max": 1.2874603271484375e-05, "reward_change_mean": -0.4763172436505556, "reward_change_min": -0.7801134400069714, "reward_change_std": 0.3034983268007636, "reward_std": 0.9415238127112389, "rewards/cosine_scaled_reward": 0.18005506787449121, "rewards/format_reward": 0.8125000074505806, "step": 384 }, { "advantage_max": 1.7342734187841415, "advantage_mean": -3.1044086745701804e-08, "advantage_min": -1.0521792992949486, "advantage_std": 0.9998474344611168, "completion_length": 1812.7500610351562, "epoch": 0.44, "grad_norm": 0.8224566578865051, "kl": 0.17679595947265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.374037332934512e-07, "loss": 0.0071, "reward": 0.22003162489272654, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": 0.22003162489272654, "reward_after_std": 0.79856176674366, "reward_before_mean": 0.5471120458096266, "reward_before_std": 0.7889064289629459, "reward_change_max": 0.0, "reward_change_mean": -0.3270804397761822, "reward_change_min": -0.6277721002697945, "reward_change_std": 0.23880037106573582, "reward_std": 0.7985618002712727, "rewards/cosine_scaled_reward": -0.07019397895783186, "rewards/format_reward": 0.6875000149011612, "step": 385 }, { "advantage_max": 1.6939862072467804, "advantage_mean": -1.691902762335218e-08, "advantage_min": -1.1218422651290894, "advantage_std": 0.9998356774449348, "completion_length": 1800.8125610351562, "epoch": 0.44114285714285717, "grad_norm": 1.1486692428588867, "kl": 0.23262786865234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.3515149676898552e-07, "loss": 0.0093, "reward": 0.3363962881267071, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3363962881267071, "reward_after_std": 0.8022975288331509, "reward_before_mean": 0.7059931866824627, "reward_before_std": 0.8133284114301205, "reward_change_max": 0.0005755871534347534, "reward_change_mean": -0.3695969022810459, "reward_change_min": -0.6922038830816746, "reward_change_std": 0.2751284819096327, "reward_std": 0.8022975437343121, "rewards/cosine_scaled_reward": -0.022003429010510445, "rewards/format_reward": 0.7500000186264515, "step": 386 }, { "advantage_max": 1.707328885793686, "advantage_mean": -4.470348402563218e-08, "advantage_min": -1.0347394496202469, "advantage_std": 0.9998210296034813, "completion_length": 2157.062545776367, "epoch": 0.4422857142857143, "grad_norm": 1.8616845607757568, "kl": 0.26586151123046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.3291460551638237e-07, "loss": 0.0107, "reward": 0.24127850262448192, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.24127850262448192, "reward_after_std": 0.6882153376936913, "reward_before_mean": 0.5933197988197207, "reward_before_std": 0.6659880541265011, "reward_change_max": 0.0007935240864753723, "reward_change_mean": -0.3520413190126419, "reward_change_min": -0.6123362518846989, "reward_change_std": 0.2510596886277199, "reward_std": 0.6882153712213039, "rewards/cosine_scaled_reward": -0.02625676617026329, "rewards/format_reward": 0.6458333469927311, "step": 387 }, { "advantage_max": 1.7912124544382095, "advantage_mean": -8.071462498371318e-09, "advantage_min": -0.990226574242115, "advantage_std": 0.9998493269085884, "completion_length": 1388.5833740234375, "epoch": 0.44342857142857145, "grad_norm": 0.7421445846557617, "kl": 0.11296844482421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.306931685585657e-07, "loss": 0.0045, "reward": 0.5875010080635548, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5875010080635548, "reward_after_std": 0.7431738264858723, "reward_before_mean": 1.047975342720747, "reward_before_std": 0.6550696976482868, "reward_change_max": 0.0, "reward_change_mean": -0.4604743402451277, "reward_change_min": -0.7125534936785698, "reward_change_std": 0.2757462076842785, "reward_std": 0.7431738488376141, "rewards/cosine_scaled_reward": 0.09690433740615845, "rewards/format_reward": 0.8541666828095913, "step": 388 }, { "advantage_max": 1.7267747819423676, "advantage_mean": -2.1730860666480112e-08, "advantage_min": -0.9880621880292892, "advantage_std": 0.9998234510421753, "completion_length": 1713.5833740234375, "epoch": 0.44457142857142856, "grad_norm": 0.8764031529426575, "kl": 0.19806671142578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2848729416523859e-07, "loss": 0.0079, "reward": 0.35750759206712246, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.35750759206712246, "reward_after_std": 0.7207697853446007, "reward_before_mean": 0.7473924616351724, "reward_before_std": 0.6993493568152189, "reward_change_max": 0.00026351213455200195, "reward_change_mean": -0.3898848630487919, "reward_change_min": -0.6821940205991268, "reward_change_std": 0.26345623284578323, "reward_std": 0.7207698151469231, "rewards/cosine_scaled_reward": -0.06380378128960729, "rewards/format_reward": 0.8750000149011612, "step": 389 }, { "advantage_max": 1.7296848595142365, "advantage_mean": 2.7318797113373705e-08, "advantage_min": -0.9938266947865486, "advantage_std": 0.9997981563210487, "completion_length": 1891.458381652832, "epoch": 0.44571428571428573, "grad_norm": 0.7914329767227173, "kl": 0.1772918701171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2629708984760706e-07, "loss": 0.0071, "reward": 0.15934922639280558, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.15934922639280558, "reward_after_std": 0.7329917438328266, "reward_before_mean": 0.47760644368827343, "reward_before_std": 0.7281799092888832, "reward_change_max": 0.0003632083535194397, "reward_change_mean": -0.3182572089135647, "reward_change_min": -0.5775157622992992, "reward_change_std": 0.22752394014969468, "reward_std": 0.7329917661845684, "rewards/cosine_scaled_reward": -0.1257801183965057, "rewards/format_reward": 0.729166679084301, "step": 390 }, { "advantage_max": 1.7276591509580612, "advantage_mean": 9.313226190243995e-09, "advantage_min": -1.053447738289833, "advantage_std": 0.9998690560460091, "completion_length": 1809.1459045410156, "epoch": 0.44685714285714284, "grad_norm": 1.5454634428024292, "kl": 0.3096466064453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2412266235313973e-07, "loss": 0.0124, "reward": 0.377378785982728, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.377378785982728, "reward_after_std": 0.9563984498381615, "reward_before_mean": 0.7395449681207538, "reward_before_std": 0.9543644115328789, "reward_change_max": 0.00011724233627319336, "reward_change_mean": -0.36216619424521923, "reward_change_min": -0.6636506915092468, "reward_change_std": 0.26318536326289177, "reward_std": 0.9563984870910645, "rewards/cosine_scaled_reward": 0.026022482197731733, "rewards/format_reward": 0.6875000167638063, "step": 391 }, { "advantage_max": 1.8653128445148468, "advantage_mean": -1.9868215517249155e-08, "advantage_min": -0.986283004283905, "advantage_std": 0.999817006289959, "completion_length": 1741.020896911621, "epoch": 0.448, "grad_norm": 1.0141348838806152, "kl": 0.17852020263671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2196411766036487e-07, "loss": 0.0072, "reward": 0.20314090978354216, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20314090978354216, "reward_after_std": 0.6909726150333881, "reward_before_mean": 0.5301010186522035, "reward_before_std": 0.6087089702486992, "reward_change_max": 0.0, "reward_change_mean": -0.3269601259380579, "reward_change_min": -0.49704053439199924, "reward_change_std": 0.18584060855209827, "reward_std": 0.6909726411104202, "rewards/cosine_scaled_reward": -0.14119949005544186, "rewards/format_reward": 0.8125000055879354, "step": 392 }, { "advantage_max": 1.7371740937232971, "advantage_mean": -4.501392744660215e-09, "advantage_min": -0.9123519137501717, "advantage_std": 0.9998664557933807, "completion_length": 1658.187572479248, "epoch": 0.4491428571428571, "grad_norm": 0.791374683380127, "kl": 0.1796417236328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1982156097370557e-07, "loss": 0.0072, "reward": 0.3361839293502271, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3361839293502271, "reward_after_std": 0.9034698754549026, "reward_before_mean": 0.6911499003472272, "reward_before_std": 0.8989875055849552, "reward_change_max": 0.0005335807800292969, "reward_change_mean": -0.35496594198048115, "reward_change_min": -0.7006049901247025, "reward_change_std": 0.2625715425238013, "reward_std": 0.9034699089825153, "rewards/cosine_scaled_reward": -0.008591731544584036, "rewards/format_reward": 0.7083333414047956, "step": 393 }, { "advantage_max": 1.8380515724420547, "advantage_mean": -2.887100078452498e-08, "advantage_min": -0.8600859493017197, "advantage_std": 0.9998437762260437, "completion_length": 2161.5625610351562, "epoch": 0.4502857142857143, "grad_norm": 1.8681862354278564, "kl": 0.3366851806640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1769509671835223e-07, "loss": 0.0135, "reward": 0.005552591755986214, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.005552591755986214, "reward_after_std": 0.8122586011886597, "reward_before_mean": 0.24411384388804436, "reward_before_std": 0.7915139775723219, "reward_change_max": 0.0017316117882728577, "reward_change_mean": -0.2385612539947033, "reward_change_min": -0.48638592660427094, "reward_change_std": 0.19629649817943573, "reward_std": 0.8122586272656918, "rewards/cosine_scaled_reward": -0.18002642318606377, "rewards/format_reward": 0.6041666753590107, "step": 394 }, { "advantage_max": 1.8597172647714615, "advantage_mean": -2.1109978431965715e-08, "advantage_min": -0.9049240797758102, "advantage_std": 0.9998238682746887, "completion_length": 1527.5000343322754, "epoch": 0.4514285714285714, "grad_norm": 0.8493121862411499, "kl": 0.22707366943359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1558482853517253e-07, "loss": 0.0091, "reward": 0.3813153766095638, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3813153766095638, "reward_after_std": 0.6925483010709286, "reward_before_mean": 0.768714863806963, "reward_before_std": 0.5859698206186295, "reward_change_max": 0.0, "reward_change_mean": -0.3873995114117861, "reward_change_min": -0.6053726449608803, "reward_change_std": 0.22276469971984625, "reward_std": 0.6925483159720898, "rewards/cosine_scaled_reward": 0.009357419796288013, "rewards/format_reward": 0.7500000055879354, "step": 395 }, { "advantage_max": 1.7137230038642883, "advantage_mean": -1.2417634254191512e-08, "advantage_min": -1.0351039543747902, "advantage_std": 0.9998411983251572, "completion_length": 1632.6250305175781, "epoch": 0.45257142857142857, "grad_norm": 1.918643593788147, "kl": 0.24441146850585938, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.134908592756607e-07, "loss": 0.0098, "reward": 0.2749175587669015, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2749175587669015, "reward_after_std": 0.7350540012121201, "reward_before_mean": 0.630565570667386, "reward_before_std": 0.7222996801137924, "reward_change_max": 0.0009008049964904785, "reward_change_mean": -0.35564801655709743, "reward_change_min": -0.6620389670133591, "reward_change_std": 0.24911822751164436, "reward_std": 0.7350540310144424, "rewards/cosine_scaled_reward": -0.038883913308382034, "rewards/format_reward": 0.7083333507180214, "step": 396 }, { "advantage_max": 1.7904691249132156, "advantage_mean": 5.58793583627093e-09, "advantage_min": -1.0387514308094978, "advantage_std": 0.9997662827372551, "completion_length": 1407.0208740234375, "epoch": 0.45371428571428574, "grad_norm": 1.1686025857925415, "kl": 0.1932525634765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1141329099692406e-07, "loss": 0.0077, "reward": 0.07833226304501295, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07833226304501295, "reward_after_std": 0.637881813570857, "reward_before_mean": 0.3739766292273998, "reward_before_std": 0.6106444429606199, "reward_change_max": 0.00090036541223526, "reward_change_mean": -0.29564435686916113, "reward_change_min": -0.49300988018512726, "reward_change_std": 0.20516443625092506, "reward_std": 0.6378818284720182, "rewards/cosine_scaled_reward": -0.19842836540192366, "rewards/format_reward": 0.7708333432674408, "step": 397 }, { "advantage_max": 1.686153620481491, "advantage_mean": -1.707424740482466e-08, "advantage_min": -1.0883286967873573, "advantage_std": 0.9998270943760872, "completion_length": 1903.1250534057617, "epoch": 0.45485714285714285, "grad_norm": 1.1451284885406494, "kl": 0.36431884765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0935222495670968e-07, "loss": 0.0146, "reward": 0.29257922316901386, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.29257922316901386, "reward_after_std": 0.7144969664514065, "reward_before_mean": 0.6604115404188633, "reward_before_std": 0.7212763726711273, "reward_change_max": 0.001193992793560028, "reward_change_mean": -0.3678323123604059, "reward_change_min": -0.6373005285859108, "reward_change_std": 0.2641093973070383, "reward_std": 0.7144969888031483, "rewards/cosine_scaled_reward": -0.07604425214231014, "rewards/format_reward": 0.8125000149011612, "step": 398 }, { "advantage_max": 1.7231591194868088, "advantage_mean": -4.221995686393143e-08, "advantage_min": -0.901986688375473, "advantage_std": 0.9998647421598434, "completion_length": 1535.5417098999023, "epoch": 0.456, "grad_norm": 0.9203754663467407, "kl": 0.229644775390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0730776160846853e-07, "loss": 0.0092, "reward": 0.6963324442040175, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6963324442040175, "reward_after_std": 0.8897623643279076, "reward_before_mean": 1.183788426220417, "reward_before_std": 0.8542864043265581, "reward_change_max": 0.0, "reward_change_mean": -0.4874560348689556, "reward_change_min": -0.8889418914914131, "reward_change_std": 0.32904959097504616, "reward_std": 0.8897623680531979, "rewards/cosine_scaled_reward": 0.14397753402590752, "rewards/format_reward": 0.8958333507180214, "step": 399 }, { "advantage_max": 1.7916525453329086, "advantage_mean": -1.8626442610525373e-09, "advantage_min": -0.8803001940250397, "advantage_std": 0.9998540505766869, "completion_length": 1032.4375190734863, "epoch": 0.45714285714285713, "grad_norm": 1.1205514669418335, "kl": 0.148895263671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0528000059645995e-07, "loss": 0.006, "reward": 0.7829889804124832, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7829889804124832, "reward_after_std": 0.9042446874082088, "reward_before_mean": 1.2992039993405342, "reward_before_std": 0.828706594184041, "reward_change_max": 0.0003900453448295593, "reward_change_mean": -0.5162149891257286, "reward_change_min": -0.8750152438879013, "reward_change_std": 0.342633742839098, "reward_std": 0.9042446985840797, "rewards/cosine_scaled_reward": 0.22251864802092314, "rewards/format_reward": 0.8541666716337204, "step": 400 }, { "advantage_max": 1.7257637679576874, "advantage_mean": 3.290673195044391e-08, "advantage_min": -1.063326045870781, "advantage_std": 0.9998514279723167, "completion_length": 2271.604217529297, "epoch": 0.4582857142857143, "grad_norm": 1.258865475654602, "kl": 0.42609405517578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.032690407508949e-07, "loss": 0.0171, "reward": 0.19111876375973225, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19111876375973225, "reward_after_std": 0.8394452929496765, "reward_before_mean": 0.5038880419451743, "reward_before_std": 0.8423514142632484, "reward_change_max": 0.0009241402149200439, "reward_change_mean": -0.3127692574635148, "reward_change_min": -0.5873893164098263, "reward_change_std": 0.23612038139253855, "reward_std": 0.8394453302025795, "rewards/cosine_scaled_reward": -0.03972265589982271, "rewards/format_reward": 0.5833333376795053, "step": 401 }, { "advantage_max": 1.8229260295629501, "advantage_mean": -1.55220432618286e-08, "advantage_min": -0.9057629108428955, "advantage_std": 0.9998283013701439, "completion_length": 1543.0208587646484, "epoch": 0.4594285714285714, "grad_norm": 1.1461936235427856, "kl": 0.271240234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0127498008311922e-07, "loss": 0.0108, "reward": 0.18124675983563066, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.18124675983563066, "reward_after_std": 0.6488780155777931, "reward_before_mean": 0.5076476049143821, "reward_before_std": 0.579480305314064, "reward_change_max": 0.0012984797358512878, "reward_change_mean": -0.32640084717422724, "reward_change_min": -0.5408530719578266, "reward_change_std": 0.20357784628868103, "reward_std": 0.648878026753664, "rewards/cosine_scaled_reward": -0.121176203712821, "rewards/format_reward": 0.7500000074505806, "step": 402 }, { "advantage_max": 1.8078063428401947, "advantage_mean": -1.8936892831611374e-08, "advantage_min": -0.9291981235146523, "advantage_std": 0.9998122751712799, "completion_length": 1199.6875457763672, "epoch": 0.4605714285714286, "grad_norm": 0.9709504246711731, "kl": 0.1196441650390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.9929791578083655e-07, "loss": 0.0048, "reward": 0.4865487087517977, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4865487087517977, "reward_after_std": 0.598474521189928, "reward_before_mean": 0.9330362007021904, "reward_before_std": 0.4937343690544367, "reward_change_max": 0.00041528791189193726, "reward_change_mean": -0.4464874751865864, "reward_change_min": -0.7050358578562737, "reward_change_std": 0.2615822274237871, "reward_std": 0.5984745398163795, "rewards/cosine_scaled_reward": 0.039434750098735094, "rewards/format_reward": 0.8541666772216558, "step": 403 }, { "advantage_max": 1.6991583555936813, "advantage_mean": 6.208816905051151e-09, "advantage_min": -0.9808021113276482, "advantage_std": 0.9998401403427124, "completion_length": 1612.5833740234375, "epoch": 0.4617142857142857, "grad_norm": 1.618449091911316, "kl": 0.35077667236328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.9733794420337213e-07, "loss": 0.014, "reward": 0.17475704971002415, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17475704971002415, "reward_after_std": 0.6976526118814945, "reward_before_mean": 0.501314964145422, "reward_before_std": 0.7025996670126915, "reward_change_max": 0.0008546337485313416, "reward_change_mean": -0.32655790634453297, "reward_change_min": -0.6092537939548492, "reward_change_std": 0.2381744971498847, "reward_std": 0.6976526454091072, "rewards/cosine_scaled_reward": -0.155592517927289, "rewards/format_reward": 0.8125000111758709, "step": 404 }, { "advantage_max": 1.7660992741584778, "advantage_mean": -8.07146260939362e-09, "advantage_min": -1.0090806558728218, "advantage_std": 0.999880850315094, "completion_length": 1408.708366394043, "epoch": 0.46285714285714286, "grad_norm": 1.7040504217147827, "kl": 0.33414459228515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.9539516087697517e-07, "loss": 0.0134, "reward": 0.6742039502132684, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6742039502132684, "reward_after_std": 0.9147906377911568, "reward_before_mean": 1.1496076360344887, "reward_before_std": 0.8650781996548176, "reward_change_max": 0.0004979074001312256, "reward_change_mean": -0.4754037018865347, "reward_change_min": -0.8488960191607475, "reward_change_std": 0.3184524718672037, "reward_std": 0.9147906750440598, "rewards/cosine_scaled_reward": 0.1581371445208788, "rewards/format_reward": 0.8333333395421505, "step": 405 }, { "advantage_max": 1.6632074266672134, "advantage_mean": 2.483526384544632e-09, "advantage_min": -1.04509849101305, "advantage_std": 0.9998893141746521, "completion_length": 1777.333381652832, "epoch": 0.464, "grad_norm": 1.4719630479812622, "kl": 0.3616943359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.934696604901642e-07, "loss": 0.0145, "reward": 0.6033707396127284, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6033707396127284, "reward_after_std": 1.058504804968834, "reward_before_mean": 1.0417891945689917, "reward_before_std": 1.0909473784267902, "reward_change_max": 0.00012195855379104614, "reward_change_mean": -0.4384184516966343, "reward_change_min": -0.8497341647744179, "reward_change_std": 0.34140376187860966, "reward_std": 1.058504857122898, "rewards/cosine_scaled_reward": 0.09381125285290182, "rewards/format_reward": 0.854166679084301, "step": 406 }, { "advantage_max": 1.6858511567115784, "advantage_mean": -2.23517425679276e-08, "advantage_min": -1.1315465793013573, "advantage_std": 0.9998411983251572, "completion_length": 1451.8333587646484, "epoch": 0.46514285714285714, "grad_norm": 1.7365717887878418, "kl": 0.27008056640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.915615368891117e-07, "loss": 0.0108, "reward": 0.4329396355897188, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4329396355897188, "reward_after_std": 0.67533865198493, "reward_before_mean": 0.854749821126461, "reward_before_std": 0.6376374363899231, "reward_change_max": 0.0025676488876342773, "reward_change_mean": -0.421810170635581, "reward_change_min": -0.6801861710846424, "reward_change_std": 0.26713269390165806, "reward_std": 0.6753386780619621, "rewards/cosine_scaled_reward": 0.010708222165703773, "rewards/format_reward": 0.8333333432674408, "step": 407 }, { "advantage_max": 1.7903211116790771, "advantage_mean": -5.587936335871291e-09, "advantage_min": -0.8724770396947861, "advantage_std": 0.9998845607042313, "completion_length": 1674.43754196167, "epoch": 0.4662857142857143, "grad_norm": 1.1553095579147339, "kl": 0.35880279541015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8967088307307e-07, "loss": 0.0143, "reward": 0.47488534450531006, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.47488534450531006, "reward_after_std": 0.936702661216259, "reward_before_mean": 0.8732836376875639, "reward_before_std": 0.886435866355896, "reward_change_max": 0.000733204185962677, "reward_change_mean": -0.3983982680365443, "reward_change_min": -0.7694457732141018, "reward_change_std": 0.28530881367623806, "reward_std": 0.9367026910185814, "rewards/cosine_scaled_reward": 0.05122512299567461, "rewards/format_reward": 0.7708333507180214, "step": 408 }, { "advantage_max": 1.7163973897695541, "advantage_mean": 1.6763806343078613e-08, "advantage_min": -1.019752785563469, "advantage_std": 0.9998348653316498, "completion_length": 2360.9583892822266, "epoch": 0.4674285714285714, "grad_norm": 1.3046019077301025, "kl": 0.510406494140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8779779118983867e-07, "loss": 0.0204, "reward": 0.017473383340984583, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.017473383340984583, "reward_after_std": 0.7720525339245796, "reward_before_mean": 0.27401461359113455, "reward_before_std": 0.7742743045091629, "reward_change_max": 0.0017567500472068787, "reward_change_mean": -0.2565412297844887, "reward_change_min": -0.5349822342395782, "reward_change_std": 0.20509593561291695, "reward_std": 0.7720525339245796, "rewards/cosine_scaled_reward": -0.13382603414356709, "rewards/format_reward": 0.541666679084301, "step": 409 }, { "advantage_max": 1.6995727270841599, "advantage_mean": -1.614292477469803e-08, "advantage_min": -1.0749888718128204, "advantage_std": 0.999834306538105, "completion_length": 1802.9375534057617, "epoch": 0.4685714285714286, "grad_norm": 1.4509356021881104, "kl": 0.461883544921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8594235253127372e-07, "loss": 0.0185, "reward": 0.33310021134093404, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.33310021134093404, "reward_after_std": 0.870648954063654, "reward_before_mean": 0.6942419772967696, "reward_before_std": 0.8829897176474333, "reward_change_max": 0.0, "reward_change_mean": -0.36114177852869034, "reward_change_min": -0.7244736291468143, "reward_change_std": 0.278875982388854, "reward_std": 0.8706489875912666, "rewards/cosine_scaled_reward": -0.0278790183365345, "rewards/format_reward": 0.7500000111758709, "step": 410 }, { "advantage_max": 1.8188308328390121, "advantage_mean": 1.490116185998147e-08, "advantage_min": -0.924636073410511, "advantage_std": 0.9998219534754753, "completion_length": 2191.9167098999023, "epoch": 0.4697142857142857, "grad_norm": 1.0741444826126099, "kl": 0.49733734130859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8410465752883758e-07, "loss": 0.0199, "reward": 0.12649364955723286, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12649364955723286, "reward_after_std": 0.7999816536903381, "reward_before_mean": 0.413819288223749, "reward_before_std": 0.7640783302485943, "reward_change_max": 0.0, "reward_change_mean": -0.2873256057500839, "reward_change_min": -0.538658631965518, "reward_change_std": 0.20251087564975023, "reward_std": 0.799981702119112, "rewards/cosine_scaled_reward": -0.1055903835222125, "rewards/format_reward": 0.6250000093132257, "step": 411 }, { "advantage_max": 1.705745816230774, "advantage_mean": -1.4280279514444771e-08, "advantage_min": -1.1013883873820305, "advantage_std": 0.9998851716518402, "completion_length": 1603.7500610351562, "epoch": 0.47085714285714286, "grad_norm": 1.9148565530776978, "kl": 0.3883056640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.822847957491922e-07, "loss": 0.0155, "reward": 0.4359737882914487, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4359737882914487, "reward_after_std": 0.9558186158537865, "reward_before_mean": 0.8221534118056297, "reward_before_std": 0.9665836282074451, "reward_change_max": 0.0, "reward_change_mean": -0.3861796110868454, "reward_change_min": -0.7052872814238071, "reward_change_std": 0.2830039132386446, "reward_std": 0.9558186233043671, "rewards/cosine_scaled_reward": 0.004826690070331097, "rewards/format_reward": 0.8125000223517418, "step": 412 }, { "advantage_max": 1.6772472113370895, "advantage_mean": -3.383805458057054e-08, "advantage_min": -1.107955940067768, "advantage_std": 0.9998527988791466, "completion_length": 1701.833381652832, "epoch": 0.472, "grad_norm": 2.199369192123413, "kl": 0.3614501953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.804828558898332e-07, "loss": 0.0145, "reward": 0.645661392249167, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.645661392249167, "reward_after_std": 0.808834757655859, "reward_before_mean": 1.124671246856451, "reward_before_std": 0.7627800777554512, "reward_change_max": 0.005616113543510437, "reward_change_mean": -0.47900988161563873, "reward_change_min": -0.8092235289514065, "reward_change_std": 0.3156248927116394, "reward_std": 0.8088348060846329, "rewards/cosine_scaled_reward": 0.16650228016078472, "rewards/format_reward": 0.7916666902601719, "step": 413 }, { "advantage_max": 1.7193413525819778, "advantage_mean": -1.179675312990014e-08, "advantage_min": -1.142937459051609, "advantage_std": 0.9998357966542244, "completion_length": 2369.041717529297, "epoch": 0.47314285714285714, "grad_norm": 2.4205071926116943, "kl": 0.5526123046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7869892577476722e-07, "loss": 0.0221, "reward": 0.006730238324962556, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.006730238324962556, "reward_after_std": 0.6561923511326313, "reward_before_mean": 0.27453258424066007, "reward_before_std": 0.65548250451684, "reward_change_max": 0.0014767125248908997, "reward_change_mean": -0.2678023539483547, "reward_change_min": -0.5061851441860199, "reward_change_std": 0.19859026744961739, "reward_std": 0.6561923697590828, "rewards/cosine_scaled_reward": -0.14398371148854494, "rewards/format_reward": 0.5625000055879354, "step": 414 }, { "advantage_max": 1.7226037830114365, "advantage_mean": 4.6566130951219975e-09, "advantage_min": -0.9619991928339005, "advantage_std": 0.9998587444424629, "completion_length": 2613.916717529297, "epoch": 0.4742857142857143, "grad_norm": 1.7470006942749023, "kl": 0.84814453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7693309235023127e-07, "loss": 0.0339, "reward": -0.09183326875790954, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09183326875790954, "reward_after_std": 0.8713497258722782, "reward_before_mean": 0.11427396442741156, "reward_before_std": 0.9172992371022701, "reward_change_max": 0.002862788736820221, "reward_change_mean": -0.20610722713172436, "reward_change_min": -0.5330719910562038, "reward_change_std": 0.21957671362906694, "reward_std": 0.8713497743010521, "rewards/cosine_scaled_reward": -0.1095296953280922, "rewards/format_reward": 0.33333333767950535, "step": 415 }, { "advantage_max": 1.7845307737588882, "advantage_mean": -5.4016710215876174e-08, "advantage_min": -1.0070279240608215, "advantage_std": 0.9998261108994484, "completion_length": 1235.5000305175781, "epoch": 0.4754285714285714, "grad_norm": 1.7401037216186523, "kl": 0.1582794189453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7518544168045524e-07, "loss": 0.0063, "reward": 0.6856055930256844, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6856055930256844, "reward_after_std": 0.7195888757705688, "reward_before_mean": 1.1873854883015156, "reward_before_std": 0.6368912644684315, "reward_change_max": 0.0, "reward_change_mean": -0.5017799139022827, "reward_change_min": -0.7971869707107544, "reward_change_std": 0.29596132785081863, "reward_std": 0.7195888832211494, "rewards/cosine_scaled_reward": 0.13535940553992987, "rewards/format_reward": 0.9166666716337204, "step": 416 }, { "advantage_max": 1.6987647861242294, "advantage_mean": 2.483527383745354e-09, "advantage_min": -1.1091318130493164, "advantage_std": 0.9998269751667976, "completion_length": 2247.68758392334, "epoch": 0.4765714285714286, "grad_norm": 1.6846369504928589, "kl": 0.6414794921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7345605894346726e-07, "loss": 0.0256, "reward": -0.027562174946069717, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.027562174946069717, "reward_after_std": 0.6717595010995865, "reward_before_mean": 0.22826975071802735, "reward_before_std": 0.6878395229578018, "reward_change_max": 0.001991450786590576, "reward_change_mean": -0.255831902846694, "reward_change_min": -0.49109113216400146, "reward_change_std": 0.19680028408765793, "reward_std": 0.6717595122754574, "rewards/cosine_scaled_reward": -0.14628181234002113, "rewards/format_reward": 0.5208333488553762, "step": 417 }, { "advantage_max": 1.7121010422706604, "advantage_mean": -8.071462498371318e-09, "advantage_min": -1.0464161559939384, "advantage_std": 0.9998231381177902, "completion_length": 1373.3125305175781, "epoch": 0.4777142857142857, "grad_norm": 1.201941967010498, "kl": 0.34503173828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7174502842694212e-07, "loss": 0.0138, "reward": 0.505875600501895, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.505875600501895, "reward_after_std": 0.8202372901141644, "reward_before_mean": 0.9382327465973503, "reward_before_std": 0.8229701146483421, "reward_change_max": 0.0, "reward_change_mean": -0.4323571305721998, "reward_change_min": -0.7873807102441788, "reward_change_std": 0.30513608641922474, "reward_std": 0.8202373161911964, "rewards/cosine_scaled_reward": 0.06286634411662817, "rewards/format_reward": 0.8125000111758709, "step": 418 }, { "advantage_max": 1.7464422434568405, "advantage_mean": -1.1175871339474952e-08, "advantage_min": -1.0971321389079094, "advantage_std": 0.9998645409941673, "completion_length": 2179.1667098999023, "epoch": 0.47885714285714287, "grad_norm": 1.4393150806427002, "kl": 0.655426025390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7005243352409333e-07, "loss": 0.0262, "reward": 0.24082693550735712, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24082693550735712, "reward_after_std": 0.933142215013504, "reward_before_mean": 0.5561147816479206, "reward_before_std": 0.9340134970843792, "reward_change_max": 0.0023950934410095215, "reward_change_mean": -0.31528783403337, "reward_change_min": -0.6195023320615292, "reward_change_std": 0.2443357529118657, "reward_std": 0.9331422485411167, "rewards/cosine_scaled_reward": -0.03444262454286218, "rewards/format_reward": 0.625000013038516, "step": 419 }, { "advantage_max": 1.7392508685588837, "advantage_mean": 1.4280280180578586e-08, "advantage_min": -1.0607174634933472, "advantage_std": 0.99983199685812, "completion_length": 1662.2083778381348, "epoch": 0.48, "grad_norm": 1.416146159172058, "kl": 0.4628753662109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6837835672960831e-07, "loss": 0.0185, "reward": 0.14904859941452742, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.14904859941452742, "reward_after_std": 0.631911464035511, "reward_before_mean": 0.4680563807487488, "reward_before_std": 0.5958464667201042, "reward_change_max": 0.0, "reward_change_mean": -0.3190077841281891, "reward_change_min": -0.5424599312245846, "reward_change_std": 0.21177070401608944, "reward_std": 0.6319114938378334, "rewards/cosine_scaled_reward": -0.10972180962562561, "rewards/format_reward": 0.6875000223517418, "step": 420 }, { "advantage_max": 1.7352849692106247, "advantage_mean": 2.359350548264416e-08, "advantage_min": -1.0686837211251259, "advantage_std": 0.9998110011219978, "completion_length": 2240.8958892822266, "epoch": 0.48114285714285715, "grad_norm": 2.037701368331909, "kl": 0.824798583984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6672287963562852e-07, "loss": 0.033, "reward": -0.016208623914280906, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.016208623914280906, "reward_after_std": 0.6088180728256702, "reward_before_mean": 0.2512338310480118, "reward_before_std": 0.5972829647362232, "reward_change_max": 0.00027758628129959106, "reward_change_mean": -0.26744244992733, "reward_change_min": -0.5033800862729549, "reward_change_std": 0.1909478772431612, "reward_std": 0.6088180877268314, "rewards/cosine_scaled_reward": -0.186883092392236, "rewards/format_reward": 0.6250000186264515, "step": 421 }, { "advantage_max": 1.6779368966817856, "advantage_mean": 1.8626452602532595e-08, "advantage_min": -1.060163713991642, "advantage_std": 0.9998297765851021, "completion_length": 2321.104217529297, "epoch": 0.48228571428571426, "grad_norm": 1.9188036918640137, "kl": 0.7562103271484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6508608292777203e-07, "loss": 0.0302, "reward": 0.07708868011832237, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07708868011832237, "reward_after_std": 0.7318853549659252, "reward_before_mean": 0.36480083176866174, "reward_before_std": 0.7410414032638073, "reward_change_max": 0.0003057420253753662, "reward_change_mean": -0.28771214932203293, "reward_change_min": -0.5595702417194843, "reward_change_std": 0.22329799737781286, "reward_std": 0.731885377317667, "rewards/cosine_scaled_reward": -0.13009959366172552, "rewards/format_reward": 0.6250000149011612, "step": 422 }, { "advantage_max": 1.6890679448843002, "advantage_mean": 7.450580485901526e-09, "advantage_min": -1.1275624111294746, "advantage_std": 0.9998152256011963, "completion_length": 2157.000030517578, "epoch": 0.48342857142857143, "grad_norm": 1.873183250427246, "kl": 0.71240234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6346804638120098e-07, "loss": 0.0285, "reward": 0.023489116691052914, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.023489116691052914, "reward_after_std": 0.6917878873646259, "reward_before_mean": 0.2896041702479124, "reward_before_std": 0.6916869804263115, "reward_change_max": 0.0018218010663986206, "reward_change_mean": -0.26611505541950464, "reward_change_min": -0.5238477308303118, "reward_change_std": 0.20184546057134867, "reward_std": 0.6917878985404968, "rewards/cosine_scaled_reward": -0.13644792622653767, "rewards/format_reward": 0.562500013038516, "step": 423 }, { "advantage_max": 1.7417199313640594, "advantage_mean": -1.5522043650406658e-08, "advantage_min": -0.9422660320997238, "advantage_std": 0.9998251348733902, "completion_length": 2407.104232788086, "epoch": 0.4845714285714286, "grad_norm": 1.691881537437439, "kl": 0.8114013671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6186884885673413e-07, "loss": 0.0324, "reward": -0.08947994629852474, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.08947994629852474, "reward_after_std": 0.7040245346724987, "reward_before_mean": 0.13528160471469164, "reward_before_std": 0.7018413096666336, "reward_change_max": 0.0008262023329734802, "reward_change_mean": -0.22476153261959553, "reward_change_min": -0.4856540150940418, "reward_change_std": 0.17472868598997593, "reward_std": 0.7040245346724987, "rewards/cosine_scaled_reward": -0.17194254975765944, "rewards/format_reward": 0.47916666977107525, "step": 424 }, { "advantage_max": 1.70308518409729, "advantage_mean": -3.6011140291947186e-08, "advantage_min": -1.1869082376360893, "advantage_std": 0.9998666122555733, "completion_length": 1821.6042175292969, "epoch": 0.4857142857142857, "grad_norm": 2.2496941089630127, "kl": 0.6309967041015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6028856829700258e-07, "loss": 0.0252, "reward": 0.47916033025830984, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.47916033025830984, "reward_after_std": 0.8907334096729755, "reward_before_mean": 0.8904343352187425, "reward_before_std": 0.8880314640700817, "reward_change_max": 0.0, "reward_change_mean": -0.41127400659024715, "reward_change_min": -0.7036528363823891, "reward_change_std": 0.2844604728743434, "reward_std": 0.8907334208488464, "rewards/cosine_scaled_reward": 0.09105047304183245, "rewards/format_reward": 0.7083333395421505, "step": 425 }, { "advantage_max": 1.6792911440134048, "advantage_mean": 3.725290520506519e-09, "advantage_min": -1.0789423137903214, "advantage_std": 0.9998480826616287, "completion_length": 1790.9375457763672, "epoch": 0.4868571428571429, "grad_norm": 2.7036373615264893, "kl": 0.7074432373046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5872728172265146e-07, "loss": 0.0283, "reward": 0.3401183672249317, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3401183672249317, "reward_after_std": 0.7311505675315857, "reward_before_mean": 0.7192423194646835, "reward_before_std": 0.7222111262381077, "reward_change_max": 0.0037378743290901184, "reward_change_mean": -0.37912395037710667, "reward_change_min": -0.6578963398933411, "reward_change_std": 0.255485400557518, "reward_std": 0.7311505749821663, "rewards/cosine_scaled_reward": -0.03621218912303448, "rewards/format_reward": 0.7916666828095913, "step": 426 }, { "advantage_max": 1.8759489059448242, "advantage_mean": -2.328304216092647e-10, "advantage_min": -0.909981869161129, "advantage_std": 0.9998509958386421, "completion_length": 2332.9375610351562, "epoch": 0.488, "grad_norm": 1.9152886867523193, "kl": 0.833984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5718506522858572e-07, "loss": 0.0333, "reward": 0.03957121632993221, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.03957121632993221, "reward_after_std": 0.7805034965276718, "reward_before_mean": 0.2943769544363022, "reward_before_std": 0.7338727153837681, "reward_change_max": 0.0, "reward_change_mean": -0.25480574183166027, "reward_change_min": -0.467791685834527, "reward_change_std": 0.17891321051865816, "reward_std": 0.780503511428833, "rewards/cosine_scaled_reward": -0.08197819907218218, "rewards/format_reward": 0.45833334885537624, "step": 427 }, { "advantage_max": 1.735244408249855, "advantage_mean": -1.0554989549049765e-08, "advantage_min": -0.990032747387886, "advantage_std": 0.9998773485422134, "completion_length": 1652.7500534057617, "epoch": 0.48914285714285716, "grad_norm": 1.4348410367965698, "kl": 0.47418212890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5566199398026147e-07, "loss": 0.019, "reward": 0.3397603491321206, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3397603491321206, "reward_after_std": 0.8883824944496155, "reward_before_mean": 0.698169541079551, "reward_before_std": 0.8790141828358173, "reward_change_max": 0.0, "reward_change_mean": -0.35840918496251106, "reward_change_min": -0.6807228215038776, "reward_change_std": 0.26186509244143963, "reward_std": 0.8883825093507767, "rewards/cosine_scaled_reward": -0.005081913201138377, "rewards/format_reward": 0.7083333469927311, "step": 428 }, { "advantage_max": 1.7370030730962753, "advantage_mean": -1.4047449092835507e-08, "advantage_min": -1.0770287439227104, "advantage_std": 0.9998385161161423, "completion_length": 1547.1250381469727, "epoch": 0.49028571428571427, "grad_norm": 1.778883934020996, "kl": 0.5862274169921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5415814221002265e-07, "loss": 0.0234, "reward": 0.3610060513019562, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3610060513019562, "reward_after_std": 0.7744550481438637, "reward_before_mean": 0.7426942111924291, "reward_before_std": 0.7563960812985897, "reward_change_max": 0.0012010559439659119, "reward_change_mean": -0.3816881626844406, "reward_change_min": -0.7105789259076118, "reward_change_std": 0.26517566479742527, "reward_std": 0.7744550555944443, "rewards/cosine_scaled_reward": -0.01406955812126398, "rewards/format_reward": 0.7708333432674408, "step": 429 }, { "advantage_max": 1.766536459326744, "advantage_mean": -4.493631644564289e-08, "advantage_min": -1.0880927294492722, "advantage_std": 0.9998660758137703, "completion_length": 1799.4792022705078, "epoch": 0.49142857142857144, "grad_norm": 1.8504732847213745, "kl": 0.654327392578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5267358321348285e-07, "loss": 0.0261, "reward": 0.4810065981000662, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4810065981000662, "reward_after_std": 0.8460737504065037, "reward_before_mean": 0.8931037466973066, "reward_before_std": 0.7966925539076328, "reward_change_max": 0.0024940744042396545, "reward_change_mean": -0.41209715604782104, "reward_change_min": -0.6931100897490978, "reward_change_std": 0.27682924270629883, "reward_std": 0.8460737504065037, "rewards/cosine_scaled_reward": 0.08196851704269648, "rewards/format_reward": 0.7291666828095913, "step": 430 }, { "advantage_max": 1.7138948887586594, "advantage_mean": -6.208817349140361e-09, "advantage_min": -1.0293898060917854, "advantage_std": 0.999839186668396, "completion_length": 1814.5834045410156, "epoch": 0.49257142857142855, "grad_norm": 1.5769057273864746, "kl": 0.7185516357421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5120838934595337e-07, "loss": 0.0288, "reward": 0.021916335448622704, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.021916335448622704, "reward_after_std": 0.6635828576982021, "reward_before_mean": 0.2936446195235476, "reward_before_std": 0.6534183248877525, "reward_change_max": 0.001545630395412445, "reward_change_mean": -0.27172828931361437, "reward_change_min": -0.48428529128432274, "reward_change_std": 0.19409907422959805, "reward_std": 0.6635828800499439, "rewards/cosine_scaled_reward": -0.19692770019173622, "rewards/format_reward": 0.6875000186264515, "step": 431 }, { "advantage_max": 1.6820464730262756, "advantage_mean": -1.9557774677547712e-08, "advantage_min": -1.0192477330565453, "advantage_std": 0.9998295232653618, "completion_length": 2242.7500610351562, "epoch": 0.4937142857142857, "grad_norm": 2.072165012359619, "kl": 0.813812255859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4976263201891613e-07, "loss": 0.0326, "reward": -0.034584891051054, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.034584891051054, "reward_after_std": 0.7970075290650129, "reward_before_mean": 0.2045463277027011, "reward_before_std": 0.8348637204617262, "reward_change_max": 0.0012067630887031555, "reward_change_mean": -0.23913125693798065, "reward_change_min": -0.5468174144625664, "reward_change_std": 0.21863396745175123, "reward_std": 0.7970075588673353, "rewards/cosine_scaled_reward": -0.12689350359141827, "rewards/format_reward": 0.4583333432674408, "step": 432 }, { "advantage_max": 1.8060698211193085, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.8564623966813087, "advantage_std": 0.9998571425676346, "completion_length": 1917.0000457763672, "epoch": 0.4948571428571429, "grad_norm": 1.9060853719711304, "kl": 0.7872314453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.483363816965435e-07, "loss": 0.0315, "reward": 0.22292231512255967, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22292231512255967, "reward_after_std": 0.7656309306621552, "reward_before_mean": 0.5522305665072054, "reward_before_std": 0.7235872074961662, "reward_change_max": 0.0015352964401245117, "reward_change_mean": -0.329308208078146, "reward_change_min": -0.6004983559250832, "reward_change_std": 0.2363244229927659, "reward_std": 0.7656309716403484, "rewards/cosine_scaled_reward": -0.046801396645605564, "rewards/format_reward": 0.6458333432674408, "step": 433 }, { "advantage_max": 1.606886312365532, "advantage_mean": 1.862645426786713e-09, "advantage_min": -1.2542832344770432, "advantage_std": 0.9998219758272171, "completion_length": 1925.3125534057617, "epoch": 0.496, "grad_norm": 2.6380958557128906, "kl": 0.804931640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.469297078922642e-07, "loss": 0.0322, "reward": -0.10575361305382103, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.10575361305382103, "reward_after_std": 0.5835559405386448, "reward_before_mean": 0.13612819153058808, "reward_before_std": 0.6133833043277264, "reward_change_max": 0.0027960538864135742, "reward_change_mean": -0.24188180826604366, "reward_change_min": -0.4476979188621044, "reward_change_std": 0.1925535798072815, "reward_std": 0.5835559517145157, "rewards/cosine_scaled_reward": -0.2756859138607979, "rewards/format_reward": 0.6875000149011612, "step": 434 }, { "advantage_max": 1.7966983765363693, "advantage_mean": -1.8626452047421083e-09, "advantage_min": -0.9924411401152611, "advantage_std": 0.9997947365045547, "completion_length": 1189.708381652832, "epoch": 0.49714285714285716, "grad_norm": 2.6071808338165283, "kl": 0.5529022216796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4554267916537495e-07, "loss": 0.0221, "reward": 0.03083255933597684, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03083255933597684, "reward_after_std": 0.5064611211419106, "reward_before_mean": 0.32153959572315216, "reward_before_std": 0.4588327668607235, "reward_change_max": 0.0, "reward_change_mean": -0.2907070368528366, "reward_change_min": -0.482327438890934, "reward_change_std": 0.17869125492870808, "reward_std": 0.506461139768362, "rewards/cosine_scaled_reward": -0.25589688308537006, "rewards/format_reward": 0.8333333395421505, "step": 435 }, { "advantage_max": 1.7848911434412003, "advantage_mean": -3.725291075618031e-09, "advantage_min": -1.0250633507966995, "advantage_std": 0.9998371303081512, "completion_length": 1660.2917137145996, "epoch": 0.4982857142857143, "grad_norm": 1.643998384475708, "kl": 0.750030517578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4417536311769885e-07, "loss": 0.03, "reward": 0.44262125343084335, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.44262125343084335, "reward_after_std": 0.7644370347261429, "reward_before_mean": 0.8519178479909897, "reward_before_std": 0.7052580006420612, "reward_change_max": 0.0, "reward_change_mean": -0.40929659083485603, "reward_change_min": -0.64281902089715, "reward_change_std": 0.2554041398689151, "reward_std": 0.7644370496273041, "rewards/cosine_scaled_reward": 0.06137557839974761, "rewards/format_reward": 0.7291666809469461, "step": 436 }, { "advantage_max": 1.7122971713542938, "advantage_mean": -6.208817182606907e-09, "advantage_min": -1.0945315062999725, "advantage_std": 0.9998469427227974, "completion_length": 1893.5416870117188, "epoch": 0.49942857142857144, "grad_norm": 2.3377344608306885, "kl": 0.7822265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4282782639029128e-07, "loss": 0.0313, "reward": 0.21610759082250297, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21610759082250297, "reward_after_std": 0.7649851590394974, "reward_before_mean": 0.5418718312866986, "reward_before_std": 0.7529012635350227, "reward_change_max": 0.003853835165500641, "reward_change_mean": -0.32576424442231655, "reward_change_min": -0.5747802630066872, "reward_change_std": 0.2267757887020707, "reward_std": 0.7649851888418198, "rewards/cosine_scaled_reward": -0.0832307543605566, "rewards/format_reward": 0.7083333469927311, "step": 437 }, { "advantage_max": 1.6405274718999863, "advantage_mean": 4.9670545454461035e-09, "advantage_min": -1.0970692560076714, "advantage_std": 0.9997981637716293, "completion_length": 2329.437545776367, "epoch": 0.5005714285714286, "grad_norm": 1.4638139009475708, "kl": 1.0709991455078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4150013466019114e-07, "loss": 0.0429, "reward": -0.04648863337934017, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.04648863337934017, "reward_after_std": 0.6623328626155853, "reward_before_mean": 0.2077143844217062, "reward_before_std": 0.6913297958672047, "reward_change_max": 0.0009057372808456421, "reward_change_mean": -0.2542030122131109, "reward_change_min": -0.5234818086028099, "reward_change_std": 0.21704575512558222, "reward_std": 0.6623328663408756, "rewards/cosine_scaled_reward": -0.15655948291532695, "rewards/format_reward": 0.5208333395421505, "step": 438 }, { "advantage_max": 1.6812333464622498, "advantage_mean": -1.2417635808503746e-09, "advantage_min": -0.9782895967364311, "advantage_std": 0.999854676425457, "completion_length": 1743.2083587646484, "epoch": 0.5017142857142857, "grad_norm": 2.869419574737549, "kl": 0.65850830078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4019235263722034e-07, "loss": 0.0263, "reward": 0.030551514239050448, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.030551514239050448, "reward_after_std": 0.8735250420868397, "reward_before_mean": 0.2807038873434067, "reward_before_std": 0.9029458425939083, "reward_change_max": 0.0019630342721939087, "reward_change_mean": -0.2501524034887552, "reward_change_min": -0.5816636979579926, "reward_change_std": 0.22933765407651663, "reward_std": 0.8735250718891621, "rewards/cosine_scaled_reward": -0.161731387488544, "rewards/format_reward": 0.604166679084301, "step": 439 }, { "advantage_max": 1.7551601380109787, "advantage_mean": 1.490116130486996e-08, "advantage_min": -0.9574207738041878, "advantage_std": 0.9998493418097496, "completion_length": 2117.041717529297, "epoch": 0.5028571428571429, "grad_norm": 2.1518924236297607, "kl": 1.08984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3890454406082956e-07, "loss": 0.0435, "reward": 0.010695042728912085, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.010695042728912085, "reward_after_std": 0.7147711627185345, "reward_before_mean": 0.2692843030090444, "reward_before_std": 0.6956265531480312, "reward_change_max": 0.0006374120712280273, "reward_change_mean": -0.25858925841748714, "reward_change_min": -0.48591882921755314, "reward_change_std": 0.1894593071192503, "reward_std": 0.7147711850702763, "rewards/cosine_scaled_reward": -0.19869118509814143, "rewards/format_reward": 0.6666666846722364, "step": 440 }, { "advantage_max": 1.6780736148357391, "advantage_mean": -8.692344399818808e-09, "advantage_min": -0.9933064877986908, "advantage_std": 0.9998457506299019, "completion_length": 1835.645851135254, "epoch": 0.504, "grad_norm": 3.0458028316497803, "kl": 0.72802734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3763677169699217e-07, "loss": 0.0291, "reward": 0.12760269886348397, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.12760269886348397, "reward_after_std": 0.7875058762729168, "reward_before_mean": 0.4247519914060831, "reward_before_std": 0.8058272823691368, "reward_change_max": 0.0013157352805137634, "reward_change_mean": -0.2971492912620306, "reward_change_min": -0.6083884090185165, "reward_change_std": 0.23828487563878298, "reward_std": 0.7875059023499489, "rewards/cosine_scaled_reward": -0.11054066941142082, "rewards/format_reward": 0.6458333507180214, "step": 441 }, { "advantage_max": 1.6514086723327637, "advantage_mean": -1.3659397946064189e-08, "advantage_min": -1.1171049699187279, "advantage_std": 0.999860055744648, "completion_length": 1477.6042022705078, "epoch": 0.5051428571428571, "grad_norm": 1.3946844339370728, "kl": 0.51104736328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3638909733514452e-07, "loss": 0.0204, "reward": 0.3657463360577822, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3657463360577822, "reward_after_std": 0.8437108583748341, "reward_before_mean": 0.7433155763428658, "reward_before_std": 0.8619813248515129, "reward_change_max": 0.0007864609360694885, "reward_change_mean": -0.37756926007568836, "reward_change_min": -0.6919102817773819, "reward_change_std": 0.2796674408018589, "reward_std": 0.8437108770012856, "rewards/cosine_scaled_reward": 0.017491115257143974, "rewards/format_reward": 0.7083333488553762, "step": 442 }, { "advantage_max": 1.7629027962684631, "advantage_mean": 1.3659398612198004e-08, "advantage_min": -0.9923323430120945, "advantage_std": 0.9998226314783096, "completion_length": 2171.750068664551, "epoch": 0.5062857142857143, "grad_norm": 2.375197410583496, "kl": 0.855926513671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.351615817851748e-07, "loss": 0.0342, "reward": 0.020650985185056925, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.020650985185056925, "reward_after_std": 0.7707908842712641, "reward_before_mean": 0.27570805698633194, "reward_before_std": 0.7681555701419711, "reward_change_max": 0.002855271100997925, "reward_change_mean": -0.25505704688839614, "reward_change_min": -0.4462346062064171, "reward_change_std": 0.18801956600509584, "reward_std": 0.7707909177988768, "rewards/cosine_scaled_reward": -0.15381266549229622, "rewards/format_reward": 0.5833333432674408, "step": 443 }, { "advantage_max": 1.7165568172931671, "advantage_mean": -1.9247333726823967e-08, "advantage_min": -0.9324416369199753, "advantage_std": 0.9998496621847153, "completion_length": 2007.8958740234375, "epoch": 0.5074285714285715, "grad_norm": 1.917452335357666, "kl": 0.8436737060546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3395428487445914e-07, "loss": 0.0338, "reward": 0.0359902863856405, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0359902863856405, "reward_after_std": 0.7226921916007996, "reward_before_mean": 0.3072628080844879, "reward_before_std": 0.724447213113308, "reward_change_max": 0.0, "reward_change_mean": -0.27127252891659737, "reward_change_min": -0.5508077293634415, "reward_change_std": 0.20682274922728539, "reward_std": 0.7226921990513802, "rewards/cosine_scaled_reward": -0.17970194312511012, "rewards/format_reward": 0.6666666828095913, "step": 444 }, { "advantage_max": 1.8018615394830704, "advantage_mean": -1.6142924996742636e-08, "advantage_min": -1.0738493129611015, "advantage_std": 0.9998226761817932, "completion_length": 1945.5834197998047, "epoch": 0.5085714285714286, "grad_norm": 2.996227979660034, "kl": 0.95709228515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3276726544494571e-07, "loss": 0.0383, "reward": 0.0608814200386405, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0608814200386405, "reward_after_std": 0.7086494788527489, "reward_before_mean": 0.33803138858638704, "reward_before_std": 0.6751173473894596, "reward_change_max": 0.0, "reward_change_mean": -0.2771499715745449, "reward_change_min": -0.48153046146035194, "reward_change_std": 0.19007529132068157, "reward_std": 0.7086494825780392, "rewards/cosine_scaled_reward": -0.16431764629669487, "rewards/format_reward": 0.6666666828095913, "step": 445 }, { "advantage_max": 1.7290478199720383, "advantage_mean": -1.676380642634534e-08, "advantage_min": -0.9854962527751923, "advantage_std": 0.9998418763279915, "completion_length": 1711.5000762939453, "epoch": 0.5097142857142857, "grad_norm": 2.8393335342407227, "kl": 0.562469482421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.316005813502869e-07, "loss": 0.0225, "reward": 0.21680985265993513, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21680985265993513, "reward_after_std": 0.7741325199604034, "reward_before_mean": 0.5439182122936472, "reward_before_std": 0.7603906467556953, "reward_change_max": 0.0007556229829788208, "reward_change_mean": -0.3271083664149046, "reward_change_min": -0.6477513685822487, "reward_change_std": 0.25323644652962685, "reward_std": 0.7741325572133064, "rewards/cosine_scaled_reward": -0.0405409038066864, "rewards/format_reward": 0.6250000149011612, "step": 446 }, { "advantage_max": 1.6450235098600388, "advantage_mean": 1.0865432997775315e-09, "advantage_min": -1.2040871158242226, "advantage_std": 0.9998258352279663, "completion_length": 1978.604232788086, "epoch": 0.5108571428571429, "grad_norm": 1.6465866565704346, "kl": 1.01416015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3045428945301953e-07, "loss": 0.0406, "reward": 0.1665862348745577, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1665862348745577, "reward_after_std": 0.684241134673357, "reward_before_mean": 0.49194120336323977, "reward_before_std": 0.7021683566272259, "reward_change_max": 0.0009320005774497986, "reward_change_mean": -0.32535500079393387, "reward_change_min": -0.6130348071455956, "reward_change_std": 0.24094501323997974, "reward_std": 0.6842411458492279, "rewards/cosine_scaled_reward": -0.09777940064668655, "rewards/format_reward": 0.6875000186264515, "step": 447 }, { "advantage_max": 1.7520337402820587, "advantage_mean": -2.110997865401032e-08, "advantage_min": -1.026205975562334, "advantage_std": 0.9998293519020081, "completion_length": 1531.2083892822266, "epoch": 0.512, "grad_norm": 2.4577713012695312, "kl": 0.7320556640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2932844562179352e-07, "loss": 0.0293, "reward": 0.28207549441140145, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.28207549441140145, "reward_after_std": 0.6254963763058186, "reward_before_mean": 0.648076742189005, "reward_before_std": 0.5620164261199534, "reward_change_max": 0.00311385840177536, "reward_change_mean": -0.3660012758336961, "reward_change_min": -0.5942125655710697, "reward_change_std": 0.2303004777058959, "reward_std": 0.6254964023828506, "rewards/cosine_scaled_reward": -0.05096163973212242, "rewards/format_reward": 0.750000013038516, "step": 448 }, { "advantage_max": 1.7443628013134003, "advantage_mean": 1.3659398057086491e-08, "advantage_min": -1.1052935719490051, "advantage_std": 0.999808594584465, "completion_length": 1451.145866394043, "epoch": 0.5131428571428571, "grad_norm": 4.073936939239502, "kl": 0.8507080078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2822310472864885e-07, "loss": 0.034, "reward": 0.07193030323833227, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07193030323833227, "reward_after_std": 0.5925132110714912, "reward_before_mean": 0.3685013074427843, "reward_before_std": 0.5541165396571159, "reward_change_max": 0.0006818026304244995, "reward_change_mean": -0.296570997685194, "reward_change_min": -0.48166324198246, "reward_change_std": 0.18980262894183397, "reward_std": 0.5925132371485233, "rewards/cosine_scaled_reward": -0.16991601279005408, "rewards/format_reward": 0.708333345130086, "step": 449 }, { "advantage_max": 1.742071345448494, "advantage_mean": 1.8626452713554897e-08, "advantage_min": -1.0224637016654015, "advantage_std": 0.9997702986001968, "completion_length": 1326.6042098999023, "epoch": 0.5142857142857142, "grad_norm": 1.6718668937683105, "kl": 0.46539306640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2713832064634125e-07, "loss": 0.0186, "reward": 0.2378229470923543, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2378229470923543, "reward_after_std": 0.49685801938176155, "reward_before_mean": 0.6082137902267277, "reward_before_std": 0.4265294875949621, "reward_change_max": 0.0006290972232818604, "reward_change_mean": -0.3703908286988735, "reward_change_min": -0.5807116702198982, "reward_change_std": 0.22203008830547333, "reward_std": 0.49685803428292274, "rewards/cosine_scaled_reward": -0.050059786066412926, "rewards/format_reward": 0.7083333395421505, "step": 450 }, { "advantage_max": 1.7501177936792374, "advantage_mean": -2.9647102828267435e-08, "advantage_min": -1.0650447010993958, "advantage_std": 0.9998284503817558, "completion_length": 1262.5000228881836, "epoch": 0.5154285714285715, "grad_norm": 1.5179022550582886, "kl": 0.48748779296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.260741462457165e-07, "loss": 0.0195, "reward": 0.25011507043382153, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.25011507043382153, "reward_after_std": 0.6612214110791683, "reward_before_mean": 0.6020324025303125, "reward_before_std": 0.6105714365839958, "reward_change_max": 0.0, "reward_change_mean": -0.35191732831299305, "reward_change_min": -0.5882875882089138, "reward_change_std": 0.2263214197009802, "reward_std": 0.6612214371562004, "rewards/cosine_scaled_reward": -0.09481715969741344, "rewards/format_reward": 0.7916666865348816, "step": 451 }, { "advantage_max": 1.7556055188179016, "advantage_mean": -2.483526828633842e-09, "advantage_min": -1.0078409612178802, "advantage_std": 0.9998588040471077, "completion_length": 1870.7292251586914, "epoch": 0.5165714285714286, "grad_norm": 2.929342031478882, "kl": 1.04827880859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2503063339313356e-07, "loss": 0.0419, "reward": 0.3153824971523136, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3153824971523136, "reward_after_std": 0.8747689202427864, "reward_before_mean": 0.6701357141137123, "reward_before_std": 0.8791177310049534, "reward_change_max": 0.0011571869254112244, "reward_change_mean": -0.3547531981021166, "reward_change_min": -0.6610901206731796, "reward_change_std": 0.27017839066684246, "reward_std": 0.8747689425945282, "rewards/cosine_scaled_reward": 0.04340117983520031, "rewards/format_reward": 0.5833333563059568, "step": 452 }, { "advantage_max": 1.7142881453037262, "advantage_mean": -2.1730861221591624e-08, "advantage_min": -1.1296189948916435, "advantage_std": 0.9998414218425751, "completion_length": 1626.6250534057617, "epoch": 0.5177142857142857, "grad_norm": 1.6005405187606812, "kl": 0.5292205810546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2400783294793668e-07, "loss": 0.0212, "reward": 0.2671230403939262, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2671230403939262, "reward_after_std": 0.7144428342580795, "reward_before_mean": 0.625585412606597, "reward_before_std": 0.711488913744688, "reward_change_max": 0.0005446150898933411, "reward_change_mean": -0.3584623634815216, "reward_change_min": -0.6303103789687157, "reward_change_std": 0.24988629668951035, "reward_std": 0.7144428566098213, "rewards/cosine_scaled_reward": 0.021126022562384605, "rewards/format_reward": 0.5833333358168602, "step": 453 }, { "advantage_max": 1.7049687504768372, "advantage_mean": -6.829699250587851e-09, "advantage_min": -1.1121578216552734, "advantage_std": 0.999816358089447, "completion_length": 1393.708366394043, "epoch": 0.5188571428571429, "grad_norm": 1.5501641035079956, "kl": 0.553009033203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2300579475997657e-07, "loss": 0.0221, "reward": 0.026437701657414436, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.026437701657414436, "reward_after_std": 0.6530758962035179, "reward_before_mean": 0.30219898000359535, "reward_before_std": 0.6356503963470459, "reward_change_max": 0.0002528727054595947, "reward_change_mean": -0.27576126530766487, "reward_change_min": -0.46192001551389694, "reward_change_std": 0.18601053021848202, "reward_std": 0.6530759148299694, "rewards/cosine_scaled_reward": -0.19265052117407322, "rewards/format_reward": 0.687500013038516, "step": 454 }, { "advantage_max": 1.7160159945487976, "advantage_mean": 8.381903504606214e-09, "advantage_min": -1.1429937183856964, "advantage_std": 0.9998278543353081, "completion_length": 1982.916748046875, "epoch": 0.52, "grad_norm": 2.1075704097747803, "kl": 1.01708984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.220245676671809e-07, "loss": 0.0407, "reward": 0.1318288864567876, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.1318288864567876, "reward_after_std": 0.6854713223874569, "reward_before_mean": 0.442580409348011, "reward_before_std": 0.6797358877956867, "reward_change_max": 0.000586189329624176, "reward_change_mean": -0.31075152195990086, "reward_change_min": -0.5433367304503918, "reward_change_std": 0.2230097260326147, "reward_std": 0.6854713596403599, "rewards/cosine_scaled_reward": -0.1641264744102955, "rewards/format_reward": 0.7708333469927311, "step": 455 }, { "advantage_max": 1.7900293469429016, "advantage_mean": -4.346172088887101e-09, "advantage_min": -0.9770554900169373, "advantage_std": 0.9998800680041313, "completion_length": 1844.9167404174805, "epoch": 0.5211428571428571, "grad_norm": 2.078012228012085, "kl": 0.9295883178710938, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2106419949317388e-07, "loss": 0.0371, "reward": 0.20092077553272247, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20092077553272247, "reward_after_std": 0.926351323723793, "reward_before_mean": 0.5018338588997722, "reward_before_std": 0.9120119288563728, "reward_change_max": 0.0009452924132347107, "reward_change_mean": -0.3009130731225014, "reward_change_min": -0.5843405686318874, "reward_change_std": 0.2316697221249342, "reward_std": 0.926351360976696, "rewards/cosine_scaled_reward": -0.08241641102358699, "rewards/format_reward": 0.666666679084301, "step": 456 }, { "advantage_max": 1.6528970152139664, "advantage_mean": 4.190951474747351e-09, "advantage_min": -1.0880041047930717, "advantage_std": 0.9998245164752007, "completion_length": 1918.6458892822266, "epoch": 0.5222857142857142, "grad_norm": 1.600352168083191, "kl": 1.0048828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2012473704494537e-07, "loss": 0.0402, "reward": 0.0012568535166792572, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0012568535166792572, "reward_after_std": 0.6866478323936462, "reward_before_mean": 0.2642772579565644, "reward_before_std": 0.7098511904478073, "reward_change_max": 0.0017377585172653198, "reward_change_mean": -0.2630203999578953, "reward_change_min": -0.4926149882376194, "reward_change_std": 0.2062510922551155, "reward_std": 0.6866478398442268, "rewards/cosine_scaled_reward": -0.1282780384644866, "rewards/format_reward": 0.520833345130086, "step": 457 }, { "advantage_max": 1.7814672142267227, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.998545415699482, "advantage_std": 0.9998621270060539, "completion_length": 1406.6250228881836, "epoch": 0.5234285714285715, "grad_norm": 1.86981201171875, "kl": 0.480712890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1920622611056974e-07, "loss": 0.0192, "reward": 0.18133725272491574, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18133725272491574, "reward_after_std": 0.7995229177176952, "reward_before_mean": 0.48804993368685246, "reward_before_std": 0.7666475623846054, "reward_change_max": 0.001362636685371399, "reward_change_mean": -0.30671270191669464, "reward_change_min": -0.5621924698352814, "reward_change_std": 0.21376881934702396, "reward_std": 0.7995229437947273, "rewards/cosine_scaled_reward": -0.18305837083607912, "rewards/format_reward": 0.854166679084301, "step": 458 }, { "advantage_max": 1.7139794379472733, "advantage_mean": -2.2972624247330486e-08, "advantage_min": -0.974767379462719, "advantage_std": 0.9998707324266434, "completion_length": 1808.833396911621, "epoch": 0.5245714285714286, "grad_norm": 2.1311519145965576, "kl": 0.7674713134765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1830871145697412e-07, "loss": 0.0307, "reward": 0.35844816733151674, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.35844816733151674, "reward_after_std": 0.9494384005665779, "reward_before_mean": 0.7138593718409538, "reward_before_std": 0.9674537070095539, "reward_change_max": 0.0007973909378051758, "reward_change_mean": -0.35541119147092104, "reward_change_min": -0.6972407847642899, "reward_change_std": 0.2776274522766471, "reward_std": 0.9494384415447712, "rewards/cosine_scaled_reward": 0.03401299752295017, "rewards/format_reward": 0.6458333488553762, "step": 459 }, { "advantage_max": 1.823698416352272, "advantage_mean": -2.0489097529718947e-08, "advantage_min": -0.8651067242026329, "advantage_std": 0.999833919107914, "completion_length": 2181.4167404174805, "epoch": 0.5257142857142857, "grad_norm": 1.5250437259674072, "kl": 1.087005615234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1743223682775649e-07, "loss": 0.0435, "reward": 0.1974321774323471, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1974321774323471, "reward_after_std": 0.6917814612388611, "reward_before_mean": 0.5247167139314115, "reward_before_std": 0.6247297674417496, "reward_change_max": 0.0003608092665672302, "reward_change_mean": -0.3272845260798931, "reward_change_min": -0.5729866735637188, "reward_change_std": 0.21527586411684752, "reward_std": 0.6917815022170544, "rewards/cosine_scaled_reward": -0.050141649320721626, "rewards/format_reward": 0.6250000167638063, "step": 460 }, { "advantage_max": 1.6892506778240204, "advantage_mean": 1.5522043428362053e-08, "advantage_min": -1.122504323720932, "advantage_std": 0.9998522698879242, "completion_length": 2028.8334045410156, "epoch": 0.5268571428571428, "grad_norm": 2.2229671478271484, "kl": 0.912078857421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1657684494105386e-07, "loss": 0.0365, "reward": 0.183541796868667, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.183541796868667, "reward_after_std": 0.7807581871747971, "reward_before_mean": 0.5023196414113045, "reward_before_std": 0.7918438091874123, "reward_change_max": 0.0014778897166252136, "reward_change_mean": -0.31877779867500067, "reward_change_min": -0.5983595065772533, "reward_change_std": 0.23299168050289154, "reward_std": 0.7807582169771194, "rewards/cosine_scaled_reward": -0.040506863966584206, "rewards/format_reward": 0.5833333525806665, "step": 461 }, { "advantage_max": 1.640615850687027, "advantage_mean": -1.9557774733058864e-08, "advantage_min": -1.1540326476097107, "advantage_std": 0.999806135892868, "completion_length": 1283.458381652832, "epoch": 0.528, "grad_norm": 1.373755931854248, "kl": 0.42746734619140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1574257748745986e-07, "loss": 0.0171, "reward": 0.11185320460936055, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.11185320460936055, "reward_after_std": 0.5902014076709747, "reward_before_mean": 0.43089804239571095, "reward_before_std": 0.5930353216826916, "reward_change_max": 0.0, "reward_change_mean": -0.3190448433160782, "reward_change_min": -0.5567139647901058, "reward_change_std": 0.21738199330866337, "reward_std": 0.5902014262974262, "rewards/cosine_scaled_reward": -0.15955098532140255, "rewards/format_reward": 0.750000013038516, "step": 462 }, { "advantage_max": 1.7736329287290573, "advantage_mean": -1.7384688910659918e-08, "advantage_min": -1.073650375008583, "advantage_std": 0.9998747482895851, "completion_length": 2215.3750534057617, "epoch": 0.5291428571428571, "grad_norm": 1.54464590549469, "kl": 0.968994140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1492947512799328e-07, "loss": 0.0387, "reward": 0.17749759927392006, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17749759927392006, "reward_after_std": 0.8764305487275124, "reward_before_mean": 0.4752648784779012, "reward_before_std": 0.8719460889697075, "reward_change_max": 0.000581793487071991, "reward_change_mean": -0.2977673108689487, "reward_change_min": -0.5611111186444759, "reward_change_std": 0.22235905937850475, "reward_std": 0.876430556178093, "rewards/cosine_scaled_reward": -0.033200898906216025, "rewards/format_reward": 0.5416666809469461, "step": 463 }, { "advantage_max": 1.6812104135751724, "advantage_mean": -3.13545277519367e-08, "advantage_min": -1.1877547651529312, "advantage_std": 0.9998318925499916, "completion_length": 1286.666732788086, "epoch": 0.5302857142857142, "grad_norm": 1.444524884223938, "kl": 0.5599212646484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1413757749211602e-07, "loss": 0.0224, "reward": 0.5510764128994197, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5510764128994197, "reward_after_std": 0.6248351968824863, "reward_before_mean": 1.021908799186349, "reward_before_std": 0.5738391764461994, "reward_change_max": 0.0, "reward_change_mean": -0.47083238139748573, "reward_change_min": -0.721558503806591, "reward_change_std": 0.2833131980150938, "reward_std": 0.6248352080583572, "rewards/cosine_scaled_reward": 0.06303772330284119, "rewards/format_reward": 0.8958333432674408, "step": 464 }, { "advantage_max": 1.727026715874672, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -1.0595378205180168, "advantage_std": 0.9998615384101868, "completion_length": 2132.020896911621, "epoch": 0.5314285714285715, "grad_norm": 1.9149830341339111, "kl": 0.9992446899414062, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1336692317580158e-07, "loss": 0.04, "reward": 0.2120549610699527, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2120549610699527, "reward_after_std": 0.8104190230369568, "reward_before_mean": 0.5326290186494589, "reward_before_std": 0.7940441556274891, "reward_change_max": 0.0006371587514877319, "reward_change_mean": -0.3205740787088871, "reward_change_min": -0.5808617770671844, "reward_change_std": 0.22702185064554214, "reward_std": 0.8104190304875374, "rewards/cosine_scaled_reward": -0.09826882090419531, "rewards/format_reward": 0.7291666865348816, "step": 465 }, { "advantage_max": 1.759408637881279, "advantage_mean": -8.692343678173842e-09, "advantage_min": -0.9421725645661354, "advantage_std": 0.999868594110012, "completion_length": 1642.4167175292969, "epoch": 0.5325714285714286, "grad_norm": 1.8337674140930176, "kl": 0.6185455322265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1261754973965422e-07, "loss": 0.0247, "reward": 0.262684247456491, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.262684247456491, "reward_after_std": 0.8431472107768059, "reward_before_mean": 0.5965541880577803, "reward_before_std": 0.8095771167427301, "reward_change_max": 0.001756027340888977, "reward_change_mean": -0.33386990427970886, "reward_change_min": -0.6470221914350986, "reward_change_std": 0.25174971111118793, "reward_std": 0.8431472405791283, "rewards/cosine_scaled_reward": -0.014222933445125818, "rewards/format_reward": 0.6250000111758709, "step": 466 }, { "advantage_max": 1.8189637959003448, "advantage_mean": 1.6763806565123218e-08, "advantage_min": -1.0182795599102974, "advantage_std": 0.9998102784156799, "completion_length": 2065.500045776367, "epoch": 0.5337142857142857, "grad_norm": 2.831685781478882, "kl": 0.92041015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1188949370707787e-07, "loss": 0.0368, "reward": -0.11408897396177053, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.11408897396177053, "reward_after_std": 0.5685962811112404, "reward_before_mean": 0.11652039317414165, "reward_before_std": 0.5278760902583599, "reward_change_max": 0.00025863945484161377, "reward_change_mean": -0.23060936108231544, "reward_change_min": -0.3900768756866455, "reward_change_std": 0.15485874377191067, "reward_std": 0.5685963034629822, "rewards/cosine_scaled_reward": -0.2334064778406173, "rewards/format_reward": 0.5833333376795053, "step": 467 }, { "advantage_max": 1.7499201446771622, "advantage_mean": -4.967053657267684e-09, "advantage_min": -0.9600143134593964, "advantage_std": 0.9998721703886986, "completion_length": 1974.2917251586914, "epoch": 0.5348571428571428, "grad_norm": 1.8548977375030518, "kl": 0.8831787109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1118279056249653e-07, "loss": 0.0353, "reward": 0.10449926368892193, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10449926368892193, "reward_after_std": 0.8679427579045296, "reward_before_mean": 0.3780664289370179, "reward_before_std": 0.8734307512640953, "reward_change_max": 0.0, "reward_change_mean": -0.27356718573719263, "reward_change_min": -0.6023085713386536, "reward_change_std": 0.21668203826993704, "reward_std": 0.8679428026080132, "rewards/cosine_scaled_reward": -0.10263345763087273, "rewards/format_reward": 0.5833333507180214, "step": 468 }, { "advantage_max": 1.6583528071641922, "advantage_mean": 1.5211601978037947e-08, "advantage_min": -1.1691104620695114, "advantage_std": 0.9998335912823677, "completion_length": 1831.8542213439941, "epoch": 0.536, "grad_norm": 1.5754987001419067, "kl": 1.0714111328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1049747474962444e-07, "loss": 0.0429, "reward": 0.11293114291038364, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.11293114291038364, "reward_after_std": 0.6838933974504471, "reward_before_mean": 0.41842682659626007, "reward_before_std": 0.6889714412391186, "reward_change_max": 0.0006234273314476013, "reward_change_mean": -0.3054956942796707, "reward_change_min": -0.5350888855755329, "reward_change_std": 0.22158779203891754, "reward_std": 0.6838934272527695, "rewards/cosine_scaled_reward": -0.11370326328324154, "rewards/format_reward": 0.6458333469927311, "step": 469 }, { "advantage_max": 1.725182831287384, "advantage_mean": 0.0, "advantage_min": -1.0750950574874878, "advantage_std": 0.999839261174202, "completion_length": 2395.0834045410156, "epoch": 0.5371428571428571, "grad_norm": 2.61478328704834, "kl": 1.37646484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0983357966978745e-07, "loss": 0.055, "reward": -0.023179539013653994, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": -0.023179539013653994, "reward_after_std": 0.6517323404550552, "reward_before_mean": 0.23503208067268133, "reward_before_std": 0.6523898020386696, "reward_change_max": 0.0009254887700080872, "reward_change_mean": -0.2582116276025772, "reward_change_min": -0.4944167695939541, "reward_change_std": 0.19454544875770807, "reward_std": 0.6517323516309261, "rewards/cosine_scaled_reward": -0.1324839610606432, "rewards/format_reward": 0.5000000074505806, "step": 470 }, { "advantage_max": 1.723146378993988, "advantage_mean": -7.45058065243498e-09, "advantage_min": -1.0446399301290512, "advantage_std": 0.9998374506831169, "completion_length": 2049.3958435058594, "epoch": 0.5382857142857143, "grad_norm": 2.4537980556488037, "kl": 1.03857421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0919113768029517e-07, "loss": 0.0416, "reward": 0.23960454890038818, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23960454890038818, "reward_after_std": 0.7334759458899498, "reward_before_mean": 0.5821069525554776, "reward_before_std": 0.7277428098022938, "reward_change_max": 0.0010803118348121643, "reward_change_mean": -0.34250241331756115, "reward_change_min": -0.6261528991162777, "reward_change_std": 0.2388583142310381, "reward_std": 0.7334759458899498, "rewards/cosine_scaled_reward": -0.021446531638503075, "rewards/format_reward": 0.625000013038516, "step": 471 }, { "advantage_max": 1.894005298614502, "advantage_mean": 1.7384688022481498e-08, "advantage_min": -0.8695674315094948, "advantage_std": 0.9998286813497543, "completion_length": 2050.3958587646484, "epoch": 0.5394285714285715, "grad_norm": 1.5686941146850586, "kl": 0.885772705078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0857018009286381e-07, "loss": 0.0355, "reward": 0.010497666895389557, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.010497666895389557, "reward_after_std": 0.7697325795888901, "reward_before_mean": 0.25853023055242375, "reward_before_std": 0.7131627630442381, "reward_change_max": 0.0, "reward_change_mean": -0.2480325698852539, "reward_change_min": -0.43661124631762505, "reward_change_std": 0.16969729028642178, "reward_std": 0.7697325870394707, "rewards/cosine_scaled_reward": -0.1936515560373664, "rewards/format_reward": 0.6458333600312471, "step": 472 }, { "advantage_max": 1.8264094293117523, "advantage_mean": 5.587935669737476e-09, "advantage_min": -1.0018049478530884, "advantage_std": 0.9998430460691452, "completion_length": 1862.7292251586914, "epoch": 0.5405714285714286, "grad_norm": 2.162597417831421, "kl": 0.7371826171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0797073717209013e-07, "loss": 0.0295, "reward": 0.0015712128952145576, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0015712128952145576, "reward_after_std": 0.7118121609091759, "reward_before_mean": 0.25474235555157065, "reward_before_std": 0.672457791864872, "reward_change_max": 0.0006057992577552795, "reward_change_mean": -0.2531711310148239, "reward_change_min": -0.4490245133638382, "reward_change_std": 0.17601436004042625, "reward_std": 0.7118121683597565, "rewards/cosine_scaled_reward": -0.2267955057322979, "rewards/format_reward": 0.7083333544433117, "step": 473 }, { "advantage_max": 1.6219315230846405, "advantage_mean": -2.421438727306935e-08, "advantage_min": -1.1856679916381836, "advantage_std": 0.9998554438352585, "completion_length": 2023.1667175292969, "epoch": 0.5417142857142857, "grad_norm": 2.245720624923706, "kl": 1.12255859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0739283813397639e-07, "loss": 0.045, "reward": 0.6076354747638106, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6076354747638106, "reward_after_std": 0.917079396545887, "reward_before_mean": 1.068249762058258, "reward_before_std": 0.9546788744628429, "reward_change_max": 0.0006736889481544495, "reward_change_mean": -0.4606142733246088, "reward_change_min": -0.8541491962969303, "reward_change_std": 0.3454452157020569, "reward_std": 0.9170794077217579, "rewards/cosine_scaled_reward": 0.21120819076895714, "rewards/format_reward": 0.6458333507180214, "step": 474 }, { "advantage_max": 1.720523789525032, "advantage_mean": -1.8626451825376478e-08, "advantage_min": -1.003638181835413, "advantage_std": 0.999807707965374, "completion_length": 1608.1875610351562, "epoch": 0.5428571428571428, "grad_norm": 2.4956586360931396, "kl": 0.50616455078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.068365111445064e-07, "loss": 0.0203, "reward": 0.21973821939900517, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21973821939900517, "reward_after_std": 0.6421426851302385, "reward_before_mean": 0.5657117059454322, "reward_before_std": 0.6174349505454302, "reward_change_max": 0.0, "reward_change_mean": -0.34597351029515266, "reward_change_min": -0.5904626809060574, "reward_change_std": 0.2287957202643156, "reward_std": 0.6421427018940449, "rewards/cosine_scaled_reward": -0.13381083216518164, "rewards/format_reward": 0.8333333507180214, "step": 475 }, { "advantage_max": 1.7275697588920593, "advantage_mean": -4.967053657267684e-09, "advantage_min": -1.0523411557078362, "advantage_std": 0.9998522475361824, "completion_length": 2025.3125686645508, "epoch": 0.544, "grad_norm": 1.240239143371582, "kl": 0.829833984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.063017833182728e-07, "loss": 0.0332, "reward": 0.2721425127238035, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2721425127238035, "reward_after_std": 0.8951118066906929, "reward_before_mean": 0.6045951023697853, "reward_before_std": 0.8891665646806359, "reward_change_max": 0.0010402053594589233, "reward_change_mean": -0.33245258405804634, "reward_change_min": -0.6085440143942833, "reward_change_std": 0.24889733083546162, "reward_std": 0.8951118364930153, "rewards/cosine_scaled_reward": -0.07270245563995559, "rewards/format_reward": 0.7500000260770321, "step": 476 }, { "advantage_max": 1.7613456547260284, "advantage_mean": -3.3306690738754696e-16, "advantage_min": -1.0036557987332344, "advantage_std": 0.9998884499073029, "completion_length": 1881.666732788086, "epoch": 0.5451428571428572, "grad_norm": 2.3956451416015625, "kl": 1.03955078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0578868071715544e-07, "loss": 0.0416, "reward": 0.49204717949032784, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.49204717949032784, "reward_after_std": 1.041341632604599, "reward_before_mean": 0.8847652245312929, "reward_before_std": 1.0459558106958866, "reward_change_max": 0.0, "reward_change_mean": -0.39271803200244904, "reward_change_min": -0.7887572608888149, "reward_change_std": 0.3039436973631382, "reward_std": 1.0413416549563408, "rewards/cosine_scaled_reward": 0.09863259876146913, "rewards/format_reward": 0.6875000186264515, "step": 477 }, { "advantage_max": 1.7318103462457657, "advantage_mean": 1.986821435151498e-08, "advantage_min": -1.0049845576286316, "advantage_std": 0.9998083412647247, "completion_length": 2230.6875762939453, "epoch": 0.5462857142857143, "grad_norm": 1.9944533109664917, "kl": 0.9873046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0529722834905125e-07, "loss": 0.0395, "reward": 0.03902403824031353, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.03902403824031353, "reward_after_std": 0.5999388583004475, "reward_before_mean": 0.32476447708904743, "reward_before_std": 0.5715632252395153, "reward_change_max": 0.0014951378107070923, "reward_change_mean": -0.2857404346577823, "reward_change_min": -0.5063689686357975, "reward_change_std": 0.1933660637587309, "reward_std": 0.5999388918280602, "rewards/cosine_scaled_reward": -0.09803445637226105, "rewards/format_reward": 0.5208333469927311, "step": 478 }, { "advantage_max": 1.6402385234832764, "advantage_mean": -2.4835268619405326e-08, "advantage_min": -1.1984301209449768, "advantage_std": 0.9997950866818428, "completion_length": 2008.1458587646484, "epoch": 0.5474285714285714, "grad_norm": 2.2904410362243652, "kl": 1.124755859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0482745016665526e-07, "loss": 0.045, "reward": 0.24780177371576428, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.24780177371576428, "reward_after_std": 0.6601499877870083, "reward_before_mean": 0.6118663307279348, "reward_before_std": 0.6820986019447446, "reward_change_max": 0.0002136528491973877, "reward_change_mean": -0.36406457982957363, "reward_change_min": -0.6367292441427708, "reward_change_std": 0.262751754373312, "reward_std": 0.6601500064134598, "rewards/cosine_scaled_reward": -0.0690668448805809, "rewards/format_reward": 0.7500000204890966, "step": 479 }, { "advantage_max": 1.7782746702432632, "advantage_mean": 1.4901161637936866e-08, "advantage_min": -1.0110074132680893, "advantage_std": 0.9998089596629143, "completion_length": 2098.0000381469727, "epoch": 0.5485714285714286, "grad_norm": 4.358585834503174, "kl": 1.267333984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0437936906629334e-07, "loss": 0.0507, "reward": 0.033728417474776506, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.033728417474776506, "reward_after_std": 0.6660411544144154, "reward_before_mean": 0.30819552578032017, "reward_before_std": 0.6395064685493708, "reward_change_max": 6.268173456192017e-05, "reward_change_mean": -0.2744671143591404, "reward_change_min": -0.4989656023681164, "reward_change_std": 0.1919045727699995, "reward_std": 0.6660411730408669, "rewards/cosine_scaled_reward": -0.2000689124688506, "rewards/format_reward": 0.7083333469927311, "step": 480 }, { "advantage_max": 1.7010060846805573, "advantage_mean": 8.071462553882469e-09, "advantage_min": -1.1162853986024857, "advantage_std": 0.9998491629958153, "completion_length": 2045.5625610351562, "epoch": 0.5497142857142857, "grad_norm": 3.8500173091888428, "kl": 0.8306350708007812, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0395300688680625e-07, "loss": 0.0332, "reward": 0.11424011806957424, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.11424011806957424, "reward_after_std": 0.7244274131953716, "reward_before_mean": 0.4157728860154748, "reward_before_std": 0.7299416549503803, "reward_change_max": 0.0, "reward_change_mean": -0.3015327639877796, "reward_change_min": -0.5946803689002991, "reward_change_std": 0.2272760048508644, "reward_std": 0.7244274392724037, "rewards/cosine_scaled_reward": -0.11503022816032171, "rewards/format_reward": 0.6458333469927311, "step": 481 }, { "advantage_max": 1.7334468513727188, "advantage_mean": -1.1331091773203461e-08, "advantage_min": -1.06451665610075, "advantage_std": 0.9998659044504166, "completion_length": 1559.5416946411133, "epoch": 0.5508571428571428, "grad_norm": 2.350365161895752, "kl": 0.6270751953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0354838440848501e-07, "loss": 0.0251, "reward": 0.3181959269568324, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3181959269568324, "reward_after_std": 0.8462742604315281, "reward_before_mean": 0.671368783339858, "reward_before_std": 0.829070370644331, "reward_change_max": 0.0, "reward_change_mean": -0.3531728610396385, "reward_change_min": -0.6642805859446526, "reward_change_std": 0.25283273681998253, "reward_std": 0.8462743014097214, "rewards/cosine_scaled_reward": 0.0023510390892624855, "rewards/format_reward": 0.666666679084301, "step": 482 }, { "advantage_max": 1.759923592209816, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.9412931874394417, "advantage_std": 0.999844878911972, "completion_length": 1853.1667175292969, "epoch": 0.552, "grad_norm": 1.4625320434570312, "kl": 0.7619476318359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0316552135205837e-07, "loss": 0.0305, "reward": 0.20957084116525948, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20957084116525948, "reward_after_std": 0.7815211266279221, "reward_before_mean": 0.5329324454069138, "reward_before_std": 0.7592417672276497, "reward_change_max": 0.0004161447286605835, "reward_change_mean": -0.3233615830540657, "reward_change_min": -0.6261811926960945, "reward_change_std": 0.23435983434319496, "reward_std": 0.7815211676061153, "rewards/cosine_scaled_reward": -0.09811712522059679, "rewards/format_reward": 0.7291666772216558, "step": 483 }, { "advantage_max": 1.695325568318367, "advantage_mean": -2.9336662876744413e-08, "advantage_min": -1.1379719600081444, "advantage_std": 0.9998583048582077, "completion_length": 1765.1042137145996, "epoch": 0.5531428571428572, "grad_norm": 3.2285454273223877, "kl": 0.9775390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0280443637773163e-07, "loss": 0.0391, "reward": 0.37274694675579667, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.37274694675579667, "reward_after_std": 0.8066456466913223, "reward_before_mean": 0.7536321505904198, "reward_before_std": 0.7944464683532715, "reward_change_max": 0.0005901604890823364, "reward_change_mean": -0.38088520988821983, "reward_change_min": -0.6573539786040783, "reward_change_std": 0.2598003875464201, "reward_std": 0.8066456578671932, "rewards/cosine_scaled_reward": 0.012232714332640171, "rewards/format_reward": 0.7291666865348816, "step": 484 }, { "advantage_max": 1.6583207547664642, "advantage_mean": -6.208817571184966e-09, "advantage_min": -1.0882994085550308, "advantage_std": 0.9998541325330734, "completion_length": 2115.0833892822266, "epoch": 0.5542857142857143, "grad_norm": 2.7807843685150146, "kl": 1.16552734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0246514708427701e-07, "loss": 0.0466, "reward": 0.1614460563287139, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1614460563287139, "reward_after_std": 0.7408439144492149, "reward_before_mean": 0.4795731317717582, "reward_before_std": 0.7549587935209274, "reward_change_max": 0.0005863979458808899, "reward_change_mean": -0.3181270696222782, "reward_change_min": -0.5944512635469437, "reward_change_std": 0.2374275177717209, "reward_std": 0.7408439218997955, "rewards/cosine_scaled_reward": -0.1143801175057888, "rewards/format_reward": 0.7083333544433117, "step": 485 }, { "advantage_max": 1.7206530421972275, "advantage_mean": -6.208817238118058e-09, "advantage_min": -1.16083462536335, "advantage_std": 0.9998443946242332, "completion_length": 1422.562557220459, "epoch": 0.5554285714285714, "grad_norm": 1.674818515777588, "kl": 0.86358642578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0214767000817596e-07, "loss": 0.0346, "reward": 0.29471569415181875, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.29471569415181875, "reward_after_std": 0.6928602121770382, "reward_before_mean": 0.6584705207496881, "reward_before_std": 0.655466441065073, "reward_change_max": 0.0007828772068023682, "reward_change_mean": -0.36375483497977257, "reward_change_min": -0.5771803520619869, "reward_change_std": 0.2310185208916664, "reward_std": 0.69286023452878, "rewards/cosine_scaled_reward": -0.0770147442817688, "rewards/format_reward": 0.8125000223517418, "step": 486 }, { "advantage_max": 1.797703430056572, "advantage_mean": -8.692343955729598e-09, "advantage_min": -1.0012999922037125, "advantage_std": 0.9998306334018707, "completion_length": 1301.6875228881836, "epoch": 0.5565714285714286, "grad_norm": 2.7326741218566895, "kl": 0.5511322021484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0185202062281336e-07, "loss": 0.022, "reward": 0.4218749701976776, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4218749701976776, "reward_after_std": 0.7974163889884949, "reward_before_mean": 0.8153237756341696, "reward_before_std": 0.7312633749097586, "reward_change_max": 0.0014507323503494263, "reward_change_mean": -0.39344881381839514, "reward_change_min": -0.6271464489400387, "reward_change_std": 0.25059905648231506, "reward_std": 0.7974164076149464, "rewards/cosine_scaled_reward": 0.06391187477856874, "rewards/format_reward": 0.6875000111758709, "step": 487 }, { "advantage_max": 1.702788531780243, "advantage_mean": 3.1044086745701804e-09, "advantage_min": -1.186601109802723, "advantage_std": 0.9998382404446602, "completion_length": 1301.4375305175781, "epoch": 0.5577142857142857, "grad_norm": 1.5253108739852905, "kl": 0.4942626953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0157821333772304e-07, "loss": 0.0198, "reward": 0.23219369817525148, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23219369817525148, "reward_after_std": 0.680590383708477, "reward_before_mean": 0.5791028682142496, "reward_before_std": 0.649394080042839, "reward_change_max": 0.0021488815546035767, "reward_change_mean": -0.34690916538238525, "reward_change_min": -0.5555089823901653, "reward_change_std": 0.2237343229353428, "reward_std": 0.6805903986096382, "rewards/cosine_scaled_reward": -0.11669857613742352, "rewards/format_reward": 0.8125000149011612, "step": 488 }, { "advantage_max": 1.774372011423111, "advantage_mean": 2.9491882269638836e-09, "advantage_min": -1.0680523589253426, "advantage_std": 0.9998393431305885, "completion_length": 2031.5000534057617, "epoch": 0.5588571428571428, "grad_norm": 1.9390368461608887, "kl": 1.087890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.013262614978859e-07, "loss": 0.0435, "reward": -0.07844682363793254, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.07844682363793254, "reward_after_std": 0.7290038503706455, "reward_before_mean": 0.14605336857493967, "reward_before_std": 0.7259037904441357, "reward_change_max": 0.0019025802612304688, "reward_change_mean": -0.22450019791722298, "reward_change_min": -0.45869655162096024, "reward_change_std": 0.18692905455827713, "reward_std": 0.7290038652718067, "rewards/cosine_scaled_reward": -0.23947332985699177, "rewards/format_reward": 0.6250000149011612, "step": 489 }, { "advantage_max": 1.6033965051174164, "advantage_mean": 1.862645149230957e-09, "advantage_min": -1.2662446796894073, "advantage_std": 0.999833270907402, "completion_length": 1821.2500610351562, "epoch": 0.56, "grad_norm": 1.5168814659118652, "kl": 1.015167236328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0109617738307911e-07, "loss": 0.0406, "reward": 0.1102374754846096, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1102374754846096, "reward_after_std": 0.7076537907123566, "reward_before_mean": 0.41693200170993805, "reward_before_std": 0.7424149066209793, "reward_change_max": 0.0, "reward_change_mean": -0.30669451504945755, "reward_change_min": -0.5568870604038239, "reward_change_std": 0.23318634741008282, "reward_std": 0.7076538167893887, "rewards/cosine_scaled_reward": -0.15611735358834267, "rewards/format_reward": 0.7291666939854622, "step": 490 }, { "advantage_max": 1.7501429468393326, "advantage_mean": -6.208817460162663e-09, "advantage_min": -1.006188504397869, "advantage_std": 0.9998316392302513, "completion_length": 1951.6875762939453, "epoch": 0.5611428571428572, "grad_norm": 1.4101494550704956, "kl": 0.946044921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0088797220727779e-07, "loss": 0.0378, "reward": 0.2639427548274398, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.2639427548274398, "reward_after_std": 0.7990949675440788, "reward_before_mean": 0.6044207019731402, "reward_before_std": 0.7900963928550482, "reward_change_max": 0.0007309168577194214, "reward_change_mean": -0.34047792851924896, "reward_change_min": -0.6518944837152958, "reward_change_std": 0.2486917097121477, "reward_std": 0.79909498244524, "rewards/cosine_scaled_reward": -0.05195633601397276, "rewards/format_reward": 0.7083333507180214, "step": 491 }, { "advantage_max": 1.624852180480957, "advantage_mean": -2.173086155465853e-09, "advantage_min": -1.339008018374443, "advantage_std": 0.9998179003596306, "completion_length": 1779.270866394043, "epoch": 0.5622857142857143, "grad_norm": 2.1047329902648926, "kl": 0.950775146484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0070165611810855e-07, "loss": 0.038, "reward": 0.2094727410003543, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2094727410003543, "reward_after_std": 0.6928419955074787, "reward_before_mean": 0.5501253991387784, "reward_before_std": 0.7146821040660143, "reward_change_max": 0.00292176753282547, "reward_change_mean": -0.3406526930630207, "reward_change_min": -0.6104924082756042, "reward_change_std": 0.2492066901177168, "reward_std": 0.6928420029580593, "rewards/cosine_scaled_reward": -0.027020633220672607, "rewards/format_reward": 0.6041666772216558, "step": 492 }, { "advantage_max": 1.7295208126306534, "advantage_mean": 1.9247333615801665e-08, "advantage_min": -1.1021549180150032, "advantage_std": 0.999849334359169, "completion_length": 1852.6458892822266, "epoch": 0.5634285714285714, "grad_norm": 2.4835798740386963, "kl": 0.9590988159179688, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.005372381963547e-07, "loss": 0.0384, "reward": 0.16399981500580907, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.16399981500580907, "reward_after_std": 0.8776100538671017, "reward_before_mean": 0.4570887290756218, "reward_before_std": 0.8825380764901638, "reward_change_max": 1.4081597328186035e-06, "reward_change_mean": -0.2930889241397381, "reward_change_min": -0.6155415810644627, "reward_change_std": 0.23546235542744398, "reward_std": 0.8776100799441338, "rewards/cosine_scaled_reward": -0.08395563159137964, "rewards/format_reward": 0.6250000111758709, "step": 493 }, { "advantage_max": 1.7614665031433105, "advantage_mean": -1.8626452047421083e-08, "advantage_min": -0.9802741855382919, "advantage_std": 0.9998749867081642, "completion_length": 1577.0000457763672, "epoch": 0.5645714285714286, "grad_norm": 1.3195198774337769, "kl": 0.8506927490234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0039472645551372e-07, "loss": 0.034, "reward": 0.35878331679850817, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.35878331679850817, "reward_after_std": 0.9304808415472507, "reward_before_mean": 0.7141460664570332, "reward_before_std": 0.9197700396180153, "reward_change_max": 0.0005487203598022461, "reward_change_mean": -0.35536274686455727, "reward_change_min": -0.7285639680922031, "reward_change_std": 0.2670033797621727, "reward_std": 0.9304808564484119, "rewards/cosine_scaled_reward": -0.02834364236332476, "rewards/format_reward": 0.7708333469927311, "step": 494 }, { "advantage_max": 1.679429143667221, "advantage_mean": -1.4280280402623191e-08, "advantage_min": -1.1185985431075096, "advantage_std": 0.9998891353607178, "completion_length": 1989.5833892822266, "epoch": 0.5657142857142857, "grad_norm": 3.8484103679656982, "kl": 0.9638671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.002741278414069e-07, "loss": 0.0385, "reward": 0.4516491156537086, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4516491156537086, "reward_after_std": 0.9823117256164551, "reward_before_mean": 0.8388855177909136, "reward_before_std": 0.9996080324053764, "reward_change_max": 0.0, "reward_change_mean": -0.3872363902628422, "reward_change_min": -0.7773643136024475, "reward_change_std": 0.2994156740605831, "reward_std": 0.9823117479681969, "rewards/cosine_scaled_reward": 0.04444274306297302, "rewards/format_reward": 0.7500000186264515, "step": 495 }, { "advantage_max": 1.862813338637352, "advantage_mean": 1.1102230246251565e-16, "advantage_min": -0.899694487452507, "advantage_std": 0.9998706802725792, "completion_length": 1765.5416870117188, "epoch": 0.5668571428571428, "grad_norm": 1.6446524858474731, "kl": 0.6643447875976562, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0017544823184055e-07, "loss": 0.0266, "reward": 0.3695176690816879, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3695176690816879, "reward_after_std": 0.8783304542303085, "reward_before_mean": 0.7352924682199955, "reward_before_std": 0.820489190518856, "reward_change_max": 0.0005880072712898254, "reward_change_mean": -0.3657748221885413, "reward_change_min": -0.5810040943324566, "reward_change_std": 0.23075592331588268, "reward_std": 0.8783304914832115, "rewards/cosine_scaled_reward": 0.055146233178675175, "rewards/format_reward": 0.6250000111758709, "step": 496 }, { "advantage_max": 1.7019027471542358, "advantage_mean": -1.117587078436344e-08, "advantage_min": -1.1692993342876434, "advantage_std": 0.999820813536644, "completion_length": 1625.9375228881836, "epoch": 0.568, "grad_norm": 3.0078015327453613, "kl": 0.93243408203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0009869243631952e-07, "loss": 0.0373, "reward": 0.5913935985881835, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5913935985881835, "reward_after_std": 0.689568504691124, "reward_before_mean": 1.069495590403676, "reward_before_std": 0.6528638452291489, "reward_change_max": 0.000348411500453949, "reward_change_mean": -0.4781019575893879, "reward_change_min": -0.7377244308590889, "reward_change_std": 0.2915885467082262, "reward_std": 0.6895685158669949, "rewards/cosine_scaled_reward": 0.1389144528657198, "rewards/format_reward": 0.791666679084301, "step": 497 }, { "advantage_max": 1.744615837931633, "advantage_mean": -2.483527050678447e-09, "advantage_min": -0.9988292381167412, "advantage_std": 0.9998521506786346, "completion_length": 1727.4583740234375, "epoch": 0.5691428571428572, "grad_norm": 2.353933811187744, "kl": 0.803466796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.000438641958131e-07, "loss": 0.0322, "reward": 0.18764864862896502, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18764864862896502, "reward_after_std": 0.7317885160446167, "reward_before_mean": 0.5091952886432409, "reward_before_std": 0.7048919424414635, "reward_change_max": 0.0007078200578689575, "reward_change_mean": -0.32154661882668734, "reward_change_min": -0.5815696716308594, "reward_change_std": 0.22264028061181307, "reward_std": 0.7317885383963585, "rewards/cosine_scaled_reward": -0.09956903967395192, "rewards/format_reward": 0.7083333563059568, "step": 498 }, { "advantage_max": 1.760789692401886, "advantage_mean": 4.967053990334591e-09, "advantage_min": -1.0820249915122986, "advantage_std": 0.9998012855648994, "completion_length": 1662.145851135254, "epoch": 0.5702857142857143, "grad_norm": 1.2436089515686035, "kl": 0.7041015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0001096618257236e-07, "loss": 0.0282, "reward": 0.24078513868153095, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24078513868153095, "reward_after_std": 0.6206786334514618, "reward_before_mean": 0.5984022030606866, "reward_before_std": 0.5815402772277594, "reward_change_max": 0.0, "reward_change_mean": -0.3576170839369297, "reward_change_min": -0.5774316936731339, "reward_change_std": 0.22300436440855265, "reward_std": 0.6206786520779133, "rewards/cosine_scaled_reward": -0.07579890079796314, "rewards/format_reward": 0.7500000074505806, "step": 499 }, { "advantage_max": 1.7504061460494995, "advantage_mean": -4.967053768289986e-09, "advantage_min": -1.0285020619630814, "advantage_std": 0.9998298957943916, "completion_length": 1955.1458740234375, "epoch": 0.5714285714285714, "grad_norm": 1.864587664604187, "kl": 1.07080078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1e-07, "loss": 0.0429, "reward": 0.05267873127013445, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.05267873127013445, "reward_after_std": 0.7509809248149395, "reward_before_mean": 0.3227067431434989, "reward_before_std": 0.7360034771263599, "reward_change_max": 0.00022435933351516724, "reward_change_mean": -0.270028006285429, "reward_change_min": -0.5317719988524914, "reward_change_std": 0.20626395754516125, "reward_std": 0.7509809285402298, "rewards/cosine_scaled_reward": -0.1719799698330462, "rewards/format_reward": 0.6666666865348816, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.006779100396982623, "train_runtime": 61788.8248, "train_samples_per_second": 0.388, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }