{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.88339222614841, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 120.28125, "epoch": 0.002355712603062426, "grad_norm": 0.2532831123960639, "kl": 0.0, "learning_rate": 9.99375e-07, "loss": 0.0, "reward": 1.1007115244865417, "reward_std": 0.20355842262506485, "rewards/GDino": 0.568495362997055, "rewards/GIT": 0.2530422583222389, "rewards/HPSv2": 0.2791738510131836, "step": 1 }, { "completion_length": 118.3359375, "epoch": 0.004711425206124852, "grad_norm": 0.28162303004913347, "kl": 0.0013074721791781485, "learning_rate": 9.9875e-07, "loss": 0.0013074721791781485, "reward": 1.3375725150108337, "reward_std": 0.3001032769680023, "rewards/GDino": 0.7266526520252228, "rewards/GIT": 0.34060658514499664, "rewards/HPSv2": 0.2703132629394531, "step": 2 }, { "completion_length": 118.765625, "epoch": 0.007067137809187279, "grad_norm": 0.7324163449830371, "kl": 0.0013142654206603765, "learning_rate": 9.98125e-07, "loss": 0.0013142654206603765, "reward": 1.1894217729568481, "reward_std": 0.17880908399820328, "rewards/GDino": 0.6453262269496918, "rewards/GIT": 0.27175766229629517, "rewards/HPSv2": 0.2723379135131836, "step": 3 }, { "completion_length": 118.25, "epoch": 0.009422850412249705, "grad_norm": 0.32458499243191347, "kl": 0.0012939731241203845, "learning_rate": 9.975e-07, "loss": 0.0012939731241203845, "reward": 1.419495701789856, "reward_std": 0.22031337767839432, "rewards/GDino": 0.7810339629650116, "rewards/GIT": 0.36503659188747406, "rewards/HPSv2": 0.2734251022338867, "step": 4 }, { "completion_length": 119.9375, "epoch": 0.011778563015312132, "grad_norm": 0.24807810430844465, "kl": 0.001313178101554513, "learning_rate": 9.968749999999999e-07, "loss": 0.001313178101554513, "reward": 1.3168306350708008, "reward_std": 0.2806411385536194, "rewards/GDino": 0.7066371440887451, "rewards/GIT": 0.33652664721012115, "rewards/HPSv2": 0.2736668586730957, "step": 5 }, { "completion_length": 108.34375, "epoch": 0.014134275618374558, "grad_norm": 0.3661470385870102, "kl": 0.0013668319443240762, "learning_rate": 9.9625e-07, "loss": 0.0013668319443240762, "reward": 1.4123074412345886, "reward_std": 0.2420313060283661, "rewards/GDino": 0.7063382267951965, "rewards/GIT": 0.44137515127658844, "rewards/HPSv2": 0.26459407806396484, "step": 6 }, { "completion_length": 118.421875, "epoch": 0.016489988221436984, "grad_norm": 0.30872114529579797, "kl": 0.0013387451181188226, "learning_rate": 9.956249999999999e-07, "loss": 0.0013387451181188226, "reward": 1.3603145480155945, "reward_std": 0.18140670657157898, "rewards/GDino": 0.7587892711162567, "rewards/GIT": 0.33553021401166916, "rewards/HPSv2": 0.2659950256347656, "step": 7 }, { "completion_length": 115.859375, "epoch": 0.01884570082449941, "grad_norm": 0.25306898188683175, "kl": 0.0013217974337749183, "learning_rate": 9.95e-07, "loss": 0.0013217974337749183, "reward": 1.4592117667198181, "reward_std": 0.19207094609737396, "rewards/GDino": 0.7478815317153931, "rewards/GIT": 0.43853163719177246, "rewards/HPSv2": 0.2727985382080078, "step": 8 }, { "completion_length": 119.4765625, "epoch": 0.02120141342756184, "grad_norm": 0.5738363012671475, "kl": 0.0013029443798586726, "learning_rate": 9.94375e-07, "loss": 0.0013029443798586726, "reward": 1.5387293100357056, "reward_std": 0.27692169696092606, "rewards/GDino": 0.7695487439632416, "rewards/GIT": 0.4928849935531616, "rewards/HPSv2": 0.27629566192626953, "step": 9 }, { "completion_length": 119.6171875, "epoch": 0.023557126030624265, "grad_norm": 0.24620381759508636, "kl": 0.0013163809780962765, "learning_rate": 9.9375e-07, "loss": 0.0013163809780962765, "reward": 1.3166776299476624, "reward_std": 0.22295378893613815, "rewards/GDino": 0.6794570684432983, "rewards/GIT": 0.3746444433927536, "rewards/HPSv2": 0.2625761032104492, "step": 10 }, { "completion_length": 121.9609375, "epoch": 0.02591283863368669, "grad_norm": 0.4041269724044248, "kl": 0.0013243000721558928, "learning_rate": 9.93125e-07, "loss": 0.0013243000721558928, "reward": 1.3450157642364502, "reward_std": 0.21892967820167542, "rewards/GDino": 0.691423624753952, "rewards/GIT": 0.39213569462299347, "rewards/HPSv2": 0.2614564895629883, "step": 11 }, { "completion_length": 116.1640625, "epoch": 0.028268551236749116, "grad_norm": 0.31540566876729087, "kl": 0.0013148761936463416, "learning_rate": 9.925e-07, "loss": 0.0013148761936463416, "reward": 1.4443119764328003, "reward_std": 0.21881438046693802, "rewards/GDino": 0.748158186674118, "rewards/GIT": 0.43869322538375854, "rewards/HPSv2": 0.2574605941772461, "step": 12 }, { "completion_length": 130.8203125, "epoch": 0.030624263839811542, "grad_norm": 0.25225827895138253, "kl": 0.001297907147090882, "learning_rate": 9.91875e-07, "loss": 0.001297907147090882, "reward": 1.2929266095161438, "reward_std": 0.21275019645690918, "rewards/GDino": 0.6668811142444611, "rewards/GIT": 0.35254594683647156, "rewards/HPSv2": 0.2734994888305664, "step": 13 }, { "completion_length": 111.515625, "epoch": 0.03297997644287397, "grad_norm": 0.40763802972094876, "kl": 0.001335871173068881, "learning_rate": 9.912499999999998e-07, "loss": 0.001335871173068881, "reward": 1.567497432231903, "reward_std": 0.2216319441795349, "rewards/GDino": 0.7606804072856903, "rewards/GIT": 0.5427373573184013, "rewards/HPSv2": 0.26407957077026367, "step": 14 }, { "completion_length": 114.7109375, "epoch": 0.0353356890459364, "grad_norm": 0.2104409670789466, "kl": 0.001323580159805715, "learning_rate": 9.90625e-07, "loss": 0.001323580159805715, "reward": 1.5503121614456177, "reward_std": 0.20036745071411133, "rewards/GDino": 0.7632786333560944, "rewards/GIT": 0.5124020278453827, "rewards/HPSv2": 0.2746315002441406, "step": 15 }, { "completion_length": 116.6015625, "epoch": 0.03769140164899882, "grad_norm": 0.28624401564632823, "kl": 0.0013183911214582622, "learning_rate": 9.9e-07, "loss": 0.0013183911214582622, "reward": 1.4082192182540894, "reward_std": 0.15387295559048653, "rewards/GDino": 0.7589408159255981, "rewards/GIT": 0.3643643334507942, "rewards/HPSv2": 0.2849140167236328, "step": 16 }, { "completion_length": 122.4609375, "epoch": 0.04004711425206125, "grad_norm": 0.26403504197169275, "kl": 0.0013484787777997553, "learning_rate": 9.89375e-07, "loss": 0.0013484787777997553, "reward": 1.489366054534912, "reward_std": 0.2865874469280243, "rewards/GDino": 0.7000451982021332, "rewards/GIT": 0.530218780040741, "rewards/HPSv2": 0.25910210609436035, "step": 17 }, { "completion_length": 118.0078125, "epoch": 0.04240282685512368, "grad_norm": 0.27343627398069786, "kl": 0.0013494989834725857, "learning_rate": 9.8875e-07, "loss": 0.0013494989834725857, "reward": 1.4237905740737915, "reward_std": 0.254701167345047, "rewards/GDino": 0.7735631465911865, "rewards/GIT": 0.3892448991537094, "rewards/HPSv2": 0.2609825134277344, "step": 18 }, { "completion_length": 114.109375, "epoch": 0.0447585394581861, "grad_norm": 0.6371665044969644, "kl": 0.0013281549327075481, "learning_rate": 9.88125e-07, "loss": 0.0013281549327075481, "reward": 1.4366835355758667, "reward_std": 0.169646255671978, "rewards/GDino": 0.7496436834335327, "rewards/GIT": 0.41933776438236237, "rewards/HPSv2": 0.2677021026611328, "step": 19 }, { "completion_length": 122.53125, "epoch": 0.04711425206124853, "grad_norm": 0.2715217795650905, "kl": 0.0013815115089528263, "learning_rate": 9.875e-07, "loss": 0.0013815115089528263, "reward": 1.3204509615898132, "reward_std": 0.2136593908071518, "rewards/GDino": 0.7027584314346313, "rewards/GIT": 0.34052804112434387, "rewards/HPSv2": 0.2771644592285156, "step": 20 }, { "completion_length": 123.3203125, "epoch": 0.04946996466431095, "grad_norm": 0.45819230858629295, "kl": 0.0013606999418698251, "learning_rate": 9.86875e-07, "loss": 0.0013606999418698251, "reward": 1.34914892911911, "reward_std": 0.2118716984987259, "rewards/GDino": 0.7245356142520905, "rewards/GIT": 0.36403897404670715, "rewards/HPSv2": 0.2605743408203125, "step": 21 }, { "completion_length": 123.6171875, "epoch": 0.05182567726737338, "grad_norm": 0.7564440815399309, "kl": 0.001386830525007099, "learning_rate": 9.862499999999999e-07, "loss": 0.001386830525007099, "reward": 1.815440058708191, "reward_std": 0.19418103992938995, "rewards/GDino": 0.8534060120582581, "rewards/GIT": 0.6870721578598022, "rewards/HPSv2": 0.2749619483947754, "step": 22 }, { "completion_length": 117.8671875, "epoch": 0.05418138987043581, "grad_norm": 0.24924254331231438, "kl": 0.0012856099638156593, "learning_rate": 9.85625e-07, "loss": 0.0012856099638156593, "reward": 1.632369339466095, "reward_std": 0.1968156024813652, "rewards/GDino": 0.8499236106872559, "rewards/GIT": 0.49933211505413055, "rewards/HPSv2": 0.2831134796142578, "step": 23 }, { "completion_length": 118.921875, "epoch": 0.05653710247349823, "grad_norm": 0.34379474523388015, "kl": 0.0013843628694303334, "learning_rate": 9.849999999999999e-07, "loss": 0.0013843628694303334, "reward": 1.2086148262023926, "reward_std": 0.21922558546066284, "rewards/GDino": 0.6184915602207184, "rewards/GIT": 0.3100777342915535, "rewards/HPSv2": 0.2800455093383789, "step": 24 }, { "completion_length": 123.8359375, "epoch": 0.05889281507656066, "grad_norm": 0.2545226313114055, "kl": 0.001381821115501225, "learning_rate": 9.84375e-07, "loss": 0.001381821115501225, "reward": 1.1286445260047913, "reward_std": 0.2250244915485382, "rewards/GDino": 0.6510695517063141, "rewards/GIT": 0.19917067885398865, "rewards/HPSv2": 0.27840423583984375, "step": 25 }, { "completion_length": 121.0703125, "epoch": 0.061248527679623084, "grad_norm": 0.24335308297854166, "kl": 0.0013603830011561513, "learning_rate": 9.8375e-07, "loss": 0.0013603830011561513, "reward": 1.2952918410301208, "reward_std": 0.1503567472100258, "rewards/GDino": 0.7084189355373383, "rewards/GIT": 0.30566197633743286, "rewards/HPSv2": 0.28121089935302734, "step": 26 }, { "completion_length": 127.015625, "epoch": 0.0636042402826855, "grad_norm": 0.2300970599054755, "kl": 0.001339898502919823, "learning_rate": 9.83125e-07, "loss": 0.001339898502919823, "reward": 1.3184754848480225, "reward_std": 0.24553914368152618, "rewards/GDino": 0.7058696448802948, "rewards/GIT": 0.35126666724681854, "rewards/HPSv2": 0.2613391876220703, "step": 27 }, { "completion_length": 115.78125, "epoch": 0.06595995288574794, "grad_norm": 0.267414827940517, "kl": 0.001418650965206325, "learning_rate": 9.825e-07, "loss": 0.001418650965206325, "reward": 1.4365617036819458, "reward_std": 0.17958877980709076, "rewards/GDino": 0.7320044338703156, "rewards/GIT": 0.4284685254096985, "rewards/HPSv2": 0.2760887145996094, "step": 28 }, { "completion_length": 116.0859375, "epoch": 0.06831566548881036, "grad_norm": 0.29431337732895774, "kl": 0.0014122650027275085, "learning_rate": 9.81875e-07, "loss": 0.0014122650027275085, "reward": 1.583243727684021, "reward_std": 0.18211837112903595, "rewards/GDino": 0.7955514192581177, "rewards/GIT": 0.5137636959552765, "rewards/HPSv2": 0.2739286422729492, "step": 29 }, { "completion_length": 121.6015625, "epoch": 0.0706713780918728, "grad_norm": 0.2509708027110314, "kl": 0.0013804864138364792, "learning_rate": 9.8125e-07, "loss": 0.0013804864138364792, "reward": 1.5445727705955505, "reward_std": 0.19888446480035782, "rewards/GDino": 0.8273197412490845, "rewards/GIT": 0.44876983761787415, "rewards/HPSv2": 0.26848316192626953, "step": 30 }, { "completion_length": 122.1640625, "epoch": 0.07302709069493522, "grad_norm": 0.2985105222222475, "kl": 0.0014304147916845977, "learning_rate": 9.806249999999998e-07, "loss": 0.0014304147916845977, "reward": 1.6495965719223022, "reward_std": 0.23981638997793198, "rewards/GDino": 0.8521493375301361, "rewards/GIT": 0.5391225665807724, "rewards/HPSv2": 0.25832462310791016, "step": 31 }, { "completion_length": 120.1953125, "epoch": 0.07538280329799764, "grad_norm": 0.2197846414954403, "kl": 0.00140391755849123, "learning_rate": 9.8e-07, "loss": 0.00140391755849123, "reward": 1.2200416922569275, "reward_std": 0.23218364268541336, "rewards/GDino": 0.6266158819198608, "rewards/GIT": 0.3264303505420685, "rewards/HPSv2": 0.2669954299926758, "step": 32 }, { "completion_length": 121.6328125, "epoch": 0.07773851590106007, "grad_norm": 0.31599917638540337, "kl": 0.0014098280807957053, "learning_rate": 9.79375e-07, "loss": 0.0014098280807957053, "reward": 1.5785539746284485, "reward_std": 0.24454942345619202, "rewards/GDino": 0.7875615358352661, "rewards/GIT": 0.535168319940567, "rewards/HPSv2": 0.25582408905029297, "step": 33 }, { "completion_length": 127.9609375, "epoch": 0.0800942285041225, "grad_norm": 0.2732853355099135, "kl": 0.001419250329490751, "learning_rate": 9.7875e-07, "loss": 0.001419250329490751, "reward": 1.4098470211029053, "reward_std": 0.20230243355035782, "rewards/GDino": 0.7162584066390991, "rewards/GIT": 0.4209607243537903, "rewards/HPSv2": 0.2726278305053711, "step": 34 }, { "completion_length": 114.1640625, "epoch": 0.08244994110718493, "grad_norm": 0.6002751378820108, "kl": 0.00151003833161667, "learning_rate": 9.78125e-07, "loss": 0.00151003833161667, "reward": 1.4272226095199585, "reward_std": 0.20679940283298492, "rewards/GDino": 0.7744862139225006, "rewards/GIT": 0.3769698739051819, "rewards/HPSv2": 0.27576637268066406, "step": 35 }, { "completion_length": 121.125, "epoch": 0.08480565371024736, "grad_norm": 0.27269128976759754, "kl": 0.0014037095825187862, "learning_rate": 9.775e-07, "loss": 0.0014037095825187862, "reward": 1.4006112813949585, "reward_std": 0.2074330672621727, "rewards/GDino": 0.7891882956027985, "rewards/GIT": 0.3337648808956146, "rewards/HPSv2": 0.27765798568725586, "step": 36 }, { "completion_length": 117.875, "epoch": 0.08716136631330977, "grad_norm": 0.43820194717187727, "kl": 0.0014256125432439148, "learning_rate": 9.76875e-07, "loss": 0.0014256125432439148, "reward": 1.4517158269882202, "reward_std": 0.2250361368060112, "rewards/GDino": 0.7203561961650848, "rewards/GIT": 0.4587488919496536, "rewards/HPSv2": 0.2726106643676758, "step": 37 }, { "completion_length": 118.03125, "epoch": 0.0895170789163722, "grad_norm": 0.252308925879943, "kl": 0.0014492222107946873, "learning_rate": 9.7625e-07, "loss": 0.0014492222107946873, "reward": 1.6928882598876953, "reward_std": 0.2526443302631378, "rewards/GDino": 0.8110005259513855, "rewards/GIT": 0.6060650646686554, "rewards/HPSv2": 0.27582263946533203, "step": 38 }, { "completion_length": 122.2890625, "epoch": 0.09187279151943463, "grad_norm": 0.26963461287523094, "kl": 0.0014610645011998713, "learning_rate": 9.756249999999999e-07, "loss": 0.0014610645011998713, "reward": 1.2712687849998474, "reward_std": 0.2403176873922348, "rewards/GDino": 0.6555468440055847, "rewards/GIT": 0.34917764365673065, "rewards/HPSv2": 0.2665443420410156, "step": 39 }, { "completion_length": 113.8203125, "epoch": 0.09422850412249706, "grad_norm": 0.32563932525515776, "kl": 0.0015068492502905428, "learning_rate": 9.75e-07, "loss": 0.0015068492502905428, "reward": 1.2842183113098145, "reward_std": 0.180851049721241, "rewards/GDino": 0.6738163828849792, "rewards/GIT": 0.321700856089592, "rewards/HPSv2": 0.28870105743408203, "step": 40 }, { "completion_length": 117.2265625, "epoch": 0.09658421672555949, "grad_norm": 0.25059280744895857, "kl": 0.0014773915172554553, "learning_rate": 9.743749999999999e-07, "loss": 0.0014773915172554553, "reward": 1.6124165058135986, "reward_std": 0.16663113981485367, "rewards/GDino": 0.8273109793663025, "rewards/GIT": 0.4992368221282959, "rewards/HPSv2": 0.28586864471435547, "step": 41 }, { "completion_length": 126.09375, "epoch": 0.0989399293286219, "grad_norm": 0.30607994211529754, "kl": 0.0015374869108200073, "learning_rate": 9.7375e-07, "loss": 0.0015374869108200073, "reward": 1.468973994255066, "reward_std": 0.21939436346292496, "rewards/GDino": 0.771526426076889, "rewards/GIT": 0.4223763793706894, "rewards/HPSv2": 0.2750711441040039, "step": 42 }, { "completion_length": 121.2265625, "epoch": 0.10129564193168433, "grad_norm": 0.4151717767893309, "kl": 0.0015280075022019446, "learning_rate": 9.73125e-07, "loss": 0.0015280075022019446, "reward": 1.4618364572525024, "reward_std": 0.2286859154701233, "rewards/GDino": 0.7599323391914368, "rewards/GIT": 0.43141815066337585, "rewards/HPSv2": 0.27048587799072266, "step": 43 }, { "completion_length": 121.96875, "epoch": 0.10365135453474676, "grad_norm": 0.23514193223331203, "kl": 0.0014392572920769453, "learning_rate": 9.725e-07, "loss": 0.0014392572920769453, "reward": 1.5480297803878784, "reward_std": 0.16164406388998032, "rewards/GDino": 0.8162851929664612, "rewards/GIT": 0.4658394604921341, "rewards/HPSv2": 0.26590490341186523, "step": 44 }, { "completion_length": 124.390625, "epoch": 0.10600706713780919, "grad_norm": 0.2495319173561865, "kl": 0.0018858073744922876, "learning_rate": 9.71875e-07, "loss": 0.0018858073744922876, "reward": 1.5036916732788086, "reward_std": 0.23125918209552765, "rewards/GDino": 0.789673238992691, "rewards/GIT": 0.4532347470521927, "rewards/HPSv2": 0.26078367233276367, "step": 45 }, { "completion_length": 111.9296875, "epoch": 0.10836277974087162, "grad_norm": 0.29045987573353166, "kl": 0.0014896091306582093, "learning_rate": 9.712499999999998e-07, "loss": 0.0014896091306582093, "reward": 1.6672833561897278, "reward_std": 0.26420916616916656, "rewards/GDino": 0.8612026572227478, "rewards/GIT": 0.5491684824228287, "rewards/HPSv2": 0.2569122314453125, "step": 46 }, { "completion_length": 114.7421875, "epoch": 0.11071849234393404, "grad_norm": 0.25845281182418184, "kl": 0.0014899871894158423, "learning_rate": 9.70625e-07, "loss": 0.0014899871894158423, "reward": 1.1329429149627686, "reward_std": 0.1909269466996193, "rewards/GDino": 0.6326232552528381, "rewards/GIT": 0.22672093659639359, "rewards/HPSv2": 0.27359867095947266, "step": 47 }, { "completion_length": 119.59375, "epoch": 0.11307420494699646, "grad_norm": 0.42971755503297954, "kl": 0.0015833491925150156, "learning_rate": 9.7e-07, "loss": 0.0015833491925150156, "reward": 1.3833127617835999, "reward_std": 0.22956909239292145, "rewards/GDino": 0.7405589818954468, "rewards/GIT": 0.36825767159461975, "rewards/HPSv2": 0.27449607849121094, "step": 48 }, { "completion_length": 119.1171875, "epoch": 0.1154299175500589, "grad_norm": 0.4118633910378266, "kl": 0.0016614573542028666, "learning_rate": 9.69375e-07, "loss": 0.0016614573542028666, "reward": 1.6082695722579956, "reward_std": 0.18719755113124847, "rewards/GDino": 0.7857354581356049, "rewards/GIT": 0.5675664246082306, "rewards/HPSv2": 0.25496768951416016, "step": 49 }, { "completion_length": 120.0703125, "epoch": 0.11778563015312132, "grad_norm": 0.2628252408473769, "kl": 0.00151388457743451, "learning_rate": 9.6875e-07, "loss": 0.00151388457743451, "reward": 1.3588453531265259, "reward_std": 0.17723247408866882, "rewards/GDino": 0.6499251425266266, "rewards/GIT": 0.4350516349077225, "rewards/HPSv2": 0.2738685607910156, "step": 50 }, { "completion_length": 116.21875, "epoch": 0.12014134275618374, "grad_norm": 0.260010067525717, "kl": 0.00149545818567276, "learning_rate": 9.68125e-07, "loss": 0.00149545818567276, "reward": 1.6116271615028381, "reward_std": 0.17986326664686203, "rewards/GDino": 0.8165108561515808, "rewards/GIT": 0.5178879201412201, "rewards/HPSv2": 0.27722835540771484, "step": 51 }, { "completion_length": 116.28125, "epoch": 0.12249705535924617, "grad_norm": 0.6829659425445808, "kl": 0.0015640227356925607, "learning_rate": 9.675e-07, "loss": 0.0015640227356925607, "reward": 1.5053871870040894, "reward_std": 0.16816724836826324, "rewards/GDino": 0.7835308015346527, "rewards/GIT": 0.43679744005203247, "rewards/HPSv2": 0.28505897521972656, "step": 52 }, { "completion_length": 119.09375, "epoch": 0.1248527679623086, "grad_norm": 0.2877957602047789, "kl": 0.0015550117823295295, "learning_rate": 9.66875e-07, "loss": 0.0015550117823295295, "reward": 1.2890885472297668, "reward_std": 0.19584518671035767, "rewards/GDino": 0.7237735092639923, "rewards/GIT": 0.2836405709385872, "rewards/HPSv2": 0.2816743850708008, "step": 53 }, { "completion_length": 119.90625, "epoch": 0.127208480565371, "grad_norm": 0.28174193791908697, "kl": 0.0016366480849683285, "learning_rate": 9.6625e-07, "loss": 0.0016366480849683285, "reward": 1.432750940322876, "reward_std": 0.16682738810777664, "rewards/GDino": 0.7480170130729675, "rewards/GIT": 0.41860339045524597, "rewards/HPSv2": 0.2661304473876953, "step": 54 }, { "completion_length": 116.671875, "epoch": 0.12956419316843346, "grad_norm": 0.2904687437413608, "kl": 0.0015858595725148916, "learning_rate": 9.65625e-07, "loss": 0.0015858595725148916, "reward": 1.6456317901611328, "reward_std": 0.15695975720882416, "rewards/GDino": 0.8240379393100739, "rewards/GIT": 0.540200799703598, "rewards/HPSv2": 0.28139305114746094, "step": 55 }, { "completion_length": 120.3671875, "epoch": 0.13191990577149587, "grad_norm": 0.25471573820354615, "kl": 0.0015669609420001507, "learning_rate": 9.649999999999999e-07, "loss": 0.0015669609420001507, "reward": 1.3531073331832886, "reward_std": 0.2376112937927246, "rewards/GDino": 0.7589595317840576, "rewards/GIT": 0.324622243642807, "rewards/HPSv2": 0.26952552795410156, "step": 56 }, { "completion_length": 118.171875, "epoch": 0.13427561837455831, "grad_norm": 0.3006123067883031, "kl": 0.0016387543291784823, "learning_rate": 9.64375e-07, "loss": 0.0016387543291784823, "reward": 1.531467854976654, "reward_std": 0.2262226566672325, "rewards/GDino": 0.7637097537517548, "rewards/GIT": 0.5068003982305527, "rewards/HPSv2": 0.2609577178955078, "step": 57 }, { "completion_length": 121.328125, "epoch": 0.13663133097762073, "grad_norm": 0.26335273636132583, "kl": 0.0015602456405758858, "learning_rate": 9.637499999999999e-07, "loss": 0.0015602456405758858, "reward": 1.6095128059387207, "reward_std": 0.16171453893184662, "rewards/GDino": 0.7619011402130127, "rewards/GIT": 0.5778972506523132, "rewards/HPSv2": 0.26971435546875, "step": 58 }, { "completion_length": 116.4375, "epoch": 0.13898704358068315, "grad_norm": 0.38447264441859974, "kl": 0.0015910342917777598, "learning_rate": 9.63125e-07, "loss": 0.0015910342917777598, "reward": 1.5380921959877014, "reward_std": 0.1911901906132698, "rewards/GDino": 0.8267523050308228, "rewards/GIT": 0.4426010102033615, "rewards/HPSv2": 0.2687387466430664, "step": 59 }, { "completion_length": 123.703125, "epoch": 0.1413427561837456, "grad_norm": 0.24495504151945557, "kl": 0.001754104858264327, "learning_rate": 9.624999999999999e-07, "loss": 0.001754104858264327, "reward": 1.2330472767353058, "reward_std": 0.23950910568237305, "rewards/GDino": 0.6126895099878311, "rewards/GIT": 0.35188791900873184, "rewards/HPSv2": 0.26846981048583984, "step": 60 }, { "completion_length": 124.671875, "epoch": 0.143698468786808, "grad_norm": 0.30929678025078833, "kl": 0.0017492063343524933, "learning_rate": 9.61875e-07, "loss": 0.0017492063343524933, "reward": 1.4227551817893982, "reward_std": 0.2234400287270546, "rewards/GDino": 0.7638609111309052, "rewards/GIT": 0.3836190849542618, "rewards/HPSv2": 0.27527523040771484, "step": 61 }, { "completion_length": 118.328125, "epoch": 0.14605418138987045, "grad_norm": 0.28727541829192144, "kl": 0.0018716503982432187, "learning_rate": 9.6125e-07, "loss": 0.0018716503982432187, "reward": 1.5549749732017517, "reward_std": 0.15252643078565598, "rewards/GDino": 0.7693066895008087, "rewards/GIT": 0.5026424676179886, "rewards/HPSv2": 0.28302574157714844, "step": 62 }, { "completion_length": 121.6484375, "epoch": 0.14840989399293286, "grad_norm": 0.29139914583149135, "kl": 0.0017212032689712942, "learning_rate": 9.606249999999998e-07, "loss": 0.0017212032689712942, "reward": 1.2072110176086426, "reward_std": 0.2429390847682953, "rewards/GDino": 0.6707694232463837, "rewards/GIT": 0.26683203130960464, "rewards/HPSv2": 0.2696094512939453, "step": 63 }, { "completion_length": 119.265625, "epoch": 0.15076560659599528, "grad_norm": 0.889851083213229, "kl": 0.0017158385016955435, "learning_rate": 9.6e-07, "loss": 0.0017158385016955435, "reward": 1.2913310527801514, "reward_std": 0.19539130479097366, "rewards/GDino": 0.6838600933551788, "rewards/GIT": 0.3264012262225151, "rewards/HPSv2": 0.2810697555541992, "step": 64 }, { "completion_length": 123.4140625, "epoch": 0.15312131919905772, "grad_norm": 0.3185778015827473, "kl": 0.001679237000644207, "learning_rate": 9.59375e-07, "loss": 0.001679237000644207, "reward": 1.3336173295974731, "reward_std": 0.2520475834608078, "rewards/GDino": 0.7061086595058441, "rewards/GIT": 0.36446040868759155, "rewards/HPSv2": 0.2630481719970703, "step": 65 }, { "completion_length": 122.4921875, "epoch": 0.15547703180212014, "grad_norm": 0.29082578656740155, "kl": 0.0017390166176483035, "learning_rate": 9.5875e-07, "loss": 0.0017390166176483035, "reward": 1.323692798614502, "reward_std": 0.1798083856701851, "rewards/GDino": 0.6522715389728546, "rewards/GIT": 0.40690912306308746, "rewards/HPSv2": 0.2645120620727539, "step": 66 }, { "completion_length": 115.3359375, "epoch": 0.15783274440518258, "grad_norm": 0.4122223409285881, "kl": 0.0020997170940972865, "learning_rate": 9.58125e-07, "loss": 0.0020997170940972865, "reward": 1.307538390159607, "reward_std": 0.2175757959485054, "rewards/GDino": 0.7022336423397064, "rewards/GIT": 0.32459449768066406, "rewards/HPSv2": 0.28071022033691406, "step": 67 }, { "completion_length": 117.171875, "epoch": 0.160188457008245, "grad_norm": 0.5010811561545013, "kl": 0.0017123650759458542, "learning_rate": 9.575e-07, "loss": 0.0017123650759458542, "reward": 1.3808576464653015, "reward_std": 0.16271132975816727, "rewards/GDino": 0.7192801833152771, "rewards/GIT": 0.371448814868927, "rewards/HPSv2": 0.2901287078857422, "step": 68 }, { "completion_length": 118.5546875, "epoch": 0.1625441696113074, "grad_norm": 0.29439026066285884, "kl": 0.00178577279439196, "learning_rate": 9.56875e-07, "loss": 0.00178577279439196, "reward": 1.6233538389205933, "reward_std": 0.1496392861008644, "rewards/GDino": 0.7836182713508606, "rewards/GIT": 0.5500492751598358, "rewards/HPSv2": 0.2896862030029297, "step": 69 }, { "completion_length": 115.59375, "epoch": 0.16489988221436985, "grad_norm": 0.29472182111639117, "kl": 0.0017971571069210768, "learning_rate": 9.5625e-07, "loss": 0.0017971571069210768, "reward": 1.2831493020057678, "reward_std": 0.17382801696658134, "rewards/GDino": 0.6800273954868317, "rewards/GIT": 0.32777221500873566, "rewards/HPSv2": 0.27534961700439453, "step": 70 }, { "completion_length": 117.78125, "epoch": 0.16725559481743227, "grad_norm": 0.612071971328077, "kl": 0.00184587761759758, "learning_rate": 9.556249999999999e-07, "loss": 0.00184587761759758, "reward": 1.4166246056556702, "reward_std": 0.19866756349802017, "rewards/GDino": 0.7656495869159698, "rewards/GIT": 0.3730514198541641, "rewards/HPSv2": 0.277923583984375, "step": 71 }, { "completion_length": 117.96875, "epoch": 0.1696113074204947, "grad_norm": 0.23938641444447467, "kl": 0.0016899046604521573, "learning_rate": 9.55e-07, "loss": 0.0016899046604521573, "reward": 1.258272111415863, "reward_std": 0.18912728130817413, "rewards/GDino": 0.6118131875991821, "rewards/GIT": 0.3737824559211731, "rewards/HPSv2": 0.2726764678955078, "step": 72 }, { "completion_length": 119.4453125, "epoch": 0.17196702002355713, "grad_norm": 0.2776986640605152, "kl": 0.0018090966623276472, "learning_rate": 9.543749999999999e-07, "loss": 0.0018090966623276472, "reward": 1.4644430875778198, "reward_std": 0.19388439506292343, "rewards/GDino": 0.7252411246299744, "rewards/GIT": 0.47095245122909546, "rewards/HPSv2": 0.26824951171875, "step": 73 }, { "completion_length": 119.03125, "epoch": 0.17432273262661954, "grad_norm": 0.23539747876363853, "kl": 0.00195615051779896, "learning_rate": 9.5375e-07, "loss": 0.00195615051779896, "reward": 1.4274641871452332, "reward_std": 0.20058608055114746, "rewards/GDino": 0.7063159942626953, "rewards/GIT": 0.4419161081314087, "rewards/HPSv2": 0.2792320251464844, "step": 74 }, { "completion_length": 107.3828125, "epoch": 0.17667844522968199, "grad_norm": 0.5849407719652319, "kl": 0.002735464833676815, "learning_rate": 9.53125e-07, "loss": 0.002735464833676815, "reward": 1.502268373966217, "reward_std": 0.16728447377681732, "rewards/GDino": 0.7582340240478516, "rewards/GIT": 0.46406325697898865, "rewards/HPSv2": 0.2799711227416992, "step": 75 }, { "completion_length": 114.0546875, "epoch": 0.1790341578327444, "grad_norm": 0.2816696366059667, "kl": 0.001873756933491677, "learning_rate": 9.525e-07, "loss": 0.001873756933491677, "reward": 1.4625573754310608, "reward_std": 0.2449864000082016, "rewards/GDino": 0.7482869327068329, "rewards/GIT": 0.4507931023836136, "rewards/HPSv2": 0.2634773254394531, "step": 76 }, { "completion_length": 118.7265625, "epoch": 0.18138987043580684, "grad_norm": 0.2862396151489467, "kl": 0.0021867634495720267, "learning_rate": 9.51875e-07, "loss": 0.0021867634495720267, "reward": 1.3586730360984802, "reward_std": 0.24163472652435303, "rewards/GDino": 0.6844815313816071, "rewards/GIT": 0.40619656443595886, "rewards/HPSv2": 0.26799488067626953, "step": 77 }, { "completion_length": 118.7265625, "epoch": 0.18374558303886926, "grad_norm": 0.2402978893559342, "kl": 0.0018702655797824264, "learning_rate": 9.5125e-07, "loss": 0.0018702655797824264, "reward": 1.3333824276924133, "reward_std": 0.21071477234363556, "rewards/GDino": 0.6598013043403625, "rewards/GIT": 0.39959612488746643, "rewards/HPSv2": 0.2739849090576172, "step": 78 }, { "completion_length": 112.109375, "epoch": 0.18610129564193167, "grad_norm": 0.3140924818404655, "kl": 0.001994729565922171, "learning_rate": 9.50625e-07, "loss": 0.001994729565922171, "reward": 1.3026515245437622, "reward_std": 0.16617826372385025, "rewards/GDino": 0.7232322990894318, "rewards/GIT": 0.30121809244155884, "rewards/HPSv2": 0.2782011032104492, "step": 79 }, { "completion_length": 122.1328125, "epoch": 0.18845700824499412, "grad_norm": 0.23698162578456214, "kl": 0.0019655448850244284, "learning_rate": 9.499999999999999e-07, "loss": 0.0019655448850244284, "reward": 1.2496649026870728, "reward_std": 0.1760888248682022, "rewards/GDino": 0.6573259830474854, "rewards/GIT": 0.3232959359884262, "rewards/HPSv2": 0.26904296875, "step": 80 }, { "completion_length": 117.6171875, "epoch": 0.19081272084805653, "grad_norm": 0.40212838247482807, "kl": 0.0019596697529777884, "learning_rate": 9.493749999999999e-07, "loss": 0.0019596697529777884, "reward": 1.4696364402770996, "reward_std": 0.18646766990423203, "rewards/GDino": 0.8071052730083466, "rewards/GIT": 0.38611139357089996, "rewards/HPSv2": 0.27641963958740234, "step": 81 }, { "completion_length": 122.6875, "epoch": 0.19316843345111898, "grad_norm": 0.25892108054624163, "kl": 0.0018325882847420871, "learning_rate": 9.487499999999999e-07, "loss": 0.0018325882847420871, "reward": 1.363904058933258, "reward_std": 0.20706705003976822, "rewards/GDino": 0.7383570671081543, "rewards/GIT": 0.34423255920410156, "rewards/HPSv2": 0.2813143730163574, "step": 82 }, { "completion_length": 112.953125, "epoch": 0.1955241460541814, "grad_norm": 0.2747640199429539, "kl": 0.002058290643617511, "learning_rate": 9.481249999999999e-07, "loss": 0.002058290643617511, "reward": 1.4634281992912292, "reward_std": 0.17625326663255692, "rewards/GDino": 0.7229166328907013, "rewards/GIT": 0.47531093657016754, "rewards/HPSv2": 0.2652006149291992, "step": 83 }, { "completion_length": 122.265625, "epoch": 0.1978798586572438, "grad_norm": 0.2665451407937047, "kl": 0.002013478660956025, "learning_rate": 9.474999999999999e-07, "loss": 0.002013478660956025, "reward": 1.4016539454460144, "reward_std": 0.1818184033036232, "rewards/GDino": 0.7316404581069946, "rewards/GIT": 0.3995123952627182, "rewards/HPSv2": 0.27050113677978516, "step": 84 }, { "completion_length": 116.0078125, "epoch": 0.20023557126030625, "grad_norm": 0.3603801748464669, "kl": 0.002120223711244762, "learning_rate": 9.468749999999999e-07, "loss": 0.002120223711244762, "reward": 1.4757026433944702, "reward_std": 0.20095176994800568, "rewards/GDino": 0.7639147043228149, "rewards/GIT": 0.44087424874305725, "rewards/HPSv2": 0.27091360092163086, "step": 85 }, { "completion_length": 112.3984375, "epoch": 0.20259128386336867, "grad_norm": 0.3239176329408181, "kl": 0.002278960309922695, "learning_rate": 9.462499999999999e-07, "loss": 0.002278960309922695, "reward": 1.40516996383667, "reward_std": 0.16226524859666824, "rewards/GDino": 0.7576597630977631, "rewards/GIT": 0.35792025923728943, "rewards/HPSv2": 0.28958988189697266, "step": 86 }, { "completion_length": 120.546875, "epoch": 0.2049469964664311, "grad_norm": 0.23916548852772673, "kl": 0.0019681883277371526, "learning_rate": 9.45625e-07, "loss": 0.0019681883277371526, "reward": 1.5272275805473328, "reward_std": 0.25036415457725525, "rewards/GDino": 0.7844875454902649, "rewards/GIT": 0.4783099591732025, "rewards/HPSv2": 0.26443004608154297, "step": 87 }, { "completion_length": 128.3203125, "epoch": 0.20730270906949352, "grad_norm": 0.23620889198690914, "kl": 0.002017518738284707, "learning_rate": 9.45e-07, "loss": 0.002017518738284707, "reward": 1.2135156989097595, "reward_std": 0.19964167475700378, "rewards/GDino": 0.6259966492652893, "rewards/GIT": 0.31083229929208755, "rewards/HPSv2": 0.2766866683959961, "step": 88 }, { "completion_length": 118.75, "epoch": 0.20965842167255594, "grad_norm": 0.23890658230611023, "kl": 0.002090562367811799, "learning_rate": 9.44375e-07, "loss": 0.002090562367811799, "reward": 1.6871580481529236, "reward_std": 0.23278890550136566, "rewards/GDino": 0.8393713235855103, "rewards/GIT": 0.582229495048523, "rewards/HPSv2": 0.26555728912353516, "step": 89 }, { "completion_length": 116.7734375, "epoch": 0.21201413427561838, "grad_norm": 0.35173208257063876, "kl": 0.002108373213559389, "learning_rate": 9.4375e-07, "loss": 0.002108373213559389, "reward": 1.3342792987823486, "reward_std": 0.1977575197815895, "rewards/GDino": 0.7038924396038055, "rewards/GIT": 0.35854390263557434, "rewards/HPSv2": 0.27184295654296875, "step": 90 }, { "completion_length": 114.4296875, "epoch": 0.2143698468786808, "grad_norm": 0.2821470628331685, "kl": 0.0030870914924889803, "learning_rate": 9.43125e-07, "loss": 0.0030870914924889803, "reward": 1.5565957427024841, "reward_std": 0.15032906085252762, "rewards/GDino": 0.779148280620575, "rewards/GIT": 0.5005071759223938, "rewards/HPSv2": 0.27694034576416016, "step": 91 }, { "completion_length": 118.703125, "epoch": 0.21672555948174324, "grad_norm": 0.2625526141820863, "kl": 0.0022944058291614056, "learning_rate": 9.425e-07, "loss": 0.0022944058291614056, "reward": 1.5590879321098328, "reward_std": 0.16213352233171463, "rewards/GDino": 0.7920580506324768, "rewards/GIT": 0.48158271610736847, "rewards/HPSv2": 0.2854471206665039, "step": 92 }, { "completion_length": 119.8984375, "epoch": 0.21908127208480566, "grad_norm": 0.2677880647610033, "kl": 0.0021984108025208116, "learning_rate": 9.41875e-07, "loss": 0.0021984108025208116, "reward": 1.280658483505249, "reward_std": 0.16342563927173615, "rewards/GDino": 0.6769416928291321, "rewards/GIT": 0.3232220709323883, "rewards/HPSv2": 0.28049468994140625, "step": 93 }, { "completion_length": 112.5859375, "epoch": 0.22143698468786807, "grad_norm": 0.343231365388175, "kl": 0.0026061448734253645, "learning_rate": 9.4125e-07, "loss": 0.0026061448734253645, "reward": 1.485694169998169, "reward_std": 0.23218651115894318, "rewards/GDino": 0.7781257927417755, "rewards/GIT": 0.4318277984857559, "rewards/HPSv2": 0.2757406234741211, "step": 94 }, { "completion_length": 121.171875, "epoch": 0.22379269729093051, "grad_norm": 0.26744331939202987, "kl": 0.002241209731437266, "learning_rate": 9.40625e-07, "loss": 0.002241209731437266, "reward": 1.6466319561004639, "reward_std": 0.17462409287691116, "rewards/GDino": 0.8608879446983337, "rewards/GIT": 0.49525195360183716, "rewards/HPSv2": 0.29049205780029297, "step": 95 }, { "completion_length": 118.4609375, "epoch": 0.22614840989399293, "grad_norm": 0.30999752936672714, "kl": 0.00223827559966594, "learning_rate": 9.399999999999999e-07, "loss": 0.00223827559966594, "reward": 1.5653805136680603, "reward_std": 0.18466531485319138, "rewards/GDino": 0.7854405045509338, "rewards/GIT": 0.4837230443954468, "rewards/HPSv2": 0.2962169647216797, "step": 96 }, { "completion_length": 116.984375, "epoch": 0.22850412249705537, "grad_norm": 0.2739526972403438, "kl": 0.002347193891182542, "learning_rate": 9.393749999999999e-07, "loss": 0.002347193891182542, "reward": 1.2239395380020142, "reward_std": 0.19294629991054535, "rewards/GDino": 0.7316015958786011, "rewards/GIT": 0.20546599477529526, "rewards/HPSv2": 0.28687191009521484, "step": 97 }, { "completion_length": 128.265625, "epoch": 0.2308598351001178, "grad_norm": 0.25601698774569337, "kl": 0.002220396650955081, "learning_rate": 9.387499999999999e-07, "loss": 0.002220396650955081, "reward": 1.0058202147483826, "reward_std": 0.28802110254764557, "rewards/GDino": 0.5590215623378754, "rewards/GIT": 0.1765025332570076, "rewards/HPSv2": 0.2702960968017578, "step": 98 }, { "completion_length": 122.2109375, "epoch": 0.2332155477031802, "grad_norm": 0.3990072052655401, "kl": 0.0025561433285474777, "learning_rate": 9.381249999999999e-07, "loss": 0.0025561433285474777, "reward": 1.4678585529327393, "reward_std": 0.18522066622972488, "rewards/GDino": 0.7544408440589905, "rewards/GIT": 0.4262806624174118, "rewards/HPSv2": 0.2871370315551758, "step": 99 }, { "completion_length": 123.078125, "epoch": 0.23557126030624265, "grad_norm": 0.5682828448934358, "kl": 0.0032527694711461663, "learning_rate": 9.374999999999999e-07, "loss": 0.0032527694711461663, "reward": 1.5422398447990417, "reward_std": 0.09343259036540985, "rewards/GDino": 0.758593738079071, "rewards/GIT": 0.5059981644153595, "rewards/HPSv2": 0.2776479721069336, "step": 100 }, { "completion_length": 120.8515625, "epoch": 0.23792697290930506, "grad_norm": 0.38558338073405124, "kl": 0.0029193214140832424, "learning_rate": 9.368749999999999e-07, "loss": 0.0029193214140832424, "reward": 1.4652592539787292, "reward_std": 0.18903593719005585, "rewards/GDino": 0.7272815108299255, "rewards/GIT": 0.47037678956985474, "rewards/HPSv2": 0.26760101318359375, "step": 101 }, { "completion_length": 126.3828125, "epoch": 0.24028268551236748, "grad_norm": 0.4201134252439098, "kl": 0.002912102499976754, "learning_rate": 9.3625e-07, "loss": 0.002912102499976754, "reward": 1.3667205572128296, "reward_std": 0.17018768936395645, "rewards/GDino": 0.7228421270847321, "rewards/GIT": 0.37278786301612854, "rewards/HPSv2": 0.2710905075073242, "step": 102 }, { "completion_length": 115.6640625, "epoch": 0.24263839811542992, "grad_norm": 0.36387472412384136, "kl": 0.0028120805509388447, "learning_rate": 9.35625e-07, "loss": 0.0028120805509388447, "reward": 1.3938547372817993, "reward_std": 0.1604597344994545, "rewards/GDino": 0.7382325530052185, "rewards/GIT": 0.37633391842246056, "rewards/HPSv2": 0.27928829193115234, "step": 103 }, { "completion_length": 119.171875, "epoch": 0.24499411071849234, "grad_norm": 0.4559905694122823, "kl": 0.0033245147205889225, "learning_rate": 9.35e-07, "loss": 0.0033245147205889225, "reward": 1.3490734100341797, "reward_std": 0.15588217228651047, "rewards/GDino": 0.6809636950492859, "rewards/GIT": 0.3872620761394501, "rewards/HPSv2": 0.28084754943847656, "step": 104 }, { "completion_length": 118.9609375, "epoch": 0.24734982332155478, "grad_norm": 0.5793559809223535, "kl": 0.00485729263164103, "learning_rate": 9.34375e-07, "loss": 0.00485729263164103, "reward": 1.2819088101387024, "reward_std": 0.20281542837619781, "rewards/GDino": 0.7051200866699219, "rewards/GIT": 0.30481601506471634, "rewards/HPSv2": 0.27197265625, "step": 105 }, { "completion_length": 112.6875, "epoch": 0.2497055359246172, "grad_norm": 0.2426192310485848, "kl": 0.0025443728081882, "learning_rate": 9.3375e-07, "loss": 0.0025443728081882, "reward": 1.320365071296692, "reward_std": 0.19409708678722382, "rewards/GDino": 0.7148928642272949, "rewards/GIT": 0.3145042806863785, "rewards/HPSv2": 0.2909679412841797, "step": 106 }, { "completion_length": 111.7890625, "epoch": 0.25206124852767964, "grad_norm": 0.5987893967073395, "kl": 0.0033781271195039153, "learning_rate": 9.33125e-07, "loss": 0.0033781271195039153, "reward": 1.458251714706421, "reward_std": 0.15048499405384064, "rewards/GDino": 0.8051183223724365, "rewards/GIT": 0.3764305114746094, "rewards/HPSv2": 0.276702880859375, "step": 107 }, { "completion_length": 118.15625, "epoch": 0.254416961130742, "grad_norm": 0.2456828647467861, "kl": 0.0050746474880725145, "learning_rate": 9.325e-07, "loss": 0.0050746474880725145, "reward": 1.5426043272018433, "reward_std": 0.1781405210494995, "rewards/GDino": 0.7743792533874512, "rewards/GIT": 0.47680309414863586, "rewards/HPSv2": 0.29142189025878906, "step": 108 }, { "completion_length": 118.625, "epoch": 0.25677267373380447, "grad_norm": 0.36060432329446124, "kl": 0.003646688535809517, "learning_rate": 9.31875e-07, "loss": 0.003646688535809517, "reward": 1.2394214272499084, "reward_std": 0.19298750907182693, "rewards/GDino": 0.6611234843730927, "rewards/GIT": 0.2952626645565033, "rewards/HPSv2": 0.2830352783203125, "step": 109 }, { "completion_length": 120.625, "epoch": 0.2591283863368669, "grad_norm": 0.25853891727882533, "kl": 0.0025212473701685667, "learning_rate": 9.3125e-07, "loss": 0.0025212473701685667, "reward": 1.4342886805534363, "reward_std": 0.19594230502843857, "rewards/GDino": 0.7400390207767487, "rewards/GIT": 0.42212530970573425, "rewards/HPSv2": 0.2721242904663086, "step": 110 }, { "completion_length": 117.8671875, "epoch": 0.26148409893992935, "grad_norm": 0.2888975912582715, "kl": 0.0027900757268071175, "learning_rate": 9.30625e-07, "loss": 0.0027900757268071175, "reward": 1.3054375052452087, "reward_std": 0.16610045731067657, "rewards/GDino": 0.6589219868183136, "rewards/GIT": 0.3580652326345444, "rewards/HPSv2": 0.2884502410888672, "step": 111 }, { "completion_length": 116.1015625, "epoch": 0.26383981154299174, "grad_norm": 0.33538943702703067, "kl": 0.002599665313027799, "learning_rate": 9.3e-07, "loss": 0.002599665313027799, "reward": 1.5806245803833008, "reward_std": 0.17944349348545074, "rewards/GDino": 0.8358974158763885, "rewards/GIT": 0.46029046177864075, "rewards/HPSv2": 0.2844367027282715, "step": 112 }, { "completion_length": 124.1015625, "epoch": 0.2661955241460542, "grad_norm": 0.3387237953101647, "kl": 0.0040167884435504675, "learning_rate": 9.293749999999999e-07, "loss": 0.0040167884435504675, "reward": 1.7106415033340454, "reward_std": 0.20329046249389648, "rewards/GDino": 0.8292076289653778, "rewards/GIT": 0.6044276654720306, "rewards/HPSv2": 0.2770061492919922, "step": 113 }, { "completion_length": 120.2578125, "epoch": 0.26855123674911663, "grad_norm": 0.2356256198115586, "kl": 0.0026971568586304784, "learning_rate": 9.287499999999999e-07, "loss": 0.0026971568586304784, "reward": 1.273473858833313, "reward_std": 0.19800737500190735, "rewards/GDino": 0.7268248796463013, "rewards/GIT": 0.26298987865448, "rewards/HPSv2": 0.2836589813232422, "step": 114 }, { "completion_length": 118.8359375, "epoch": 0.270906949352179, "grad_norm": 0.2805640308110318, "kl": 0.0037218593060970306, "learning_rate": 9.281249999999999e-07, "loss": 0.0037218593060970306, "reward": 1.3687764406204224, "reward_std": 0.16527260094881058, "rewards/GDino": 0.6857761442661285, "rewards/GIT": 0.40027205646038055, "rewards/HPSv2": 0.2827281951904297, "step": 115 }, { "completion_length": 114.609375, "epoch": 0.27326266195524146, "grad_norm": 0.21826265519893165, "kl": 0.0030477073742076755, "learning_rate": 9.274999999999999e-07, "loss": 0.0030477073742076755, "reward": 1.5265187621116638, "reward_std": 0.1887928619980812, "rewards/GDino": 0.7819173038005829, "rewards/GIT": 0.4663059562444687, "rewards/HPSv2": 0.27829551696777344, "step": 116 }, { "completion_length": 118.0546875, "epoch": 0.2756183745583039, "grad_norm": 0.315507852560352, "kl": 0.0030055524548515677, "learning_rate": 9.268749999999999e-07, "loss": 0.0030055524548515677, "reward": 1.6297286748886108, "reward_std": 0.18344774842262268, "rewards/GDino": 0.7611067295074463, "rewards/GIT": 0.611541748046875, "rewards/HPSv2": 0.257080078125, "step": 117 }, { "completion_length": 123.5390625, "epoch": 0.2779740871613663, "grad_norm": 0.35156061236835173, "kl": 0.0033750240691006184, "learning_rate": 9.2625e-07, "loss": 0.0033750240691006184, "reward": 1.2054052352905273, "reward_std": 0.27922993898391724, "rewards/GDino": 0.676445722579956, "rewards/GIT": 0.25982873141765594, "rewards/HPSv2": 0.2691307067871094, "step": 118 }, { "completion_length": 119.75, "epoch": 0.28032979976442873, "grad_norm": 0.30579759341265556, "kl": 0.004724527359940112, "learning_rate": 9.25625e-07, "loss": 0.004724527359940112, "reward": 1.4698107242584229, "reward_std": 0.17547835409641266, "rewards/GDino": 0.784612238407135, "rewards/GIT": 0.39554841816425323, "rewards/HPSv2": 0.28964996337890625, "step": 119 }, { "completion_length": 120.015625, "epoch": 0.2826855123674912, "grad_norm": 0.3020405116642382, "kl": 0.007684687618166208, "learning_rate": 9.25e-07, "loss": 0.007684687618166208, "reward": 1.5769703388214111, "reward_std": 0.22644689679145813, "rewards/GDino": 0.776692658662796, "rewards/GIT": 0.5326155871152878, "rewards/HPSv2": 0.26766204833984375, "step": 120 }, { "completion_length": 116.53125, "epoch": 0.2850412249705536, "grad_norm": 0.38874532347223884, "kl": 0.0037365639582276344, "learning_rate": 9.243749999999999e-07, "loss": 0.0037365639582276344, "reward": 1.4327336549758911, "reward_std": 0.15665320679545403, "rewards/GDino": 0.7510005235671997, "rewards/GIT": 0.4021310694515705, "rewards/HPSv2": 0.27960205078125, "step": 121 }, { "completion_length": 116.8125, "epoch": 0.287396937573616, "grad_norm": 0.2869922093053935, "kl": 0.004592409357428551, "learning_rate": 9.237499999999999e-07, "loss": 0.004592409357428551, "reward": 1.6139296889305115, "reward_std": 0.1472279652953148, "rewards/GDino": 0.816271036863327, "rewards/GIT": 0.5062167048454285, "rewards/HPSv2": 0.2914419174194336, "step": 122 }, { "completion_length": 124.6484375, "epoch": 0.28975265017667845, "grad_norm": 0.28961687175345374, "kl": 0.0068622499238699675, "learning_rate": 9.23125e-07, "loss": 0.0068622499238699675, "reward": 1.201086938381195, "reward_std": 0.15015563368797302, "rewards/GDino": 0.6506968885660172, "rewards/GIT": 0.2650030702352524, "rewards/HPSv2": 0.2853870391845703, "step": 123 }, { "completion_length": 115.765625, "epoch": 0.2921083627797409, "grad_norm": 0.3028230824820869, "kl": 0.006972880102694035, "learning_rate": 9.225e-07, "loss": 0.006972880102694035, "reward": 1.568232536315918, "reward_std": 0.14724697172641754, "rewards/GDino": 0.8241820931434631, "rewards/GIT": 0.47233714163303375, "rewards/HPSv2": 0.2717132568359375, "step": 124 }, { "completion_length": 115.921875, "epoch": 0.2944640753828033, "grad_norm": 0.2513234137650961, "kl": 0.006366019602864981, "learning_rate": 9.21875e-07, "loss": 0.006366019602864981, "reward": 1.4276733994483948, "reward_std": 0.1901681050658226, "rewards/GDino": 0.7936453819274902, "rewards/GIT": 0.3538661599159241, "rewards/HPSv2": 0.28016185760498047, "step": 125 }, { "completion_length": 123.8046875, "epoch": 0.2968197879858657, "grad_norm": 0.4417733190272292, "kl": 0.0048048876924440265, "learning_rate": 9.2125e-07, "loss": 0.0048048876924440265, "reward": 1.302800476551056, "reward_std": 0.2053716480731964, "rewards/GDino": 0.6933121085166931, "rewards/GIT": 0.31638413667678833, "rewards/HPSv2": 0.2931041717529297, "step": 126 }, { "completion_length": 113.4296875, "epoch": 0.29917550058892817, "grad_norm": 0.2820829132685245, "kl": 0.003649702644906938, "learning_rate": 9.20625e-07, "loss": 0.003649702644906938, "reward": 1.7063919305801392, "reward_std": 0.12833374738693237, "rewards/GDino": 0.8571658730506897, "rewards/GIT": 0.5702161490917206, "rewards/HPSv2": 0.2790098190307617, "step": 127 }, { "completion_length": 123.28125, "epoch": 0.30153121319199055, "grad_norm": 0.5046212034946389, "kl": 0.005035737296566367, "learning_rate": 9.2e-07, "loss": 0.005035737296566367, "reward": 1.4200000166893005, "reward_std": 0.1603655368089676, "rewards/GDino": 0.7420765161514282, "rewards/GIT": 0.39525342732667923, "rewards/HPSv2": 0.2826700210571289, "step": 128 }, { "completion_length": 115.2578125, "epoch": 0.303886925795053, "grad_norm": 0.36000462486652446, "kl": 0.004047968657687306, "learning_rate": 9.19375e-07, "loss": 0.004047968657687306, "reward": 1.3250027894973755, "reward_std": 0.17621323466300964, "rewards/GDino": 0.7162688970565796, "rewards/GIT": 0.3301860988140106, "rewards/HPSv2": 0.2785477638244629, "step": 129 }, { "completion_length": 120.6640625, "epoch": 0.30624263839811544, "grad_norm": 1.6768966552943407, "kl": 0.005234519834630191, "learning_rate": 9.187499999999999e-07, "loss": 0.005234519834630191, "reward": 1.5390700101852417, "reward_std": 0.1595797836780548, "rewards/GDino": 0.7382936477661133, "rewards/GIT": 0.5352771878242493, "rewards/HPSv2": 0.2654991149902344, "step": 130 }, { "completion_length": 114.125, "epoch": 0.30859835100117783, "grad_norm": 0.25452782691685166, "kl": 0.003763939021155238, "learning_rate": 9.181249999999999e-07, "loss": 0.003763939021155238, "reward": 1.5603395700454712, "reward_std": 0.12152928113937378, "rewards/GDino": 0.7622032463550568, "rewards/GIT": 0.5218005776405334, "rewards/HPSv2": 0.2763357162475586, "step": 131 }, { "completion_length": 123.1875, "epoch": 0.31095406360424027, "grad_norm": 0.27226945702170774, "kl": 0.009177556727081537, "learning_rate": 9.174999999999999e-07, "loss": 0.009177556727081537, "reward": 1.3923259973526, "reward_std": 0.18214039504528046, "rewards/GDino": 0.774984747171402, "rewards/GIT": 0.3335907533764839, "rewards/HPSv2": 0.2837505340576172, "step": 132 }, { "completion_length": 119.78125, "epoch": 0.3133097762073027, "grad_norm": 0.2597285762772555, "kl": 0.008418319281190634, "learning_rate": 9.168749999999999e-07, "loss": 0.008418319281190634, "reward": 1.6184507012367249, "reward_std": 0.11622695252299309, "rewards/GDino": 0.8246855437755585, "rewards/GIT": 0.5030422806739807, "rewards/HPSv2": 0.2907228469848633, "step": 133 }, { "completion_length": 118.8359375, "epoch": 0.31566548881036516, "grad_norm": 0.27730872139298124, "kl": 0.004451054846867919, "learning_rate": 9.1625e-07, "loss": 0.004451054846867919, "reward": 1.266618549823761, "reward_std": 0.2054881602525711, "rewards/GDino": 0.7027294039726257, "rewards/GIT": 0.2916542813181877, "rewards/HPSv2": 0.2722349166870117, "step": 134 }, { "completion_length": 115.859375, "epoch": 0.31802120141342755, "grad_norm": 0.6882216081653162, "kl": 0.010817927308380604, "learning_rate": 9.15625e-07, "loss": 0.010817927308380604, "reward": 1.3148509860038757, "reward_std": 0.17112566530704498, "rewards/GDino": 0.6790395081043243, "rewards/GIT": 0.3558289110660553, "rewards/HPSv2": 0.2799825668334961, "step": 135 }, { "completion_length": 119.9375, "epoch": 0.32037691401649, "grad_norm": 0.34139292409819094, "kl": 0.004219690570607781, "learning_rate": 9.15e-07, "loss": 0.004219690570607781, "reward": 1.3591762781143188, "reward_std": 0.19799543917179108, "rewards/GDino": 0.7115766108036041, "rewards/GIT": 0.37749338150024414, "rewards/HPSv2": 0.27010631561279297, "step": 136 }, { "completion_length": 124.34375, "epoch": 0.32273262661955243, "grad_norm": 0.2542744748814295, "kl": 0.004624171881005168, "learning_rate": 9.14375e-07, "loss": 0.004624171881005168, "reward": 1.3979861736297607, "reward_std": 0.22949668765068054, "rewards/GDino": 0.7827940881252289, "rewards/GIT": 0.334325447678566, "rewards/HPSv2": 0.2808666229248047, "step": 137 }, { "completion_length": 120.0, "epoch": 0.3250883392226148, "grad_norm": 0.4358833366139649, "kl": 0.0043571447022259235, "learning_rate": 9.137499999999999e-07, "loss": 0.0043571447022259235, "reward": 1.2475192546844482, "reward_std": 0.20221056044101715, "rewards/GDino": 0.6932171583175659, "rewards/GIT": 0.26740727573633194, "rewards/HPSv2": 0.2868947982788086, "step": 138 }, { "completion_length": 122.4453125, "epoch": 0.32744405182567726, "grad_norm": 0.30961336937359013, "kl": 0.0037244665436446667, "learning_rate": 9.131249999999999e-07, "loss": 0.0037244665436446667, "reward": 1.1996442675590515, "reward_std": 0.18872076272964478, "rewards/GDino": 0.725640207529068, "rewards/GIT": 0.19274938479065895, "rewards/HPSv2": 0.28125476837158203, "step": 139 }, { "completion_length": 119.28125, "epoch": 0.3297997644287397, "grad_norm": 0.26677339295627156, "kl": 0.00421550776809454, "learning_rate": 9.124999999999999e-07, "loss": 0.00421550776809454, "reward": 1.327845573425293, "reward_std": 0.18080878257751465, "rewards/GDino": 0.7253636419773102, "rewards/GIT": 0.3320428282022476, "rewards/HPSv2": 0.27043914794921875, "step": 140 }, { "completion_length": 121.4609375, "epoch": 0.3321554770318021, "grad_norm": 0.3713564249111101, "kl": 0.004628100199624896, "learning_rate": 9.11875e-07, "loss": 0.004628100199624896, "reward": 1.4560731649398804, "reward_std": 0.18876664340496063, "rewards/GDino": 0.7719440460205078, "rewards/GIT": 0.410146102309227, "rewards/HPSv2": 0.2739830017089844, "step": 141 }, { "completion_length": 122.578125, "epoch": 0.33451118963486454, "grad_norm": 0.33163750308442685, "kl": 0.006284679169766605, "learning_rate": 9.1125e-07, "loss": 0.006284679169766605, "reward": 1.1361467242240906, "reward_std": 0.1411423683166504, "rewards/GDino": 0.65177321434021, "rewards/GIT": 0.20452436804771423, "rewards/HPSv2": 0.2798490524291992, "step": 142 }, { "completion_length": 112.4921875, "epoch": 0.336866902237927, "grad_norm": 0.25445203025552027, "kl": 0.0067289783619344234, "learning_rate": 9.10625e-07, "loss": 0.0067289783619344234, "reward": 1.4620287418365479, "reward_std": 0.16819049045443535, "rewards/GDino": 0.7772760093212128, "rewards/GIT": 0.4057963192462921, "rewards/HPSv2": 0.27895641326904297, "step": 143 }, { "completion_length": 117.9765625, "epoch": 0.3392226148409894, "grad_norm": 0.2718495200495857, "kl": 0.0038740215823054314, "learning_rate": 9.1e-07, "loss": 0.0038740215823054314, "reward": 1.5342840552330017, "reward_std": 0.1841500923037529, "rewards/GDino": 0.7875899970531464, "rewards/GIT": 0.47154468297958374, "rewards/HPSv2": 0.2751493453979492, "step": 144 }, { "completion_length": 115.4765625, "epoch": 0.3415783274440518, "grad_norm": 0.2585894345593492, "kl": 0.0042675541481003165, "learning_rate": 9.09375e-07, "loss": 0.0042675541481003165, "reward": 1.529801845550537, "reward_std": 0.1101372241973877, "rewards/GDino": 0.8092205822467804, "rewards/GIT": 0.4435676261782646, "rewards/HPSv2": 0.27701377868652344, "step": 145 }, { "completion_length": 121.5234375, "epoch": 0.34393404004711425, "grad_norm": 0.24874391718590672, "kl": 0.007108606398105621, "learning_rate": 9.087499999999999e-07, "loss": 0.007108606398105621, "reward": 1.7090327739715576, "reward_std": 0.18251892179250717, "rewards/GDino": 0.8413677215576172, "rewards/GIT": 0.5779501795768738, "rewards/HPSv2": 0.2897148132324219, "step": 146 }, { "completion_length": 114.6796875, "epoch": 0.3462897526501767, "grad_norm": 0.28797944955605576, "kl": 0.004397568060085177, "learning_rate": 9.081249999999999e-07, "loss": 0.004397568060085177, "reward": 1.551221489906311, "reward_std": 0.15495620667934418, "rewards/GDino": 0.7745460569858551, "rewards/GIT": 0.4947930723428726, "rewards/HPSv2": 0.28188228607177734, "step": 147 }, { "completion_length": 120.65625, "epoch": 0.3486454652532391, "grad_norm": 0.43915135211431994, "kl": 0.009355415590107441, "learning_rate": 9.074999999999999e-07, "loss": 0.009355415590107441, "reward": 1.6348460912704468, "reward_std": 0.17205246537923813, "rewards/GDino": 0.7657027542591095, "rewards/GIT": 0.5795084834098816, "rewards/HPSv2": 0.28963470458984375, "step": 148 }, { "completion_length": 113.625, "epoch": 0.3510011778563015, "grad_norm": 0.3596728521822304, "kl": 0.00461172079667449, "learning_rate": 9.068749999999999e-07, "loss": 0.00461172079667449, "reward": 1.571276307106018, "reward_std": 0.23586251586675644, "rewards/GDino": 0.7799384295940399, "rewards/GIT": 0.5188950598239899, "rewards/HPSv2": 0.2724428176879883, "step": 149 }, { "completion_length": 120.6171875, "epoch": 0.35335689045936397, "grad_norm": 1.3993474516441373, "kl": 0.007138528861105442, "learning_rate": 9.0625e-07, "loss": 0.007138528861105442, "reward": 1.4649569988250732, "reward_std": 0.19733739644289017, "rewards/GDino": 0.7414374947547913, "rewards/GIT": 0.4486619532108307, "rewards/HPSv2": 0.2748575210571289, "step": 150 }, { "completion_length": 115.0078125, "epoch": 0.35571260306242636, "grad_norm": 0.2434388614909844, "kl": 0.0038894894532859325, "learning_rate": 9.05625e-07, "loss": 0.0038894894532859325, "reward": 1.353737235069275, "reward_std": 0.1862211525440216, "rewards/GDino": 0.7606337666511536, "rewards/GIT": 0.3160610795021057, "rewards/HPSv2": 0.2770423889160156, "step": 151 }, { "completion_length": 118.0078125, "epoch": 0.3580683156654888, "grad_norm": 0.3437062463763072, "kl": 0.010416059521958232, "learning_rate": 9.05e-07, "loss": 0.010416059521958232, "reward": 1.6395174264907837, "reward_std": 0.17358093708753586, "rewards/GDino": 0.831091046333313, "rewards/GIT": 0.5193428099155426, "rewards/HPSv2": 0.28908348083496094, "step": 152 }, { "completion_length": 122.109375, "epoch": 0.36042402826855124, "grad_norm": 0.27632022418717916, "kl": 0.005517980316653848, "learning_rate": 9.04375e-07, "loss": 0.005517980316653848, "reward": 1.4383190274238586, "reward_std": 0.1696222573518753, "rewards/GDino": 0.7326882481575012, "rewards/GIT": 0.4120803028345108, "rewards/HPSv2": 0.2935504913330078, "step": 153 }, { "completion_length": 118.765625, "epoch": 0.3627797408716137, "grad_norm": 0.25333472242728916, "kl": 0.004616864025592804, "learning_rate": 9.0375e-07, "loss": 0.004616864025592804, "reward": 1.279759705066681, "reward_std": 0.18590209633111954, "rewards/GDino": 0.7085305750370026, "rewards/GIT": 0.2845574617385864, "rewards/HPSv2": 0.28667163848876953, "step": 154 }, { "completion_length": 120.0078125, "epoch": 0.3651354534746761, "grad_norm": 0.34723162087171755, "kl": 0.013976640067994595, "learning_rate": 9.031249999999999e-07, "loss": 0.013976640067994595, "reward": 1.2974292635917664, "reward_std": 0.2496296390891075, "rewards/GDino": 0.7280158400535583, "rewards/GIT": 0.29700203984975815, "rewards/HPSv2": 0.2724113464355469, "step": 155 }, { "completion_length": 115.484375, "epoch": 0.3674911660777385, "grad_norm": 0.272197882982864, "kl": 0.0070599212776869535, "learning_rate": 9.024999999999999e-07, "loss": 0.0070599212776869535, "reward": 1.4371802806854248, "reward_std": 0.18361084908246994, "rewards/GDino": 0.7640349566936493, "rewards/GIT": 0.39835166931152344, "rewards/HPSv2": 0.2747936248779297, "step": 156 }, { "completion_length": 120.765625, "epoch": 0.36984687868080096, "grad_norm": 0.38359267490215604, "kl": 0.006496853660792112, "learning_rate": 9.018749999999999e-07, "loss": 0.006496853660792112, "reward": 1.5134045481681824, "reward_std": 0.19275549799203873, "rewards/GDino": 0.7855638265609741, "rewards/GIT": 0.46035419404506683, "rewards/HPSv2": 0.267486572265625, "step": 157 }, { "completion_length": 118.0234375, "epoch": 0.37220259128386335, "grad_norm": 0.3205413739522583, "kl": 0.006743110250681639, "learning_rate": 9.0125e-07, "loss": 0.006743110250681639, "reward": 1.4751760363578796, "reward_std": 0.15944985300302505, "rewards/GDino": 0.7474680840969086, "rewards/GIT": 0.4448501020669937, "rewards/HPSv2": 0.28285789489746094, "step": 158 }, { "completion_length": 121.7734375, "epoch": 0.3745583038869258, "grad_norm": 0.5392446113874851, "kl": 0.010783521924167871, "learning_rate": 9.00625e-07, "loss": 0.010783521924167871, "reward": 1.5727298259735107, "reward_std": 0.18068891763687134, "rewards/GDino": 0.8056339025497437, "rewards/GIT": 0.4838365167379379, "rewards/HPSv2": 0.28325939178466797, "step": 159 }, { "completion_length": 121.671875, "epoch": 0.37691401648998824, "grad_norm": 0.24902324464660577, "kl": 0.011011498048901558, "learning_rate": 9e-07, "loss": 0.011011498048901558, "reward": 1.8066010475158691, "reward_std": 0.10796968266367912, "rewards/GDino": 0.8584640622138977, "rewards/GIT": 0.6545569598674774, "rewards/HPSv2": 0.2935800552368164, "step": 160 }, { "completion_length": 119.1328125, "epoch": 0.3792697290930506, "grad_norm": 1.156669940814261, "kl": 0.015345902414992452, "learning_rate": 8.99375e-07, "loss": 0.015345902414992452, "reward": 1.7207170724868774, "reward_std": 0.16005732119083405, "rewards/GDino": 0.8589991629123688, "rewards/GIT": 0.5959889888763428, "rewards/HPSv2": 0.2657289505004883, "step": 161 }, { "completion_length": 120.296875, "epoch": 0.38162544169611307, "grad_norm": 0.6293369871446819, "kl": 0.013437798479571939, "learning_rate": 8.9875e-07, "loss": 0.013437798479571939, "reward": 1.4637031555175781, "reward_std": 0.16879646480083466, "rewards/GDino": 0.7395396828651428, "rewards/GIT": 0.4386972486972809, "rewards/HPSv2": 0.28546619415283203, "step": 162 }, { "completion_length": 119.515625, "epoch": 0.3839811542991755, "grad_norm": 0.26862945729226634, "kl": 0.005311851855367422, "learning_rate": 8.981249999999999e-07, "loss": 0.005311851855367422, "reward": 1.5648610591888428, "reward_std": 0.13763657957315445, "rewards/GDino": 0.7811363041400909, "rewards/GIT": 0.5093278586864471, "rewards/HPSv2": 0.2743968963623047, "step": 163 }, { "completion_length": 120.6875, "epoch": 0.38633686690223795, "grad_norm": 0.24993937469736408, "kl": 0.007540711434558034, "learning_rate": 8.974999999999999e-07, "loss": 0.007540711434558034, "reward": 1.2253366708755493, "reward_std": 0.13047771900892258, "rewards/GDino": 0.6267434060573578, "rewards/GIT": 0.3070177882909775, "rewards/HPSv2": 0.29157543182373047, "step": 164 }, { "completion_length": 122.2265625, "epoch": 0.38869257950530034, "grad_norm": 0.7137882700443733, "kl": 0.009616367053240538, "learning_rate": 8.96875e-07, "loss": 0.009616367053240538, "reward": 1.2451866269111633, "reward_std": 0.12625165656208992, "rewards/GDino": 0.6649079322814941, "rewards/GIT": 0.28280673176050186, "rewards/HPSv2": 0.2974720001220703, "step": 165 }, { "completion_length": 123.109375, "epoch": 0.3910482921083628, "grad_norm": 0.26818757989699654, "kl": 0.01488648122176528, "learning_rate": 8.9625e-07, "loss": 0.01488648122176528, "reward": 1.7531495690345764, "reward_std": 0.14931076020002365, "rewards/GDino": 0.857462078332901, "rewards/GIT": 0.6198695451021194, "rewards/HPSv2": 0.27581787109375, "step": 166 }, { "completion_length": 119.1875, "epoch": 0.3934040047114252, "grad_norm": 0.24095014937206602, "kl": 0.006236532470211387, "learning_rate": 8.95625e-07, "loss": 0.006236532470211387, "reward": 1.510873794555664, "reward_std": 0.18022798746824265, "rewards/GDino": 0.8156704306602478, "rewards/GIT": 0.4094958007335663, "rewards/HPSv2": 0.2857074737548828, "step": 167 }, { "completion_length": 124.2890625, "epoch": 0.3957597173144876, "grad_norm": 0.3030082937442416, "kl": 0.009627037681639194, "learning_rate": 8.95e-07, "loss": 0.009627037681639194, "reward": 1.1397759318351746, "reward_std": 0.19631942361593246, "rewards/GDino": 0.6104525923728943, "rewards/GIT": 0.2442462295293808, "rewards/HPSv2": 0.2850770950317383, "step": 168 }, { "completion_length": 120.8046875, "epoch": 0.39811542991755006, "grad_norm": 0.5103288360852164, "kl": 0.012551124207675457, "learning_rate": 8.94375e-07, "loss": 0.012551124207675457, "reward": 1.4224437475204468, "reward_std": 0.16933120787143707, "rewards/GDino": 0.764372706413269, "rewards/GIT": 0.3657965362071991, "rewards/HPSv2": 0.29227447509765625, "step": 169 }, { "completion_length": 119.59375, "epoch": 0.4004711425206125, "grad_norm": 0.255123253772587, "kl": 0.021092642098665237, "learning_rate": 8.9375e-07, "loss": 0.021092642098665237, "reward": 1.4170874953269958, "reward_std": 0.15517953038215637, "rewards/GDino": 0.7557954490184784, "rewards/GIT": 0.37980160117149353, "rewards/HPSv2": 0.2814903259277344, "step": 170 }, { "completion_length": 119.375, "epoch": 0.4028268551236749, "grad_norm": 0.2823064421445881, "kl": 0.020489402115345, "learning_rate": 8.931249999999999e-07, "loss": 0.020489402115345, "reward": 1.5654370188713074, "reward_std": 0.16114944219589233, "rewards/GDino": 0.8186959624290466, "rewards/GIT": 0.46701690554618835, "rewards/HPSv2": 0.27972412109375, "step": 171 }, { "completion_length": 125.84375, "epoch": 0.40518256772673733, "grad_norm": 0.25077394292289656, "kl": 0.01169170020148158, "learning_rate": 8.924999999999999e-07, "loss": 0.01169170020148158, "reward": 1.6773833632469177, "reward_std": 0.16438720375299454, "rewards/GDino": 0.8543325960636139, "rewards/GIT": 0.5445902794599533, "rewards/HPSv2": 0.2784605026245117, "step": 172 }, { "completion_length": 124.0625, "epoch": 0.4075382803297998, "grad_norm": 0.25753018657040827, "kl": 0.006570700090378523, "learning_rate": 8.918749999999999e-07, "loss": 0.006570700090378523, "reward": 1.2073261141777039, "reward_std": 0.23937559127807617, "rewards/GDino": 0.6660653352737427, "rewards/GIT": 0.26105208694934845, "rewards/HPSv2": 0.2802085876464844, "step": 173 }, { "completion_length": 124.125, "epoch": 0.4098939929328622, "grad_norm": 0.2508039151752151, "kl": 0.006404939340427518, "learning_rate": 8.912499999999999e-07, "loss": 0.006404939340427518, "reward": 1.1012952327728271, "reward_std": 0.18487440049648285, "rewards/GDino": 0.6412220299243927, "rewards/GIT": 0.1791074424982071, "rewards/HPSv2": 0.28096580505371094, "step": 174 }, { "completion_length": 130.640625, "epoch": 0.4122497055359246, "grad_norm": 0.2601315712826511, "kl": 0.012203969992697239, "learning_rate": 8.906249999999999e-07, "loss": 0.012203969992697239, "reward": 1.3721113204956055, "reward_std": 0.16963671147823334, "rewards/GDino": 0.6951804459095001, "rewards/GIT": 0.39326897263526917, "rewards/HPSv2": 0.2836618423461914, "step": 175 }, { "completion_length": 124.6015625, "epoch": 0.41460541813898705, "grad_norm": 0.5198747743884437, "kl": 0.013308790046721697, "learning_rate": 8.9e-07, "loss": 0.013308790046721697, "reward": 1.5064972639083862, "reward_std": 0.1559619903564453, "rewards/GDino": 0.7946788966655731, "rewards/GIT": 0.4135672599077225, "rewards/HPSv2": 0.2982511520385742, "step": 176 }, { "completion_length": 128.78125, "epoch": 0.4169611307420495, "grad_norm": 0.30337784276620017, "kl": 0.007899556308984756, "learning_rate": 8.89375e-07, "loss": 0.007899556308984756, "reward": 1.4228416681289673, "reward_std": 0.182267464697361, "rewards/GDino": 0.7750592231750488, "rewards/GIT": 0.352176770567894, "rewards/HPSv2": 0.2956056594848633, "step": 177 }, { "completion_length": 123.6328125, "epoch": 0.4193168433451119, "grad_norm": 0.27832670405360443, "kl": 0.006992871640250087, "learning_rate": 8.8875e-07, "loss": 0.006992871640250087, "reward": 1.152895748615265, "reward_std": 0.16947459429502487, "rewards/GDino": 0.6190851032733917, "rewards/GIT": 0.259493887424469, "rewards/HPSv2": 0.27431678771972656, "step": 178 }, { "completion_length": 121.2734375, "epoch": 0.4216725559481743, "grad_norm": 0.4409453524544516, "kl": 0.016375430393964052, "learning_rate": 8.88125e-07, "loss": 0.016375430393964052, "reward": 1.2858899235725403, "reward_std": 0.1789165437221527, "rewards/GDino": 0.7271504402160645, "rewards/GIT": 0.27660350501537323, "rewards/HPSv2": 0.2821359634399414, "step": 179 }, { "completion_length": 125.2265625, "epoch": 0.42402826855123676, "grad_norm": 0.27132920358417056, "kl": 0.011395329609513283, "learning_rate": 8.874999999999999e-07, "loss": 0.011395329609513283, "reward": 1.5079703330993652, "reward_std": 0.2025618851184845, "rewards/GDino": 0.7669787108898163, "rewards/GIT": 0.46031367778778076, "rewards/HPSv2": 0.28067779541015625, "step": 180 }, { "completion_length": 125.265625, "epoch": 0.42638398115429915, "grad_norm": 0.28383687503363153, "kl": 0.006783008109778166, "learning_rate": 8.86875e-07, "loss": 0.006783008109778166, "reward": 1.3603765964508057, "reward_std": 0.17958229035139084, "rewards/GDino": 0.7443369626998901, "rewards/GIT": 0.3204205483198166, "rewards/HPSv2": 0.29561901092529297, "step": 181 }, { "completion_length": 119.296875, "epoch": 0.4287396937573616, "grad_norm": 0.3489436132468038, "kl": 0.006763711338862777, "learning_rate": 8.8625e-07, "loss": 0.006763711338862777, "reward": 1.3337554335594177, "reward_std": 0.1439540758728981, "rewards/GDino": 0.7396840453147888, "rewards/GIT": 0.3021048605442047, "rewards/HPSv2": 0.29196643829345703, "step": 182 }, { "completion_length": 129.859375, "epoch": 0.43109540636042404, "grad_norm": 1.8395536069554859, "kl": 0.015877308323979378, "learning_rate": 8.85625e-07, "loss": 0.015877308323979378, "reward": 1.3896361589431763, "reward_std": 0.20208755880594254, "rewards/GDino": 0.7448915243148804, "rewards/GIT": 0.3527447134256363, "rewards/HPSv2": 0.29199981689453125, "step": 183 }, { "completion_length": 124.6796875, "epoch": 0.4334511189634865, "grad_norm": 0.37457497965176884, "kl": 0.01041938504204154, "learning_rate": 8.85e-07, "loss": 0.01041938504204154, "reward": 1.569790780544281, "reward_std": 0.1646622195839882, "rewards/GDino": 0.8275903165340424, "rewards/GIT": 0.4603467583656311, "rewards/HPSv2": 0.28185367584228516, "step": 184 }, { "completion_length": 118.7578125, "epoch": 0.43580683156654887, "grad_norm": 0.5582807017019862, "kl": 0.027030501514673233, "learning_rate": 8.84375e-07, "loss": 0.027030501514673233, "reward": 1.481880247592926, "reward_std": 0.17218272387981415, "rewards/GDino": 0.7446475923061371, "rewards/GIT": 0.4585203230381012, "rewards/HPSv2": 0.27871227264404297, "step": 185 }, { "completion_length": 119.5078125, "epoch": 0.4381625441696113, "grad_norm": 0.29196424731639786, "kl": 0.010259227827191353, "learning_rate": 8.8375e-07, "loss": 0.010259227827191353, "reward": 1.4044077396392822, "reward_std": 0.210309699177742, "rewards/GDino": 0.7110077142715454, "rewards/GIT": 0.42204540967941284, "rewards/HPSv2": 0.27135467529296875, "step": 186 }, { "completion_length": 125.3046875, "epoch": 0.44051825677267376, "grad_norm": 0.5859914336036139, "kl": 0.024135492742061615, "learning_rate": 8.83125e-07, "loss": 0.024135492742061615, "reward": 1.371088445186615, "reward_std": 0.17554545402526855, "rewards/GDino": 0.7103890776634216, "rewards/GIT": 0.38125942647457123, "rewards/HPSv2": 0.27943992614746094, "step": 187 }, { "completion_length": 126.3203125, "epoch": 0.44287396937573614, "grad_norm": 0.272214168642862, "kl": 0.027817600406706333, "learning_rate": 8.824999999999999e-07, "loss": 0.027817600406706333, "reward": 1.2352468967437744, "reward_std": 0.169111430644989, "rewards/GDino": 0.7103920578956604, "rewards/GIT": 0.24646969139575958, "rewards/HPSv2": 0.2783851623535156, "step": 188 }, { "completion_length": 124.3515625, "epoch": 0.4452296819787986, "grad_norm": 0.5404404996131553, "kl": 0.028901003301143646, "learning_rate": 8.818749999999999e-07, "loss": 0.028901003301143646, "reward": 1.4789384007453918, "reward_std": 0.17879607528448105, "rewards/GDino": 0.7527338564395905, "rewards/GIT": 0.4392772912979126, "rewards/HPSv2": 0.2869272232055664, "step": 189 }, { "completion_length": 123.78125, "epoch": 0.44758539458186103, "grad_norm": 0.3088195432839607, "kl": 0.016063033835962415, "learning_rate": 8.812499999999999e-07, "loss": 0.016063033835962415, "reward": 1.4304088354110718, "reward_std": 0.1532239019870758, "rewards/GDino": 0.7483359277248383, "rewards/GIT": 0.39692990481853485, "rewards/HPSv2": 0.2851428985595703, "step": 190 }, { "completion_length": 125.3125, "epoch": 0.4499411071849234, "grad_norm": 0.3850007003116994, "kl": 0.0110264727845788, "learning_rate": 8.806249999999999e-07, "loss": 0.0110264727845788, "reward": 1.353133738040924, "reward_std": 0.21043546497821808, "rewards/GDino": 0.6460394859313965, "rewards/GIT": 0.43326856195926666, "rewards/HPSv2": 0.27382564544677734, "step": 191 }, { "completion_length": 126.8046875, "epoch": 0.45229681978798586, "grad_norm": 0.97851104572296, "kl": 0.03681589663028717, "learning_rate": 8.799999999999999e-07, "loss": 0.03681589663028717, "reward": 1.3622918725013733, "reward_std": 0.13441802188754082, "rewards/GDino": 0.6928820610046387, "rewards/GIT": 0.3867693245410919, "rewards/HPSv2": 0.2826404571533203, "step": 192 }, { "completion_length": 124.59375, "epoch": 0.4546525323910483, "grad_norm": 0.2601444222259964, "kl": 0.013458737172186375, "learning_rate": 8.793749999999999e-07, "loss": 0.013458737172186375, "reward": 1.493158221244812, "reward_std": 0.14912136644124985, "rewards/GDino": 0.7699921429157257, "rewards/GIT": 0.4409366250038147, "rewards/HPSv2": 0.2822294235229492, "step": 193 }, { "completion_length": 128.0859375, "epoch": 0.45700824499411075, "grad_norm": 0.3408820603477387, "kl": 0.010828856378793716, "learning_rate": 8.7875e-07, "loss": 0.010828856378793716, "reward": 1.3069757223129272, "reward_std": 0.1719047799706459, "rewards/GDino": 0.6878907680511475, "rewards/GIT": 0.33856354653835297, "rewards/HPSv2": 0.2805213928222656, "step": 194 }, { "completion_length": 121.8359375, "epoch": 0.45936395759717313, "grad_norm": 0.32721238993369384, "kl": 0.030472806189209223, "learning_rate": 8.78125e-07, "loss": 0.030472806189209223, "reward": 1.5238117575645447, "reward_std": 0.21171307563781738, "rewards/GDino": 0.7855172157287598, "rewards/GIT": 0.4637049585580826, "rewards/HPSv2": 0.27458953857421875, "step": 195 }, { "completion_length": 122.5, "epoch": 0.4617196702002356, "grad_norm": 1.0494893467709472, "kl": 0.02742212451994419, "learning_rate": 8.774999999999999e-07, "loss": 0.02742212451994419, "reward": 1.5904842615127563, "reward_std": 0.18075516819953918, "rewards/GDino": 0.8420775234699249, "rewards/GIT": 0.46768221259117126, "rewards/HPSv2": 0.28072452545166016, "step": 196 }, { "completion_length": 121.9765625, "epoch": 0.464075382803298, "grad_norm": 0.3268880987442627, "kl": 0.014679982326924801, "learning_rate": 8.76875e-07, "loss": 0.014679982326924801, "reward": 1.0063704252243042, "reward_std": 0.17330440133810043, "rewards/GDino": 0.6505569219589233, "rewards/GIT": 0.05698058754205704, "rewards/HPSv2": 0.29883289337158203, "step": 197 }, { "completion_length": 124.734375, "epoch": 0.4664310954063604, "grad_norm": 0.6384171835698521, "kl": 0.012524401769042015, "learning_rate": 8.7625e-07, "loss": 0.012524401769042015, "reward": 1.045524775981903, "reward_std": 0.17109376192092896, "rewards/GDino": 0.5761095285415649, "rewards/GIT": 0.1846509426832199, "rewards/HPSv2": 0.28476428985595703, "step": 198 }, { "completion_length": 124.0390625, "epoch": 0.46878680800942285, "grad_norm": 0.2557047690342097, "kl": 0.017722844146192074, "learning_rate": 8.75625e-07, "loss": 0.017722844146192074, "reward": 1.5080816745758057, "reward_std": 0.13944968953728676, "rewards/GDino": 0.7579792737960815, "rewards/GIT": 0.46891260147094727, "rewards/HPSv2": 0.2811899185180664, "step": 199 }, { "completion_length": 120.53125, "epoch": 0.4711425206124853, "grad_norm": 0.2812710162739322, "kl": 0.028845791704952717, "learning_rate": 8.75e-07, "loss": 0.028845791704952717, "reward": 1.4363234043121338, "reward_std": 0.1938735917210579, "rewards/GDino": 0.794521152973175, "rewards/GIT": 0.35969291627407074, "rewards/HPSv2": 0.28210926055908203, "step": 200 }, { "completion_length": 124.28125, "epoch": 0.4734982332155477, "grad_norm": 0.31280450306573876, "kl": 0.01206715777516365, "learning_rate": 8.74375e-07, "loss": 0.01206715777516365, "reward": 1.4517974853515625, "reward_std": 0.19092358648777008, "rewards/GDino": 0.7503921985626221, "rewards/GIT": 0.40691065788269043, "rewards/HPSv2": 0.29449462890625, "step": 201 }, { "completion_length": 125.15625, "epoch": 0.4758539458186101, "grad_norm": 0.6776366776605734, "kl": 0.021801676135510206, "learning_rate": 8.7375e-07, "loss": 0.021801676135510206, "reward": 1.4952760934829712, "reward_std": 0.17064472287893295, "rewards/GDino": 0.7744052410125732, "rewards/GIT": 0.42836178839206696, "rewards/HPSv2": 0.2925090789794922, "step": 202 }, { "completion_length": 125.5546875, "epoch": 0.47820965842167257, "grad_norm": 0.2895225299596182, "kl": 0.011909999884665012, "learning_rate": 8.73125e-07, "loss": 0.011909999884665012, "reward": 1.585558295249939, "reward_std": 0.19963888078927994, "rewards/GDino": 0.7839756309986115, "rewards/GIT": 0.5249111652374268, "rewards/HPSv2": 0.2766714096069336, "step": 203 }, { "completion_length": 122.5859375, "epoch": 0.48056537102473496, "grad_norm": 0.2771838270130009, "kl": 0.011198031716048717, "learning_rate": 8.725e-07, "loss": 0.011198031716048717, "reward": 1.5191962122917175, "reward_std": 0.19361772388219833, "rewards/GDino": 0.7657425701618195, "rewards/GIT": 0.4655936360359192, "rewards/HPSv2": 0.2878599166870117, "step": 204 }, { "completion_length": 129.296875, "epoch": 0.4829210836277974, "grad_norm": 0.36212358841012826, "kl": 0.041189681738615036, "learning_rate": 8.718749999999999e-07, "loss": 0.041189681738615036, "reward": 1.638255000114441, "reward_std": 0.12955023348331451, "rewards/GDino": 0.799127608537674, "rewards/GIT": 0.5573843121528625, "rewards/HPSv2": 0.28174304962158203, "step": 205 }, { "completion_length": 125.1640625, "epoch": 0.48527679623085984, "grad_norm": 0.26647027170913695, "kl": 0.01300528971478343, "learning_rate": 8.712499999999999e-07, "loss": 0.01300528971478343, "reward": 1.370944619178772, "reward_std": 0.184652179479599, "rewards/GDino": 0.703246682882309, "rewards/GIT": 0.3904877156019211, "rewards/HPSv2": 0.2772102355957031, "step": 206 }, { "completion_length": 126.0859375, "epoch": 0.4876325088339223, "grad_norm": 0.3178940393696595, "kl": 0.02882100874558091, "learning_rate": 8.706249999999999e-07, "loss": 0.02882100874558091, "reward": 1.3252025842666626, "reward_std": 0.14220105856657028, "rewards/GDino": 0.6684596240520477, "rewards/GIT": 0.37338922917842865, "rewards/HPSv2": 0.2833538055419922, "step": 207 }, { "completion_length": 129.234375, "epoch": 0.48998822143698467, "grad_norm": 0.9171215682252858, "kl": 0.022492485120892525, "learning_rate": 8.699999999999999e-07, "loss": 0.022492485120892525, "reward": 1.3298659324645996, "reward_std": 0.2099175974726677, "rewards/GDino": 0.7538159489631653, "rewards/GIT": 0.28752247989177704, "rewards/HPSv2": 0.2885274887084961, "step": 208 }, { "completion_length": 130.703125, "epoch": 0.4923439340400471, "grad_norm": 0.41714576833445777, "kl": 0.03650578111410141, "learning_rate": 8.693749999999999e-07, "loss": 0.03650578111410141, "reward": 1.2455011010169983, "reward_std": 0.20831462740898132, "rewards/GDino": 0.6905107498168945, "rewards/GIT": 0.2790684998035431, "rewards/HPSv2": 0.2759218215942383, "step": 209 }, { "completion_length": 123.3671875, "epoch": 0.49469964664310956, "grad_norm": 0.4815882179683319, "kl": 0.025000850670039654, "learning_rate": 8.687499999999999e-07, "loss": 0.025000850670039654, "reward": 1.335551917552948, "reward_std": 0.17937739938497543, "rewards/GDino": 0.6645743250846863, "rewards/GIT": 0.38214388489723206, "rewards/HPSv2": 0.2888336181640625, "step": 210 }, { "completion_length": 126.140625, "epoch": 0.49705535924617195, "grad_norm": 0.32134854506675037, "kl": 0.023159710690379143, "learning_rate": 8.681249999999999e-07, "loss": 0.023159710690379143, "reward": 1.346066653728485, "reward_std": 0.22909215092658997, "rewards/GDino": 0.6985798180103302, "rewards/GIT": 0.36535653471946716, "rewards/HPSv2": 0.28213024139404297, "step": 211 }, { "completion_length": 128.4453125, "epoch": 0.4994110718492344, "grad_norm": 0.9560238226088092, "kl": 0.01517411693930626, "learning_rate": 8.675000000000001e-07, "loss": 0.01517411693930626, "reward": 1.3953797221183777, "reward_std": 0.18077290803194046, "rewards/GDino": 0.723727285861969, "rewards/GIT": 0.3943287134170532, "rewards/HPSv2": 0.27732372283935547, "step": 212 }, { "completion_length": 127.3515625, "epoch": 0.5017667844522968, "grad_norm": 0.449599819958732, "kl": 0.04814598336815834, "learning_rate": 8.66875e-07, "loss": 0.04814598336815834, "reward": 1.5333417057991028, "reward_std": 0.1945018619298935, "rewards/GDino": 0.7708311080932617, "rewards/GIT": 0.4709627479314804, "rewards/HPSv2": 0.2915477752685547, "step": 213 }, { "completion_length": 121.28125, "epoch": 0.5041224970553593, "grad_norm": 0.44549092195518664, "kl": 0.029509149491786957, "learning_rate": 8.6625e-07, "loss": 0.029509149491786957, "reward": 1.4087635278701782, "reward_std": 0.13609741628170013, "rewards/GDino": 0.7613054513931274, "rewards/GIT": 0.360777884721756, "rewards/HPSv2": 0.2866802215576172, "step": 214 }, { "completion_length": 122.6875, "epoch": 0.5064782096584217, "grad_norm": 0.4470452173915334, "kl": 0.04658668860793114, "learning_rate": 8.65625e-07, "loss": 0.04658668860793114, "reward": 1.3791213035583496, "reward_std": 0.17788369208574295, "rewards/GDino": 0.7086146771907806, "rewards/GIT": 0.38278496265411377, "rewards/HPSv2": 0.2877216339111328, "step": 215 }, { "completion_length": 122.890625, "epoch": 0.508833922261484, "grad_norm": 0.5955275853583525, "kl": 0.04264957085251808, "learning_rate": 8.65e-07, "loss": 0.04264957085251808, "reward": 1.4658660888671875, "reward_std": 0.15293558686971664, "rewards/GDino": 0.7810139656066895, "rewards/GIT": 0.3958868309855461, "rewards/HPSv2": 0.28896522521972656, "step": 216 }, { "completion_length": 125.3203125, "epoch": 0.5111896348645465, "grad_norm": 0.3204234296188401, "kl": 0.02333579771220684, "learning_rate": 8.64375e-07, "loss": 0.02333579771220684, "reward": 1.3554760217666626, "reward_std": 0.2270141914486885, "rewards/GDino": 0.7457924485206604, "rewards/GIT": 0.3298707604408264, "rewards/HPSv2": 0.2798128128051758, "step": 217 }, { "completion_length": 130.4140625, "epoch": 0.5135453474676089, "grad_norm": 0.3271453889072839, "kl": 0.026644302532076836, "learning_rate": 8.6375e-07, "loss": 0.026644302532076836, "reward": 1.3951560854911804, "reward_std": 0.17737507820129395, "rewards/GDino": 0.7596355676651001, "rewards/GIT": 0.34116126596927643, "rewards/HPSv2": 0.2943592071533203, "step": 218 }, { "completion_length": 123.6484375, "epoch": 0.5159010600706714, "grad_norm": 0.33762310798328254, "kl": 0.02765314094722271, "learning_rate": 8.63125e-07, "loss": 0.02765314094722271, "reward": 1.4198485016822815, "reward_std": 0.19130456447601318, "rewards/GDino": 0.7232366502285004, "rewards/GIT": 0.422112837433815, "rewards/HPSv2": 0.27449893951416016, "step": 219 }, { "completion_length": 121.625, "epoch": 0.5182567726737338, "grad_norm": 0.27043127702533726, "kl": 0.018064215779304504, "learning_rate": 8.625e-07, "loss": 0.018064215779304504, "reward": 1.3616116642951965, "reward_std": 0.20426654815673828, "rewards/GDino": 0.7526256144046783, "rewards/GIT": 0.3335772007703781, "rewards/HPSv2": 0.2754087448120117, "step": 220 }, { "completion_length": 128.640625, "epoch": 0.5206124852767963, "grad_norm": 0.6008074918448574, "kl": 0.03221448324620724, "learning_rate": 8.618749999999999e-07, "loss": 0.03221448324620724, "reward": 1.6391831636428833, "reward_std": 0.13242150098085403, "rewards/GDino": 0.7986079454421997, "rewards/GIT": 0.5494518131017685, "rewards/HPSv2": 0.2911233901977539, "step": 221 }, { "completion_length": 129.96875, "epoch": 0.5229681978798587, "grad_norm": 0.2524246147726016, "kl": 0.03731601359322667, "learning_rate": 8.612499999999999e-07, "loss": 0.03731601359322667, "reward": 1.2126295566558838, "reward_std": 0.1531626619398594, "rewards/GDino": 0.7049686014652252, "rewards/GIT": 0.23087983578443527, "rewards/HPSv2": 0.2767810821533203, "step": 222 }, { "completion_length": 129.1875, "epoch": 0.525323910482921, "grad_norm": 0.7777576800451097, "kl": 0.062217116355895996, "learning_rate": 8.606249999999999e-07, "loss": 0.062217116355895996, "reward": 1.3349083065986633, "reward_std": 0.14203068241477013, "rewards/GDino": 0.7920622229576111, "rewards/GIT": 0.25802168250083923, "rewards/HPSv2": 0.2848243713378906, "step": 223 }, { "completion_length": 122.1171875, "epoch": 0.5276796230859835, "grad_norm": 0.5951068961975011, "kl": 0.06027352064847946, "learning_rate": 8.599999999999999e-07, "loss": 0.06027352064847946, "reward": 1.555338442325592, "reward_std": 0.15343157947063446, "rewards/GDino": 0.8091731667518616, "rewards/GIT": 0.46286290884017944, "rewards/HPSv2": 0.28330230712890625, "step": 224 }, { "completion_length": 121.4453125, "epoch": 0.5300353356890459, "grad_norm": 0.4716673591609777, "kl": 0.01675863191485405, "learning_rate": 8.593749999999999e-07, "loss": 0.01675863191485405, "reward": 1.6013509631156921, "reward_std": 0.15604154765605927, "rewards/GDino": 0.80527263879776, "rewards/GIT": 0.5050320774316788, "rewards/HPSv2": 0.291046142578125, "step": 225 }, { "completion_length": 124.96875, "epoch": 0.5323910482921084, "grad_norm": 0.30815840737993755, "kl": 0.019350371789187193, "learning_rate": 8.587499999999999e-07, "loss": 0.019350371789187193, "reward": 1.8339231610298157, "reward_std": 0.15831077843904495, "rewards/GDino": 0.8986649811267853, "rewards/GIT": 0.6601937115192413, "rewards/HPSv2": 0.27506446838378906, "step": 226 }, { "completion_length": 126.359375, "epoch": 0.5347467608951708, "grad_norm": 0.365672606129346, "kl": 0.0747486874461174, "learning_rate": 8.581249999999999e-07, "loss": 0.0747486874461174, "reward": 1.3010891675949097, "reward_std": 0.17961854487657547, "rewards/GDino": 0.7064899802207947, "rewards/GIT": 0.3180908262729645, "rewards/HPSv2": 0.2765083312988281, "step": 227 }, { "completion_length": 123.1953125, "epoch": 0.5371024734982333, "grad_norm": 0.5129661370024405, "kl": 0.03458590805530548, "learning_rate": 8.575e-07, "loss": 0.03458590805530548, "reward": 1.280578076839447, "reward_std": 0.16396326571702957, "rewards/GDino": 0.686535120010376, "rewards/GIT": 0.31886208057403564, "rewards/HPSv2": 0.2751808166503906, "step": 228 }, { "completion_length": 128.609375, "epoch": 0.5394581861012956, "grad_norm": 0.38963091663565425, "kl": 0.04388098046183586, "learning_rate": 8.568750000000001e-07, "loss": 0.04388098046183586, "reward": 1.507240891456604, "reward_std": 0.14378944039344788, "rewards/GDino": 0.7796148955821991, "rewards/GIT": 0.4495317339897156, "rewards/HPSv2": 0.2780942916870117, "step": 229 }, { "completion_length": 131.6328125, "epoch": 0.541813898704358, "grad_norm": 0.2619860561137536, "kl": 0.02171275205910206, "learning_rate": 8.5625e-07, "loss": 0.02171275205910206, "reward": 1.4074188470840454, "reward_std": 0.21173711121082306, "rewards/GDino": 0.7650101184844971, "rewards/GIT": 0.35261181741952896, "rewards/HPSv2": 0.2897968292236328, "step": 230 }, { "completion_length": 124.3359375, "epoch": 0.5441696113074205, "grad_norm": 0.3270186341889228, "kl": 0.02894231677055359, "learning_rate": 8.55625e-07, "loss": 0.02894231677055359, "reward": 1.4613901376724243, "reward_std": 0.1294674389064312, "rewards/GDino": 0.7714146077632904, "rewards/GIT": 0.4050920531153679, "rewards/HPSv2": 0.2848834991455078, "step": 231 }, { "completion_length": 120.0546875, "epoch": 0.5465253239104829, "grad_norm": 0.3378020126729096, "kl": 0.032415205612778664, "learning_rate": 8.55e-07, "loss": 0.032415205612778664, "reward": 1.682977557182312, "reward_std": 0.14798809215426445, "rewards/GDino": 0.8309596478939056, "rewards/GIT": 0.5679926872253418, "rewards/HPSv2": 0.2840251922607422, "step": 232 }, { "completion_length": 133.3125, "epoch": 0.5488810365135454, "grad_norm": 0.4368656280016812, "kl": 0.019294225610792637, "learning_rate": 8.54375e-07, "loss": 0.019294225610792637, "reward": 1.394506573677063, "reward_std": 0.1984976828098297, "rewards/GDino": 0.7269530892372131, "rewards/GIT": 0.3861250877380371, "rewards/HPSv2": 0.28142833709716797, "step": 233 }, { "completion_length": 120.3125, "epoch": 0.5512367491166078, "grad_norm": 0.3883637422262549, "kl": 0.023376581259071827, "learning_rate": 8.5375e-07, "loss": 0.023376581259071827, "reward": 1.7846816778182983, "reward_std": 0.13904983550310135, "rewards/GDino": 0.8629387021064758, "rewards/GIT": 0.6415220946073532, "rewards/HPSv2": 0.28022098541259766, "step": 234 }, { "completion_length": 122.5546875, "epoch": 0.5535924617196702, "grad_norm": 0.6254663464369029, "kl": 0.04548387974500656, "learning_rate": 8.53125e-07, "loss": 0.04548387974500656, "reward": 1.616257905960083, "reward_std": 0.17715008556842804, "rewards/GDino": 0.7983682751655579, "rewards/GIT": 0.5365471243858337, "rewards/HPSv2": 0.2813425064086914, "step": 235 }, { "completion_length": 125.4375, "epoch": 0.5559481743227326, "grad_norm": 0.4294310987504339, "kl": 0.034659081138670444, "learning_rate": 8.525e-07, "loss": 0.034659081138670444, "reward": 1.5947605967521667, "reward_std": 0.19024289399385452, "rewards/GDino": 0.7982955276966095, "rewards/GIT": 0.5182592123746872, "rewards/HPSv2": 0.27820587158203125, "step": 236 }, { "completion_length": 127.84375, "epoch": 0.558303886925795, "grad_norm": 0.3714804332967827, "kl": 0.029053447768092155, "learning_rate": 8.51875e-07, "loss": 0.029053447768092155, "reward": 1.6576268672943115, "reward_std": 0.20476368069648743, "rewards/GDino": 0.8352813422679901, "rewards/GIT": 0.54258131980896, "rewards/HPSv2": 0.27976417541503906, "step": 237 }, { "completion_length": 132.109375, "epoch": 0.5606595995288575, "grad_norm": 1.1872334117092234, "kl": 0.039507496170699596, "learning_rate": 8.512499999999999e-07, "loss": 0.039507496170699596, "reward": 1.5363277792930603, "reward_std": 0.24048058688640594, "rewards/GDino": 0.8190398812294006, "rewards/GIT": 0.43277910351753235, "rewards/HPSv2": 0.28450870513916016, "step": 238 }, { "completion_length": 127.3671875, "epoch": 0.5630153121319199, "grad_norm": 0.3085065855690255, "kl": 0.0219512190669775, "learning_rate": 8.506249999999999e-07, "loss": 0.0219512190669775, "reward": 1.2611124515533447, "reward_std": 0.17331720143556595, "rewards/GDino": 0.671147495508194, "rewards/GIT": 0.30268385261297226, "rewards/HPSv2": 0.2872810363769531, "step": 239 }, { "completion_length": 122.671875, "epoch": 0.5653710247349824, "grad_norm": 0.4610418049830832, "kl": 0.04885072633624077, "learning_rate": 8.499999999999999e-07, "loss": 0.04885072633624077, "reward": 1.5246912837028503, "reward_std": 0.1735849529504776, "rewards/GDino": 0.8312811851501465, "rewards/GIT": 0.4039059653878212, "rewards/HPSv2": 0.2895040512084961, "step": 240 }, { "completion_length": 130.0625, "epoch": 0.5677267373380448, "grad_norm": 0.5507694665836241, "kl": 0.017274947836995125, "learning_rate": 8.493749999999999e-07, "loss": 0.017274947836995125, "reward": 1.197261929512024, "reward_std": 0.14505808055400848, "rewards/GDino": 0.6793518364429474, "rewards/GIT": 0.23177339881658554, "rewards/HPSv2": 0.2861366271972656, "step": 241 }, { "completion_length": 125.921875, "epoch": 0.5700824499411072, "grad_norm": 0.2817113729249634, "kl": 0.01429289160296321, "learning_rate": 8.487499999999999e-07, "loss": 0.01429289160296321, "reward": 1.692769169807434, "reward_std": 0.16637955605983734, "rewards/GDino": 0.8168050646781921, "rewards/GIT": 0.6013965308666229, "rewards/HPSv2": 0.2745676040649414, "step": 242 }, { "completion_length": 123.0703125, "epoch": 0.5724381625441696, "grad_norm": 0.3911542274261919, "kl": 0.07110440358519554, "learning_rate": 8.481249999999999e-07, "loss": 0.07110440358519554, "reward": 1.628957748413086, "reward_std": 0.19216161966323853, "rewards/GDino": 0.7833643853664398, "rewards/GIT": 0.5600185394287109, "rewards/HPSv2": 0.28557491302490234, "step": 243 }, { "completion_length": 127.6875, "epoch": 0.574793875147232, "grad_norm": 1.0364811556763958, "kl": 0.04847117327153683, "learning_rate": 8.475e-07, "loss": 0.04847117327153683, "reward": 1.5398220419883728, "reward_std": 0.17681090533733368, "rewards/GDino": 0.7717085778713226, "rewards/GIT": 0.4881051778793335, "rewards/HPSv2": 0.28000831604003906, "step": 244 }, { "completion_length": 124.1640625, "epoch": 0.5771495877502945, "grad_norm": 0.29183313489268703, "kl": 0.04609296843409538, "learning_rate": 8.46875e-07, "loss": 0.04609296843409538, "reward": 1.7110787630081177, "reward_std": 0.14548825472593307, "rewards/GDino": 0.8245199620723724, "rewards/GIT": 0.5936710834503174, "rewards/HPSv2": 0.29288768768310547, "step": 245 }, { "completion_length": 121.6640625, "epoch": 0.5795053003533569, "grad_norm": 0.3105904241663103, "kl": 0.01769427675753832, "learning_rate": 8.462499999999999e-07, "loss": 0.01769427675753832, "reward": 1.355324149131775, "reward_std": 0.16564738750457764, "rewards/GDino": 0.776468962430954, "rewards/GIT": 0.2915026396512985, "rewards/HPSv2": 0.2873525619506836, "step": 246 }, { "completion_length": 124.3515625, "epoch": 0.5818610129564193, "grad_norm": 1.0795038481778716, "kl": 0.07374152168631554, "learning_rate": 8.45625e-07, "loss": 0.07374152168631554, "reward": 1.3112455010414124, "reward_std": 0.19275472313165665, "rewards/GDino": 0.6675810515880585, "rewards/GIT": 0.3638182282447815, "rewards/HPSv2": 0.27984619140625, "step": 247 }, { "completion_length": 121.9765625, "epoch": 0.5842167255594818, "grad_norm": 0.4143955207606253, "kl": 0.045595334842801094, "learning_rate": 8.45e-07, "loss": 0.045595334842801094, "reward": 1.4827686548233032, "reward_std": 0.17120515555143356, "rewards/GDino": 0.7815067768096924, "rewards/GIT": 0.42086443305015564, "rewards/HPSv2": 0.2803974151611328, "step": 248 }, { "completion_length": 128.7890625, "epoch": 0.5865724381625441, "grad_norm": 1.095857495406444, "kl": 0.0283422963693738, "learning_rate": 8.44375e-07, "loss": 0.0283422963693738, "reward": 1.568875789642334, "reward_std": 0.16571104526519775, "rewards/GDino": 0.8299093842506409, "rewards/GIT": 0.44674052298069, "rewards/HPSv2": 0.29222583770751953, "step": 249 }, { "completion_length": 131.7734375, "epoch": 0.5889281507656066, "grad_norm": 0.3142583053164034, "kl": 0.048525091260671616, "learning_rate": 8.4375e-07, "loss": 0.048525091260671616, "reward": 1.6577069759368896, "reward_std": 0.17285022884607315, "rewards/GDino": 0.807335376739502, "rewards/GIT": 0.5771810114383698, "rewards/HPSv2": 0.2731904983520508, "step": 250 }, { "completion_length": 134.4375, "epoch": 0.591283863368669, "grad_norm": 0.722793920191808, "kl": 0.017548350617289543, "learning_rate": 8.43125e-07, "loss": 0.017548350617289543, "reward": 1.2278771996498108, "reward_std": 0.15594539046287537, "rewards/GDino": 0.6785571277141571, "rewards/GIT": 0.2714288979768753, "rewards/HPSv2": 0.2778911590576172, "step": 251 }, { "completion_length": 125.609375, "epoch": 0.5936395759717314, "grad_norm": 0.6243427713378089, "kl": 0.0763181783258915, "learning_rate": 8.425e-07, "loss": 0.0763181783258915, "reward": 1.518773078918457, "reward_std": 0.21261358261108398, "rewards/GDino": 0.7321556806564331, "rewards/GIT": 0.5098534673452377, "rewards/HPSv2": 0.276763916015625, "step": 252 }, { "completion_length": 122.609375, "epoch": 0.5959952885747939, "grad_norm": 0.5115850951705847, "kl": 0.027194509282708168, "learning_rate": 8.41875e-07, "loss": 0.027194509282708168, "reward": 1.6575953960418701, "reward_std": 0.16597223281860352, "rewards/GDino": 0.7910813987255096, "rewards/GIT": 0.5919577777385712, "rewards/HPSv2": 0.27455615997314453, "step": 253 }, { "completion_length": 128.96875, "epoch": 0.5983510011778563, "grad_norm": 0.323790847459248, "kl": 0.025116185657680035, "learning_rate": 8.4125e-07, "loss": 0.025116185657680035, "reward": 1.5575677752494812, "reward_std": 0.15343696624040604, "rewards/GDino": 0.7786457538604736, "rewards/GIT": 0.48442354798316956, "rewards/HPSv2": 0.2944984436035156, "step": 254 }, { "completion_length": 130.640625, "epoch": 0.6007067137809188, "grad_norm": 0.2691474503231475, "kl": 0.038111958652734756, "learning_rate": 8.406249999999999e-07, "loss": 0.038111958652734756, "reward": 1.517290472984314, "reward_std": 0.17402203381061554, "rewards/GDino": 0.7649014294147491, "rewards/GIT": 0.4595947861671448, "rewards/HPSv2": 0.29279422760009766, "step": 255 }, { "completion_length": 132.203125, "epoch": 0.6030624263839811, "grad_norm": 0.46478677119161066, "kl": 0.04139411635696888, "learning_rate": 8.399999999999999e-07, "loss": 0.04139411635696888, "reward": 1.4776598811149597, "reward_std": 0.18293121084570885, "rewards/GDino": 0.7710234820842743, "rewards/GIT": 0.42940231412649155, "rewards/HPSv2": 0.2772340774536133, "step": 256 }, { "completion_length": 122.2265625, "epoch": 0.6054181389870436, "grad_norm": 0.30425273760138677, "kl": 0.017129461281001568, "learning_rate": 8.393749999999999e-07, "loss": 0.017129461281001568, "reward": 1.5007814764976501, "reward_std": 0.15645697712898254, "rewards/GDino": 0.7433582246303558, "rewards/GIT": 0.47152310609817505, "rewards/HPSv2": 0.2859001159667969, "step": 257 }, { "completion_length": 122.9609375, "epoch": 0.607773851590106, "grad_norm": 0.44424249817786293, "kl": 0.028568798676133156, "learning_rate": 8.387499999999999e-07, "loss": 0.028568798676133156, "reward": 1.4305506348609924, "reward_std": 0.19410353899002075, "rewards/GDino": 0.7182663083076477, "rewards/GIT": 0.43983571231365204, "rewards/HPSv2": 0.2724485397338867, "step": 258 }, { "completion_length": 128.8515625, "epoch": 0.6101295641931684, "grad_norm": 0.32319732090828796, "kl": 0.015055303927510977, "learning_rate": 8.38125e-07, "loss": 0.015055303927510977, "reward": 1.318528175354004, "reward_std": 0.2092847004532814, "rewards/GDino": 0.6869865953922272, "rewards/GIT": 0.3488219380378723, "rewards/HPSv2": 0.28271961212158203, "step": 259 }, { "completion_length": 117.3984375, "epoch": 0.6124852767962309, "grad_norm": 0.6308103849010092, "kl": 0.01874847151339054, "learning_rate": 8.375e-07, "loss": 0.01874847151339054, "reward": 1.6463790535926819, "reward_std": 0.0990103967487812, "rewards/GDino": 0.819106936454773, "rewards/GIT": 0.5489355176687241, "rewards/HPSv2": 0.2783365249633789, "step": 260 }, { "completion_length": 129.5703125, "epoch": 0.6148409893992933, "grad_norm": 0.2677727233847472, "kl": 0.017937992699444294, "learning_rate": 8.36875e-07, "loss": 0.017937992699444294, "reward": 1.32936292886734, "reward_std": 0.1529393121600151, "rewards/GDino": 0.690763533115387, "rewards/GIT": 0.3534802794456482, "rewards/HPSv2": 0.28511905670166016, "step": 261 }, { "completion_length": 130.5546875, "epoch": 0.6171967020023557, "grad_norm": 0.3939129487068769, "kl": 0.02008101623505354, "learning_rate": 8.3625e-07, "loss": 0.02008101623505354, "reward": 1.4631484746932983, "reward_std": 0.1491515040397644, "rewards/GDino": 0.7680418193340302, "rewards/GIT": 0.4038115590810776, "rewards/HPSv2": 0.29129505157470703, "step": 262 }, { "completion_length": 129.015625, "epoch": 0.6195524146054181, "grad_norm": 0.25302205448647075, "kl": 0.030262595042586327, "learning_rate": 8.356249999999999e-07, "loss": 0.030262595042586327, "reward": 1.3339073657989502, "reward_std": 0.24874365329742432, "rewards/GDino": 0.7429611682891846, "rewards/GIT": 0.3248061239719391, "rewards/HPSv2": 0.2661399841308594, "step": 263 }, { "completion_length": 129.2421875, "epoch": 0.6219081272084805, "grad_norm": 0.38507039766055123, "kl": 0.02732150536030531, "learning_rate": 8.349999999999999e-07, "loss": 0.02732150536030531, "reward": 1.4871517419815063, "reward_std": 0.13652398437261581, "rewards/GDino": 0.779392421245575, "rewards/GIT": 0.41419248282909393, "rewards/HPSv2": 0.2935667037963867, "step": 264 }, { "completion_length": 126.5234375, "epoch": 0.624263839811543, "grad_norm": 0.2748281538776829, "kl": 0.018198583275079727, "learning_rate": 8.34375e-07, "loss": 0.018198583275079727, "reward": 1.4633320569992065, "reward_std": 0.1814088448882103, "rewards/GDino": 0.7637252509593964, "rewards/GIT": 0.43617141246795654, "rewards/HPSv2": 0.26343536376953125, "step": 265 }, { "completion_length": 129.0234375, "epoch": 0.6266195524146054, "grad_norm": 0.47147777316193407, "kl": 0.034708138555288315, "learning_rate": 8.3375e-07, "loss": 0.034708138555288315, "reward": 1.4065152406692505, "reward_std": 0.21177636831998825, "rewards/GDino": 0.7402100563049316, "rewards/GIT": 0.38560643792152405, "rewards/HPSv2": 0.2806987762451172, "step": 266 }, { "completion_length": 131.6875, "epoch": 0.6289752650176679, "grad_norm": 0.4805748734935267, "kl": 0.03413896542042494, "learning_rate": 8.33125e-07, "loss": 0.03413896542042494, "reward": 1.4293612241744995, "reward_std": 0.1300058737397194, "rewards/GDino": 0.7890754342079163, "rewards/GIT": 0.34871795773506165, "rewards/HPSv2": 0.2915678024291992, "step": 267 }, { "completion_length": 129.6328125, "epoch": 0.6313309776207303, "grad_norm": 0.4891156470991335, "kl": 0.0318872407078743, "learning_rate": 8.325e-07, "loss": 0.0318872407078743, "reward": 1.5250497460365295, "reward_std": 0.2033407837152481, "rewards/GDino": 0.7283053696155548, "rewards/GIT": 0.5190324932336807, "rewards/HPSv2": 0.2777118682861328, "step": 268 }, { "completion_length": 125.7890625, "epoch": 0.6336866902237926, "grad_norm": 0.29253898480065377, "kl": 0.017048795707523823, "learning_rate": 8.31875e-07, "loss": 0.017048795707523823, "reward": 1.3189342021942139, "reward_std": 0.17149540781974792, "rewards/GDino": 0.6853411197662354, "rewards/GIT": 0.3642010912299156, "rewards/HPSv2": 0.2693920135498047, "step": 269 }, { "completion_length": 125.7265625, "epoch": 0.6360424028268551, "grad_norm": 0.30072941490627514, "kl": 0.054598864167928696, "learning_rate": 8.3125e-07, "loss": 0.054598864167928696, "reward": 1.140231966972351, "reward_std": 0.17673533409833908, "rewards/GDino": 0.6172873973846436, "rewards/GIT": 0.22749242186546326, "rewards/HPSv2": 0.2954521179199219, "step": 270 }, { "completion_length": 123.4921875, "epoch": 0.6383981154299175, "grad_norm": 1.12120090038295, "kl": 0.048818960785865784, "learning_rate": 8.306249999999999e-07, "loss": 0.048818960785865784, "reward": 1.3542758226394653, "reward_std": 0.15825599431991577, "rewards/GDino": 0.7784483432769775, "rewards/GIT": 0.28135665506124496, "rewards/HPSv2": 0.29447078704833984, "step": 271 }, { "completion_length": 122.0234375, "epoch": 0.64075382803298, "grad_norm": 0.30697322309072694, "kl": 0.04006337374448776, "learning_rate": 8.299999999999999e-07, "loss": 0.04006337374448776, "reward": 1.4612534046173096, "reward_std": 0.17510753870010376, "rewards/GDino": 0.7369506061077118, "rewards/GIT": 0.4453616142272949, "rewards/HPSv2": 0.27894115447998047, "step": 272 }, { "completion_length": 127.921875, "epoch": 0.6431095406360424, "grad_norm": 0.3103959932118932, "kl": 0.018673130311071873, "learning_rate": 8.293749999999999e-07, "loss": 0.018673130311071873, "reward": 1.1873598098754883, "reward_std": 0.20136252790689468, "rewards/GDino": 0.6606672704219818, "rewards/GIT": 0.239786297082901, "rewards/HPSv2": 0.28690624237060547, "step": 273 }, { "completion_length": 126.1796875, "epoch": 0.6454652532391049, "grad_norm": 0.26970431693536967, "kl": 0.012459523044526577, "learning_rate": 8.287499999999999e-07, "loss": 0.012459523044526577, "reward": 1.2887164950370789, "reward_std": 0.20671743154525757, "rewards/GDino": 0.6966780126094818, "rewards/GIT": 0.31509433686733246, "rewards/HPSv2": 0.2769441604614258, "step": 274 }, { "completion_length": 121.4296875, "epoch": 0.6478209658421673, "grad_norm": 0.361648655837427, "kl": 0.02516191452741623, "learning_rate": 8.28125e-07, "loss": 0.02516191452741623, "reward": 1.6976540088653564, "reward_std": 0.14942985773086548, "rewards/GDino": 0.8503040671348572, "rewards/GIT": 0.5559062063694, "rewards/HPSv2": 0.2914438247680664, "step": 275 }, { "completion_length": 124.7734375, "epoch": 0.6501766784452296, "grad_norm": 0.36568998702821315, "kl": 0.021649268455803394, "learning_rate": 8.275e-07, "loss": 0.021649268455803394, "reward": 1.5425922274589539, "reward_std": 0.14775551110506058, "rewards/GDino": 0.7530822157859802, "rewards/GIT": 0.5137597620487213, "rewards/HPSv2": 0.27575016021728516, "step": 276 }, { "completion_length": 126.265625, "epoch": 0.6525323910482921, "grad_norm": 0.24779157001319851, "kl": 0.02461074758321047, "learning_rate": 8.26875e-07, "loss": 0.02461074758321047, "reward": 1.3401646614074707, "reward_std": 0.14307990670204163, "rewards/GDino": 0.7169491350650787, "rewards/GIT": 0.33443616330623627, "rewards/HPSv2": 0.28877925872802734, "step": 277 }, { "completion_length": 122.3671875, "epoch": 0.6548881036513545, "grad_norm": 0.33233779166330585, "kl": 0.03161661606281996, "learning_rate": 8.2625e-07, "loss": 0.03161661606281996, "reward": 1.3815143704414368, "reward_std": 0.16440695524215698, "rewards/GDino": 0.694475382566452, "rewards/GIT": 0.40984585881233215, "rewards/HPSv2": 0.2771930694580078, "step": 278 }, { "completion_length": 124.1015625, "epoch": 0.657243816254417, "grad_norm": 0.42382819165470265, "kl": 0.022010349668562412, "learning_rate": 8.25625e-07, "loss": 0.022010349668562412, "reward": 1.176303207874298, "reward_std": 0.15697185695171356, "rewards/GDino": 0.6775559484958649, "rewards/GIT": 0.2079223245382309, "rewards/HPSv2": 0.29082489013671875, "step": 279 }, { "completion_length": 129.4609375, "epoch": 0.6595995288574794, "grad_norm": 0.3861032776143577, "kl": 0.014706038404256105, "learning_rate": 8.249999999999999e-07, "loss": 0.014706038404256105, "reward": 1.4857875108718872, "reward_std": 0.2214314043521881, "rewards/GDino": 0.7619541883468628, "rewards/GIT": 0.4389135539531708, "rewards/HPSv2": 0.28491973876953125, "step": 280 }, { "completion_length": 121.375, "epoch": 0.6619552414605419, "grad_norm": 0.29390819924312994, "kl": 0.022826142609119415, "learning_rate": 8.243749999999999e-07, "loss": 0.022826142609119415, "reward": 1.548025131225586, "reward_std": 0.11385249346494675, "rewards/GDino": 0.811306357383728, "rewards/GIT": 0.4582858681678772, "rewards/HPSv2": 0.27843284606933594, "step": 281 }, { "completion_length": 128.0, "epoch": 0.6643109540636042, "grad_norm": 0.5531971361146103, "kl": 0.014700056985020638, "learning_rate": 8.2375e-07, "loss": 0.014700056985020638, "reward": 1.3034098148345947, "reward_std": 0.17472809553146362, "rewards/GDino": 0.713548481464386, "rewards/GIT": 0.30822892487049103, "rewards/HPSv2": 0.2816324234008789, "step": 282 }, { "completion_length": 128.8828125, "epoch": 0.6666666666666666, "grad_norm": 0.37869260117966363, "kl": 0.017631348222494125, "learning_rate": 8.23125e-07, "loss": 0.017631348222494125, "reward": 1.6525904536247253, "reward_std": 0.15039529651403427, "rewards/GDino": 0.8716060519218445, "rewards/GIT": 0.4864230155944824, "rewards/HPSv2": 0.29456138610839844, "step": 283 }, { "completion_length": 130.5859375, "epoch": 0.6690223792697291, "grad_norm": 0.42064337013208775, "kl": 0.04872806742787361, "learning_rate": 8.225e-07, "loss": 0.04872806742787361, "reward": 1.3578343987464905, "reward_std": 0.1975974291563034, "rewards/GDino": 0.7317111790180206, "rewards/GIT": 0.3480231761932373, "rewards/HPSv2": 0.27810001373291016, "step": 284 }, { "completion_length": 119.8671875, "epoch": 0.6713780918727915, "grad_norm": 0.375751097460708, "kl": 0.017903264611959457, "learning_rate": 8.21875e-07, "loss": 0.017903264611959457, "reward": 1.5878105163574219, "reward_std": 0.14192687720060349, "rewards/GDino": 0.8087341487407684, "rewards/GIT": 0.49808187782764435, "rewards/HPSv2": 0.2809944152832031, "step": 285 }, { "completion_length": 124.53125, "epoch": 0.673733804475854, "grad_norm": 0.4416671510375221, "kl": 0.025637196376919746, "learning_rate": 8.2125e-07, "loss": 0.025637196376919746, "reward": 1.520736575126648, "reward_std": 0.15493131428956985, "rewards/GDino": 0.7712737321853638, "rewards/GIT": 0.4587991386651993, "rewards/HPSv2": 0.2906637191772461, "step": 286 }, { "completion_length": 126.4296875, "epoch": 0.6760895170789164, "grad_norm": 0.33983947068777504, "kl": 0.045718420296907425, "learning_rate": 8.20625e-07, "loss": 0.045718420296907425, "reward": 1.6930556297302246, "reward_std": 0.13790836930274963, "rewards/GDino": 0.8445090055465698, "rewards/GIT": 0.5575329065322876, "rewards/HPSv2": 0.2910137176513672, "step": 287 }, { "completion_length": 119.8984375, "epoch": 0.6784452296819788, "grad_norm": 0.3887473530613621, "kl": 0.03832306992262602, "learning_rate": 8.199999999999999e-07, "loss": 0.03832306992262602, "reward": 1.5662216544151306, "reward_std": 0.13692811876535416, "rewards/GDino": 0.8552664518356323, "rewards/GIT": 0.4292779564857483, "rewards/HPSv2": 0.28167724609375, "step": 288 }, { "completion_length": 130.421875, "epoch": 0.6808009422850412, "grad_norm": 0.43018640813325887, "kl": 0.028278278186917305, "learning_rate": 8.193749999999999e-07, "loss": 0.028278278186917305, "reward": 1.3263012766838074, "reward_std": 0.1696128249168396, "rewards/GDino": 0.7353622913360596, "rewards/GIT": 0.3079628497362137, "rewards/HPSv2": 0.2829761505126953, "step": 289 }, { "completion_length": 122.5859375, "epoch": 0.6831566548881036, "grad_norm": 1.0724607460032154, "kl": 0.053646642714738846, "learning_rate": 8.187499999999999e-07, "loss": 0.053646642714738846, "reward": 1.4874740839004517, "reward_std": 0.13784275949001312, "rewards/GDino": 0.7356436550617218, "rewards/GIT": 0.44985993206501007, "rewards/HPSv2": 0.3019704818725586, "step": 290 }, { "completion_length": 131.234375, "epoch": 0.6855123674911661, "grad_norm": 0.3018539534734603, "kl": 0.01635696180164814, "learning_rate": 8.18125e-07, "loss": 0.01635696180164814, "reward": 1.512361764907837, "reward_std": 0.1697377860546112, "rewards/GDino": 0.7273116111755371, "rewards/GIT": 0.5127532929182053, "rewards/HPSv2": 0.2722969055175781, "step": 291 }, { "completion_length": 128.5625, "epoch": 0.6878680800942285, "grad_norm": 0.5431524876032257, "kl": 0.058600494638085365, "learning_rate": 8.175e-07, "loss": 0.058600494638085365, "reward": 1.4146356582641602, "reward_std": 0.2222433015704155, "rewards/GDino": 0.7245434820652008, "rewards/GIT": 0.41342371702194214, "rewards/HPSv2": 0.2766685485839844, "step": 292 }, { "completion_length": 126.5234375, "epoch": 0.690223792697291, "grad_norm": 0.3315612300626071, "kl": 0.0683399960398674, "learning_rate": 8.16875e-07, "loss": 0.0683399960398674, "reward": 1.4038465023040771, "reward_std": 0.17259612679481506, "rewards/GDino": 0.7010475099086761, "rewards/GIT": 0.42341622710227966, "rewards/HPSv2": 0.27938270568847656, "step": 293 }, { "completion_length": 123.21875, "epoch": 0.6925795053003534, "grad_norm": 0.47422150208213343, "kl": 0.02748813945800066, "learning_rate": 8.1625e-07, "loss": 0.02748813945800066, "reward": 1.421975016593933, "reward_std": 0.13820461928844452, "rewards/GDino": 0.7338467836380005, "rewards/GIT": 0.40739427506923676, "rewards/HPSv2": 0.2807340621948242, "step": 294 }, { "completion_length": 125.546875, "epoch": 0.6949352179034158, "grad_norm": 0.3325057050553565, "kl": 0.03135174512863159, "learning_rate": 8.15625e-07, "loss": 0.03135174512863159, "reward": 1.4006428122520447, "reward_std": 0.15386375784873962, "rewards/GDino": 0.710864394903183, "rewards/GIT": 0.41147609800100327, "rewards/HPSv2": 0.2783021926879883, "step": 295 }, { "completion_length": 130.828125, "epoch": 0.6972909305064782, "grad_norm": 0.3826143816046585, "kl": 0.06278537772595882, "learning_rate": 8.149999999999999e-07, "loss": 0.06278537772595882, "reward": 1.4086037278175354, "reward_std": 0.17883968353271484, "rewards/GDino": 0.7359591126441956, "rewards/GIT": 0.39369482919573784, "rewards/HPSv2": 0.2789497375488281, "step": 296 }, { "completion_length": 128.390625, "epoch": 0.6996466431095406, "grad_norm": 1.0668752501973389, "kl": 0.047636556439101696, "learning_rate": 8.143749999999999e-07, "loss": 0.047636556439101696, "reward": 1.7043591737747192, "reward_std": 0.15156446769833565, "rewards/GDino": 0.8310354351997375, "rewards/GIT": 0.5944264978170395, "rewards/HPSv2": 0.2788972854614258, "step": 297 }, { "completion_length": 128.3359375, "epoch": 0.702002355712603, "grad_norm": 0.29636307335595286, "kl": 0.02888335194438696, "learning_rate": 8.137499999999999e-07, "loss": 0.02888335194438696, "reward": 1.3975404500961304, "reward_std": 0.1780788078904152, "rewards/GDino": 0.7274096608161926, "rewards/GIT": 0.38422691822052, "rewards/HPSv2": 0.2859039306640625, "step": 298 }, { "completion_length": 130.828125, "epoch": 0.7043580683156655, "grad_norm": 0.7985587958471299, "kl": 0.033760842867195606, "learning_rate": 8.131249999999999e-07, "loss": 0.033760842867195606, "reward": 1.4403738975524902, "reward_std": 0.16354098916053772, "rewards/GDino": 0.7428195178508759, "rewards/GIT": 0.4228588938713074, "rewards/HPSv2": 0.27469539642333984, "step": 299 }, { "completion_length": 124.9453125, "epoch": 0.7067137809187279, "grad_norm": 0.3040139852906523, "kl": 0.018305689096450806, "learning_rate": 8.125e-07, "loss": 0.018305689096450806, "reward": 1.5731314420700073, "reward_std": 0.17271725088357925, "rewards/GDino": 0.8145833611488342, "rewards/GIT": 0.4792007505893707, "rewards/HPSv2": 0.27934741973876953, "step": 300 }, { "completion_length": 132.6171875, "epoch": 0.7090694935217904, "grad_norm": 0.44230807433290703, "kl": 0.024274067021906376, "learning_rate": 8.11875e-07, "loss": 0.024274067021906376, "reward": 1.6243597865104675, "reward_std": 0.1482085883617401, "rewards/GDino": 0.7930316030979156, "rewards/GIT": 0.5462501496076584, "rewards/HPSv2": 0.2850780487060547, "step": 301 }, { "completion_length": 127.9296875, "epoch": 0.7114252061248527, "grad_norm": 0.6517899299761609, "kl": 0.034833348356187344, "learning_rate": 8.1125e-07, "loss": 0.034833348356187344, "reward": 1.6197678446769714, "reward_std": 0.15246256440877914, "rewards/GDino": 0.7825698554515839, "rewards/GIT": 0.5587870180606842, "rewards/HPSv2": 0.2784109115600586, "step": 302 }, { "completion_length": 127.6484375, "epoch": 0.7137809187279152, "grad_norm": 0.4150829751902694, "kl": 0.05362547188997269, "learning_rate": 8.10625e-07, "loss": 0.05362547188997269, "reward": 1.3537341356277466, "reward_std": 0.15590070933103561, "rewards/GDino": 0.6993246376514435, "rewards/GIT": 0.37451179325580597, "rewards/HPSv2": 0.27989768981933594, "step": 303 }, { "completion_length": 123.1484375, "epoch": 0.7161366313309776, "grad_norm": 1.526203565774157, "kl": 0.03947442816570401, "learning_rate": 8.1e-07, "loss": 0.03947442816570401, "reward": 1.4447040557861328, "reward_std": 0.1714784875512123, "rewards/GDino": 0.7440004348754883, "rewards/GIT": 0.42542940378189087, "rewards/HPSv2": 0.27527427673339844, "step": 304 }, { "completion_length": 126.1328125, "epoch": 0.71849234393404, "grad_norm": 0.3458559504937369, "kl": 0.03534231334924698, "learning_rate": 8.093749999999999e-07, "loss": 0.03534231334924698, "reward": 1.689622402191162, "reward_std": 0.20265092700719833, "rewards/GDino": 0.8389671146869659, "rewards/GIT": 0.5767028033733368, "rewards/HPSv2": 0.2739524841308594, "step": 305 }, { "completion_length": 124.0, "epoch": 0.7208480565371025, "grad_norm": 0.6968702501071533, "kl": 0.04150471044704318, "learning_rate": 8.087499999999999e-07, "loss": 0.04150471044704318, "reward": 1.5528291463851929, "reward_std": 0.17901183664798737, "rewards/GDino": 0.7798742651939392, "rewards/GIT": 0.48020070791244507, "rewards/HPSv2": 0.2927541732788086, "step": 306 }, { "completion_length": 122.484375, "epoch": 0.7232037691401649, "grad_norm": 1.8954832950648015, "kl": 0.056108953431248665, "learning_rate": 8.08125e-07, "loss": 0.056108953431248665, "reward": 1.7013561725616455, "reward_std": 0.08635647594928741, "rewards/GDino": 0.8702483773231506, "rewards/GIT": 0.5473934710025787, "rewards/HPSv2": 0.28371429443359375, "step": 307 }, { "completion_length": 131.9453125, "epoch": 0.7255594817432274, "grad_norm": 0.34052132578794897, "kl": 0.06209992431104183, "learning_rate": 8.075e-07, "loss": 0.06209992431104183, "reward": 1.3329508900642395, "reward_std": 0.18270864337682724, "rewards/GDino": 0.7122637033462524, "rewards/GIT": 0.3322216421365738, "rewards/HPSv2": 0.2884654998779297, "step": 308 }, { "completion_length": 122.640625, "epoch": 0.7279151943462897, "grad_norm": 0.4298971152725154, "kl": 0.039235640317201614, "learning_rate": 8.06875e-07, "loss": 0.039235640317201614, "reward": 1.4676252007484436, "reward_std": 0.15264073759317398, "rewards/GDino": 0.7305140793323517, "rewards/GIT": 0.45672985911369324, "rewards/HPSv2": 0.2803812026977539, "step": 309 }, { "completion_length": 127.4296875, "epoch": 0.7302709069493521, "grad_norm": 0.35057040505872766, "kl": 0.0198000930249691, "learning_rate": 8.0625e-07, "loss": 0.0198000930249691, "reward": 1.3383066058158875, "reward_std": 0.20110344141721725, "rewards/GDino": 0.7184711396694183, "rewards/GIT": 0.3435465395450592, "rewards/HPSv2": 0.2762889862060547, "step": 310 }, { "completion_length": 122.5390625, "epoch": 0.7326266195524146, "grad_norm": 0.24718707837774398, "kl": 0.04158630222082138, "learning_rate": 8.05625e-07, "loss": 0.04158630222082138, "reward": 1.453433096408844, "reward_std": 0.15765993297100067, "rewards/GDino": 0.7990122139453888, "rewards/GIT": 0.3700161278247833, "rewards/HPSv2": 0.2844047546386719, "step": 311 }, { "completion_length": 127.125, "epoch": 0.734982332155477, "grad_norm": 0.30584031040781096, "kl": 0.02670707367360592, "learning_rate": 8.05e-07, "loss": 0.02670707367360592, "reward": 1.733988642692566, "reward_std": 0.18975289911031723, "rewards/GDino": 0.865786224603653, "rewards/GIT": 0.569219708442688, "rewards/HPSv2": 0.2989826202392578, "step": 312 }, { "completion_length": 126.1875, "epoch": 0.7373380447585395, "grad_norm": 0.29529733641376416, "kl": 0.015993456356227398, "learning_rate": 8.043749999999999e-07, "loss": 0.015993456356227398, "reward": 1.297784149646759, "reward_std": 0.21640415489673615, "rewards/GDino": 0.7619137167930603, "rewards/GIT": 0.24188021570444107, "rewards/HPSv2": 0.2939901351928711, "step": 313 }, { "completion_length": 125.9140625, "epoch": 0.7396937573616019, "grad_norm": 0.5260827047349452, "kl": 0.018631361424922943, "learning_rate": 8.037499999999999e-07, "loss": 0.018631361424922943, "reward": 1.55229914188385, "reward_std": 0.1562209576368332, "rewards/GDino": 0.7318809330463409, "rewards/GIT": 0.5370119512081146, "rewards/HPSv2": 0.28340625762939453, "step": 314 }, { "completion_length": 121.3671875, "epoch": 0.7420494699646644, "grad_norm": 0.39927594427471297, "kl": 0.015232020057737827, "learning_rate": 8.031249999999999e-07, "loss": 0.015232020057737827, "reward": 1.400625467300415, "reward_std": 0.19370374828577042, "rewards/GDino": 0.7292725145816803, "rewards/GIT": 0.3855634480714798, "rewards/HPSv2": 0.28578948974609375, "step": 315 }, { "completion_length": 130.4765625, "epoch": 0.7444051825677267, "grad_norm": 0.4528515579435641, "kl": 0.032600946724414825, "learning_rate": 8.024999999999999e-07, "loss": 0.032600946724414825, "reward": 1.513465702533722, "reward_std": 0.11191963031888008, "rewards/GDino": 0.7798548340797424, "rewards/GIT": 0.4418989270925522, "rewards/HPSv2": 0.29171180725097656, "step": 316 }, { "completion_length": 121.1875, "epoch": 0.7467608951707891, "grad_norm": 0.35190500977815165, "kl": 0.03599115740507841, "learning_rate": 8.018749999999999e-07, "loss": 0.03599115740507841, "reward": 1.6221016645431519, "reward_std": 0.18038246780633926, "rewards/GDino": 0.8498950004577637, "rewards/GIT": 0.48536522686481476, "rewards/HPSv2": 0.28684139251708984, "step": 317 }, { "completion_length": 136.109375, "epoch": 0.7491166077738516, "grad_norm": 0.3537017173411738, "kl": 0.014967352151870728, "learning_rate": 8.0125e-07, "loss": 0.014967352151870728, "reward": 1.338509440422058, "reward_std": 0.16818224638700485, "rewards/GDino": 0.6558979153633118, "rewards/GIT": 0.3935062438249588, "rewards/HPSv2": 0.2891054153442383, "step": 318 }, { "completion_length": 124.75, "epoch": 0.751472320376914, "grad_norm": 0.3883580786656669, "kl": 0.03356989845633507, "learning_rate": 8.00625e-07, "loss": 0.03356989845633507, "reward": 1.4287025332450867, "reward_std": 0.17245692759752274, "rewards/GDino": 0.7606414556503296, "rewards/GIT": 0.38464997708797455, "rewards/HPSv2": 0.28341102600097656, "step": 319 }, { "completion_length": 125.6796875, "epoch": 0.7538280329799765, "grad_norm": 0.4589428587658747, "kl": 0.02635353058576584, "learning_rate": 8e-07, "loss": 0.02635353058576584, "reward": 1.2789124846458435, "reward_std": 0.1678721085190773, "rewards/GDino": 0.6663730442523956, "rewards/GIT": 0.3192245066165924, "rewards/HPSv2": 0.29331493377685547, "step": 320 }, { "completion_length": 130.4921875, "epoch": 0.7561837455830389, "grad_norm": 0.8713108303725687, "kl": 0.04808354936540127, "learning_rate": 7.993749999999999e-07, "loss": 0.04808354936540127, "reward": 1.1586462259292603, "reward_std": 0.1770206168293953, "rewards/GDino": 0.6388220489025116, "rewards/GIT": 0.23627091944217682, "rewards/HPSv2": 0.2835531234741211, "step": 321 }, { "completion_length": 125.609375, "epoch": 0.7585394581861012, "grad_norm": 0.38432189521486504, "kl": 0.034217922016978264, "learning_rate": 7.9875e-07, "loss": 0.034217922016978264, "reward": 1.5119401216506958, "reward_std": 0.19607888162136078, "rewards/GDino": 0.8218662738800049, "rewards/GIT": 0.3996170163154602, "rewards/HPSv2": 0.29045677185058594, "step": 322 }, { "completion_length": 128.9140625, "epoch": 0.7608951707891637, "grad_norm": 0.3443308775084797, "kl": 0.01797491032630205, "learning_rate": 7.98125e-07, "loss": 0.01797491032630205, "reward": 1.2356206178665161, "reward_std": 0.17161127924919128, "rewards/GDino": 0.685572475194931, "rewards/GIT": 0.2547934502363205, "rewards/HPSv2": 0.2952547073364258, "step": 323 }, { "completion_length": 125.4921875, "epoch": 0.7632508833922261, "grad_norm": 0.5186692483053706, "kl": 0.02349786553531885, "learning_rate": 7.975e-07, "loss": 0.02349786553531885, "reward": 1.4223875999450684, "reward_std": 0.189034566283226, "rewards/GDino": 0.7490924000740051, "rewards/GIT": 0.39170558750629425, "rewards/HPSv2": 0.2815895080566406, "step": 324 }, { "completion_length": 121.1953125, "epoch": 0.7656065959952886, "grad_norm": 1.0327983124188465, "kl": 0.048919547349214554, "learning_rate": 7.96875e-07, "loss": 0.048919547349214554, "reward": 1.4143193364143372, "reward_std": 0.14530813321471214, "rewards/GDino": 0.719795286655426, "rewards/GIT": 0.4007302224636078, "rewards/HPSv2": 0.2937936782836914, "step": 325 }, { "completion_length": 134.7578125, "epoch": 0.767962308598351, "grad_norm": 0.6272047833757223, "kl": 0.047528428956866264, "learning_rate": 7.9625e-07, "loss": 0.047528428956866264, "reward": 1.4865314364433289, "reward_std": 0.2055855393409729, "rewards/GDino": 0.7754060328006744, "rewards/GIT": 0.42640019953250885, "rewards/HPSv2": 0.2847251892089844, "step": 326 }, { "completion_length": 123.359375, "epoch": 0.7703180212014135, "grad_norm": 0.9138779953562122, "kl": 0.06359554827213287, "learning_rate": 7.95625e-07, "loss": 0.06359554827213287, "reward": 1.6315770745277405, "reward_std": 0.15016406029462814, "rewards/GDino": 0.8327396512031555, "rewards/GIT": 0.5174653679132462, "rewards/HPSv2": 0.2813720703125, "step": 327 }, { "completion_length": 129.03125, "epoch": 0.7726737338044759, "grad_norm": 0.377937662835072, "kl": 0.024591203778982162, "learning_rate": 7.95e-07, "loss": 0.024591203778982162, "reward": 1.5386335849761963, "reward_std": 0.16554662585258484, "rewards/GDino": 0.8118350505828857, "rewards/GIT": 0.44174613058567047, "rewards/HPSv2": 0.2850522994995117, "step": 328 }, { "completion_length": 126.6328125, "epoch": 0.7750294464075382, "grad_norm": 0.30132622661035485, "kl": 0.05170280672609806, "learning_rate": 7.94375e-07, "loss": 0.05170280672609806, "reward": 1.279753863811493, "reward_std": 0.2041497603058815, "rewards/GDino": 0.7125385701656342, "rewards/GIT": 0.2759450450539589, "rewards/HPSv2": 0.29127025604248047, "step": 329 }, { "completion_length": 121.796875, "epoch": 0.7773851590106007, "grad_norm": 0.3010858862449583, "kl": 0.03824125323444605, "learning_rate": 7.937499999999999e-07, "loss": 0.03824125323444605, "reward": 1.5684179067611694, "reward_std": 0.1886109784245491, "rewards/GDino": 0.7699853777885437, "rewards/GIT": 0.5146515071392059, "rewards/HPSv2": 0.2837810516357422, "step": 330 }, { "completion_length": 130.4140625, "epoch": 0.7797408716136631, "grad_norm": 0.5154264590626827, "kl": 0.023881498724222183, "learning_rate": 7.931249999999999e-07, "loss": 0.023881498724222183, "reward": 1.5115928649902344, "reward_std": 0.15255264937877655, "rewards/GDino": 0.7645035684108734, "rewards/GIT": 0.4607924073934555, "rewards/HPSv2": 0.2862968444824219, "step": 331 }, { "completion_length": 132.3671875, "epoch": 0.7820965842167256, "grad_norm": 0.4391377562665786, "kl": 0.026975923217833042, "learning_rate": 7.924999999999999e-07, "loss": 0.026975923217833042, "reward": 1.2677489817142487, "reward_std": 0.18446628004312515, "rewards/GDino": 0.653992235660553, "rewards/GIT": 0.32070299983024597, "rewards/HPSv2": 0.29305362701416016, "step": 332 }, { "completion_length": 131.8046875, "epoch": 0.784452296819788, "grad_norm": 0.6070994258489568, "kl": 0.04964772891253233, "learning_rate": 7.918749999999999e-07, "loss": 0.04964772891253233, "reward": 1.675797700881958, "reward_std": 0.18104221671819687, "rewards/GDino": 0.7930756509304047, "rewards/GIT": 0.5956908464431763, "rewards/HPSv2": 0.2870311737060547, "step": 333 }, { "completion_length": 129.75, "epoch": 0.7868080094228505, "grad_norm": 0.6063182340989366, "kl": 0.034713102504611015, "learning_rate": 7.912499999999999e-07, "loss": 0.034713102504611015, "reward": 1.3531363010406494, "reward_std": 0.18428336828947067, "rewards/GDino": 0.6750190258026123, "rewards/GIT": 0.40720702707767487, "rewards/HPSv2": 0.27091026306152344, "step": 334 }, { "completion_length": 127.625, "epoch": 0.7891637220259128, "grad_norm": 0.3653581974216871, "kl": 0.017113614827394485, "learning_rate": 7.90625e-07, "loss": 0.017113614827394485, "reward": 1.6638911962509155, "reward_std": 0.1797097846865654, "rewards/GDino": 0.8094729781150818, "rewards/GIT": 0.5822576284408569, "rewards/HPSv2": 0.27216053009033203, "step": 335 }, { "completion_length": 134.09375, "epoch": 0.7915194346289752, "grad_norm": 0.3520544723799149, "kl": 0.014115944970399141, "learning_rate": 7.9e-07, "loss": 0.014115944970399141, "reward": 1.2900086045265198, "reward_std": 0.19174642115831375, "rewards/GDino": 0.6792336106300354, "rewards/GIT": 0.3366193622350693, "rewards/HPSv2": 0.2741556167602539, "step": 336 }, { "completion_length": 128.0, "epoch": 0.7938751472320377, "grad_norm": 0.31728826679686184, "kl": 0.015436351764947176, "learning_rate": 7.893750000000001e-07, "loss": 0.015436351764947176, "reward": 1.272620439529419, "reward_std": 0.09594709426164627, "rewards/GDino": 0.6984356641769409, "rewards/GIT": 0.30101902037858963, "rewards/HPSv2": 0.2731657028198242, "step": 337 }, { "completion_length": 127.46875, "epoch": 0.7962308598351001, "grad_norm": 0.3024558281129897, "kl": 0.03094123862683773, "learning_rate": 7.8875e-07, "loss": 0.03094123862683773, "reward": 1.6679672002792358, "reward_std": 0.18477828055620193, "rewards/GDino": 0.7926647067070007, "rewards/GIT": 0.5849192142486572, "rewards/HPSv2": 0.29038333892822266, "step": 338 }, { "completion_length": 131.90625, "epoch": 0.7985865724381626, "grad_norm": 0.28892462236281286, "kl": 0.03025410696864128, "learning_rate": 7.88125e-07, "loss": 0.03025410696864128, "reward": 1.5329428911209106, "reward_std": 0.18367067724466324, "rewards/GDino": 0.7539663016796112, "rewards/GIT": 0.4960414171218872, "rewards/HPSv2": 0.28293514251708984, "step": 339 }, { "completion_length": 126.7265625, "epoch": 0.800942285041225, "grad_norm": 0.44079418596113346, "kl": 0.01944844890385866, "learning_rate": 7.875e-07, "loss": 0.01944844890385866, "reward": 1.5138062834739685, "reward_std": 0.13654865324497223, "rewards/GDino": 0.80606609582901, "rewards/GIT": 0.4316324144601822, "rewards/HPSv2": 0.2761077880859375, "step": 340 }, { "completion_length": 130.4296875, "epoch": 0.8032979976442874, "grad_norm": 0.4638108039285803, "kl": 0.02957247756421566, "learning_rate": 7.86875e-07, "loss": 0.02957247756421566, "reward": 1.7366004586219788, "reward_std": 0.18700267374515533, "rewards/GDino": 0.8382346034049988, "rewards/GIT": 0.611309826374054, "rewards/HPSv2": 0.28705596923828125, "step": 341 }, { "completion_length": 129.015625, "epoch": 0.8056537102473498, "grad_norm": 0.2872824515025552, "kl": 0.024061542004346848, "learning_rate": 7.8625e-07, "loss": 0.024061542004346848, "reward": 1.4499781727790833, "reward_std": 0.12556719779968262, "rewards/GDino": 0.7729260623455048, "rewards/GIT": 0.3833164721727371, "rewards/HPSv2": 0.2937355041503906, "step": 342 }, { "completion_length": 124.25, "epoch": 0.8080094228504122, "grad_norm": 0.3640112361750856, "kl": 0.04077521711587906, "learning_rate": 7.85625e-07, "loss": 0.04077521711587906, "reward": 1.6527122259140015, "reward_std": 0.14589634537696838, "rewards/GDino": 0.8713693022727966, "rewards/GIT": 0.49846500158309937, "rewards/HPSv2": 0.28287792205810547, "step": 343 }, { "completion_length": 131.453125, "epoch": 0.8103651354534747, "grad_norm": 0.3302201230853207, "kl": 0.02199706621468067, "learning_rate": 7.85e-07, "loss": 0.02199706621468067, "reward": 1.4969403147697449, "reward_std": 0.1987384483218193, "rewards/GDino": 0.7694066762924194, "rewards/GIT": 0.4460585117340088, "rewards/HPSv2": 0.2814750671386719, "step": 344 }, { "completion_length": 132.3515625, "epoch": 0.8127208480565371, "grad_norm": 1.0443984027492745, "kl": 0.03796289023011923, "learning_rate": 7.84375e-07, "loss": 0.03796289023011923, "reward": 1.3811039328575134, "reward_std": 0.1574697494506836, "rewards/GDino": 0.7723546624183655, "rewards/GIT": 0.32235129177570343, "rewards/HPSv2": 0.28639793395996094, "step": 345 }, { "completion_length": 138.84375, "epoch": 0.8150765606595995, "grad_norm": 0.30404649853427185, "kl": 0.021234591491520405, "learning_rate": 7.837499999999999e-07, "loss": 0.021234591491520405, "reward": 1.4353033900260925, "reward_std": 0.18285365402698517, "rewards/GDino": 0.7163802683353424, "rewards/GIT": 0.43963292241096497, "rewards/HPSv2": 0.27929019927978516, "step": 346 }, { "completion_length": 126.96875, "epoch": 0.817432273262662, "grad_norm": 0.45436050807137557, "kl": 0.05188546422868967, "learning_rate": 7.831249999999999e-07, "loss": 0.05188546422868967, "reward": 1.4756303429603577, "reward_std": 0.13485369458794594, "rewards/GDino": 0.7830187678337097, "rewards/GIT": 0.41845016181468964, "rewards/HPSv2": 0.27416133880615234, "step": 347 }, { "completion_length": 134.96875, "epoch": 0.8197879858657244, "grad_norm": 0.3157880479596075, "kl": 0.02103997766971588, "learning_rate": 7.824999999999999e-07, "loss": 0.02103997766971588, "reward": 1.4239362478256226, "reward_std": 0.11408064514398575, "rewards/GDino": 0.7248628735542297, "rewards/GIT": 0.4042954742908478, "rewards/HPSv2": 0.29477787017822266, "step": 348 }, { "completion_length": 123.984375, "epoch": 0.8221436984687868, "grad_norm": 0.23895231178756696, "kl": 0.02996033290401101, "learning_rate": 7.818749999999999e-07, "loss": 0.02996033290401101, "reward": 1.2996571063995361, "reward_std": 0.10932384058833122, "rewards/GDino": 0.7706364989280701, "rewards/GIT": 0.234572634100914, "rewards/HPSv2": 0.2944478988647461, "step": 349 }, { "completion_length": 130.6953125, "epoch": 0.8244994110718492, "grad_norm": 0.618995168805275, "kl": 0.025302913505584, "learning_rate": 7.812499999999999e-07, "loss": 0.025302913505584, "reward": 1.1487194895744324, "reward_std": 0.17484624311327934, "rewards/GDino": 0.6504124701023102, "rewards/GIT": 0.20075541734695435, "rewards/HPSv2": 0.29755163192749023, "step": 350 }, { "completion_length": 126.4765625, "epoch": 0.8268551236749117, "grad_norm": 0.26577698414938755, "kl": 0.016755123622715473, "learning_rate": 7.806249999999999e-07, "loss": 0.016755123622715473, "reward": 1.3057908415794373, "reward_std": 0.13240446150302887, "rewards/GDino": 0.6766990125179291, "rewards/GIT": 0.34303056448698044, "rewards/HPSv2": 0.28606128692626953, "step": 351 }, { "completion_length": 124.671875, "epoch": 0.8292108362779741, "grad_norm": 0.8214046276795496, "kl": 0.05983645375818014, "learning_rate": 7.799999999999999e-07, "loss": 0.05983645375818014, "reward": 1.7054147124290466, "reward_std": 0.1155538335442543, "rewards/GDino": 0.8149780035018921, "rewards/GIT": 0.6146503686904907, "rewards/HPSv2": 0.2757863998413086, "step": 352 }, { "completion_length": 127.7734375, "epoch": 0.8315665488810365, "grad_norm": 0.41102088925264657, "kl": 0.023810289800167084, "learning_rate": 7.793750000000001e-07, "loss": 0.023810289800167084, "reward": 1.6652113795280457, "reward_std": 0.15206262469291687, "rewards/GDino": 0.7857888042926788, "rewards/GIT": 0.6007837951183319, "rewards/HPSv2": 0.2786388397216797, "step": 353 }, { "completion_length": 129.7109375, "epoch": 0.833922261484099, "grad_norm": 0.3070252457160066, "kl": 0.014662251807749271, "learning_rate": 7.787500000000001e-07, "loss": 0.014662251807749271, "reward": 1.580950915813446, "reward_std": 0.15787895396351814, "rewards/GDino": 0.8141408264636993, "rewards/GIT": 0.47352562844753265, "rewards/HPSv2": 0.29328441619873047, "step": 354 }, { "completion_length": 131.390625, "epoch": 0.8362779740871613, "grad_norm": 0.9212584320828935, "kl": 0.041158477775752544, "learning_rate": 7.78125e-07, "loss": 0.041158477775752544, "reward": 1.6764832735061646, "reward_std": 0.12939919531345367, "rewards/GDino": 0.8331021964550018, "rewards/GIT": 0.5651999115943909, "rewards/HPSv2": 0.2781810760498047, "step": 355 }, { "completion_length": 125.8125, "epoch": 0.8386336866902238, "grad_norm": 0.2656180995921266, "kl": 0.01376047870144248, "learning_rate": 7.775e-07, "loss": 0.01376047870144248, "reward": 1.6351155042648315, "reward_std": 0.13147830963134766, "rewards/GDino": 0.8874495029449463, "rewards/GIT": 0.46044783294200897, "rewards/HPSv2": 0.2872180938720703, "step": 356 }, { "completion_length": 130.328125, "epoch": 0.8409893992932862, "grad_norm": 0.28585423648825703, "kl": 0.01715562166646123, "learning_rate": 7.76875e-07, "loss": 0.01715562166646123, "reward": 1.5057255029678345, "reward_std": 0.19728095829486847, "rewards/GDino": 0.7587775588035583, "rewards/GIT": 0.4610249251127243, "rewards/HPSv2": 0.2859230041503906, "step": 357 }, { "completion_length": 130.5390625, "epoch": 0.8433451118963486, "grad_norm": 0.41150243782542495, "kl": 0.015297704376280308, "learning_rate": 7.7625e-07, "loss": 0.015297704376280308, "reward": 1.3823580741882324, "reward_std": 0.14532288536429405, "rewards/GDino": 0.6917605996131897, "rewards/GIT": 0.4070347547531128, "rewards/HPSv2": 0.28356266021728516, "step": 358 }, { "completion_length": 133.1484375, "epoch": 0.8457008244994111, "grad_norm": 0.28981146665788826, "kl": 0.023817891255021095, "learning_rate": 7.75625e-07, "loss": 0.023817891255021095, "reward": 1.6232978105545044, "reward_std": 0.19310826063156128, "rewards/GDino": 0.8501787781715393, "rewards/GIT": 0.4992656409740448, "rewards/HPSv2": 0.2738533020019531, "step": 359 }, { "completion_length": 126.6484375, "epoch": 0.8480565371024735, "grad_norm": 0.3906682518317748, "kl": 0.016292096115648746, "learning_rate": 7.75e-07, "loss": 0.016292096115648746, "reward": 1.3794063329696655, "reward_std": 0.19753535091876984, "rewards/GDino": 0.6958156824111938, "rewards/GIT": 0.40037788450717926, "rewards/HPSv2": 0.28321266174316406, "step": 360 }, { "completion_length": 131.546875, "epoch": 0.850412249705536, "grad_norm": 0.3251116214657587, "kl": 0.018880583345890045, "learning_rate": 7.74375e-07, "loss": 0.018880583345890045, "reward": 1.192928433418274, "reward_std": 0.20844640582799911, "rewards/GDino": 0.7232890129089355, "rewards/GIT": 0.1797119379043579, "rewards/HPSv2": 0.28992748260498047, "step": 361 }, { "completion_length": 129.1640625, "epoch": 0.8527679623085983, "grad_norm": 0.26971902004564463, "kl": 0.013182812370359898, "learning_rate": 7.7375e-07, "loss": 0.013182812370359898, "reward": 1.5813700556755066, "reward_std": 0.10840684175491333, "rewards/GDino": 0.8078116178512573, "rewards/GIT": 0.48276305198669434, "rewards/HPSv2": 0.29079532623291016, "step": 362 }, { "completion_length": 128.8984375, "epoch": 0.8551236749116607, "grad_norm": 0.3014903990429389, "kl": 0.01973722968250513, "learning_rate": 7.731249999999999e-07, "loss": 0.01973722968250513, "reward": 1.3931081295013428, "reward_std": 0.1931626945734024, "rewards/GDino": 0.7878836393356323, "rewards/GIT": 0.30862316489219666, "rewards/HPSv2": 0.2966012954711914, "step": 363 }, { "completion_length": 123.859375, "epoch": 0.8574793875147232, "grad_norm": 1.1944858687426094, "kl": 0.04033518582582474, "learning_rate": 7.724999999999999e-07, "loss": 0.04033518582582474, "reward": 1.7376198172569275, "reward_std": 0.14475225657224655, "rewards/GDino": 0.870601236820221, "rewards/GIT": 0.5861347019672394, "rewards/HPSv2": 0.2808837890625, "step": 364 }, { "completion_length": 134.234375, "epoch": 0.8598351001177856, "grad_norm": 0.3028476086031855, "kl": 0.02389040496200323, "learning_rate": 7.718749999999999e-07, "loss": 0.02389040496200323, "reward": 1.312369704246521, "reward_std": 0.1893121749162674, "rewards/GDino": 0.7310144603252411, "rewards/GIT": 0.2872028648853302, "rewards/HPSv2": 0.29415225982666016, "step": 365 }, { "completion_length": 124.421875, "epoch": 0.8621908127208481, "grad_norm": 0.38350538088240127, "kl": 0.021909753791987896, "learning_rate": 7.712499999999999e-07, "loss": 0.021909753791987896, "reward": 1.4509356617927551, "reward_std": 0.15566155314445496, "rewards/GDino": 0.7324540913105011, "rewards/GIT": 0.43682049214839935, "rewards/HPSv2": 0.2816610336303711, "step": 366 }, { "completion_length": 133.4609375, "epoch": 0.8645465253239105, "grad_norm": 0.3248758504703164, "kl": 0.039526283740997314, "learning_rate": 7.706249999999999e-07, "loss": 0.039526283740997314, "reward": 1.474755346775055, "reward_std": 0.16109705716371536, "rewards/GDino": 0.7987775504589081, "rewards/GIT": 0.38614846765995026, "rewards/HPSv2": 0.2898292541503906, "step": 367 }, { "completion_length": 130.359375, "epoch": 0.866902237926973, "grad_norm": 0.3937459543511716, "kl": 0.022100177593529224, "learning_rate": 7.699999999999999e-07, "loss": 0.022100177593529224, "reward": 1.933620810508728, "reward_std": 0.13571255654096603, "rewards/GDino": 0.9026041924953461, "rewards/GIT": 0.7526972591876984, "rewards/HPSv2": 0.2783193588256836, "step": 368 }, { "completion_length": 123.078125, "epoch": 0.8692579505300353, "grad_norm": 0.7415423563231123, "kl": 0.024448616430163383, "learning_rate": 7.69375e-07, "loss": 0.024448616430163383, "reward": 1.396821141242981, "reward_std": 0.1994519829750061, "rewards/GDino": 0.7409936189651489, "rewards/GIT": 0.38250355422496796, "rewards/HPSv2": 0.27332401275634766, "step": 369 }, { "completion_length": 125.3046875, "epoch": 0.8716136631330977, "grad_norm": 0.28155851368625817, "kl": 0.046930212527513504, "learning_rate": 7.6875e-07, "loss": 0.046930212527513504, "reward": 1.420795977115631, "reward_std": 0.14101148396730423, "rewards/GDino": 0.7107749581336975, "rewards/GIT": 0.41248030960559845, "rewards/HPSv2": 0.29754066467285156, "step": 370 }, { "completion_length": 130.890625, "epoch": 0.8739693757361602, "grad_norm": 0.356496326899857, "kl": 0.016422544606029987, "learning_rate": 7.68125e-07, "loss": 0.016422544606029987, "reward": 1.4139507412910461, "reward_std": 0.19464239478111267, "rewards/GDino": 0.7570723593235016, "rewards/GIT": 0.375384159386158, "rewards/HPSv2": 0.281494140625, "step": 371 }, { "completion_length": 131.734375, "epoch": 0.8763250883392226, "grad_norm": 0.38109518215937, "kl": 0.03784538805484772, "learning_rate": 7.675e-07, "loss": 0.03784538805484772, "reward": 1.6553622484207153, "reward_std": 0.11519679799675941, "rewards/GDino": 0.8495681285858154, "rewards/GIT": 0.5133517235517502, "rewards/HPSv2": 0.29244232177734375, "step": 372 }, { "completion_length": 130.5703125, "epoch": 0.8786808009422851, "grad_norm": 1.0352271611130435, "kl": 0.031293200328946114, "learning_rate": 7.66875e-07, "loss": 0.031293200328946114, "reward": 1.1551111340522766, "reward_std": 0.2216140627861023, "rewards/GDino": 0.661577045917511, "rewards/GIT": 0.19550882279872894, "rewards/HPSv2": 0.29802513122558594, "step": 373 }, { "completion_length": 130.2734375, "epoch": 0.8810365135453475, "grad_norm": 0.2739317162086522, "kl": 0.01876984629780054, "learning_rate": 7.6625e-07, "loss": 0.01876984629780054, "reward": 1.2596851587295532, "reward_std": 0.21350964903831482, "rewards/GDino": 0.7151577770709991, "rewards/GIT": 0.25528355687856674, "rewards/HPSv2": 0.2892436981201172, "step": 374 }, { "completion_length": 129.84375, "epoch": 0.8833922261484098, "grad_norm": 0.28079742585641326, "kl": 0.01983358897268772, "learning_rate": 7.65625e-07, "loss": 0.01983358897268772, "reward": 1.4563385844230652, "reward_std": 0.16744723916053772, "rewards/GDino": 0.7406797409057617, "rewards/GIT": 0.4348178952932358, "rewards/HPSv2": 0.2808408737182617, "step": 375 }, { "completion_length": 133.515625, "epoch": 0.8857479387514723, "grad_norm": 0.30339973125553266, "kl": 0.019843383692204952, "learning_rate": 7.65e-07, "loss": 0.019843383692204952, "reward": 1.7782825827598572, "reward_std": 0.1066521517932415, "rewards/GDino": 0.8567739427089691, "rewards/GIT": 0.6339214742183685, "rewards/HPSv2": 0.28758716583251953, "step": 376 }, { "completion_length": 132.09375, "epoch": 0.8881036513545347, "grad_norm": 0.480211535093072, "kl": 0.02523112390190363, "learning_rate": 7.64375e-07, "loss": 0.02523112390190363, "reward": 1.5741488933563232, "reward_std": 0.15932176262140274, "rewards/GDino": 0.7766148746013641, "rewards/GIT": 0.5085754841566086, "rewards/HPSv2": 0.2889585494995117, "step": 377 }, { "completion_length": 130.0, "epoch": 0.8904593639575972, "grad_norm": 0.40308634893649065, "kl": 0.021694418974220753, "learning_rate": 7.6375e-07, "loss": 0.021694418974220753, "reward": 1.3338321447372437, "reward_std": 0.146247036755085, "rewards/GDino": 0.7306439578533173, "rewards/GIT": 0.3073231279850006, "rewards/HPSv2": 0.2958650588989258, "step": 378 }, { "completion_length": 134.1640625, "epoch": 0.8928150765606596, "grad_norm": 0.4379498132411162, "kl": 0.021961030550301075, "learning_rate": 7.63125e-07, "loss": 0.021961030550301075, "reward": 1.5851874947547913, "reward_std": 0.12407775968313217, "rewards/GDino": 0.760408878326416, "rewards/GIT": 0.5367689728736877, "rewards/HPSv2": 0.2880096435546875, "step": 379 }, { "completion_length": 127.5859375, "epoch": 0.8951707891637221, "grad_norm": 0.29692091317218894, "kl": 0.02630383614450693, "learning_rate": 7.624999999999999e-07, "loss": 0.02630383614450693, "reward": 1.4055343866348267, "reward_std": 0.12047265470027924, "rewards/GDino": 0.7828658819198608, "rewards/GIT": 0.3322860673069954, "rewards/HPSv2": 0.29038238525390625, "step": 380 }, { "completion_length": 122.9140625, "epoch": 0.8975265017667845, "grad_norm": 0.8335339763311835, "kl": 0.028995588421821594, "learning_rate": 7.618749999999999e-07, "loss": 0.028995588421821594, "reward": 1.409187912940979, "reward_std": 0.1519249975681305, "rewards/GDino": 0.7678323686122894, "rewards/GIT": 0.35779665410518646, "rewards/HPSv2": 0.28355884552001953, "step": 381 }, { "completion_length": 130.3046875, "epoch": 0.8998822143698468, "grad_norm": 0.45531949051783593, "kl": 0.09853838011622429, "learning_rate": 7.612499999999999e-07, "loss": 0.09853838011622429, "reward": 1.6228669881820679, "reward_std": 0.16181015223264694, "rewards/GDino": 0.7987302243709564, "rewards/GIT": 0.5483847260475159, "rewards/HPSv2": 0.27575206756591797, "step": 382 }, { "completion_length": 135.984375, "epoch": 0.9022379269729093, "grad_norm": 0.30514903536578064, "kl": 0.0174590852111578, "learning_rate": 7.606249999999999e-07, "loss": 0.0174590852111578, "reward": 1.482495903968811, "reward_std": 0.12343868613243103, "rewards/GDino": 0.6838803589344025, "rewards/GIT": 0.5090647339820862, "rewards/HPSv2": 0.28955078125, "step": 383 }, { "completion_length": 126.8671875, "epoch": 0.9045936395759717, "grad_norm": 0.3784083870683929, "kl": 0.020596541464328766, "learning_rate": 7.599999999999999e-07, "loss": 0.020596541464328766, "reward": 1.6287755966186523, "reward_std": 0.13346859067678452, "rewards/GDino": 0.7703124582767487, "rewards/GIT": 0.5879695415496826, "rewards/HPSv2": 0.2704935073852539, "step": 384 }, { "completion_length": 126.7265625, "epoch": 0.9069493521790342, "grad_norm": 0.3582678469238993, "kl": 0.01856532320380211, "learning_rate": 7.59375e-07, "loss": 0.01856532320380211, "reward": 1.4615419507026672, "reward_std": 0.16459150612354279, "rewards/GDino": 0.7889142036437988, "rewards/GIT": 0.3715985119342804, "rewards/HPSv2": 0.3010292053222656, "step": 385 }, { "completion_length": 139.2421875, "epoch": 0.9093050647820966, "grad_norm": 0.5653606685066469, "kl": 0.0469019990414381, "learning_rate": 7.5875e-07, "loss": 0.0469019990414381, "reward": 1.396345853805542, "reward_std": 0.21743130683898926, "rewards/GDino": 0.7450050413608551, "rewards/GIT": 0.3703777492046356, "rewards/HPSv2": 0.2809629440307617, "step": 386 }, { "completion_length": 131.046875, "epoch": 0.911660777385159, "grad_norm": 0.6099816224488825, "kl": 0.07133647054433823, "learning_rate": 7.58125e-07, "loss": 0.07133647054433823, "reward": 1.4800037741661072, "reward_std": 0.20884086936712265, "rewards/GDino": 0.7388469874858856, "rewards/GIT": 0.46255990862846375, "rewards/HPSv2": 0.2785968780517578, "step": 387 }, { "completion_length": 130.203125, "epoch": 0.9140164899882215, "grad_norm": 0.27078182965876474, "kl": 0.030368881300091743, "learning_rate": 7.575e-07, "loss": 0.030368881300091743, "reward": 1.4358269572257996, "reward_std": 0.13459337875247002, "rewards/GDino": 0.772514671087265, "rewards/GIT": 0.37817317247390747, "rewards/HPSv2": 0.2851390838623047, "step": 388 }, { "completion_length": 134.6640625, "epoch": 0.9163722025912838, "grad_norm": 1.104142654557233, "kl": 0.051616670563817024, "learning_rate": 7.56875e-07, "loss": 0.051616670563817024, "reward": 1.2872219681739807, "reward_std": 0.20270472764968872, "rewards/GDino": 0.6869372427463531, "rewards/GIT": 0.3158484622836113, "rewards/HPSv2": 0.2844362258911133, "step": 389 }, { "completion_length": 136.8828125, "epoch": 0.9187279151943463, "grad_norm": 0.2888325572860154, "kl": 0.03331358544528484, "learning_rate": 7.5625e-07, "loss": 0.03331358544528484, "reward": 1.2222061157226562, "reward_std": 0.10563910752534866, "rewards/GDino": 0.7051435708999634, "rewards/GIT": 0.21435099840164185, "rewards/HPSv2": 0.30271148681640625, "step": 390 }, { "completion_length": 128.3046875, "epoch": 0.9210836277974087, "grad_norm": 2.050775466468362, "kl": 0.03329954110085964, "learning_rate": 7.55625e-07, "loss": 0.03329954110085964, "reward": 1.3688421249389648, "reward_std": 0.1607290357351303, "rewards/GDino": 0.7604928016662598, "rewards/GIT": 0.3186508044600487, "rewards/HPSv2": 0.28969860076904297, "step": 391 }, { "completion_length": 130.46875, "epoch": 0.9234393404004712, "grad_norm": 0.3444351135031718, "kl": 0.03492344543337822, "learning_rate": 7.55e-07, "loss": 0.03492344543337822, "reward": 1.489086091518402, "reward_std": 0.16827236115932465, "rewards/GDino": 0.7601805031299591, "rewards/GIT": 0.4362829327583313, "rewards/HPSv2": 0.29262256622314453, "step": 392 }, { "completion_length": 131.7109375, "epoch": 0.9257950530035336, "grad_norm": 0.7829115880765649, "kl": 0.024348177015781403, "learning_rate": 7.54375e-07, "loss": 0.024348177015781403, "reward": 1.4259389638900757, "reward_std": 0.18727616220712662, "rewards/GDino": 0.7419649958610535, "rewards/GIT": 0.39320439100265503, "rewards/HPSv2": 0.2907695770263672, "step": 393 }, { "completion_length": 129.015625, "epoch": 0.928150765606596, "grad_norm": 0.2705047235530651, "kl": 0.022778253071010113, "learning_rate": 7.5375e-07, "loss": 0.022778253071010113, "reward": 1.3298715353012085, "reward_std": 0.15502920001745224, "rewards/GDino": 0.7035481035709381, "rewards/GIT": 0.3221995085477829, "rewards/HPSv2": 0.3041238784790039, "step": 394 }, { "completion_length": 130.796875, "epoch": 0.9305064782096584, "grad_norm": 0.45512757186445874, "kl": 0.03831307217478752, "learning_rate": 7.53125e-07, "loss": 0.03831307217478752, "reward": 1.464635193347931, "reward_std": 0.13517122715711594, "rewards/GDino": 0.7655279040336609, "rewards/GIT": 0.3975439518690109, "rewards/HPSv2": 0.3015632629394531, "step": 395 }, { "completion_length": 127.265625, "epoch": 0.9328621908127208, "grad_norm": 0.34806672316576265, "kl": 0.021874687634408474, "learning_rate": 7.524999999999999e-07, "loss": 0.021874687634408474, "reward": 1.5898691415786743, "reward_std": 0.16093335300683975, "rewards/GDino": 0.8238385319709778, "rewards/GIT": 0.4795030206441879, "rewards/HPSv2": 0.2865276336669922, "step": 396 }, { "completion_length": 127.8203125, "epoch": 0.9352179034157833, "grad_norm": 0.3870813944527315, "kl": 0.08789144828915596, "learning_rate": 7.518749999999999e-07, "loss": 0.08789144828915596, "reward": 1.4660338163375854, "reward_std": 0.1854344755411148, "rewards/GDino": 0.7939740121364594, "rewards/GIT": 0.3975513130426407, "rewards/HPSv2": 0.2745084762573242, "step": 397 }, { "completion_length": 132.5, "epoch": 0.9375736160188457, "grad_norm": 0.2795543203615517, "kl": 0.038961777463555336, "learning_rate": 7.512499999999999e-07, "loss": 0.038961777463555336, "reward": 1.4182384014129639, "reward_std": 0.14558867365121841, "rewards/GDino": 0.7302033603191376, "rewards/GIT": 0.4043808579444885, "rewards/HPSv2": 0.28365421295166016, "step": 398 }, { "completion_length": 140.015625, "epoch": 0.9399293286219081, "grad_norm": 1.0105128247037232, "kl": 0.023297715932130814, "learning_rate": 7.506249999999999e-07, "loss": 0.023297715932130814, "reward": 1.2274208664894104, "reward_std": 0.20261241495609283, "rewards/GDino": 0.7066686451435089, "rewards/GIT": 0.2365935556590557, "rewards/HPSv2": 0.28415870666503906, "step": 399 }, { "completion_length": 133.375, "epoch": 0.9422850412249706, "grad_norm": 0.8608655977392967, "kl": 0.04557744972407818, "learning_rate": 7.5e-07, "loss": 0.04557744972407818, "reward": 1.5542137026786804, "reward_std": 0.1714993268251419, "rewards/GDino": 0.7798634171485901, "rewards/GIT": 0.49163156747817993, "rewards/HPSv2": 0.2827186584472656, "step": 400 }, { "completion_length": 134.6328125, "epoch": 0.944640753828033, "grad_norm": 0.8077168863607549, "kl": 0.028582578524947166, "learning_rate": 7.49375e-07, "loss": 0.028582578524947166, "reward": 1.239660769701004, "reward_std": 0.16748231649398804, "rewards/GDino": 0.67254838347435, "rewards/GIT": 0.26967376098036766, "rewards/HPSv2": 0.2974386215209961, "step": 401 }, { "completion_length": 131.34375, "epoch": 0.9469964664310954, "grad_norm": 0.6004835733451964, "kl": 0.06767754815518856, "learning_rate": 7.4875e-07, "loss": 0.06767754815518856, "reward": 1.4188067317008972, "reward_std": 0.17988011986017227, "rewards/GDino": 0.7086147665977478, "rewards/GIT": 0.4402487874031067, "rewards/HPSv2": 0.2699432373046875, "step": 402 }, { "completion_length": 130.8671875, "epoch": 0.9493521790341578, "grad_norm": 0.460475056180434, "kl": 0.035088252276182175, "learning_rate": 7.48125e-07, "loss": 0.035088252276182175, "reward": 1.484118938446045, "reward_std": 0.13807883486151695, "rewards/GDino": 0.7424787878990173, "rewards/GIT": 0.4522048830986023, "rewards/HPSv2": 0.28943538665771484, "step": 403 }, { "completion_length": 130.515625, "epoch": 0.9517078916372202, "grad_norm": 0.3935555799072625, "kl": 0.07480474933981895, "learning_rate": 7.475e-07, "loss": 0.07480474933981895, "reward": 1.5436906218528748, "reward_std": 0.1737920641899109, "rewards/GDino": 0.7736605703830719, "rewards/GIT": 0.4907425493001938, "rewards/HPSv2": 0.27928733825683594, "step": 404 }, { "completion_length": 128.0546875, "epoch": 0.9540636042402827, "grad_norm": 0.6805516342586014, "kl": 0.04973035305738449, "learning_rate": 7.468749999999999e-07, "loss": 0.04973035305738449, "reward": 1.5167292952537537, "reward_std": 0.1317146271467209, "rewards/GDino": 0.7750570774078369, "rewards/GIT": 0.4539381116628647, "rewards/HPSv2": 0.2877340316772461, "step": 405 }, { "completion_length": 132.90625, "epoch": 0.9564193168433451, "grad_norm": 0.3971330547194055, "kl": 0.04269574210047722, "learning_rate": 7.4625e-07, "loss": 0.04269574210047722, "reward": 1.3524296283721924, "reward_std": 0.19406509399414062, "rewards/GDino": 0.7412762641906738, "rewards/GIT": 0.3199603334069252, "rewards/HPSv2": 0.29119300842285156, "step": 406 }, { "completion_length": 125.6875, "epoch": 0.9587750294464076, "grad_norm": 0.47048224713811315, "kl": 0.0614928575232625, "learning_rate": 7.45625e-07, "loss": 0.0614928575232625, "reward": 1.3906712532043457, "reward_std": 0.18406904488801956, "rewards/GDino": 0.7212272584438324, "rewards/GIT": 0.3752783089876175, "rewards/HPSv2": 0.29416561126708984, "step": 407 }, { "completion_length": 132.859375, "epoch": 0.9611307420494699, "grad_norm": 0.4264965196962213, "kl": 0.03209751471877098, "learning_rate": 7.45e-07, "loss": 0.03209751471877098, "reward": 1.5392975807189941, "reward_std": 0.17072487622499466, "rewards/GDino": 0.7723385393619537, "rewards/GIT": 0.4683341085910797, "rewards/HPSv2": 0.29862499237060547, "step": 408 }, { "completion_length": 128.46875, "epoch": 0.9634864546525324, "grad_norm": 0.3577738945278871, "kl": 0.04572402313351631, "learning_rate": 7.44375e-07, "loss": 0.04572402313351631, "reward": 1.2473695874214172, "reward_std": 0.15602751076221466, "rewards/GDino": 0.6418924033641815, "rewards/GIT": 0.32645681500434875, "rewards/HPSv2": 0.2790203094482422, "step": 409 }, { "completion_length": 130.1171875, "epoch": 0.9658421672555948, "grad_norm": 0.5243545402938767, "kl": 0.05634794011712074, "learning_rate": 7.4375e-07, "loss": 0.05634794011712074, "reward": 1.5791205167770386, "reward_std": 0.15963201224803925, "rewards/GDino": 0.813571959733963, "rewards/GIT": 0.47838863730430603, "rewards/HPSv2": 0.28715991973876953, "step": 410 }, { "completion_length": 128.8828125, "epoch": 0.9681978798586572, "grad_norm": 0.5291412271386301, "kl": 0.05485726520419121, "learning_rate": 7.43125e-07, "loss": 0.05485726520419121, "reward": 1.4821436405181885, "reward_std": 0.19848233461380005, "rewards/GDino": 0.7561786472797394, "rewards/GIT": 0.43794290721416473, "rewards/HPSv2": 0.2880220413208008, "step": 411 }, { "completion_length": 132.2265625, "epoch": 0.9705535924617197, "grad_norm": 0.26708051095761504, "kl": 0.043022263795137405, "learning_rate": 7.425e-07, "loss": 0.043022263795137405, "reward": 1.4546568393707275, "reward_std": 0.17835672199726105, "rewards/GDino": 0.7781871259212494, "rewards/GIT": 0.39533793926239014, "rewards/HPSv2": 0.2811317443847656, "step": 412 }, { "completion_length": 127.125, "epoch": 0.9729093050647821, "grad_norm": 0.45921677012983453, "kl": 0.0613701231777668, "learning_rate": 7.418749999999999e-07, "loss": 0.0613701231777668, "reward": 1.673954963684082, "reward_std": 0.1345493197441101, "rewards/GDino": 0.8299302756786346, "rewards/GIT": 0.5549812018871307, "rewards/HPSv2": 0.2890434265136719, "step": 413 }, { "completion_length": 124.34375, "epoch": 0.9752650176678446, "grad_norm": 0.6251022270225428, "kl": 0.06681890413165092, "learning_rate": 7.412499999999999e-07, "loss": 0.06681890413165092, "reward": 1.5183817148208618, "reward_std": 0.18464192748069763, "rewards/GDino": 0.8101812899112701, "rewards/GIT": 0.42358289659023285, "rewards/HPSv2": 0.28461742401123047, "step": 414 }, { "completion_length": 129.5703125, "epoch": 0.9776207302709069, "grad_norm": 0.49808087768810716, "kl": 0.05610997416079044, "learning_rate": 7.406249999999999e-07, "loss": 0.05610997416079044, "reward": 1.4574124813079834, "reward_std": 0.15828466415405273, "rewards/GDino": 0.7833256125450134, "rewards/GIT": 0.37723106145858765, "rewards/HPSv2": 0.2968559265136719, "step": 415 }, { "completion_length": 133.1171875, "epoch": 0.9799764428739693, "grad_norm": 0.519177936500165, "kl": 0.039395347237586975, "learning_rate": 7.4e-07, "loss": 0.039395347237586975, "reward": 1.5274075269699097, "reward_std": 0.16552837193012238, "rewards/GDino": 0.7731655240058899, "rewards/GIT": 0.4649972766637802, "rewards/HPSv2": 0.2892446517944336, "step": 416 }, { "completion_length": 128.6171875, "epoch": 0.9823321554770318, "grad_norm": 0.38902526903715584, "kl": 0.10039560124278069, "learning_rate": 7.39375e-07, "loss": 0.10039560124278069, "reward": 1.5876814723014832, "reward_std": 0.09029714949429035, "rewards/GDino": 0.8176366686820984, "rewards/GIT": 0.4818616509437561, "rewards/HPSv2": 0.28818321228027344, "step": 417 }, { "completion_length": 129.703125, "epoch": 0.9846878680800942, "grad_norm": 0.7850462327306735, "kl": 0.0674417857080698, "learning_rate": 7.3875e-07, "loss": 0.0674417857080698, "reward": 1.8251633048057556, "reward_std": 0.1115475744009018, "rewards/GDino": 0.8977144956588745, "rewards/GIT": 0.6253410577774048, "rewards/HPSv2": 0.3021078109741211, "step": 418 }, { "completion_length": 135.1796875, "epoch": 0.9870435806831567, "grad_norm": 0.5728583511679222, "kl": 0.06567433476448059, "learning_rate": 7.38125e-07, "loss": 0.06567433476448059, "reward": 1.1271331012248993, "reward_std": 0.14839713275432587, "rewards/GDino": 0.6479416340589523, "rewards/GIT": 0.18193881213665009, "rewards/HPSv2": 0.2972526550292969, "step": 419 }, { "completion_length": 128.3359375, "epoch": 0.9893992932862191, "grad_norm": 0.33458558818370476, "kl": 0.11089200153946877, "learning_rate": 7.375e-07, "loss": 0.11089200153946877, "reward": 1.3381257057189941, "reward_std": 0.16492439806461334, "rewards/GDino": 0.7244874238967896, "rewards/GIT": 0.3152421563863754, "rewards/HPSv2": 0.29839611053466797, "step": 420 }, { "completion_length": 130.828125, "epoch": 0.9917550058892816, "grad_norm": 0.6482951471737368, "kl": 0.07569693773984909, "learning_rate": 7.368749999999999e-07, "loss": 0.07569693773984909, "reward": 1.3215692639350891, "reward_std": 0.1642981618642807, "rewards/GDino": 0.7724869251251221, "rewards/GIT": 0.2585902512073517, "rewards/HPSv2": 0.29049205780029297, "step": 421 }, { "completion_length": 134.90625, "epoch": 0.9941107184923439, "grad_norm": 0.33469576526765016, "kl": 0.048517441377043724, "learning_rate": 7.362499999999999e-07, "loss": 0.048517441377043724, "reward": 1.1985656023025513, "reward_std": 0.19336188584566116, "rewards/GDino": 0.6617996990680695, "rewards/GIT": 0.23764502257108688, "rewards/HPSv2": 0.2991209030151367, "step": 422 }, { "completion_length": 130.8359375, "epoch": 0.9964664310954063, "grad_norm": 0.5150122942354851, "kl": 0.04238016903400421, "learning_rate": 7.356249999999999e-07, "loss": 0.04238016903400421, "reward": 1.7137736678123474, "reward_std": 0.12651315331459045, "rewards/GDino": 0.8497101962566376, "rewards/GIT": 0.5700580477714539, "rewards/HPSv2": 0.2940053939819336, "step": 423 }, { "completion_length": 135.640625, "epoch": 0.9988221436984688, "grad_norm": 0.34126283459153617, "kl": 0.08555258717387915, "learning_rate": 7.35e-07, "loss": 0.08555258717387915, "reward": 1.1070693135261536, "reward_std": 0.192812018096447, "rewards/GDino": 0.648174911737442, "rewards/GIT": 0.16878290474414825, "rewards/HPSv2": 0.2901115417480469, "step": 424 }, { "completion_length": 128.6875, "epoch": 1.0, "grad_norm": 0.34126283459153617, "kl": 0.04992780089378357, "learning_rate": 7.34375e-07, "loss": 0.04992780089378357, "reward": 1.2530328035354614, "reward_std": 0.13942262530326843, "rewards/GDino": 0.6562132239341736, "rewards/GIT": 0.3047415316104889, "rewards/HPSv2": 0.29207801818847656, "step": 425 }, { "completion_length": 130.671875, "epoch": 1.0023557126030624, "grad_norm": 0.6443987516209699, "kl": 0.09126339107751846, "learning_rate": 7.3375e-07, "loss": 0.09126339107751846, "reward": 1.4003102779388428, "reward_std": 0.20405516773462296, "rewards/GDino": 0.7249287068843842, "rewards/GIT": 0.3984135687351227, "rewards/HPSv2": 0.27696800231933594, "step": 426 }, { "completion_length": 135.6640625, "epoch": 1.0047114252061249, "grad_norm": 1.0442909705299082, "kl": 0.1511267125606537, "learning_rate": 7.33125e-07, "loss": 0.1511267125606537, "reward": 1.4958284497261047, "reward_std": 0.12213357910513878, "rewards/GDino": 0.7780944406986237, "rewards/GIT": 0.4247261732816696, "rewards/HPSv2": 0.29300785064697266, "step": 427 }, { "completion_length": 130.75, "epoch": 1.0070671378091873, "grad_norm": 0.4297118377534725, "kl": 0.08750178664922714, "learning_rate": 7.325e-07, "loss": 0.08750178664922714, "reward": 1.2018466591835022, "reward_std": 0.18819383531808853, "rewards/GDino": 0.685711145401001, "rewards/GIT": 0.2222883105278015, "rewards/HPSv2": 0.29384708404541016, "step": 428 }, { "completion_length": 128.4140625, "epoch": 1.0094228504122498, "grad_norm": 0.3690777562996555, "kl": 0.08055368810892105, "learning_rate": 7.31875e-07, "loss": 0.08055368810892105, "reward": 1.557196855545044, "reward_std": 0.19745707511901855, "rewards/GDino": 0.8270485401153564, "rewards/GIT": 0.4378852993249893, "rewards/HPSv2": 0.2922630310058594, "step": 429 }, { "completion_length": 128.3671875, "epoch": 1.0117785630153122, "grad_norm": 0.5534369666106964, "kl": 0.09029343724250793, "learning_rate": 7.312499999999999e-07, "loss": 0.09029343724250793, "reward": 1.3187685012817383, "reward_std": 0.1849435418844223, "rewards/GDino": 0.7261742353439331, "rewards/GIT": 0.2993193492293358, "rewards/HPSv2": 0.2932748794555664, "step": 430 }, { "completion_length": 128.25, "epoch": 1.0141342756183747, "grad_norm": 0.5439622650445955, "kl": 0.07110912352800369, "learning_rate": 7.306249999999999e-07, "loss": 0.07110912352800369, "reward": 1.2637090682983398, "reward_std": 0.15487349033355713, "rewards/GDino": 0.7405569851398468, "rewards/GIT": 0.2267005890607834, "rewards/HPSv2": 0.2964515686035156, "step": 431 }, { "completion_length": 128.6171875, "epoch": 1.0164899882214369, "grad_norm": 1.2430500509619704, "kl": 0.10060467571020126, "learning_rate": 7.3e-07, "loss": 0.10060467571020126, "reward": 1.7922418713569641, "reward_std": 0.08564912527799606, "rewards/GDino": 0.8702858090400696, "rewards/GIT": 0.6342000067234039, "rewards/HPSv2": 0.28775596618652344, "step": 432 }, { "completion_length": 130.953125, "epoch": 1.0188457008244993, "grad_norm": 0.9498827852022362, "kl": 0.08068659529089928, "learning_rate": 7.29375e-07, "loss": 0.08068659529089928, "reward": 1.4883978366851807, "reward_std": 0.14468002319335938, "rewards/GDino": 0.7598952353000641, "rewards/GIT": 0.44080086052417755, "rewards/HPSv2": 0.2877016067504883, "step": 433 }, { "completion_length": 126.3828125, "epoch": 1.0212014134275618, "grad_norm": 0.5606615032512683, "kl": 0.07240545563399792, "learning_rate": 7.2875e-07, "loss": 0.07240545563399792, "reward": 1.8122752904891968, "reward_std": 0.13288399577140808, "rewards/GDino": 0.9144292771816254, "rewards/GIT": 0.6088635921478271, "rewards/HPSv2": 0.2889823913574219, "step": 434 }, { "completion_length": 133.9140625, "epoch": 1.0235571260306242, "grad_norm": 0.34735355911507776, "kl": 0.06206061318516731, "learning_rate": 7.28125e-07, "loss": 0.06206061318516731, "reward": 1.641293227672577, "reward_std": 0.16366813331842422, "rewards/GDino": 0.8642386496067047, "rewards/GIT": 0.4918516278266907, "rewards/HPSv2": 0.2852029800415039, "step": 435 }, { "completion_length": 128.9453125, "epoch": 1.0259128386336867, "grad_norm": 0.5853705705867442, "kl": 0.07501314580440521, "learning_rate": 7.275e-07, "loss": 0.07501314580440521, "reward": 1.5831254124641418, "reward_std": 0.13523368537425995, "rewards/GDino": 0.8035596907138824, "rewards/GIT": 0.4886806905269623, "rewards/HPSv2": 0.29088497161865234, "step": 436 }, { "completion_length": 128.3828125, "epoch": 1.028268551236749, "grad_norm": 0.32813372584360667, "kl": 0.055249400436878204, "learning_rate": 7.26875e-07, "loss": 0.055249400436878204, "reward": 1.6748942136764526, "reward_std": 0.12469176948070526, "rewards/GDino": 0.800895094871521, "rewards/GIT": 0.5848373770713806, "rewards/HPSv2": 0.28916168212890625, "step": 437 }, { "completion_length": 131.1875, "epoch": 1.0306242638398115, "grad_norm": 0.4444791997867047, "kl": 0.06898971274495125, "learning_rate": 7.262499999999999e-07, "loss": 0.06898971274495125, "reward": 1.6129685640335083, "reward_std": 0.17029380053281784, "rewards/GDino": 0.7945185303688049, "rewards/GIT": 0.5310192853212357, "rewards/HPSv2": 0.2874307632446289, "step": 438 }, { "completion_length": 135.7109375, "epoch": 1.032979976442874, "grad_norm": 0.8347109501038955, "kl": 0.07921576872467995, "learning_rate": 7.256249999999999e-07, "loss": 0.07921576872467995, "reward": 1.6059342622756958, "reward_std": 0.1800544708967209, "rewards/GDino": 0.8188140094280243, "rewards/GIT": 0.512963593006134, "rewards/HPSv2": 0.2741565704345703, "step": 439 }, { "completion_length": 135.453125, "epoch": 1.0353356890459364, "grad_norm": 0.43052252610097524, "kl": 0.07001743838191032, "learning_rate": 7.249999999999999e-07, "loss": 0.07001743838191032, "reward": 1.2818132042884827, "reward_std": 0.20064447820186615, "rewards/GDino": 0.6976161003112793, "rewards/GIT": 0.293830931186676, "rewards/HPSv2": 0.29036617279052734, "step": 440 }, { "completion_length": 133.3671875, "epoch": 1.0376914016489989, "grad_norm": 0.5384539455916353, "kl": 0.0668974369764328, "learning_rate": 7.243749999999999e-07, "loss": 0.0668974369764328, "reward": 1.3083884119987488, "reward_std": 0.18769492954015732, "rewards/GDino": 0.6989682018756866, "rewards/GIT": 0.31958138942718506, "rewards/HPSv2": 0.2898387908935547, "step": 441 }, { "completion_length": 138.140625, "epoch": 1.0400471142520613, "grad_norm": 0.3318993693816589, "kl": 0.061493005603551865, "learning_rate": 7.2375e-07, "loss": 0.061493005603551865, "reward": 1.4271244406700134, "reward_std": 0.12231442332267761, "rewards/GDino": 0.7546918094158173, "rewards/GIT": 0.381477989256382, "rewards/HPSv2": 0.29095458984375, "step": 442 }, { "completion_length": 134.8125, "epoch": 1.0424028268551238, "grad_norm": 0.33419811067647387, "kl": 0.05583300068974495, "learning_rate": 7.23125e-07, "loss": 0.05583300068974495, "reward": 1.2668619751930237, "reward_std": 0.18620267510414124, "rewards/GDino": 0.7141441702842712, "rewards/GIT": 0.2572856470942497, "rewards/HPSv2": 0.29543209075927734, "step": 443 }, { "completion_length": 126.265625, "epoch": 1.0447585394581862, "grad_norm": 0.4145206798166035, "kl": 0.05142274312674999, "learning_rate": 7.225e-07, "loss": 0.05142274312674999, "reward": 1.8544477224349976, "reward_std": 0.1217915266752243, "rewards/GDino": 0.8569270372390747, "rewards/GIT": 0.7110131084918976, "rewards/HPSv2": 0.28650760650634766, "step": 444 }, { "completion_length": 126.671875, "epoch": 1.0471142520612484, "grad_norm": 0.3499195250150864, "kl": 0.053441716358065605, "learning_rate": 7.21875e-07, "loss": 0.053441716358065605, "reward": 1.6533610820770264, "reward_std": 0.12335006147623062, "rewards/GDino": 0.8364751040935516, "rewards/GIT": 0.5266418159008026, "rewards/HPSv2": 0.29024410247802734, "step": 445 }, { "completion_length": 129.8984375, "epoch": 1.0494699646643109, "grad_norm": 0.5402988640557597, "kl": 0.07956504449248314, "learning_rate": 7.212499999999999e-07, "loss": 0.07956504449248314, "reward": 1.8627381920814514, "reward_std": 0.11414041370153427, "rewards/GDino": 0.8889322578907013, "rewards/GIT": 0.6792616248130798, "rewards/HPSv2": 0.2945442199707031, "step": 446 }, { "completion_length": 136.546875, "epoch": 1.0518256772673733, "grad_norm": 0.3156722798499676, "kl": 0.028255420736968517, "learning_rate": 7.206249999999999e-07, "loss": 0.028255420736968517, "reward": 1.2837774157524109, "reward_std": 0.23353099822998047, "rewards/GDino": 0.6519962549209595, "rewards/GIT": 0.35398343205451965, "rewards/HPSv2": 0.2777976989746094, "step": 447 }, { "completion_length": 133.4921875, "epoch": 1.0541813898704357, "grad_norm": 0.2936994804344032, "kl": 0.0683881938457489, "learning_rate": 7.2e-07, "loss": 0.0683881938457489, "reward": 1.3552574515342712, "reward_std": 0.1425539292395115, "rewards/GDino": 0.7238002121448517, "rewards/GIT": 0.33887098729610443, "rewards/HPSv2": 0.2925863265991211, "step": 448 }, { "completion_length": 130.578125, "epoch": 1.0565371024734982, "grad_norm": 0.716237012633681, "kl": 0.069327712059021, "learning_rate": 7.19375e-07, "loss": 0.069327712059021, "reward": 1.7198835015296936, "reward_std": 0.14339616894721985, "rewards/GDino": 0.839476615190506, "rewards/GIT": 0.5866426825523376, "rewards/HPSv2": 0.2937641143798828, "step": 449 }, { "completion_length": 132.5234375, "epoch": 1.0588928150765606, "grad_norm": 0.6332477898982055, "kl": 0.05895545706152916, "learning_rate": 7.1875e-07, "loss": 0.05895545706152916, "reward": 1.374456763267517, "reward_std": 0.17674200981855392, "rewards/GDino": 0.7332712113857269, "rewards/GIT": 0.35329121351242065, "rewards/HPSv2": 0.28789424896240234, "step": 450 }, { "completion_length": 138.8671875, "epoch": 1.061248527679623, "grad_norm": 0.30401075237506175, "kl": 0.0305694118142128, "learning_rate": 7.18125e-07, "loss": 0.0305694118142128, "reward": 1.4743647575378418, "reward_std": 0.18903370201587677, "rewards/GDino": 0.7923079431056976, "rewards/GIT": 0.3892435356974602, "rewards/HPSv2": 0.2928133010864258, "step": 451 }, { "completion_length": 129.6953125, "epoch": 1.0636042402826855, "grad_norm": 0.556982169365843, "kl": 0.06523298472166061, "learning_rate": 7.175e-07, "loss": 0.06523298472166061, "reward": 1.5759711861610413, "reward_std": 0.16606174409389496, "rewards/GDino": 0.8465093970298767, "rewards/GIT": 0.44377823173999786, "rewards/HPSv2": 0.28568363189697266, "step": 452 }, { "completion_length": 127.1875, "epoch": 1.065959952885748, "grad_norm": 0.6086174027023912, "kl": 0.051728660240769386, "learning_rate": 7.16875e-07, "loss": 0.051728660240769386, "reward": 1.639402985572815, "reward_std": 0.1630081981420517, "rewards/GDino": 0.8351429402828217, "rewards/GIT": 0.5153520405292511, "rewards/HPSv2": 0.2889080047607422, "step": 453 }, { "completion_length": 137.984375, "epoch": 1.0683156654888104, "grad_norm": 0.31195915264316987, "kl": 0.04374501667916775, "learning_rate": 7.1625e-07, "loss": 0.04374501667916775, "reward": 1.2596930861473083, "reward_std": 0.20604410767555237, "rewards/GDino": 0.6958290338516235, "rewards/GIT": 0.2770426571369171, "rewards/HPSv2": 0.2868213653564453, "step": 454 }, { "completion_length": 133.203125, "epoch": 1.0706713780918728, "grad_norm": 1.7321662490344873, "kl": 0.09143341705203056, "learning_rate": 7.156249999999999e-07, "loss": 0.09143341705203056, "reward": 1.6032226085662842, "reward_std": 0.13541024923324585, "rewards/GDino": 0.770833283662796, "rewards/GIT": 0.5390476435422897, "rewards/HPSv2": 0.29334163665771484, "step": 455 }, { "completion_length": 135.0703125, "epoch": 1.0730270906949353, "grad_norm": 0.4896428135413219, "kl": 0.034353566356003284, "learning_rate": 7.149999999999999e-07, "loss": 0.034353566356003284, "reward": 1.448026955127716, "reward_std": 0.14167709276080132, "rewards/GDino": 0.7680744230747223, "rewards/GIT": 0.3955753743648529, "rewards/HPSv2": 0.2843770980834961, "step": 456 }, { "completion_length": 132.3828125, "epoch": 1.0753828032979977, "grad_norm": 1.0837916531767495, "kl": 0.07013012655079365, "learning_rate": 7.143749999999999e-07, "loss": 0.07013012655079365, "reward": 1.653678297996521, "reward_std": 0.13524790108203888, "rewards/GDino": 0.8072231709957123, "rewards/GIT": 0.5587744414806366, "rewards/HPSv2": 0.28768062591552734, "step": 457 }, { "completion_length": 130.3359375, "epoch": 1.0777385159010602, "grad_norm": 0.30457335791701756, "kl": 0.06888172402977943, "learning_rate": 7.137499999999999e-07, "loss": 0.06888172402977943, "reward": 1.3523876070976257, "reward_std": 0.15604664385318756, "rewards/GDino": 0.7104668021202087, "rewards/GIT": 0.3432852327823639, "rewards/HPSv2": 0.29863548278808594, "step": 458 }, { "completion_length": 137.7265625, "epoch": 1.0800942285041224, "grad_norm": 0.6992205034459315, "kl": 0.050083023495972157, "learning_rate": 7.13125e-07, "loss": 0.050083023495972157, "reward": 1.5397540926933289, "reward_std": 0.16648109257221222, "rewards/GDino": 0.7844061553478241, "rewards/GIT": 0.4673011004924774, "rewards/HPSv2": 0.28804683685302734, "step": 459 }, { "completion_length": 134.0546875, "epoch": 1.0824499411071848, "grad_norm": 0.36071563048205674, "kl": 0.049633923918008804, "learning_rate": 7.125e-07, "loss": 0.049633923918008804, "reward": 1.6014037728309631, "reward_std": 0.10856888443231583, "rewards/GDino": 0.7951059639453888, "rewards/GIT": 0.507873922586441, "rewards/HPSv2": 0.29842376708984375, "step": 460 }, { "completion_length": 123.1171875, "epoch": 1.0848056537102473, "grad_norm": 0.9220410132630221, "kl": 0.0671689547598362, "learning_rate": 7.11875e-07, "loss": 0.0671689547598362, "reward": 1.769094169139862, "reward_std": 0.1781492829322815, "rewards/GDino": 0.8864818513393402, "rewards/GIT": 0.5991659462451935, "rewards/HPSv2": 0.2834463119506836, "step": 461 }, { "completion_length": 138.109375, "epoch": 1.0871613663133097, "grad_norm": 1.0510418590129982, "kl": 0.08036252297461033, "learning_rate": 7.1125e-07, "loss": 0.08036252297461033, "reward": 1.3735433220863342, "reward_std": 0.17895889282226562, "rewards/GDino": 0.7283358573913574, "rewards/GIT": 0.3521995097398758, "rewards/HPSv2": 0.29300785064697266, "step": 462 }, { "completion_length": 130.828125, "epoch": 1.0895170789163722, "grad_norm": 0.2891601612442965, "kl": 0.04065127111971378, "learning_rate": 7.106249999999999e-07, "loss": 0.04065127111971378, "reward": 1.4211238026618958, "reward_std": 0.1484183967113495, "rewards/GDino": 0.7411346137523651, "rewards/GIT": 0.3852551430463791, "rewards/HPSv2": 0.29473400115966797, "step": 463 }, { "completion_length": 129.7265625, "epoch": 1.0918727915194346, "grad_norm": 0.6968812103165092, "kl": 0.07524821162223816, "learning_rate": 7.1e-07, "loss": 0.07524821162223816, "reward": 1.7636322975158691, "reward_std": 0.16066031903028488, "rewards/GDino": 0.8509392440319061, "rewards/GIT": 0.6281614601612091, "rewards/HPSv2": 0.2845315933227539, "step": 464 }, { "completion_length": 133.390625, "epoch": 1.094228504122497, "grad_norm": 0.4859050350913781, "kl": 0.0847030058503151, "learning_rate": 7.09375e-07, "loss": 0.0847030058503151, "reward": 1.4899604320526123, "reward_std": 0.15593509376049042, "rewards/GDino": 0.742542177438736, "rewards/GIT": 0.4618728905916214, "rewards/HPSv2": 0.28554534912109375, "step": 465 }, { "completion_length": 130.015625, "epoch": 1.0965842167255595, "grad_norm": 0.3967141681823971, "kl": 0.05054683983325958, "learning_rate": 7.0875e-07, "loss": 0.05054683983325958, "reward": 1.1831399202346802, "reward_std": 0.15625082701444626, "rewards/GDino": 0.6322555840015411, "rewards/GIT": 0.2582436427474022, "rewards/HPSv2": 0.29264068603515625, "step": 466 }, { "completion_length": 132.4140625, "epoch": 1.098939929328622, "grad_norm": 0.36260717354915395, "kl": 0.07738400995731354, "learning_rate": 7.08125e-07, "loss": 0.07738400995731354, "reward": 1.3341223001480103, "reward_std": 0.16778255254030228, "rewards/GDino": 0.6956815123558044, "rewards/GIT": 0.3517671599984169, "rewards/HPSv2": 0.28667354583740234, "step": 467 }, { "completion_length": 129.890625, "epoch": 1.1012956419316844, "grad_norm": 0.3607996050338778, "kl": 0.04301217943429947, "learning_rate": 7.075e-07, "loss": 0.04301217943429947, "reward": 1.7088506817817688, "reward_std": 0.1494511105120182, "rewards/GDino": 0.8414458334445953, "rewards/GIT": 0.5658550262451172, "rewards/HPSv2": 0.30154991149902344, "step": 468 }, { "completion_length": 135.21875, "epoch": 1.1036513545347468, "grad_norm": 0.4463876322668004, "kl": 0.06402565538883209, "learning_rate": 7.06875e-07, "loss": 0.06402565538883209, "reward": 1.51543128490448, "reward_std": 0.1551087275147438, "rewards/GDino": 0.793732613325119, "rewards/GIT": 0.43113407492637634, "rewards/HPSv2": 0.29056453704833984, "step": 469 }, { "completion_length": 133.4453125, "epoch": 1.1060070671378093, "grad_norm": 0.37891445880922536, "kl": 0.06917763873934746, "learning_rate": 7.0625e-07, "loss": 0.06917763873934746, "reward": 1.2394628524780273, "reward_std": 0.2044844850897789, "rewards/GDino": 0.7218807339668274, "rewards/GIT": 0.21791385114192963, "rewards/HPSv2": 0.2996683120727539, "step": 470 }, { "completion_length": 129.390625, "epoch": 1.1083627797408717, "grad_norm": 0.4654334833134267, "kl": 0.16528518497943878, "learning_rate": 7.056249999999999e-07, "loss": 0.16528518497943878, "reward": 1.7017892003059387, "reward_std": 0.13834501057863235, "rewards/GDino": 0.8393750190734863, "rewards/GIT": 0.5754163712263107, "rewards/HPSv2": 0.28699779510498047, "step": 471 }, { "completion_length": 139.1875, "epoch": 1.110718492343934, "grad_norm": 0.4175677412769539, "kl": 0.106028251349926, "learning_rate": 7.049999999999999e-07, "loss": 0.106028251349926, "reward": 1.4238123893737793, "reward_std": 0.15170610696077347, "rewards/GDino": 0.743099719285965, "rewards/GIT": 0.40301790833473206, "rewards/HPSv2": 0.2776947021484375, "step": 472 }, { "completion_length": 126.03125, "epoch": 1.1130742049469964, "grad_norm": 0.5311389039246561, "kl": 0.09199371561408043, "learning_rate": 7.043749999999999e-07, "loss": 0.09199371561408043, "reward": 1.430630385875702, "reward_std": 0.1489259898662567, "rewards/GDino": 0.7183835804462433, "rewards/GIT": 0.4395865201950073, "rewards/HPSv2": 0.2726602554321289, "step": 473 }, { "completion_length": 134.25, "epoch": 1.1154299175500588, "grad_norm": 0.42823333753447007, "kl": 0.06957821175456047, "learning_rate": 7.037499999999999e-07, "loss": 0.06957821175456047, "reward": 1.7452639937400818, "reward_std": 0.1643286496400833, "rewards/GDino": 0.8455321192741394, "rewards/GIT": 0.6092788577079773, "rewards/HPSv2": 0.2904529571533203, "step": 474 }, { "completion_length": 140.1328125, "epoch": 1.1177856301531213, "grad_norm": 0.5295022438882272, "kl": 0.044125985354185104, "learning_rate": 7.031249999999999e-07, "loss": 0.044125985354185104, "reward": 1.4622347354888916, "reward_std": 0.17294533550739288, "rewards/GDino": 0.7552322745323181, "rewards/GIT": 0.4210222065448761, "rewards/HPSv2": 0.285980224609375, "step": 475 }, { "completion_length": 141.421875, "epoch": 1.1201413427561837, "grad_norm": 0.38482183922393914, "kl": 0.047309812158346176, "learning_rate": 7.024999999999999e-07, "loss": 0.047309812158346176, "reward": 1.509515106678009, "reward_std": 0.1523008719086647, "rewards/GDino": 0.7591331601142883, "rewards/GIT": 0.46335552632808685, "rewards/HPSv2": 0.28702640533447266, "step": 476 }, { "completion_length": 132.1875, "epoch": 1.1224970553592462, "grad_norm": 0.3778086252245741, "kl": 0.07257743552327156, "learning_rate": 7.01875e-07, "loss": 0.07257743552327156, "reward": 1.4489846229553223, "reward_std": 0.179600790143013, "rewards/GDino": 0.8066836893558502, "rewards/GIT": 0.3551725223660469, "rewards/HPSv2": 0.2871284484863281, "step": 477 }, { "completion_length": 134.015625, "epoch": 1.1248527679623086, "grad_norm": 0.6003665175933015, "kl": 0.11121268570423126, "learning_rate": 7.0125e-07, "loss": 0.11121268570423126, "reward": 1.2464228868484497, "reward_std": 0.18334129080176353, "rewards/GDino": 0.704736977815628, "rewards/GIT": 0.25376107916235924, "rewards/HPSv2": 0.28792476654052734, "step": 478 }, { "completion_length": 128.96875, "epoch": 1.127208480565371, "grad_norm": 0.30585282918003204, "kl": 0.06408608332276344, "learning_rate": 7.006250000000001e-07, "loss": 0.06408608332276344, "reward": 1.479724407196045, "reward_std": 0.1730445995926857, "rewards/GDino": 0.7062499821186066, "rewards/GIT": 0.48398274183273315, "rewards/HPSv2": 0.2894916534423828, "step": 479 }, { "completion_length": 133.453125, "epoch": 1.1295641931684335, "grad_norm": 0.39516431673012026, "kl": 0.10182099044322968, "learning_rate": 7e-07, "loss": 0.10182099044322968, "reward": 1.3505151867866516, "reward_std": 0.16388875246047974, "rewards/GDino": 0.704030305147171, "rewards/GIT": 0.3585314601659775, "rewards/HPSv2": 0.28795337677001953, "step": 480 }, { "completion_length": 126.8515625, "epoch": 1.131919905771496, "grad_norm": 0.3513809689192677, "kl": 0.0824097990989685, "learning_rate": 6.99375e-07, "loss": 0.0824097990989685, "reward": 1.684275507926941, "reward_std": 0.13490892201662064, "rewards/GDino": 0.8007812201976776, "rewards/GIT": 0.5884531438350677, "rewards/HPSv2": 0.2950410842895508, "step": 481 }, { "completion_length": 130.3359375, "epoch": 1.1342756183745584, "grad_norm": 0.47298687020374136, "kl": 0.07517285272479057, "learning_rate": 6.9875e-07, "loss": 0.07517285272479057, "reward": 1.5534254312515259, "reward_std": 0.18982304632663727, "rewards/GDino": 0.7896295189857483, "rewards/GIT": 0.48702526092529297, "rewards/HPSv2": 0.27677059173583984, "step": 482 }, { "completion_length": 136.5234375, "epoch": 1.1366313309776208, "grad_norm": 0.6921946037780111, "kl": 0.10671093128621578, "learning_rate": 6.98125e-07, "loss": 0.10671093128621578, "reward": 1.3436049222946167, "reward_std": 0.15531788021326065, "rewards/GDino": 0.7522880733013153, "rewards/GIT": 0.2885642945766449, "rewards/HPSv2": 0.3027524948120117, "step": 483 }, { "completion_length": 135.3359375, "epoch": 1.1389870435806833, "grad_norm": 0.7925033602779391, "kl": 0.0704701691865921, "learning_rate": 6.975e-07, "loss": 0.0704701691865921, "reward": 1.658815085887909, "reward_std": 0.2199658900499344, "rewards/GDino": 0.8281733393669128, "rewards/GIT": 0.5552759170532227, "rewards/HPSv2": 0.27536582946777344, "step": 484 }, { "completion_length": 134.640625, "epoch": 1.1413427561837457, "grad_norm": 0.6765953013097432, "kl": 0.06282306648790836, "learning_rate": 6.96875e-07, "loss": 0.06282306648790836, "reward": 1.4804533123970032, "reward_std": 0.18150704354047775, "rewards/GDino": 0.76826611161232, "rewards/GIT": 0.430107519030571, "rewards/HPSv2": 0.28207969665527344, "step": 485 }, { "completion_length": 135.6640625, "epoch": 1.143698468786808, "grad_norm": 0.7000748944603259, "kl": 0.060856372117996216, "learning_rate": 6.9625e-07, "loss": 0.060856372117996216, "reward": 1.2610546350479126, "reward_std": 0.16629406064748764, "rewards/GDino": 0.7293891906738281, "rewards/GIT": 0.23253784328699112, "rewards/HPSv2": 0.29912757873535156, "step": 486 }, { "completion_length": 131.4375, "epoch": 1.1460541813898704, "grad_norm": 0.5543716038273439, "kl": 0.13864624500274658, "learning_rate": 6.95625e-07, "loss": 0.13864624500274658, "reward": 1.507591426372528, "reward_std": 0.12369828298687935, "rewards/GDino": 0.793907880783081, "rewards/GIT": 0.4162696450948715, "rewards/HPSv2": 0.29741382598876953, "step": 487 }, { "completion_length": 132.875, "epoch": 1.1484098939929328, "grad_norm": 0.47035588189138644, "kl": 0.09583650529384613, "learning_rate": 6.949999999999999e-07, "loss": 0.09583650529384613, "reward": 1.8542941212654114, "reward_std": 0.15772466361522675, "rewards/GDino": 0.864580363035202, "rewards/GIT": 0.7061291337013245, "rewards/HPSv2": 0.2835845947265625, "step": 488 }, { "completion_length": 133.2890625, "epoch": 1.1507656065959952, "grad_norm": 0.7164971485928564, "kl": 0.15484127402305603, "learning_rate": 6.943749999999999e-07, "loss": 0.15484127402305603, "reward": 1.5808722972869873, "reward_std": 0.15237798541784286, "rewards/GDino": 0.8669689297676086, "rewards/GIT": 0.4153965711593628, "rewards/HPSv2": 0.2985067367553711, "step": 489 }, { "completion_length": 133.7578125, "epoch": 1.1531213191990577, "grad_norm": 0.6708566791435171, "kl": 0.07866884022951126, "learning_rate": 6.937499999999999e-07, "loss": 0.07866884022951126, "reward": 1.433125913143158, "reward_std": 0.16584476083517075, "rewards/GDino": 0.7609116733074188, "rewards/GIT": 0.37439408898353577, "rewards/HPSv2": 0.2978200912475586, "step": 490 }, { "completion_length": 133.84375, "epoch": 1.1554770318021201, "grad_norm": 0.5125329632739298, "kl": 0.11946621164679527, "learning_rate": 6.931249999999999e-07, "loss": 0.11946621164679527, "reward": 1.694872260093689, "reward_std": 0.14437446743249893, "rewards/GDino": 0.8289394378662109, "rewards/GIT": 0.5686153024435043, "rewards/HPSv2": 0.2973175048828125, "step": 491 }, { "completion_length": 133.734375, "epoch": 1.1578327444051826, "grad_norm": 0.6640528616153002, "kl": 0.08713643625378609, "learning_rate": 6.924999999999999e-07, "loss": 0.08713643625378609, "reward": 1.6712709069252014, "reward_std": 0.18598241358995438, "rewards/GDino": 0.8464162349700928, "rewards/GIT": 0.5345886051654816, "rewards/HPSv2": 0.2902660369873047, "step": 492 }, { "completion_length": 131.140625, "epoch": 1.160188457008245, "grad_norm": 0.5077749071888455, "kl": 0.12001257389783859, "learning_rate": 6.918749999999999e-07, "loss": 0.12001257389783859, "reward": 1.4953066110610962, "reward_std": 0.13583268597722054, "rewards/GDino": 0.7637503147125244, "rewards/GIT": 0.44915228337049484, "rewards/HPSv2": 0.28240394592285156, "step": 493 }, { "completion_length": 133.28125, "epoch": 1.1625441696113075, "grad_norm": 0.38923259695759, "kl": 0.10472170263528824, "learning_rate": 6.9125e-07, "loss": 0.10472170263528824, "reward": 1.5433300733566284, "reward_std": 0.14395415782928467, "rewards/GDino": 0.74609375, "rewards/GIT": 0.5086535513401031, "rewards/HPSv2": 0.28858280181884766, "step": 494 }, { "completion_length": 135.9453125, "epoch": 1.16489988221437, "grad_norm": 0.4736568705891641, "kl": 0.09413471072912216, "learning_rate": 6.906250000000001e-07, "loss": 0.09413471072912216, "reward": 1.3756672143936157, "reward_std": 0.17193832993507385, "rewards/GDino": 0.7566398680210114, "rewards/GIT": 0.3270546942949295, "rewards/HPSv2": 0.29197263717651367, "step": 495 }, { "completion_length": 137.4453125, "epoch": 1.1672555948174324, "grad_norm": 1.3434227817922848, "kl": 0.13969138264656067, "learning_rate": 6.9e-07, "loss": 0.13969138264656067, "reward": 1.4789603352546692, "reward_std": 0.11306149140000343, "rewards/GDino": 0.8136036992073059, "rewards/GIT": 0.3730153292417526, "rewards/HPSv2": 0.2923412322998047, "step": 496 }, { "completion_length": 134.4296875, "epoch": 1.1696113074204948, "grad_norm": 0.6775910749939482, "kl": 0.09850773215293884, "learning_rate": 6.89375e-07, "loss": 0.09850773215293884, "reward": 1.7584534883499146, "reward_std": 0.16169389337301254, "rewards/GDino": 0.8759984970092773, "rewards/GIT": 0.6002809405326843, "rewards/HPSv2": 0.28217411041259766, "step": 497 }, { "completion_length": 132.6015625, "epoch": 1.171967020023557, "grad_norm": 0.544706905978649, "kl": 0.08148472756147385, "learning_rate": 6.8875e-07, "loss": 0.08148472756147385, "reward": 1.6649127006530762, "reward_std": 0.12193413451313972, "rewards/GDino": 0.8153144717216492, "rewards/GIT": 0.5662281513214111, "rewards/HPSv2": 0.2833700180053711, "step": 498 }, { "completion_length": 138.4453125, "epoch": 1.1743227326266195, "grad_norm": 0.4842907514916734, "kl": 0.1195157989859581, "learning_rate": 6.88125e-07, "loss": 0.1195157989859581, "reward": 1.5789727568626404, "reward_std": 0.16959313303232193, "rewards/GDino": 0.7783421576023102, "rewards/GIT": 0.5026769042015076, "rewards/HPSv2": 0.29795360565185547, "step": 499 }, { "completion_length": 132.5625, "epoch": 1.176678445229682, "grad_norm": 0.4753223817009035, "kl": 0.07905077561736107, "learning_rate": 6.875e-07, "loss": 0.07905077561736107, "reward": 1.3250168561935425, "reward_std": 0.17245954275131226, "rewards/GDino": 0.7136596441268921, "rewards/GIT": 0.3086857199668884, "rewards/HPSv2": 0.3026714324951172, "step": 500 }, { "completion_length": 135.796875, "epoch": 1.1790341578327443, "grad_norm": 0.6487307059294855, "kl": 0.09139437973499298, "learning_rate": 6.86875e-07, "loss": 0.09139437973499298, "reward": 1.380884826183319, "reward_std": 0.14910286664962769, "rewards/GDino": 0.6914523541927338, "rewards/GIT": 0.4198744297027588, "rewards/HPSv2": 0.2695579528808594, "step": 501 }, { "completion_length": 135.515625, "epoch": 1.1813898704358068, "grad_norm": 0.5639002085365394, "kl": 0.07704335823655128, "learning_rate": 6.8625e-07, "loss": 0.07704335823655128, "reward": 1.545265793800354, "reward_std": 0.1417168527841568, "rewards/GDino": 0.7653350532054901, "rewards/GIT": 0.4897075593471527, "rewards/HPSv2": 0.2902231216430664, "step": 502 }, { "completion_length": 133.890625, "epoch": 1.1837455830388692, "grad_norm": 0.7091597478020026, "kl": 0.04586506634950638, "learning_rate": 6.85625e-07, "loss": 0.04586506634950638, "reward": 1.2688724398612976, "reward_std": 0.24529920518398285, "rewards/GDino": 0.6847739517688751, "rewards/GIT": 0.3023696839809418, "rewards/HPSv2": 0.28172874450683594, "step": 503 }, { "completion_length": 134.3828125, "epoch": 1.1861012956419317, "grad_norm": 0.42703596966651397, "kl": 0.06285225972533226, "learning_rate": 6.85e-07, "loss": 0.06285225972533226, "reward": 1.193402886390686, "reward_std": 0.1375451609492302, "rewards/GDino": 0.6839216649532318, "rewards/GIT": 0.2174050584435463, "rewards/HPSv2": 0.29207611083984375, "step": 504 }, { "completion_length": 130.859375, "epoch": 1.1884570082449941, "grad_norm": 0.37813014648740995, "kl": 0.07878992520272732, "learning_rate": 6.843749999999999e-07, "loss": 0.07878992520272732, "reward": 1.612427532672882, "reward_std": 0.16499916464090347, "rewards/GDino": 0.7989850342273712, "rewards/GIT": 0.5338957160711288, "rewards/HPSv2": 0.27954673767089844, "step": 505 }, { "completion_length": 134.0859375, "epoch": 1.1908127208480566, "grad_norm": 0.3821656816853138, "kl": 0.11602576822042465, "learning_rate": 6.837499999999999e-07, "loss": 0.11602576822042465, "reward": 1.5149898529052734, "reward_std": 0.11856966093182564, "rewards/GDino": 0.7697213590145111, "rewards/GIT": 0.45404306054115295, "rewards/HPSv2": 0.2912254333496094, "step": 506 }, { "completion_length": 137.5625, "epoch": 1.193168433451119, "grad_norm": 0.8308295787025599, "kl": 0.09224450960755348, "learning_rate": 6.831249999999999e-07, "loss": 0.09224450960755348, "reward": 1.6943516731262207, "reward_std": 0.1519317552447319, "rewards/GDino": 0.8049950897693634, "rewards/GIT": 0.6128702759742737, "rewards/HPSv2": 0.2764863967895508, "step": 507 }, { "completion_length": 133.328125, "epoch": 1.1955241460541814, "grad_norm": 0.6197624253099856, "kl": 0.06368106976151466, "learning_rate": 6.824999999999999e-07, "loss": 0.06368106976151466, "reward": 1.4826791286468506, "reward_std": 0.15575309842824936, "rewards/GDino": 0.7783997356891632, "rewards/GIT": 0.41513386368751526, "rewards/HPSv2": 0.28914546966552734, "step": 508 }, { "completion_length": 139.2265625, "epoch": 1.197879858657244, "grad_norm": 0.47342213556929824, "kl": 0.08333444781601429, "learning_rate": 6.818749999999999e-07, "loss": 0.08333444781601429, "reward": 1.4167258739471436, "reward_std": 0.20706353336572647, "rewards/GDino": 0.7424542903900146, "rewards/GIT": 0.3830937147140503, "rewards/HPSv2": 0.29117774963378906, "step": 509 }, { "completion_length": 135.359375, "epoch": 1.2002355712603063, "grad_norm": 0.8054613836945023, "kl": 0.07162496447563171, "learning_rate": 6.8125e-07, "loss": 0.07162496447563171, "reward": 1.3995945453643799, "reward_std": 0.20349767059087753, "rewards/GDino": 0.7258060574531555, "rewards/GIT": 0.378688246011734, "rewards/HPSv2": 0.29510021209716797, "step": 510 }, { "completion_length": 130.84375, "epoch": 1.2025912838633688, "grad_norm": 0.3951471363667934, "kl": 0.09485295787453651, "learning_rate": 6.80625e-07, "loss": 0.09485295787453651, "reward": 1.2641770839691162, "reward_std": 0.2048080936074257, "rewards/GDino": 0.7021118104457855, "rewards/GIT": 0.26762788742780685, "rewards/HPSv2": 0.2944374084472656, "step": 511 }, { "completion_length": 131.1171875, "epoch": 1.2049469964664312, "grad_norm": 0.81734688400837, "kl": 0.11294835433363914, "learning_rate": 6.800000000000001e-07, "loss": 0.11294835433363914, "reward": 1.4205140471458435, "reward_std": 0.18680939078330994, "rewards/GDino": 0.7249797284603119, "rewards/GIT": 0.4163232743740082, "rewards/HPSv2": 0.27921104431152344, "step": 512 }, { "completion_length": 139.03125, "epoch": 1.2073027090694934, "grad_norm": 0.45653342821382675, "kl": 0.07696987316012383, "learning_rate": 6.79375e-07, "loss": 0.07696987316012383, "reward": 1.2958210706710815, "reward_std": 0.1597868949174881, "rewards/GDino": 0.7141644954681396, "rewards/GIT": 0.2825528532266617, "rewards/HPSv2": 0.2991037368774414, "step": 513 }, { "completion_length": 140.59375, "epoch": 1.2096584216725559, "grad_norm": 0.35697306686298447, "kl": 0.06453775241971016, "learning_rate": 6.7875e-07, "loss": 0.06453775241971016, "reward": 1.5084455609321594, "reward_std": 0.16371942311525345, "rewards/GDino": 0.7942606806755066, "rewards/GIT": 0.42814454436302185, "rewards/HPSv2": 0.2860403060913086, "step": 514 }, { "completion_length": 129.6640625, "epoch": 1.2120141342756183, "grad_norm": 0.40149502879532617, "kl": 0.05564852990210056, "learning_rate": 6.78125e-07, "loss": 0.05564852990210056, "reward": 1.2323238849639893, "reward_std": 0.16126789152622223, "rewards/GDino": 0.6632907390594482, "rewards/GIT": 0.2893500328063965, "rewards/HPSv2": 0.27968311309814453, "step": 515 }, { "completion_length": 134.140625, "epoch": 1.2143698468786808, "grad_norm": 0.5610845008094427, "kl": 0.0789623036980629, "learning_rate": 6.775e-07, "loss": 0.0789623036980629, "reward": 1.541166603565216, "reward_std": 0.12153387814760208, "rewards/GDino": 0.7381770312786102, "rewards/GIT": 0.5148311853408813, "rewards/HPSv2": 0.2881584167480469, "step": 516 }, { "completion_length": 139.4453125, "epoch": 1.2167255594817432, "grad_norm": 0.7644561084273881, "kl": 0.1489645391702652, "learning_rate": 6.76875e-07, "loss": 0.1489645391702652, "reward": 1.4892182350158691, "reward_std": 0.19029513746500015, "rewards/GDino": 0.8204675316810608, "rewards/GIT": 0.38366593420505524, "rewards/HPSv2": 0.28508472442626953, "step": 517 }, { "completion_length": 136.3671875, "epoch": 1.2190812720848057, "grad_norm": 0.5755993186357432, "kl": 0.09789583459496498, "learning_rate": 6.7625e-07, "loss": 0.09789583459496498, "reward": 1.6107258796691895, "reward_std": 0.14417658746242523, "rewards/GDino": 0.8040108382701874, "rewards/GIT": 0.5127955377101898, "rewards/HPSv2": 0.29391956329345703, "step": 518 }, { "completion_length": 136.984375, "epoch": 1.221436984687868, "grad_norm": 0.4349937367076073, "kl": 0.05000374838709831, "learning_rate": 6.75625e-07, "loss": 0.05000374838709831, "reward": 1.3472066521644592, "reward_std": 0.1414111852645874, "rewards/GDino": 0.7098392844200134, "rewards/GIT": 0.35516372323036194, "rewards/HPSv2": 0.28220367431640625, "step": 519 }, { "completion_length": 135.8359375, "epoch": 1.2237926972909305, "grad_norm": 0.5519564352284803, "kl": 0.09770356863737106, "learning_rate": 6.75e-07, "loss": 0.09770356863737106, "reward": 1.9635347723960876, "reward_std": 0.12439146637916565, "rewards/GDino": 0.8399739563465118, "rewards/GIT": 0.8489407300949097, "rewards/HPSv2": 0.27462005615234375, "step": 520 }, { "completion_length": 140.203125, "epoch": 1.226148409893993, "grad_norm": 0.81186830051296, "kl": 0.16700037568807602, "learning_rate": 6.743749999999999e-07, "loss": 0.16700037568807602, "reward": 1.3874614238739014, "reward_std": 0.17933734506368637, "rewards/GDino": 0.7454012632369995, "rewards/GIT": 0.34141136705875397, "rewards/HPSv2": 0.30064868927001953, "step": 521 }, { "completion_length": 136.3125, "epoch": 1.2285041224970554, "grad_norm": 0.8945472431920414, "kl": 0.09558967500925064, "learning_rate": 6.737499999999999e-07, "loss": 0.09558967500925064, "reward": 1.7090588808059692, "reward_std": 0.17525190114974976, "rewards/GDino": 0.836966872215271, "rewards/GIT": 0.5837685465812683, "rewards/HPSv2": 0.28832340240478516, "step": 522 }, { "completion_length": 140.3828125, "epoch": 1.2308598351001179, "grad_norm": 1.6803947605035419, "kl": 0.11041294783353806, "learning_rate": 6.731249999999999e-07, "loss": 0.11041294783353806, "reward": 1.328962802886963, "reward_std": 0.10928347334265709, "rewards/GDino": 0.6474496722221375, "rewards/GIT": 0.3989546597003937, "rewards/HPSv2": 0.2825584411621094, "step": 523 }, { "completion_length": 136.8203125, "epoch": 1.23321554770318, "grad_norm": 0.4010463921598425, "kl": 0.0733288824558258, "learning_rate": 6.724999999999999e-07, "loss": 0.0733288824558258, "reward": 1.2115467190742493, "reward_std": 0.17341859638690948, "rewards/GDino": 0.7103458344936371, "rewards/GIT": 0.2240992672741413, "rewards/HPSv2": 0.2771015167236328, "step": 524 }, { "completion_length": 135.296875, "epoch": 1.2355712603062425, "grad_norm": 0.45277962225833274, "kl": 0.09571744501590729, "learning_rate": 6.718749999999999e-07, "loss": 0.09571744501590729, "reward": 1.170459270477295, "reward_std": 0.13760198652744293, "rewards/GDino": 0.6662254631519318, "rewards/GIT": 0.2212890386581421, "rewards/HPSv2": 0.2829446792602539, "step": 525 }, { "completion_length": 136.1640625, "epoch": 1.237926972909305, "grad_norm": 0.8045160997797015, "kl": 0.12412884831428528, "learning_rate": 6.7125e-07, "loss": 0.12412884831428528, "reward": 1.5405339002609253, "reward_std": 0.11216038838028908, "rewards/GDino": 0.7928453385829926, "rewards/GIT": 0.4560864120721817, "rewards/HPSv2": 0.29160213470458984, "step": 526 }, { "completion_length": 136.5, "epoch": 1.2402826855123674, "grad_norm": 0.5091978735961272, "kl": 0.08212611824274063, "learning_rate": 6.70625e-07, "loss": 0.08212611824274063, "reward": 1.4714910387992859, "reward_std": 0.19381003081798553, "rewards/GDino": 0.7715637981891632, "rewards/GIT": 0.41187460720539093, "rewards/HPSv2": 0.2880525588989258, "step": 527 }, { "completion_length": 137.9375, "epoch": 1.2426383981154299, "grad_norm": 0.3995156255023577, "kl": 0.12051703780889511, "learning_rate": 6.7e-07, "loss": 0.12051703780889511, "reward": 1.3981001377105713, "reward_std": 0.15575546026229858, "rewards/GDino": 0.679175466299057, "rewards/GIT": 0.43048591911792755, "rewards/HPSv2": 0.2884387969970703, "step": 528 }, { "completion_length": 131.375, "epoch": 1.2449941107184923, "grad_norm": 0.5551119240403666, "kl": 0.1371229663491249, "learning_rate": 6.69375e-07, "loss": 0.1371229663491249, "reward": 1.3606649041175842, "reward_std": 0.1472584381699562, "rewards/GDino": 0.7472815215587616, "rewards/GIT": 0.31736570596694946, "rewards/HPSv2": 0.2960176467895508, "step": 529 }, { "completion_length": 139.125, "epoch": 1.2473498233215548, "grad_norm": 0.4137405730780375, "kl": 0.0885402150452137, "learning_rate": 6.6875e-07, "loss": 0.0885402150452137, "reward": 1.6691033840179443, "reward_std": 0.17055381834506989, "rewards/GDino": 0.8264661431312561, "rewards/GIT": 0.5464812815189362, "rewards/HPSv2": 0.2961559295654297, "step": 530 }, { "completion_length": 137.8203125, "epoch": 1.2497055359246172, "grad_norm": 0.47874954472785364, "kl": 0.06754971668124199, "learning_rate": 6.68125e-07, "loss": 0.06754971668124199, "reward": 1.7206087112426758, "reward_std": 0.12883227318525314, "rewards/GDino": 0.8369492590427399, "rewards/GIT": 0.5933543145656586, "rewards/HPSv2": 0.29030513763427734, "step": 531 }, { "completion_length": 136.0234375, "epoch": 1.2520612485276796, "grad_norm": 0.9467251629755215, "kl": 0.12289461120963097, "learning_rate": 6.675e-07, "loss": 0.12289461120963097, "reward": 1.4477567672729492, "reward_std": 0.2012491598725319, "rewards/GDino": 0.7446931302547455, "rewards/GIT": 0.4057566374540329, "rewards/HPSv2": 0.29730701446533203, "step": 532 }, { "completion_length": 138.9921875, "epoch": 1.254416961130742, "grad_norm": 1.0954573658292484, "kl": 0.1326635256409645, "learning_rate": 6.66875e-07, "loss": 0.1326635256409645, "reward": 1.3793284893035889, "reward_std": 0.16531047224998474, "rewards/GDino": 0.7204822301864624, "rewards/GIT": 0.37095293402671814, "rewards/HPSv2": 0.28789329528808594, "step": 533 }, { "completion_length": 138.6328125, "epoch": 1.2567726737338045, "grad_norm": 0.28796296593092746, "kl": 0.06455505639314651, "learning_rate": 6.6625e-07, "loss": 0.06455505639314651, "reward": 1.8569204211235046, "reward_std": 0.17192091792821884, "rewards/GDino": 0.9063403606414795, "rewards/GIT": 0.6692250967025757, "rewards/HPSv2": 0.2813549041748047, "step": 534 }, { "completion_length": 132.234375, "epoch": 1.259128386336867, "grad_norm": 0.9965608939722642, "kl": 0.11758935078978539, "learning_rate": 6.65625e-07, "loss": 0.11758935078978539, "reward": 1.463953673839569, "reward_std": 0.18660011142492294, "rewards/GDino": 0.809697836637497, "rewards/GIT": 0.36372853070497513, "rewards/HPSv2": 0.29052734375, "step": 535 }, { "completion_length": 139.4765625, "epoch": 1.2614840989399294, "grad_norm": 0.29565605977159526, "kl": 0.10540921241044998, "learning_rate": 6.65e-07, "loss": 0.10540921241044998, "reward": 1.3952195048332214, "reward_std": 0.19029314815998077, "rewards/GDino": 0.7569994032382965, "rewards/GIT": 0.36156290769577026, "rewards/HPSv2": 0.2766571044921875, "step": 536 }, { "completion_length": 140.234375, "epoch": 1.2638398115429919, "grad_norm": 0.6522150364951133, "kl": 0.11819014698266983, "learning_rate": 6.64375e-07, "loss": 0.11819014698266983, "reward": 1.6033844947814941, "reward_std": 0.16329306364059448, "rewards/GDino": 0.7566494047641754, "rewards/GIT": 0.5658388286828995, "rewards/HPSv2": 0.2808961868286133, "step": 537 }, { "completion_length": 136.9453125, "epoch": 1.2661955241460543, "grad_norm": 0.34960987476868005, "kl": 0.0937606580555439, "learning_rate": 6.637499999999999e-07, "loss": 0.0937606580555439, "reward": 1.5388636589050293, "reward_std": 0.17167873680591583, "rewards/GDino": 0.8155846297740936, "rewards/GIT": 0.42627422511577606, "rewards/HPSv2": 0.29700469970703125, "step": 538 }, { "completion_length": 135.0546875, "epoch": 1.2685512367491167, "grad_norm": 0.474313529886802, "kl": 0.1223103292286396, "learning_rate": 6.631249999999999e-07, "loss": 0.1223103292286396, "reward": 1.6490195989608765, "reward_std": 0.14562274143099785, "rewards/GDino": 0.8163503706455231, "rewards/GIT": 0.5583647787570953, "rewards/HPSv2": 0.2743043899536133, "step": 539 }, { "completion_length": 138.328125, "epoch": 1.270906949352179, "grad_norm": 0.6517935840002204, "kl": 0.14954904466867447, "learning_rate": 6.624999999999999e-07, "loss": 0.14954904466867447, "reward": 1.2878955602645874, "reward_std": 0.1987466812133789, "rewards/GDino": 0.7024535536766052, "rewards/GIT": 0.30970995873212814, "rewards/HPSv2": 0.27573204040527344, "step": 540 }, { "completion_length": 136.53125, "epoch": 1.2732626619552414, "grad_norm": 0.5643317123035897, "kl": 0.1227278895676136, "learning_rate": 6.618749999999999e-07, "loss": 0.1227278895676136, "reward": 1.4405762553215027, "reward_std": 0.13599949702620506, "rewards/GDino": 0.76610466837883, "rewards/GIT": 0.37802182137966156, "rewards/HPSv2": 0.2964496612548828, "step": 541 }, { "completion_length": 131.8828125, "epoch": 1.2756183745583038, "grad_norm": 0.7185519224920387, "kl": 0.23424260318279266, "learning_rate": 6.6125e-07, "loss": 0.23424260318279266, "reward": 1.5962799787521362, "reward_std": 0.11178594455122948, "rewards/GDino": 0.8190045952796936, "rewards/GIT": 0.4779866188764572, "rewards/HPSv2": 0.2992887496948242, "step": 542 }, { "completion_length": 139.828125, "epoch": 1.2779740871613663, "grad_norm": 1.4154808841121747, "kl": 0.15807217359542847, "learning_rate": 6.60625e-07, "loss": 0.15807217359542847, "reward": 1.568770408630371, "reward_std": 0.12805693596601486, "rewards/GDino": 0.8468250036239624, "rewards/GIT": 0.4340864568948746, "rewards/HPSv2": 0.2878589630126953, "step": 543 }, { "completion_length": 135.2890625, "epoch": 1.2803297997644287, "grad_norm": 0.5945784043573149, "kl": 0.11158892512321472, "learning_rate": 6.6e-07, "loss": 0.11158892512321472, "reward": 1.3227280974388123, "reward_std": 0.1633792594075203, "rewards/GDino": 0.674912840127945, "rewards/GIT": 0.3547072261571884, "rewards/HPSv2": 0.2931079864501953, "step": 544 }, { "completion_length": 146.28125, "epoch": 1.2826855123674912, "grad_norm": 0.4350435983658621, "kl": 0.07603197917342186, "learning_rate": 6.59375e-07, "loss": 0.07603197917342186, "reward": 1.3070669770240784, "reward_std": 0.20748768746852875, "rewards/GDino": 0.7024946808815002, "rewards/GIT": 0.3132418841123581, "rewards/HPSv2": 0.29133033752441406, "step": 545 }, { "completion_length": 137.234375, "epoch": 1.2850412249705536, "grad_norm": 1.985426100570532, "kl": 0.1434563286602497, "learning_rate": 6.587499999999999e-07, "loss": 0.1434563286602497, "reward": 1.7153125405311584, "reward_std": 0.1450836956501007, "rewards/GDino": 0.8556762039661407, "rewards/GIT": 0.5728655010461807, "rewards/HPSv2": 0.2867708206176758, "step": 546 }, { "completion_length": 138.8203125, "epoch": 1.287396937573616, "grad_norm": 0.6972916852222926, "kl": 0.11977019160985947, "learning_rate": 6.581249999999999e-07, "loss": 0.11977019160985947, "reward": 1.4533965587615967, "reward_std": 0.17517971247434616, "rewards/GDino": 0.7826249003410339, "rewards/GIT": 0.3743228316307068, "rewards/HPSv2": 0.2964487075805664, "step": 547 }, { "completion_length": 146.2421875, "epoch": 1.2897526501766785, "grad_norm": 0.8443908627693597, "kl": 0.12893804907798767, "learning_rate": 6.575e-07, "loss": 0.12893804907798767, "reward": 1.4809221029281616, "reward_std": 0.19041018933057785, "rewards/GDino": 0.8141649067401886, "rewards/GIT": 0.3694596588611603, "rewards/HPSv2": 0.29729747772216797, "step": 548 }, { "completion_length": 138.2578125, "epoch": 1.292108362779741, "grad_norm": 0.47435315680365703, "kl": 0.17353223264217377, "learning_rate": 6.56875e-07, "loss": 0.17353223264217377, "reward": 1.4687809944152832, "reward_std": 0.2309345379471779, "rewards/GDino": 0.7837478816509247, "rewards/GIT": 0.39861229062080383, "rewards/HPSv2": 0.2864208221435547, "step": 549 }, { "completion_length": 136.8671875, "epoch": 1.2944640753828032, "grad_norm": 0.46887750943654005, "kl": 0.13014454394578934, "learning_rate": 6.5625e-07, "loss": 0.13014454394578934, "reward": 1.241137444972992, "reward_std": 0.16941027343273163, "rewards/GDino": 0.7312130928039551, "rewards/GIT": 0.19871556758880615, "rewards/HPSv2": 0.31120872497558594, "step": 550 }, { "completion_length": 141.953125, "epoch": 1.2968197879858656, "grad_norm": 0.3754230162628254, "kl": 0.1079224981367588, "learning_rate": 6.55625e-07, "loss": 0.1079224981367588, "reward": 1.4913674592971802, "reward_std": 0.1940731406211853, "rewards/GDino": 0.7691451609134674, "rewards/GIT": 0.42201513051986694, "rewards/HPSv2": 0.30020713806152344, "step": 551 }, { "completion_length": 139.734375, "epoch": 1.299175500588928, "grad_norm": 0.6600657886904785, "kl": 0.09420732408761978, "learning_rate": 6.55e-07, "loss": 0.09420732408761978, "reward": 1.2613070607185364, "reward_std": 0.1745917946100235, "rewards/GDino": 0.7034119963645935, "rewards/GIT": 0.26711490750312805, "rewards/HPSv2": 0.29078006744384766, "step": 552 }, { "completion_length": 139.0625, "epoch": 1.3015312131919905, "grad_norm": 1.2862857940936234, "kl": 0.08957865834236145, "learning_rate": 6.54375e-07, "loss": 0.08957865834236145, "reward": 1.6387276649475098, "reward_std": 0.1517413929104805, "rewards/GDino": 0.8120055198669434, "rewards/GIT": 0.5217159986495972, "rewards/HPSv2": 0.3050060272216797, "step": 553 }, { "completion_length": 137.015625, "epoch": 1.303886925795053, "grad_norm": 0.40132151377808434, "kl": 0.10629618540406227, "learning_rate": 6.5375e-07, "loss": 0.10629618540406227, "reward": 1.3669897317886353, "reward_std": 0.16217180341482162, "rewards/GDino": 0.7578899264335632, "rewards/GIT": 0.3240570202469826, "rewards/HPSv2": 0.28504276275634766, "step": 554 }, { "completion_length": 137.6875, "epoch": 1.3062426383981154, "grad_norm": 0.38723723970947904, "kl": 0.12319814413785934, "learning_rate": 6.531249999999999e-07, "loss": 0.12319814413785934, "reward": 1.37913978099823, "reward_std": 0.12230361998081207, "rewards/GDino": 0.7463316321372986, "rewards/GIT": 0.3426651805639267, "rewards/HPSv2": 0.2901430130004883, "step": 555 }, { "completion_length": 138.859375, "epoch": 1.3085983510011778, "grad_norm": 0.39625782509536234, "kl": 0.11724453046917915, "learning_rate": 6.524999999999999e-07, "loss": 0.11724453046917915, "reward": 1.6070721745491028, "reward_std": 0.12813667207956314, "rewards/GDino": 0.8320958912372589, "rewards/GIT": 0.47392599284648895, "rewards/HPSv2": 0.30105018615722656, "step": 556 }, { "completion_length": 135.5546875, "epoch": 1.3109540636042403, "grad_norm": 0.3712145758420088, "kl": 0.10071728751063347, "learning_rate": 6.51875e-07, "loss": 0.10071728751063347, "reward": 1.3656325936317444, "reward_std": 0.18164127320051193, "rewards/GDino": 0.712488979101181, "rewards/GIT": 0.36163483560085297, "rewards/HPSv2": 0.29150867462158203, "step": 557 }, { "completion_length": 141.7421875, "epoch": 1.3133097762073027, "grad_norm": 0.33292080979588395, "kl": 0.07941919192671776, "learning_rate": 6.5125e-07, "loss": 0.07941919192671776, "reward": 1.474417805671692, "reward_std": 0.15681206807494164, "rewards/GDino": 0.8088288009166718, "rewards/GIT": 0.38285207748413086, "rewards/HPSv2": 0.28273677825927734, "step": 558 }, { "completion_length": 139.015625, "epoch": 1.3156654888103652, "grad_norm": 0.41790306505369146, "kl": 0.06365776993334293, "learning_rate": 6.50625e-07, "loss": 0.06365776993334293, "reward": 1.2886786460876465, "reward_std": 0.16218340396881104, "rewards/GDino": 0.7832428514957428, "rewards/GIT": 0.21513821929693222, "rewards/HPSv2": 0.2902975082397461, "step": 559 }, { "completion_length": 142.84375, "epoch": 1.3180212014134276, "grad_norm": 0.36862810953024167, "kl": 0.07621027156710625, "learning_rate": 6.5e-07, "loss": 0.07621027156710625, "reward": 1.501527488231659, "reward_std": 0.16615239530801773, "rewards/GDino": 0.7740911245346069, "rewards/GIT": 0.4320128262042999, "rewards/HPSv2": 0.2954235076904297, "step": 560 }, { "completion_length": 141.296875, "epoch": 1.32037691401649, "grad_norm": 0.3255841034311746, "kl": 0.11215062439441681, "learning_rate": 6.49375e-07, "loss": 0.11215062439441681, "reward": 1.8249411582946777, "reward_std": 0.160112664103508, "rewards/GDino": 0.8774739503860474, "rewards/GIT": 0.6568387746810913, "rewards/HPSv2": 0.29062843322753906, "step": 561 }, { "completion_length": 134.71875, "epoch": 1.3227326266195525, "grad_norm": 0.3914600177037911, "kl": 0.0971904769539833, "learning_rate": 6.4875e-07, "loss": 0.0971904769539833, "reward": 1.4245556592941284, "reward_std": 0.18521569296717644, "rewards/GDino": 0.7525670826435089, "rewards/GIT": 0.3893270790576935, "rewards/HPSv2": 0.28266143798828125, "step": 562 }, { "completion_length": 135.5234375, "epoch": 1.325088339222615, "grad_norm": 0.6643460974645673, "kl": 0.17191359400749207, "learning_rate": 6.481249999999999e-07, "loss": 0.17191359400749207, "reward": 1.6143383383750916, "reward_std": 0.10674099624156952, "rewards/GDino": 0.8557388782501221, "rewards/GIT": 0.4595586955547333, "rewards/HPSv2": 0.2990407943725586, "step": 563 }, { "completion_length": 145.4296875, "epoch": 1.3274440518256774, "grad_norm": 0.383301408392367, "kl": 0.11143064871430397, "learning_rate": 6.474999999999999e-07, "loss": 0.11143064871430397, "reward": 1.5061694383621216, "reward_std": 0.16928518563508987, "rewards/GDino": 0.7989214062690735, "rewards/GIT": 0.4287378489971161, "rewards/HPSv2": 0.27851009368896484, "step": 564 }, { "completion_length": 142.1328125, "epoch": 1.3297997644287398, "grad_norm": 0.7801912893586306, "kl": 0.10384514182806015, "learning_rate": 6.46875e-07, "loss": 0.10384514182806015, "reward": 1.2245123088359833, "reward_std": 0.19406025856733322, "rewards/GDino": 0.6608648002147675, "rewards/GIT": 0.2764064818620682, "rewards/HPSv2": 0.28724098205566406, "step": 565 }, { "completion_length": 144.109375, "epoch": 1.332155477031802, "grad_norm": 0.33482357184137895, "kl": 0.08322149142622948, "learning_rate": 6.4625e-07, "loss": 0.08322149142622948, "reward": 1.756816029548645, "reward_std": 0.14149026200175285, "rewards/GDino": 0.8687175214290619, "rewards/GIT": 0.5992610454559326, "rewards/HPSv2": 0.2888374328613281, "step": 566 }, { "completion_length": 139.625, "epoch": 1.3345111896348645, "grad_norm": 0.6268321972586731, "kl": 0.08196162432432175, "learning_rate": 6.45625e-07, "loss": 0.08196162432432175, "reward": 1.4445892572402954, "reward_std": 0.14610330760478973, "rewards/GDino": 0.7312193810939789, "rewards/GIT": 0.43684911727905273, "rewards/HPSv2": 0.2765207290649414, "step": 567 }, { "completion_length": 147.1015625, "epoch": 1.336866902237927, "grad_norm": 0.3139751881168278, "kl": 0.10401663556694984, "learning_rate": 6.45e-07, "loss": 0.10401663556694984, "reward": 1.5223528146743774, "reward_std": 0.16305619478225708, "rewards/GDino": 0.7704153656959534, "rewards/GIT": 0.4593215733766556, "rewards/HPSv2": 0.2926158905029297, "step": 568 }, { "completion_length": 143.21875, "epoch": 1.3392226148409894, "grad_norm": 0.37546589282195736, "kl": 0.12078196182847023, "learning_rate": 6.44375e-07, "loss": 0.12078196182847023, "reward": 1.4464203119277954, "reward_std": 0.12357235699892044, "rewards/GDino": 0.743325412273407, "rewards/GIT": 0.40839238464832306, "rewards/HPSv2": 0.29470252990722656, "step": 569 }, { "completion_length": 138.3515625, "epoch": 1.3415783274440518, "grad_norm": 0.5712413467849771, "kl": 0.16047362238168716, "learning_rate": 6.4375e-07, "loss": 0.16047362238168716, "reward": 1.3412472605705261, "reward_std": 0.1740206852555275, "rewards/GDino": 0.7145533859729767, "rewards/GIT": 0.34095868468284607, "rewards/HPSv2": 0.2857351303100586, "step": 570 }, { "completion_length": 142.9609375, "epoch": 1.3439340400471143, "grad_norm": 0.7650841411374711, "kl": 0.12445234134793282, "learning_rate": 6.431249999999999e-07, "loss": 0.12445234134793282, "reward": 1.610679566860199, "reward_std": 0.14543882757425308, "rewards/GDino": 0.8179568648338318, "rewards/GIT": 0.5053206384181976, "rewards/HPSv2": 0.2874021530151367, "step": 571 }, { "completion_length": 136.0078125, "epoch": 1.3462897526501767, "grad_norm": 0.6867274510853927, "kl": 0.12082434818148613, "learning_rate": 6.424999999999999e-07, "loss": 0.12082434818148613, "reward": 1.4243210554122925, "reward_std": 0.13541127368807793, "rewards/GDino": 0.7430658638477325, "rewards/GIT": 0.38691120594739914, "rewards/HPSv2": 0.2943439483642578, "step": 572 }, { "completion_length": 135.4296875, "epoch": 1.3486454652532391, "grad_norm": 0.6274651119644614, "kl": 0.1641533263027668, "learning_rate": 6.41875e-07, "loss": 0.1641533263027668, "reward": 1.5996240377426147, "reward_std": 0.14738616347312927, "rewards/GDino": 0.8476959466934204, "rewards/GIT": 0.46514394879341125, "rewards/HPSv2": 0.28678417205810547, "step": 573 }, { "completion_length": 143.7109375, "epoch": 1.3510011778563016, "grad_norm": 0.5407800059793987, "kl": 0.13391613215208054, "learning_rate": 6.4125e-07, "loss": 0.13391613215208054, "reward": 1.2837878465652466, "reward_std": 0.14496536925435066, "rewards/GDino": 0.6614941954612732, "rewards/GIT": 0.3343784809112549, "rewards/HPSv2": 0.2879152297973633, "step": 574 }, { "completion_length": 144.2109375, "epoch": 1.353356890459364, "grad_norm": 0.3189559283867708, "kl": 0.05273721180856228, "learning_rate": 6.40625e-07, "loss": 0.05273721180856228, "reward": 1.6771380305290222, "reward_std": 0.17047832906246185, "rewards/GDino": 0.8165364563465118, "rewards/GIT": 0.5806227922439575, "rewards/HPSv2": 0.27997875213623047, "step": 575 }, { "completion_length": 144.203125, "epoch": 1.3557126030624262, "grad_norm": 0.37190733156628325, "kl": 0.10164650343358517, "learning_rate": 6.4e-07, "loss": 0.10164650343358517, "reward": 1.6950152516365051, "reward_std": 0.17884056270122528, "rewards/GDino": 0.8676457107067108, "rewards/GIT": 0.5469596683979034, "rewards/HPSv2": 0.2804098129272461, "step": 576 }, { "completion_length": 137.5546875, "epoch": 1.3580683156654887, "grad_norm": 0.7417327277035697, "kl": 0.16644983738660812, "learning_rate": 6.39375e-07, "loss": 0.16644983738660812, "reward": 1.3371291756629944, "reward_std": 0.13031911849975586, "rewards/GDino": 0.737271785736084, "rewards/GIT": 0.3195076286792755, "rewards/HPSv2": 0.2803497314453125, "step": 577 }, { "completion_length": 137.6796875, "epoch": 1.3604240282685511, "grad_norm": 0.8043484628262049, "kl": 0.1299707591533661, "learning_rate": 6.3875e-07, "loss": 0.1299707591533661, "reward": 1.47442227602005, "reward_std": 0.1264227330684662, "rewards/GDino": 0.7693486511707306, "rewards/GIT": 0.4024241119623184, "rewards/HPSv2": 0.30264949798583984, "step": 578 }, { "completion_length": 148.6640625, "epoch": 1.3627797408716136, "grad_norm": 0.3964506183908197, "kl": 0.0797329843044281, "learning_rate": 6.38125e-07, "loss": 0.0797329843044281, "reward": 1.3547425270080566, "reward_std": 0.2087239772081375, "rewards/GDino": 0.7233508825302124, "rewards/GIT": 0.3433990478515625, "rewards/HPSv2": 0.2879924774169922, "step": 579 }, { "completion_length": 144.4375, "epoch": 1.365135453474676, "grad_norm": 0.3961708239621721, "kl": 0.04831145890057087, "learning_rate": 6.374999999999999e-07, "loss": 0.04831145890057087, "reward": 1.4870784878730774, "reward_std": 0.16419214755296707, "rewards/GDino": 0.7396105527877808, "rewards/GIT": 0.45302948355674744, "rewards/HPSv2": 0.29443836212158203, "step": 580 }, { "completion_length": 142.4609375, "epoch": 1.3674911660777385, "grad_norm": 0.5275190459753168, "kl": 0.11380824074149132, "learning_rate": 6.368749999999999e-07, "loss": 0.11380824074149132, "reward": 1.1405942738056183, "reward_std": 0.11747300252318382, "rewards/GDino": 0.6560497581958771, "rewards/GIT": 0.19217849522829056, "rewards/HPSv2": 0.29236602783203125, "step": 581 }, { "completion_length": 133.1484375, "epoch": 1.369846878680801, "grad_norm": 0.6031165053994866, "kl": 0.10443776473402977, "learning_rate": 6.362499999999999e-07, "loss": 0.10443776473402977, "reward": 1.6090281009674072, "reward_std": 0.1659797579050064, "rewards/GDino": 0.8128030896186829, "rewards/GIT": 0.5179933905601501, "rewards/HPSv2": 0.2782316207885742, "step": 582 }, { "completion_length": 144.2265625, "epoch": 1.3722025912838633, "grad_norm": 0.3878960138387615, "kl": 0.08805383369326591, "learning_rate": 6.35625e-07, "loss": 0.08805383369326591, "reward": 1.4870412349700928, "reward_std": 0.15723756328225136, "rewards/GDino": 0.7476297318935394, "rewards/GIT": 0.4616299867630005, "rewards/HPSv2": 0.27778148651123047, "step": 583 }, { "completion_length": 143.40625, "epoch": 1.3745583038869258, "grad_norm": 0.9306203008052882, "kl": 0.10823635011911392, "learning_rate": 6.35e-07, "loss": 0.10823635011911392, "reward": 1.529003381729126, "reward_std": 0.13275331258773804, "rewards/GDino": 0.7972338199615479, "rewards/GIT": 0.4412727952003479, "rewards/HPSv2": 0.290496826171875, "step": 584 }, { "completion_length": 147.6171875, "epoch": 1.3769140164899882, "grad_norm": 0.42047341249776765, "kl": 0.12564117461442947, "learning_rate": 6.34375e-07, "loss": 0.12564117461442947, "reward": 1.3203855752944946, "reward_std": 0.18313417583703995, "rewards/GDino": 0.7101315855979919, "rewards/GIT": 0.31755030900239944, "rewards/HPSv2": 0.29270362854003906, "step": 585 }, { "completion_length": 148.75, "epoch": 1.3792697290930507, "grad_norm": 0.31178652140458557, "kl": 0.08040138706564903, "learning_rate": 6.3375e-07, "loss": 0.08040138706564903, "reward": 1.4704681038856506, "reward_std": 0.12151794135570526, "rewards/GDino": 0.7488127648830414, "rewards/GIT": 0.4386229068040848, "rewards/HPSv2": 0.2830324172973633, "step": 586 }, { "completion_length": 146.4609375, "epoch": 1.3816254416961131, "grad_norm": 0.4317661145656204, "kl": 0.18399332463741302, "learning_rate": 6.33125e-07, "loss": 0.18399332463741302, "reward": 1.4026954770088196, "reward_std": 0.17472387850284576, "rewards/GDino": 0.7568524777889252, "rewards/GIT": 0.35103175044059753, "rewards/HPSv2": 0.2948112487792969, "step": 587 }, { "completion_length": 138.40625, "epoch": 1.3839811542991756, "grad_norm": 0.3968456902715425, "kl": 0.11074898764491081, "learning_rate": 6.324999999999999e-07, "loss": 0.11074898764491081, "reward": 1.3268905878067017, "reward_std": 0.17768090590834618, "rewards/GDino": 0.7453171014785767, "rewards/GIT": 0.2975112199783325, "rewards/HPSv2": 0.28406238555908203, "step": 588 }, { "completion_length": 146.3984375, "epoch": 1.386336866902238, "grad_norm": 0.7937316666401802, "kl": 0.1870146095752716, "learning_rate": 6.31875e-07, "loss": 0.1870146095752716, "reward": 1.681689977645874, "reward_std": 0.1543176919221878, "rewards/GDino": 0.830425888299942, "rewards/GIT": 0.5661927461624146, "rewards/HPSv2": 0.28507137298583984, "step": 589 }, { "completion_length": 134.125, "epoch": 1.3886925795053005, "grad_norm": 0.33884024320535755, "kl": 0.16669970750808716, "learning_rate": 6.3125e-07, "loss": 0.16669970750808716, "reward": 1.4979641437530518, "reward_std": 0.1640223115682602, "rewards/GDino": 0.7693493664264679, "rewards/GIT": 0.44769851863384247, "rewards/HPSv2": 0.2809162139892578, "step": 590 }, { "completion_length": 142.9765625, "epoch": 1.391048292108363, "grad_norm": 0.3347161362050823, "kl": 0.08214369602501392, "learning_rate": 6.30625e-07, "loss": 0.08214369602501392, "reward": 1.472494125366211, "reward_std": 0.1634005457162857, "rewards/GDino": 0.7610676884651184, "rewards/GIT": 0.41345272958278656, "rewards/HPSv2": 0.2979736328125, "step": 591 }, { "completion_length": 143.75, "epoch": 1.3934040047114253, "grad_norm": 0.30507082139256586, "kl": 0.07324481382966042, "learning_rate": 6.3e-07, "loss": 0.07324481382966042, "reward": 1.6254292130470276, "reward_std": 0.16757623851299286, "rewards/GDino": 0.8062499761581421, "rewards/GIT": 0.5461241751909256, "rewards/HPSv2": 0.2730550765991211, "step": 592 }, { "completion_length": 143.3046875, "epoch": 1.3957597173144876, "grad_norm": 0.44347919633588223, "kl": 0.13521021977066994, "learning_rate": 6.29375e-07, "loss": 0.13521021977066994, "reward": 1.3111346364021301, "reward_std": 0.123922448605299, "rewards/GDino": 0.7028718590736389, "rewards/GIT": 0.3143431544303894, "rewards/HPSv2": 0.29391956329345703, "step": 593 }, { "completion_length": 145.59375, "epoch": 1.39811542991755, "grad_norm": 0.6614818985392494, "kl": 0.11023616790771484, "learning_rate": 6.2875e-07, "loss": 0.11023616790771484, "reward": 1.4376718997955322, "reward_std": 0.18897195905447006, "rewards/GDino": 0.7310039699077606, "rewards/GIT": 0.4167003035545349, "rewards/HPSv2": 0.28996753692626953, "step": 594 }, { "completion_length": 138.765625, "epoch": 1.4004711425206124, "grad_norm": 0.8948088269522633, "kl": 0.160691536962986, "learning_rate": 6.28125e-07, "loss": 0.160691536962986, "reward": 1.6420353055000305, "reward_std": 0.13575248420238495, "rewards/GDino": 0.8449749052524567, "rewards/GIT": 0.4983246922492981, "rewards/HPSv2": 0.2987356185913086, "step": 595 }, { "completion_length": 138.1953125, "epoch": 1.4028268551236749, "grad_norm": 0.8169328827387338, "kl": 0.16073638945817947, "learning_rate": 6.274999999999999e-07, "loss": 0.16073638945817947, "reward": 1.6120657324790955, "reward_std": 0.1393938511610031, "rewards/GDino": 0.7908580005168915, "rewards/GIT": 0.5236804187297821, "rewards/HPSv2": 0.2975273132324219, "step": 596 }, { "completion_length": 141.3515625, "epoch": 1.4051825677267373, "grad_norm": 1.1304107049245222, "kl": 0.1446751095354557, "learning_rate": 6.268749999999999e-07, "loss": 0.1446751095354557, "reward": 1.424015760421753, "reward_std": 0.16287735104560852, "rewards/GDino": 0.7552278637886047, "rewards/GIT": 0.37412746250629425, "rewards/HPSv2": 0.2946605682373047, "step": 597 }, { "completion_length": 145.4921875, "epoch": 1.4075382803297998, "grad_norm": 0.9872459083603768, "kl": 0.09675141051411629, "learning_rate": 6.262499999999999e-07, "loss": 0.09675141051411629, "reward": 1.5900498032569885, "reward_std": 0.17751558125019073, "rewards/GDino": 0.835483968257904, "rewards/GIT": 0.456244096159935, "rewards/HPSv2": 0.2983217239379883, "step": 598 }, { "completion_length": 138.640625, "epoch": 1.4098939929328622, "grad_norm": 1.2719780418896733, "kl": 0.13226554542779922, "learning_rate": 6.256249999999999e-07, "loss": 0.13226554542779922, "reward": 1.2951699495315552, "reward_std": 0.16750632226467133, "rewards/GDino": 0.7549033463001251, "rewards/GIT": 0.24963624775409698, "rewards/HPSv2": 0.2906303405761719, "step": 599 }, { "completion_length": 144.5390625, "epoch": 1.4122497055359247, "grad_norm": 0.4453355884937973, "kl": 0.15846922248601913, "learning_rate": 6.249999999999999e-07, "loss": 0.15846922248601913, "reward": 1.4562865495681763, "reward_std": 0.15785513818264008, "rewards/GDino": 0.7852613925933838, "rewards/GIT": 0.3871220201253891, "rewards/HPSv2": 0.2839031219482422, "step": 600 }, { "completion_length": 144.8125, "epoch": 1.414605418138987, "grad_norm": 0.36822617211847203, "kl": 0.10048692673444748, "learning_rate": 6.24375e-07, "loss": 0.10048692673444748, "reward": 1.645211935043335, "reward_std": 0.1570643074810505, "rewards/GDino": 0.7955653667449951, "rewards/GIT": 0.5659913569688797, "rewards/HPSv2": 0.28365516662597656, "step": 601 }, { "completion_length": 148.5, "epoch": 1.4169611307420495, "grad_norm": 1.2596759129236592, "kl": 0.1658380851149559, "learning_rate": 6.2375e-07, "loss": 0.1658380851149559, "reward": 1.5007840394973755, "reward_std": 0.21078625321388245, "rewards/GDino": 0.7603049576282501, "rewards/GIT": 0.4532839357852936, "rewards/HPSv2": 0.28719520568847656, "step": 602 }, { "completion_length": 146.40625, "epoch": 1.4193168433451118, "grad_norm": 0.7671942070770004, "kl": 0.15320618450641632, "learning_rate": 6.23125e-07, "loss": 0.15320618450641632, "reward": 1.3951091170310974, "reward_std": 0.17252817004919052, "rewards/GDino": 0.7147566676139832, "rewards/GIT": 0.3785679042339325, "rewards/HPSv2": 0.3017845153808594, "step": 603 }, { "completion_length": 143.6640625, "epoch": 1.4216725559481742, "grad_norm": 0.7218751938486399, "kl": 0.13489320129156113, "learning_rate": 6.225000000000001e-07, "loss": 0.13489320129156113, "reward": 1.2232465147972107, "reward_std": 0.14657112956047058, "rewards/GDino": 0.7101245224475861, "rewards/GIT": 0.22221313416957855, "rewards/HPSv2": 0.2909088134765625, "step": 604 }, { "completion_length": 139.75, "epoch": 1.4240282685512367, "grad_norm": 0.49175743495248214, "kl": 0.12343315780162811, "learning_rate": 6.21875e-07, "loss": 0.12343315780162811, "reward": 1.452384889125824, "reward_std": 0.18380863964557648, "rewards/GDino": 0.7756332457065582, "rewards/GIT": 0.3912453353404999, "rewards/HPSv2": 0.2855062484741211, "step": 605 }, { "completion_length": 144.1640625, "epoch": 1.426383981154299, "grad_norm": 6.912981462325778, "kl": 1.7431976646184921, "learning_rate": 6.2125e-07, "loss": 1.7431976646184921, "reward": 1.5427829027175903, "reward_std": 0.11256594210863113, "rewards/GDino": 0.7892815172672272, "rewards/GIT": 0.4641870856285095, "rewards/HPSv2": 0.28931427001953125, "step": 606 }, { "completion_length": 146.6875, "epoch": 1.4287396937573615, "grad_norm": 0.4906748613099644, "kl": 0.1277228184044361, "learning_rate": 6.20625e-07, "loss": 0.1277228184044361, "reward": 1.3372564911842346, "reward_std": 0.23184366524219513, "rewards/GDino": 0.7412379086017609, "rewards/GIT": 0.3139512985944748, "rewards/HPSv2": 0.28206729888916016, "step": 607 }, { "completion_length": 143.7421875, "epoch": 1.431095406360424, "grad_norm": 1.1325558625551266, "kl": 0.14914269745349884, "learning_rate": 6.2e-07, "loss": 0.14914269745349884, "reward": 1.3255081176757812, "reward_std": 0.1592753604054451, "rewards/GDino": 0.7487305104732513, "rewards/GIT": 0.2760343700647354, "rewards/HPSv2": 0.30074310302734375, "step": 608 }, { "completion_length": 141.46875, "epoch": 1.4334511189634864, "grad_norm": 0.3277199415888763, "kl": 0.16788528114557266, "learning_rate": 6.19375e-07, "loss": 0.16788528114557266, "reward": 1.4301774501800537, "reward_std": 0.12119947373867035, "rewards/GDino": 0.7879155278205872, "rewards/GIT": 0.35771700739860535, "rewards/HPSv2": 0.2845449447631836, "step": 609 }, { "completion_length": 143.6796875, "epoch": 1.4358068315665489, "grad_norm": 0.3512432783374457, "kl": 0.14466553181409836, "learning_rate": 6.1875e-07, "loss": 0.14466553181409836, "reward": 1.6464316844940186, "reward_std": 0.16529709100723267, "rewards/GDino": 0.8352936208248138, "rewards/GIT": 0.5154961943626404, "rewards/HPSv2": 0.2956418991088867, "step": 610 }, { "completion_length": 144.0078125, "epoch": 1.4381625441696113, "grad_norm": 0.44119387289691353, "kl": 0.1560012772679329, "learning_rate": 6.18125e-07, "loss": 0.1560012772679329, "reward": 1.568103313446045, "reward_std": 0.15884799510240555, "rewards/GDino": 0.8155370950698853, "rewards/GIT": 0.4624745547771454, "rewards/HPSv2": 0.29009151458740234, "step": 611 }, { "completion_length": 146.0, "epoch": 1.4405182567726738, "grad_norm": 0.7711820218907935, "kl": 0.1277160458266735, "learning_rate": 6.175e-07, "loss": 0.1277160458266735, "reward": 1.4249069094657898, "reward_std": 0.23879598081111908, "rewards/GDino": 0.7715840041637421, "rewards/GIT": 0.3721873462200165, "rewards/HPSv2": 0.28113555908203125, "step": 612 }, { "completion_length": 146.6875, "epoch": 1.4428739693757362, "grad_norm": 0.4603535545716758, "kl": 0.21749602258205414, "learning_rate": 6.168749999999999e-07, "loss": 0.21749602258205414, "reward": 1.58238285779953, "reward_std": 0.152094304561615, "rewards/GDino": 0.7924623191356659, "rewards/GIT": 0.506579115986824, "rewards/HPSv2": 0.2833414077758789, "step": 613 }, { "completion_length": 141.8203125, "epoch": 1.4452296819787986, "grad_norm": 0.40601975853074873, "kl": 0.12435641884803772, "learning_rate": 6.162499999999999e-07, "loss": 0.12435641884803772, "reward": 1.7545067071914673, "reward_std": 0.11374851316213608, "rewards/GDino": 0.8808862566947937, "rewards/GIT": 0.5909828841686249, "rewards/HPSv2": 0.2826375961303711, "step": 614 }, { "completion_length": 143.8046875, "epoch": 1.447585394581861, "grad_norm": 0.7757122218650558, "kl": 0.16097911447286606, "learning_rate": 6.156249999999999e-07, "loss": 0.16097911447286606, "reward": 1.2401981353759766, "reward_std": 0.21124746650457382, "rewards/GDino": 0.6749492585659027, "rewards/GIT": 0.26937515288591385, "rewards/HPSv2": 0.29587364196777344, "step": 615 }, { "completion_length": 148.2578125, "epoch": 1.4499411071849235, "grad_norm": 0.4270195700291534, "kl": 0.18223880231380463, "learning_rate": 6.149999999999999e-07, "loss": 0.18223880231380463, "reward": 1.4430859088897705, "reward_std": 0.19172295182943344, "rewards/GDino": 0.7872471809387207, "rewards/GIT": 0.3713986724615097, "rewards/HPSv2": 0.2844400405883789, "step": 616 }, { "completion_length": 149.3125, "epoch": 1.452296819787986, "grad_norm": 0.6352742331034914, "kl": 0.2417542114853859, "learning_rate": 6.143749999999999e-07, "loss": 0.2417542114853859, "reward": 1.4620324969291687, "reward_std": 0.13916456699371338, "rewards/GDino": 0.7279389500617981, "rewards/GIT": 0.44671331346035004, "rewards/HPSv2": 0.2873802185058594, "step": 617 }, { "completion_length": 147.265625, "epoch": 1.4546525323910484, "grad_norm": 0.597596144930165, "kl": 0.15659081935882568, "learning_rate": 6.1375e-07, "loss": 0.15659081935882568, "reward": 1.4351047277450562, "reward_std": 0.16509682685136795, "rewards/GDino": 0.7432261109352112, "rewards/GIT": 0.4078686535358429, "rewards/HPSv2": 0.2840099334716797, "step": 618 }, { "completion_length": 145.03125, "epoch": 1.4570082449941109, "grad_norm": 0.41173810910198777, "kl": 0.218051016330719, "learning_rate": 6.13125e-07, "loss": 0.218051016330719, "reward": 1.5173739790916443, "reward_std": 0.1316210925579071, "rewards/GDino": 0.7331440448760986, "rewards/GIT": 0.5014892220497131, "rewards/HPSv2": 0.28274059295654297, "step": 619 }, { "completion_length": 143.84375, "epoch": 1.459363957597173, "grad_norm": 0.5007148010037674, "kl": 0.1768430843949318, "learning_rate": 6.125000000000001e-07, "loss": 0.1768430843949318, "reward": 1.1980259418487549, "reward_std": 0.21489939093589783, "rewards/GDino": 0.6740459501743317, "rewards/GIT": 0.2325657457113266, "rewards/HPSv2": 0.2914142608642578, "step": 620 }, { "completion_length": 141.7109375, "epoch": 1.4617196702002355, "grad_norm": 0.5986805739485457, "kl": 0.19059933349490166, "learning_rate": 6.11875e-07, "loss": 0.19059933349490166, "reward": 1.3100901246070862, "reward_std": 0.1625596061348915, "rewards/GDino": 0.6921733319759369, "rewards/GIT": 0.3247296214103699, "rewards/HPSv2": 0.29318714141845703, "step": 621 }, { "completion_length": 147.53125, "epoch": 1.464075382803298, "grad_norm": 0.7003130436183359, "kl": 0.13856308907270432, "learning_rate": 6.1125e-07, "loss": 0.13856308907270432, "reward": 1.5725849866867065, "reward_std": 0.1775805503129959, "rewards/GDino": 0.7730468809604645, "rewards/GIT": 0.5192951858043671, "rewards/HPSv2": 0.280242919921875, "step": 622 }, { "completion_length": 143.8515625, "epoch": 1.4664310954063604, "grad_norm": 0.6951601757274577, "kl": 0.23178649693727493, "learning_rate": 6.10625e-07, "loss": 0.23178649693727493, "reward": 1.4961116909980774, "reward_std": 0.16893180459737778, "rewards/GDino": 0.8082059025764465, "rewards/GIT": 0.39622631669044495, "rewards/HPSv2": 0.29167938232421875, "step": 623 }, { "completion_length": 139.1953125, "epoch": 1.4687868080094229, "grad_norm": 1.0958579165332691, "kl": 0.2617073133587837, "learning_rate": 6.1e-07, "loss": 0.2617073133587837, "reward": 1.7358908653259277, "reward_std": 0.11800562590360641, "rewards/GDino": 0.9137258231639862, "rewards/GIT": 0.539880245923996, "rewards/HPSv2": 0.2822847366333008, "step": 624 }, { "completion_length": 143.4140625, "epoch": 1.4711425206124853, "grad_norm": 0.4536680753908206, "kl": 0.2020931914448738, "learning_rate": 6.09375e-07, "loss": 0.2020931914448738, "reward": 1.5223735570907593, "reward_std": 0.13264815509319305, "rewards/GDino": 0.7913032472133636, "rewards/GIT": 0.42996950447559357, "rewards/HPSv2": 0.3011007308959961, "step": 625 }, { "completion_length": 144.7109375, "epoch": 1.4734982332155477, "grad_norm": 0.5226356118971156, "kl": 0.27917638421058655, "learning_rate": 6.0875e-07, "loss": 0.27917638421058655, "reward": 1.3333322405815125, "reward_std": 0.1326553151011467, "rewards/GDino": 0.7181925475597382, "rewards/GIT": 0.3250042572617531, "rewards/HPSv2": 0.29013538360595703, "step": 626 }, { "completion_length": 145.359375, "epoch": 1.4758539458186102, "grad_norm": 0.7899885864342896, "kl": 0.24759289622306824, "learning_rate": 6.08125e-07, "loss": 0.24759289622306824, "reward": 1.3404106497764587, "reward_std": 0.18406283110380173, "rewards/GDino": 0.726594090461731, "rewards/GIT": 0.31998464465141296, "rewards/HPSv2": 0.29383182525634766, "step": 627 }, { "completion_length": 138.6875, "epoch": 1.4782096584216726, "grad_norm": 0.8220594668361484, "kl": 0.24582141637802124, "learning_rate": 6.075e-07, "loss": 0.24582141637802124, "reward": 1.6067034006118774, "reward_std": 0.13249973952770233, "rewards/GDino": 0.8507494032382965, "rewards/GIT": 0.4566041827201843, "rewards/HPSv2": 0.2993497848510742, "step": 628 }, { "completion_length": 148.171875, "epoch": 1.4805653710247348, "grad_norm": 0.5679105278068263, "kl": 0.18751254677772522, "learning_rate": 6.06875e-07, "loss": 0.18751254677772522, "reward": 1.3649287223815918, "reward_std": 0.15957856178283691, "rewards/GDino": 0.7422395646572113, "rewards/GIT": 0.33009523898363113, "rewards/HPSv2": 0.29259395599365234, "step": 629 }, { "completion_length": 146.796875, "epoch": 1.4829210836277973, "grad_norm": 0.8235311847753964, "kl": 0.2180636301636696, "learning_rate": 6.062499999999999e-07, "loss": 0.2180636301636696, "reward": 1.767469048500061, "reward_std": 0.14168590307235718, "rewards/GDino": 0.8665418922901154, "rewards/GIT": 0.6114479303359985, "rewards/HPSv2": 0.28947925567626953, "step": 630 }, { "completion_length": 143.59375, "epoch": 1.4852767962308597, "grad_norm": 0.6298893495457831, "kl": 0.20722679048776627, "learning_rate": 6.056249999999999e-07, "loss": 0.20722679048776627, "reward": 1.3358083963394165, "reward_std": 0.22792376577854156, "rewards/GDino": 0.7251602113246918, "rewards/GIT": 0.32659338414669037, "rewards/HPSv2": 0.2840547561645508, "step": 631 }, { "completion_length": 151.453125, "epoch": 1.4876325088339222, "grad_norm": 0.5842645314483879, "kl": 0.17188318818807602, "learning_rate": 6.049999999999999e-07, "loss": 0.17188318818807602, "reward": 1.3851039409637451, "reward_std": 0.17056434601545334, "rewards/GDino": 0.7260416150093079, "rewards/GIT": 0.37517812848091125, "rewards/HPSv2": 0.28388404846191406, "step": 632 }, { "completion_length": 138.859375, "epoch": 1.4899882214369846, "grad_norm": 0.8356195159082686, "kl": 0.26964421570301056, "learning_rate": 6.043749999999999e-07, "loss": 0.26964421570301056, "reward": 1.5330078601837158, "reward_std": 0.12464854493737221, "rewards/GDino": 0.7962620854377747, "rewards/GIT": 0.4410095065832138, "rewards/HPSv2": 0.29573631286621094, "step": 633 }, { "completion_length": 147.296875, "epoch": 1.492343934040047, "grad_norm": 0.5231787808708354, "kl": 0.2030675783753395, "learning_rate": 6.037499999999999e-07, "loss": 0.2030675783753395, "reward": 1.6336352229118347, "reward_std": 0.1350320801138878, "rewards/GDino": 0.8043981790542603, "rewards/GIT": 0.5389423966407776, "rewards/HPSv2": 0.2902946472167969, "step": 634 }, { "completion_length": 146.5625, "epoch": 1.4946996466431095, "grad_norm": 0.38566350926764426, "kl": 0.1283363178372383, "learning_rate": 6.031249999999999e-07, "loss": 0.1283363178372383, "reward": 1.3202742338180542, "reward_std": 0.1528320536017418, "rewards/GDino": 0.6729138493537903, "rewards/GIT": 0.36049503087997437, "rewards/HPSv2": 0.286865234375, "step": 635 }, { "completion_length": 146.71875, "epoch": 1.497055359246172, "grad_norm": 0.51825481167192, "kl": 0.24028943479061127, "learning_rate": 6.025000000000001e-07, "loss": 0.24028943479061127, "reward": 1.2090998888015747, "reward_std": 0.16162611544132233, "rewards/GDino": 0.689848393201828, "rewards/GIT": 0.22058729827404022, "rewards/HPSv2": 0.2986640930175781, "step": 636 }, { "completion_length": 148.6875, "epoch": 1.4994110718492344, "grad_norm": 0.505102163324135, "kl": 0.21514446288347244, "learning_rate": 6.018750000000001e-07, "loss": 0.21514446288347244, "reward": 1.3358541131019592, "reward_std": 0.15613651275634766, "rewards/GDino": 0.7130942940711975, "rewards/GIT": 0.33475401997566223, "rewards/HPSv2": 0.2880058288574219, "step": 637 }, { "completion_length": 148.6796875, "epoch": 1.5017667844522968, "grad_norm": 0.8510809115528237, "kl": 0.2803666293621063, "learning_rate": 6.0125e-07, "loss": 0.2803666293621063, "reward": 1.357600748538971, "reward_std": 0.13313094899058342, "rewards/GDino": 0.733298659324646, "rewards/GIT": 0.32847222685813904, "rewards/HPSv2": 0.29582977294921875, "step": 638 }, { "completion_length": 139.25, "epoch": 1.5041224970553593, "grad_norm": 0.6104915382417897, "kl": 0.20271800458431244, "learning_rate": 6.00625e-07, "loss": 0.20271800458431244, "reward": 1.4089332222938538, "reward_std": 0.16766171157360077, "rewards/GDino": 0.7345650494098663, "rewards/GIT": 0.39102960377931595, "rewards/HPSv2": 0.2833385467529297, "step": 639 }, { "completion_length": 150.4375, "epoch": 1.5064782096584217, "grad_norm": 0.5355745612566732, "kl": 0.27019722759723663, "learning_rate": 6e-07, "loss": 0.27019722759723663, "reward": 1.4916706681251526, "reward_std": 0.15887250006198883, "rewards/GDino": 0.7292697727680206, "rewards/GIT": 0.476245254278183, "rewards/HPSv2": 0.28615570068359375, "step": 640 }, { "completion_length": 148.84375, "epoch": 1.5088339222614842, "grad_norm": 0.5968638579875587, "kl": 0.25272345542907715, "learning_rate": 5.99375e-07, "loss": 0.25272345542907715, "reward": 1.1573012471199036, "reward_std": 0.18937885761260986, "rewards/GDino": 0.6691060960292816, "rewards/GIT": 0.20053548365831375, "rewards/HPSv2": 0.2876596450805664, "step": 641 }, { "completion_length": 148.84375, "epoch": 1.5111896348645466, "grad_norm": 0.30926612762147887, "kl": 0.15333933755755424, "learning_rate": 5.9875e-07, "loss": 0.15333933755755424, "reward": 1.4640073776245117, "reward_std": 0.12242891266942024, "rewards/GDino": 0.6949999928474426, "rewards/GIT": 0.4813476502895355, "rewards/HPSv2": 0.2876596450805664, "step": 642 }, { "completion_length": 146.78125, "epoch": 1.513545347467609, "grad_norm": 0.4427677831716055, "kl": 0.22889608889818192, "learning_rate": 5.98125e-07, "loss": 0.22889608889818192, "reward": 1.50440514087677, "reward_std": 0.2273886650800705, "rewards/GDino": 0.793643593788147, "rewards/GIT": 0.4131503105163574, "rewards/HPSv2": 0.2976112365722656, "step": 643 }, { "completion_length": 147.6015625, "epoch": 1.5159010600706715, "grad_norm": 0.8430289075559098, "kl": 0.1749095469713211, "learning_rate": 5.975e-07, "loss": 0.1749095469713211, "reward": 1.6506565809249878, "reward_std": 0.11212264746427536, "rewards/GDino": 0.7914026081562042, "rewards/GIT": 0.5641031563282013, "rewards/HPSv2": 0.2951507568359375, "step": 644 }, { "completion_length": 148.3125, "epoch": 1.518256772673734, "grad_norm": 0.5464455339880651, "kl": 0.24205896258354187, "learning_rate": 5.96875e-07, "loss": 0.24205896258354187, "reward": 1.7761095762252808, "reward_std": 0.12200397998094559, "rewards/GDino": 0.8673996031284332, "rewards/GIT": 0.6132845878601074, "rewards/HPSv2": 0.2954254150390625, "step": 645 }, { "completion_length": 152.2109375, "epoch": 1.5206124852767964, "grad_norm": 0.6945212015061046, "kl": 0.23683404177427292, "learning_rate": 5.962499999999999e-07, "loss": 0.23683404177427292, "reward": 1.2970621585845947, "reward_std": 0.20320986211299896, "rewards/GDino": 0.7282150387763977, "rewards/GIT": 0.28646891564130783, "rewards/HPSv2": 0.2823781967163086, "step": 646 }, { "completion_length": 151.03125, "epoch": 1.5229681978798588, "grad_norm": 0.5062133422847779, "kl": 0.1756066158413887, "learning_rate": 5.956249999999999e-07, "loss": 0.1756066158413887, "reward": 1.5077489018440247, "reward_std": 0.17452461272478104, "rewards/GDino": 0.7639693021774292, "rewards/GIT": 0.4608653783798218, "rewards/HPSv2": 0.2829141616821289, "step": 647 }, { "completion_length": 156.3359375, "epoch": 1.525323910482921, "grad_norm": 0.4079832792118037, "kl": 0.20341691374778748, "learning_rate": 5.949999999999999e-07, "loss": 0.20341691374778748, "reward": 1.4296574592590332, "reward_std": 0.12731751799583435, "rewards/GDino": 0.7056635320186615, "rewards/GIT": 0.43695227801799774, "rewards/HPSv2": 0.28704166412353516, "step": 648 }, { "completion_length": 149.1953125, "epoch": 1.5276796230859835, "grad_norm": 0.5621011458497843, "kl": 0.14613120257854462, "learning_rate": 5.943749999999999e-07, "loss": 0.14613120257854462, "reward": 1.2649133205413818, "reward_std": 0.10324588790535927, "rewards/GDino": 0.7250050902366638, "rewards/GIT": 0.2447374016046524, "rewards/HPSv2": 0.29517078399658203, "step": 649 }, { "completion_length": 146.8125, "epoch": 1.530035335689046, "grad_norm": 0.5127160014787308, "kl": 0.22407453507184982, "learning_rate": 5.937499999999999e-07, "loss": 0.22407453507184982, "reward": 1.6856442093849182, "reward_std": 0.16052275151014328, "rewards/GDino": 0.8710328638553619, "rewards/GIT": 0.5276516377925873, "rewards/HPSv2": 0.2869596481323242, "step": 650 }, { "completion_length": 151.65625, "epoch": 1.5323910482921084, "grad_norm": 0.3755865160561873, "kl": 0.23991651833057404, "learning_rate": 5.93125e-07, "loss": 0.23991651833057404, "reward": 1.3300756216049194, "reward_std": 0.1565769612789154, "rewards/GDino": 0.694370448589325, "rewards/GIT": 0.3480873852968216, "rewards/HPSv2": 0.28761768341064453, "step": 651 }, { "completion_length": 141.1484375, "epoch": 1.5347467608951708, "grad_norm": 1.348803129545543, "kl": 0.2779812812805176, "learning_rate": 5.925e-07, "loss": 0.2779812812805176, "reward": 1.087645798921585, "reward_std": 0.19887207448482513, "rewards/GDino": 0.633162796497345, "rewards/GIT": 0.16305819153785706, "rewards/HPSv2": 0.2914247512817383, "step": 652 }, { "completion_length": 153.0078125, "epoch": 1.5371024734982333, "grad_norm": 0.6683638912227681, "kl": 0.26233136653900146, "learning_rate": 5.91875e-07, "loss": 0.26233136653900146, "reward": 1.2191742062568665, "reward_std": 0.16853978484869003, "rewards/GDino": 0.7405899167060852, "rewards/GIT": 0.1754598245024681, "rewards/HPSv2": 0.30312442779541016, "step": 653 }, { "completion_length": 147.3359375, "epoch": 1.5394581861012955, "grad_norm": 0.7318231040068807, "kl": 0.24636422842741013, "learning_rate": 5.912500000000001e-07, "loss": 0.24636422842741013, "reward": 1.067196637392044, "reward_std": 0.2121383100748062, "rewards/GDino": 0.60377037525177, "rewards/GIT": 0.1827780418097973, "rewards/HPSv2": 0.28064823150634766, "step": 654 }, { "completion_length": 143.71875, "epoch": 1.541813898704358, "grad_norm": 0.7031817775959021, "kl": 0.16931962966918945, "learning_rate": 5.90625e-07, "loss": 0.16931962966918945, "reward": 1.4774904251098633, "reward_std": 0.19255462288856506, "rewards/GDino": 0.7358091175556183, "rewards/GIT": 0.4556094706058502, "rewards/HPSv2": 0.28607177734375, "step": 655 }, { "completion_length": 150.15625, "epoch": 1.5441696113074204, "grad_norm": 0.5637821659863518, "kl": 0.21773915737867355, "learning_rate": 5.9e-07, "loss": 0.21773915737867355, "reward": 1.2507258653640747, "reward_std": 0.16196085512638092, "rewards/GDino": 0.7384036481380463, "rewards/GIT": 0.2158515751361847, "rewards/HPSv2": 0.29647064208984375, "step": 656 }, { "completion_length": 147.6171875, "epoch": 1.5465253239104828, "grad_norm": 0.5565583369625409, "kl": 0.3344259560108185, "learning_rate": 5.89375e-07, "loss": 0.3344259560108185, "reward": 1.4936338663101196, "reward_std": 0.19412066042423248, "rewards/GDino": 0.8054620921611786, "rewards/GIT": 0.3991836756467819, "rewards/HPSv2": 0.2889881134033203, "step": 657 }, { "completion_length": 146.6328125, "epoch": 1.5488810365135453, "grad_norm": 1.5461772190992513, "kl": 0.18915174901485443, "learning_rate": 5.8875e-07, "loss": 0.18915174901485443, "reward": 1.505102276802063, "reward_std": 0.12176040560007095, "rewards/GDino": 0.7529442310333252, "rewards/GIT": 0.4632289558649063, "rewards/HPSv2": 0.2889289855957031, "step": 658 }, { "completion_length": 145.015625, "epoch": 1.5512367491166077, "grad_norm": 0.8385790611368877, "kl": 0.2351914867758751, "learning_rate": 5.88125e-07, "loss": 0.2351914867758751, "reward": 1.6264426112174988, "reward_std": 0.061047958210110664, "rewards/GDino": 0.7990481853485107, "rewards/GIT": 0.5263919234275818, "rewards/HPSv2": 0.30100250244140625, "step": 659 }, { "completion_length": 150.25, "epoch": 1.5535924617196701, "grad_norm": 0.5327355382737348, "kl": 0.13680776208639145, "learning_rate": 5.875e-07, "loss": 0.13680776208639145, "reward": 1.3599443435668945, "reward_std": 0.18995028734207153, "rewards/GDino": 0.7253125309944153, "rewards/GIT": 0.3404594957828522, "rewards/HPSv2": 0.2941722869873047, "step": 660 }, { "completion_length": 149.578125, "epoch": 1.5559481743227326, "grad_norm": 0.43944290079561593, "kl": 0.24134273827075958, "learning_rate": 5.86875e-07, "loss": 0.24134273827075958, "reward": 1.2283468842506409, "reward_std": 0.14058047533035278, "rewards/GDino": 0.7544448375701904, "rewards/GIT": 0.1659310646355152, "rewards/HPSv2": 0.3079710006713867, "step": 661 }, { "completion_length": 152.84375, "epoch": 1.558303886925795, "grad_norm": 0.7267474320340694, "kl": 0.35460107028484344, "learning_rate": 5.8625e-07, "loss": 0.35460107028484344, "reward": 1.6130093932151794, "reward_std": 0.1293291337788105, "rewards/GDino": 0.8360603451728821, "rewards/GIT": 0.48097623884677887, "rewards/HPSv2": 0.2959728240966797, "step": 662 }, { "completion_length": 145.703125, "epoch": 1.5606595995288575, "grad_norm": 0.5022932867278086, "kl": 0.30909785628318787, "learning_rate": 5.856249999999999e-07, "loss": 0.30909785628318787, "reward": 1.6094895601272583, "reward_std": 0.16967538744211197, "rewards/GDino": 0.8364659547805786, "rewards/GIT": 0.4742698222398758, "rewards/HPSv2": 0.2987537384033203, "step": 663 }, { "completion_length": 144.6328125, "epoch": 1.56301531213192, "grad_norm": 0.7459787746284788, "kl": 0.21996129304170609, "learning_rate": 5.849999999999999e-07, "loss": 0.21996129304170609, "reward": 1.4932969808578491, "reward_std": 0.1620258241891861, "rewards/GDino": 0.7730682790279388, "rewards/GIT": 0.4275670200586319, "rewards/HPSv2": 0.2926616668701172, "step": 664 }, { "completion_length": 148.7890625, "epoch": 1.5653710247349824, "grad_norm": 0.6666611488486536, "kl": 0.32259272038936615, "learning_rate": 5.843749999999999e-07, "loss": 0.32259272038936615, "reward": 1.661873698234558, "reward_std": 0.1325434297323227, "rewards/GDino": 0.7951178848743439, "rewards/GIT": 0.5755027234554291, "rewards/HPSv2": 0.29125308990478516, "step": 665 }, { "completion_length": 141.78125, "epoch": 1.5677267373380448, "grad_norm": 0.3938681473060903, "kl": 0.2175481915473938, "learning_rate": 5.837499999999999e-07, "loss": 0.2175481915473938, "reward": 1.5628275275230408, "reward_std": 0.148651622235775, "rewards/GDino": 0.7888315916061401, "rewards/GIT": 0.49445389211177826, "rewards/HPSv2": 0.2795419692993164, "step": 666 }, { "completion_length": 142.3515625, "epoch": 1.5700824499411072, "grad_norm": 0.42454105017082966, "kl": 0.26753559708595276, "learning_rate": 5.83125e-07, "loss": 0.26753559708595276, "reward": 1.3783289790153503, "reward_std": 0.17193155735731125, "rewards/GDino": 0.734967440366745, "rewards/GIT": 0.347412571310997, "rewards/HPSv2": 0.29594898223876953, "step": 667 }, { "completion_length": 150.8984375, "epoch": 1.5724381625441697, "grad_norm": 0.4000298560280398, "kl": 4.4095064252614975, "learning_rate": 5.825e-07, "loss": 4.4095064252614975, "reward": 1.166717529296875, "reward_std": 0.20512177795171738, "rewards/GDino": 0.7045232057571411, "rewards/GIT": 0.1792372763156891, "rewards/HPSv2": 0.2829570770263672, "step": 668 }, { "completion_length": 145.1875, "epoch": 1.5747938751472321, "grad_norm": 157.56239335833405, "kl": 0.22526895999908447, "learning_rate": 5.81875e-07, "loss": 0.22526895999908447, "reward": 1.6322451829910278, "reward_std": 0.10642168670892715, "rewards/GDino": 0.8343103229999542, "rewards/GIT": 0.4957926869392395, "rewards/HPSv2": 0.3021421432495117, "step": 669 }, { "completion_length": 144.515625, "epoch": 1.5771495877502946, "grad_norm": 0.982666173133395, "kl": 0.2920727878808975, "learning_rate": 5.8125e-07, "loss": 0.2920727878808975, "reward": 1.4252486824989319, "reward_std": 0.17745383083820343, "rewards/GDino": 0.7377952933311462, "rewards/GIT": 0.3939734399318695, "rewards/HPSv2": 0.29347991943359375, "step": 670 }, { "completion_length": 143.4140625, "epoch": 1.579505300353357, "grad_norm": 0.9924474626734707, "kl": 0.14356781542301178, "learning_rate": 5.806249999999999e-07, "loss": 0.14356781542301178, "reward": 1.2152029275894165, "reward_std": 0.18286050111055374, "rewards/GDino": 0.6699445247650146, "rewards/GIT": 0.25220175087451935, "rewards/HPSv2": 0.2930564880371094, "step": 671 }, { "completion_length": 146.7890625, "epoch": 1.5818610129564195, "grad_norm": 0.8253226971986687, "kl": 0.2665731757879257, "learning_rate": 5.8e-07, "loss": 0.2665731757879257, "reward": 1.7677819728851318, "reward_std": 0.07647029869258404, "rewards/GDino": 0.8669135868549347, "rewards/GIT": 0.6058635413646698, "rewards/HPSv2": 0.29500484466552734, "step": 672 }, { "completion_length": 144.0078125, "epoch": 1.584216725559482, "grad_norm": 0.567560001429693, "kl": 0.2550884485244751, "learning_rate": 5.79375e-07, "loss": 0.2550884485244751, "reward": 1.5183981657028198, "reward_std": 0.14513403922319412, "rewards/GDino": 0.7745106220245361, "rewards/GIT": 0.4409719407558441, "rewards/HPSv2": 0.3029155731201172, "step": 673 }, { "completion_length": 145.8359375, "epoch": 1.5865724381625441, "grad_norm": 0.5971730852615481, "kl": 0.2684609293937683, "learning_rate": 5.7875e-07, "loss": 0.2684609293937683, "reward": 1.4849683046340942, "reward_std": 0.1912762150168419, "rewards/GDino": 0.7926382124423981, "rewards/GIT": 0.3997246325016022, "rewards/HPSv2": 0.2926054000854492, "step": 674 }, { "completion_length": 144.015625, "epoch": 1.5889281507656066, "grad_norm": 0.45081100822078257, "kl": 0.2558326870203018, "learning_rate": 5.78125e-07, "loss": 0.2558326870203018, "reward": 1.7010666131973267, "reward_std": 0.1563047096133232, "rewards/GDino": 0.8482165932655334, "rewards/GIT": 0.5591088235378265, "rewards/HPSv2": 0.29374122619628906, "step": 675 }, { "completion_length": 149.046875, "epoch": 1.591283863368669, "grad_norm": 1.6782004669536372, "kl": 0.1975540742278099, "learning_rate": 5.775e-07, "loss": 0.1975540742278099, "reward": 1.461195945739746, "reward_std": 0.16369875520467758, "rewards/GDino": 0.7937262058258057, "rewards/GIT": 0.38132165372371674, "rewards/HPSv2": 0.2861480712890625, "step": 676 }, { "completion_length": 151.9296875, "epoch": 1.5936395759717314, "grad_norm": 0.5198668573713812, "kl": 0.1711014211177826, "learning_rate": 5.76875e-07, "loss": 0.1711014211177826, "reward": 1.4548780918121338, "reward_std": 0.1738310381770134, "rewards/GDino": 0.747763603925705, "rewards/GIT": 0.41992491483688354, "rewards/HPSv2": 0.2871894836425781, "step": 677 }, { "completion_length": 147.46875, "epoch": 1.595995288574794, "grad_norm": 0.43218884561903465, "kl": 0.20424865931272507, "learning_rate": 5.7625e-07, "loss": 0.20424865931272507, "reward": 1.5319575667381287, "reward_std": 0.14320454001426697, "rewards/GDino": 0.7707098722457886, "rewards/GIT": 0.48420722782611847, "rewards/HPSv2": 0.2770404815673828, "step": 678 }, { "completion_length": 146.3671875, "epoch": 1.5983510011778563, "grad_norm": 0.5228601049246934, "kl": 0.16813233494758606, "learning_rate": 5.75625e-07, "loss": 0.16813233494758606, "reward": 1.4017256498336792, "reward_std": 0.20108681917190552, "rewards/GDino": 0.753173828125, "rewards/GIT": 0.3521011620759964, "rewards/HPSv2": 0.2964506149291992, "step": 679 }, { "completion_length": 140.71875, "epoch": 1.6007067137809188, "grad_norm": 0.7931411271987714, "kl": 0.20374689251184464, "learning_rate": 5.749999999999999e-07, "loss": 0.20374689251184464, "reward": 1.5668033957481384, "reward_std": 0.17860055342316628, "rewards/GDino": 0.7883747220039368, "rewards/GIT": 0.48159557580947876, "rewards/HPSv2": 0.2968330383300781, "step": 680 }, { "completion_length": 147.15625, "epoch": 1.603062426383981, "grad_norm": 0.5325603198233646, "kl": 0.3020494803786278, "learning_rate": 5.743749999999999e-07, "loss": 0.3020494803786278, "reward": 1.6193737983703613, "reward_std": 0.15347905457019806, "rewards/GDino": 0.8321252167224884, "rewards/GIT": 0.49463169276714325, "rewards/HPSv2": 0.2926168441772461, "step": 681 }, { "completion_length": 149.328125, "epoch": 1.6054181389870434, "grad_norm": 0.7756685816197665, "kl": 0.23055081069469452, "learning_rate": 5.737499999999999e-07, "loss": 0.23055081069469452, "reward": 1.3443875312805176, "reward_std": 0.2037675529718399, "rewards/GDino": 0.6906379461288452, "rewards/GIT": 0.37052153050899506, "rewards/HPSv2": 0.28322792053222656, "step": 682 }, { "completion_length": 148.4609375, "epoch": 1.6077738515901059, "grad_norm": 0.7195504251287094, "kl": 0.21399816125631332, "learning_rate": 5.73125e-07, "loss": 0.21399816125631332, "reward": 1.4563969373703003, "reward_std": 0.18327418714761734, "rewards/GDino": 0.7302732467651367, "rewards/GIT": 0.4431694895029068, "rewards/HPSv2": 0.28295421600341797, "step": 683 }, { "completion_length": 150.6953125, "epoch": 1.6101295641931683, "grad_norm": 0.5638028642985631, "kl": 0.34575335681438446, "learning_rate": 5.725e-07, "loss": 0.34575335681438446, "reward": 1.492543339729309, "reward_std": 0.17182528972625732, "rewards/GDino": 0.7827164828777313, "rewards/GIT": 0.4165004640817642, "rewards/HPSv2": 0.29332637786865234, "step": 684 }, { "completion_length": 150.4140625, "epoch": 1.6124852767962308, "grad_norm": 0.984984687622662, "kl": 0.2633599489927292, "learning_rate": 5.71875e-07, "loss": 0.2633599489927292, "reward": 1.5790128707885742, "reward_std": 0.1589416190981865, "rewards/GDino": 0.8308237791061401, "rewards/GIT": 0.44683269411325455, "rewards/HPSv2": 0.30135631561279297, "step": 685 }, { "completion_length": 154.9609375, "epoch": 1.6148409893992932, "grad_norm": 1.7387149519692933, "kl": 0.3048952519893646, "learning_rate": 5.7125e-07, "loss": 0.3048952519893646, "reward": 1.3864032626152039, "reward_std": 0.18756862729787827, "rewards/GDino": 0.7708549499511719, "rewards/GIT": 0.33134666085243225, "rewards/HPSv2": 0.28420162200927734, "step": 686 }, { "completion_length": 144.9296875, "epoch": 1.6171967020023557, "grad_norm": 1.0107383091202482, "kl": 0.3818390518426895, "learning_rate": 5.70625e-07, "loss": 0.3818390518426895, "reward": 1.0837903022766113, "reward_std": 0.18534737825393677, "rewards/GDino": 0.6450067460536957, "rewards/GIT": 0.1513555645942688, "rewards/HPSv2": 0.2874279022216797, "step": 687 }, { "completion_length": 147.2890625, "epoch": 1.619552414605418, "grad_norm": 0.7237557772388166, "kl": 0.27083105593919754, "learning_rate": 5.699999999999999e-07, "loss": 0.27083105593919754, "reward": 1.467542290687561, "reward_std": 0.1648060530424118, "rewards/GDino": 0.7980161309242249, "rewards/GIT": 0.3790808469057083, "rewards/HPSv2": 0.29044532775878906, "step": 688 }, { "completion_length": 144.3203125, "epoch": 1.6219081272084805, "grad_norm": 0.7831485290365249, "kl": 0.3454933762550354, "learning_rate": 5.69375e-07, "loss": 0.3454933762550354, "reward": 1.4930553436279297, "reward_std": 0.18061397969722748, "rewards/GDino": 0.7897568345069885, "rewards/GIT": 0.41508084535598755, "rewards/HPSv2": 0.28821754455566406, "step": 689 }, { "completion_length": 151.5390625, "epoch": 1.624263839811543, "grad_norm": 0.8496004156501061, "kl": 0.35659798979759216, "learning_rate": 5.6875e-07, "loss": 0.35659798979759216, "reward": 1.2406991720199585, "reward_std": 0.17899729311466217, "rewards/GDino": 0.6606912016868591, "rewards/GIT": 0.28868328779935837, "rewards/HPSv2": 0.2913246154785156, "step": 690 }, { "completion_length": 150.125, "epoch": 1.6266195524146054, "grad_norm": 0.5886713171819347, "kl": 0.20841021090745926, "learning_rate": 5.68125e-07, "loss": 0.20841021090745926, "reward": 1.4032571911811829, "reward_std": 0.16045674681663513, "rewards/GDino": 0.7167186737060547, "rewards/GIT": 0.3970028758049011, "rewards/HPSv2": 0.2895355224609375, "step": 691 }, { "completion_length": 144.8046875, "epoch": 1.6289752650176679, "grad_norm": 0.6188624346447901, "kl": 0.23115477710962296, "learning_rate": 5.675e-07, "loss": 0.23115477710962296, "reward": 1.4723437428474426, "reward_std": 0.12653468549251556, "rewards/GDino": 0.7739541232585907, "rewards/GIT": 0.402134507894516, "rewards/HPSv2": 0.29625511169433594, "step": 692 }, { "completion_length": 144.984375, "epoch": 1.6313309776207303, "grad_norm": 0.5484498317160068, "kl": 0.23968181014060974, "learning_rate": 5.66875e-07, "loss": 0.23968181014060974, "reward": 1.5082906484603882, "reward_std": 0.11921254172921181, "rewards/GDino": 0.7748215198516846, "rewards/GIT": 0.4554940089583397, "rewards/HPSv2": 0.27797508239746094, "step": 693 }, { "completion_length": 143.34375, "epoch": 1.6336866902237928, "grad_norm": 0.5374686633755369, "kl": 0.3120548725128174, "learning_rate": 5.6625e-07, "loss": 0.3120548725128174, "reward": 1.587378203868866, "reward_std": 0.1251237727701664, "rewards/GDino": 0.8169122636318207, "rewards/GIT": 0.4751386344432831, "rewards/HPSv2": 0.29532718658447266, "step": 694 }, { "completion_length": 145.0546875, "epoch": 1.6360424028268552, "grad_norm": 0.8023676350798692, "kl": 0.27962692826986313, "learning_rate": 5.65625e-07, "loss": 0.27962692826986313, "reward": 1.8008582592010498, "reward_std": 0.13113077357411385, "rewards/GDino": 0.9324856400489807, "rewards/GIT": 0.5788475871086121, "rewards/HPSv2": 0.28952503204345703, "step": 695 }, { "completion_length": 152.8046875, "epoch": 1.6383981154299176, "grad_norm": 0.8114391770262829, "kl": 0.2659113109111786, "learning_rate": 5.649999999999999e-07, "loss": 0.2659113109111786, "reward": 1.4860165119171143, "reward_std": 0.14663895964622498, "rewards/GDino": 0.7465550601482391, "rewards/GIT": 0.4490523040294647, "rewards/HPSv2": 0.2904090881347656, "step": 696 }, { "completion_length": 143.203125, "epoch": 1.64075382803298, "grad_norm": 0.5311881328229695, "kl": 0.24420329928398132, "learning_rate": 5.643749999999999e-07, "loss": 0.24420329928398132, "reward": 1.5353789925575256, "reward_std": 0.1638953611254692, "rewards/GDino": 0.7916119396686554, "rewards/GIT": 0.4568931460380554, "rewards/HPSv2": 0.28687381744384766, "step": 697 }, { "completion_length": 151.6875, "epoch": 1.6431095406360425, "grad_norm": 0.9788600463214788, "kl": 0.23309168219566345, "learning_rate": 5.637499999999999e-07, "loss": 0.23309168219566345, "reward": 1.0765413641929626, "reward_std": 0.19364579021930695, "rewards/GDino": 0.5667525231838226, "rewards/GIT": 0.22248012572526932, "rewards/HPSv2": 0.2873086929321289, "step": 698 }, { "completion_length": 149.3046875, "epoch": 1.645465253239105, "grad_norm": 0.5117072293311139, "kl": 0.26753148436546326, "learning_rate": 5.63125e-07, "loss": 0.26753148436546326, "reward": 1.5085643529891968, "reward_std": 0.1601584181189537, "rewards/GDino": 0.776499480009079, "rewards/GIT": 0.44559819996356964, "rewards/HPSv2": 0.2864665985107422, "step": 699 }, { "completion_length": 146.8828125, "epoch": 1.6478209658421674, "grad_norm": 0.947824749114317, "kl": 0.33237072825431824, "learning_rate": 5.625e-07, "loss": 0.33237072825431824, "reward": 1.3191026449203491, "reward_std": 0.17891275882720947, "rewards/GDino": 0.7753120362758636, "rewards/GIT": 0.24332691729068756, "rewards/HPSv2": 0.3004636764526367, "step": 700 }, { "completion_length": 147.0546875, "epoch": 1.6501766784452296, "grad_norm": 0.8117764254992506, "kl": 0.3436366319656372, "learning_rate": 5.61875e-07, "loss": 0.3436366319656372, "reward": 1.4838327765464783, "reward_std": 0.17398478835821152, "rewards/GDino": 0.8031507730484009, "rewards/GIT": 0.393727108836174, "rewards/HPSv2": 0.2869548797607422, "step": 701 }, { "completion_length": 154.6328125, "epoch": 1.652532391048292, "grad_norm": 0.49533356463209327, "kl": 0.25683166086673737, "learning_rate": 5.6125e-07, "loss": 0.25683166086673737, "reward": 1.5655171871185303, "reward_std": 0.1688331738114357, "rewards/GDino": 0.8122017085552216, "rewards/GIT": 0.4641127288341522, "rewards/HPSv2": 0.2892026901245117, "step": 702 }, { "completion_length": 150.8515625, "epoch": 1.6548881036513545, "grad_norm": 0.6703259914844085, "kl": 0.3402216136455536, "learning_rate": 5.60625e-07, "loss": 0.3402216136455536, "reward": 1.4636093378067017, "reward_std": 0.14561191946268082, "rewards/GDino": 0.7237472534179688, "rewards/GIT": 0.4511982947587967, "rewards/HPSv2": 0.2886638641357422, "step": 703 }, { "completion_length": 139.3125, "epoch": 1.657243816254417, "grad_norm": 0.8215437924746695, "kl": 0.2883714735507965, "learning_rate": 5.6e-07, "loss": 0.2883714735507965, "reward": 1.9477152228355408, "reward_std": 0.1210954524576664, "rewards/GDino": 0.9410896897315979, "rewards/GIT": 0.7262537479400635, "rewards/HPSv2": 0.28037166595458984, "step": 704 }, { "completion_length": 147.1640625, "epoch": 1.6595995288574794, "grad_norm": 0.8113616620553012, "kl": 0.25445906817913055, "learning_rate": 5.593749999999999e-07, "loss": 0.25445906817913055, "reward": 1.5394752621650696, "reward_std": 0.13789402693510056, "rewards/GDino": 0.7813801467418671, "rewards/GIT": 0.464712455868721, "rewards/HPSv2": 0.2933826446533203, "step": 705 }, { "completion_length": 140.4921875, "epoch": 1.6619552414605419, "grad_norm": 0.4318625460443212, "kl": 0.24965877830982208, "learning_rate": 5.587499999999999e-07, "loss": 0.24965877830982208, "reward": 1.4328694939613342, "reward_std": 0.15452712029218674, "rewards/GDino": 0.7838310599327087, "rewards/GIT": 0.3569423258304596, "rewards/HPSv2": 0.2920961380004883, "step": 706 }, { "completion_length": 152.5546875, "epoch": 1.664310954063604, "grad_norm": 0.9667546949955127, "kl": 0.24883799254894257, "learning_rate": 5.58125e-07, "loss": 0.24883799254894257, "reward": 1.4091433882713318, "reward_std": 0.1565462201833725, "rewards/GDino": 0.7653892040252686, "rewards/GIT": 0.3512461185455322, "rewards/HPSv2": 0.2925081253051758, "step": 707 }, { "completion_length": 151.5390625, "epoch": 1.6666666666666665, "grad_norm": 0.36320276473398766, "kl": 0.2456768900156021, "learning_rate": 5.575e-07, "loss": 0.2456768900156021, "reward": 1.2555897235870361, "reward_std": 0.14170579612255096, "rewards/GDino": 0.670099526643753, "rewards/GIT": 0.28921881690621376, "rewards/HPSv2": 0.29627132415771484, "step": 708 }, { "completion_length": 143.9140625, "epoch": 1.669022379269729, "grad_norm": 0.5447971380188708, "kl": 0.2918473035097122, "learning_rate": 5.56875e-07, "loss": 0.2918473035097122, "reward": 1.4091033935546875, "reward_std": 0.151528000831604, "rewards/GDino": 0.7273585200309753, "rewards/GIT": 0.38334202766418457, "rewards/HPSv2": 0.2984027862548828, "step": 709 }, { "completion_length": 148.6328125, "epoch": 1.6713780918727914, "grad_norm": 0.4419515403363054, "kl": 0.21376416087150574, "learning_rate": 5.5625e-07, "loss": 0.21376416087150574, "reward": 1.357375979423523, "reward_std": 0.20736957341432571, "rewards/GDino": 0.6973326504230499, "rewards/GIT": 0.3803897947072983, "rewards/HPSv2": 0.27965354919433594, "step": 710 }, { "completion_length": 144.5078125, "epoch": 1.6737338044758538, "grad_norm": 0.6740761100125193, "kl": 0.2545318529009819, "learning_rate": 5.55625e-07, "loss": 0.2545318529009819, "reward": 1.4848544001579285, "reward_std": 0.16078054904937744, "rewards/GDino": 0.8301281929016113, "rewards/GIT": 0.362106516957283, "rewards/HPSv2": 0.2926197052001953, "step": 711 }, { "completion_length": 143.7421875, "epoch": 1.6760895170789163, "grad_norm": 0.42993999691818957, "kl": 0.1277817264199257, "learning_rate": 5.55e-07, "loss": 0.1277817264199257, "reward": 1.7335904836654663, "reward_std": 0.12933790683746338, "rewards/GDino": 0.828410416841507, "rewards/GIT": 0.6252309083938599, "rewards/HPSv2": 0.2799491882324219, "step": 712 }, { "completion_length": 137.6953125, "epoch": 1.6784452296819787, "grad_norm": 0.4781213621041494, "kl": 0.2894400358200073, "learning_rate": 5.543749999999999e-07, "loss": 0.2894400358200073, "reward": 1.3023838996887207, "reward_std": 0.1399068981409073, "rewards/GDino": 0.7449455857276917, "rewards/GIT": 0.26757944375276566, "rewards/HPSv2": 0.2898588180541992, "step": 713 }, { "completion_length": 150.78125, "epoch": 1.6808009422850412, "grad_norm": 0.9044564909668796, "kl": 0.404668927192688, "learning_rate": 5.5375e-07, "loss": 0.404668927192688, "reward": 1.6465827226638794, "reward_std": 0.16654925048351288, "rewards/GDino": 0.8409066796302795, "rewards/GIT": 0.5072513222694397, "rewards/HPSv2": 0.29842472076416016, "step": 714 }, { "completion_length": 144.7421875, "epoch": 1.6831566548881036, "grad_norm": 1.185510753590385, "kl": 0.2425072118639946, "learning_rate": 5.53125e-07, "loss": 0.2425072118639946, "reward": 1.392101526260376, "reward_std": 0.16805925220251083, "rewards/GDino": 0.7618416845798492, "rewards/GIT": 0.3337853103876114, "rewards/HPSv2": 0.2964744567871094, "step": 715 }, { "completion_length": 153.4765625, "epoch": 1.685512367491166, "grad_norm": 0.43527219880135304, "kl": 0.3133867606520653, "learning_rate": 5.525e-07, "loss": 0.3133867606520653, "reward": 1.496354341506958, "reward_std": 0.1519993618130684, "rewards/GDino": 0.7475116848945618, "rewards/GIT": 0.4615178257226944, "rewards/HPSv2": 0.2873249053955078, "step": 716 }, { "completion_length": 148.734375, "epoch": 1.6878680800942285, "grad_norm": 0.6987605701892583, "kl": 0.2973002791404724, "learning_rate": 5.51875e-07, "loss": 0.2973002791404724, "reward": 1.7600281238555908, "reward_std": 0.15817105770111084, "rewards/GDino": 0.852638453245163, "rewards/GIT": 0.6163864135742188, "rewards/HPSv2": 0.2910032272338867, "step": 717 }, { "completion_length": 150.3671875, "epoch": 1.690223792697291, "grad_norm": 0.5295579471583289, "kl": 0.22751359641551971, "learning_rate": 5.5125e-07, "loss": 0.22751359641551971, "reward": 1.5366231203079224, "reward_std": 0.12266659736633301, "rewards/GDino": 0.8529405295848846, "rewards/GIT": 0.3968116268515587, "rewards/HPSv2": 0.28687095642089844, "step": 718 }, { "completion_length": 152.78125, "epoch": 1.6925795053003534, "grad_norm": 0.4868451336686298, "kl": 0.23184525966644287, "learning_rate": 5.50625e-07, "loss": 0.23184525966644287, "reward": 1.6062045693397522, "reward_std": 0.15043797343969345, "rewards/GDino": 0.7535960674285889, "rewards/GIT": 0.5707261860370636, "rewards/HPSv2": 0.28188228607177734, "step": 719 }, { "completion_length": 143.078125, "epoch": 1.6949352179034158, "grad_norm": 0.4329753032408274, "kl": 0.32949791848659515, "learning_rate": 5.5e-07, "loss": 0.32949791848659515, "reward": 1.6123161315917969, "reward_std": 0.16874507069587708, "rewards/GDino": 0.8133229613304138, "rewards/GIT": 0.515778511762619, "rewards/HPSv2": 0.2832145690917969, "step": 720 }, { "completion_length": 152.40625, "epoch": 1.6972909305064783, "grad_norm": 0.7915024140236221, "kl": 0.38944125175476074, "learning_rate": 5.493749999999999e-07, "loss": 0.38944125175476074, "reward": 1.515310525894165, "reward_std": 0.15677162632346153, "rewards/GDino": 0.8079315423965454, "rewards/GIT": 0.41923201084136963, "rewards/HPSv2": 0.28814697265625, "step": 721 }, { "completion_length": 147.4375, "epoch": 1.6996466431095407, "grad_norm": 0.79628046804541, "kl": 0.41336265206336975, "learning_rate": 5.487499999999999e-07, "loss": 0.41336265206336975, "reward": 1.3749783635139465, "reward_std": 0.10327174514532089, "rewards/GDino": 0.7035132050514221, "rewards/GIT": 0.36731353402137756, "rewards/HPSv2": 0.3041515350341797, "step": 722 }, { "completion_length": 148.8125, "epoch": 1.7020023557126032, "grad_norm": 0.47199288471565193, "kl": 0.3123195022344589, "learning_rate": 5.481249999999999e-07, "loss": 0.3123195022344589, "reward": 1.6171230673789978, "reward_std": 0.1926277130842209, "rewards/GDino": 0.7713229358196259, "rewards/GIT": 0.5579525530338287, "rewards/HPSv2": 0.28784751892089844, "step": 723 }, { "completion_length": 145.03125, "epoch": 1.7043580683156656, "grad_norm": 0.4968956910335371, "kl": 0.33517251908779144, "learning_rate": 5.474999999999999e-07, "loss": 0.33517251908779144, "reward": 1.4876941442489624, "reward_std": 0.12305799871683121, "rewards/GDino": 0.7359239757061005, "rewards/GIT": 0.45871175825595856, "rewards/HPSv2": 0.2930583953857422, "step": 724 }, { "completion_length": 151.3984375, "epoch": 1.706713780918728, "grad_norm": 0.6514218744990719, "kl": 0.2958174794912338, "learning_rate": 5.46875e-07, "loss": 0.2958174794912338, "reward": 1.405617117881775, "reward_std": 0.1527165248990059, "rewards/GDino": 0.712875097990036, "rewards/GIT": 0.4131227731704712, "rewards/HPSv2": 0.2796192169189453, "step": 725 }, { "completion_length": 148.09375, "epoch": 1.7090694935217905, "grad_norm": 0.5214429109652751, "kl": 0.4441043436527252, "learning_rate": 5.4625e-07, "loss": 0.4441043436527252, "reward": 1.5399360656738281, "reward_std": 0.20678698271512985, "rewards/GDino": 0.7997840940952301, "rewards/GIT": 0.45174551010131836, "rewards/HPSv2": 0.2884063720703125, "step": 726 }, { "completion_length": 143.0703125, "epoch": 1.7114252061248527, "grad_norm": 0.5087394592600938, "kl": 0.36602216958999634, "learning_rate": 5.45625e-07, "loss": 0.36602216958999634, "reward": 1.6391754150390625, "reward_std": 0.07969829440116882, "rewards/GDino": 0.8301855325698853, "rewards/GIT": 0.5155595242977142, "rewards/HPSv2": 0.2934303283691406, "step": 727 }, { "completion_length": 153.4921875, "epoch": 1.7137809187279152, "grad_norm": 0.5900307942539837, "kl": 0.32635924220085144, "learning_rate": 5.45e-07, "loss": 0.32635924220085144, "reward": 1.497054934501648, "reward_std": 0.17460747063159943, "rewards/GDino": 0.8042311072349548, "rewards/GIT": 0.40138570219278336, "rewards/HPSv2": 0.29143810272216797, "step": 728 }, { "completion_length": 146.1796875, "epoch": 1.7161366313309776, "grad_norm": 0.5921989291733234, "kl": 0.2431763932108879, "learning_rate": 5.44375e-07, "loss": 0.2431763932108879, "reward": 1.5534833669662476, "reward_std": 0.15913428366184235, "rewards/GDino": 0.7694720327854156, "rewards/GIT": 0.5034622102975845, "rewards/HPSv2": 0.2805490493774414, "step": 729 }, { "completion_length": 150.9375, "epoch": 1.71849234393404, "grad_norm": 0.494953133573256, "kl": 0.2908814400434494, "learning_rate": 5.4375e-07, "loss": 0.2908814400434494, "reward": 1.5199139714241028, "reward_std": 0.16904722899198532, "rewards/GDino": 0.7878062725067139, "rewards/GIT": 0.43623024225234985, "rewards/HPSv2": 0.29587745666503906, "step": 730 }, { "completion_length": 155.8046875, "epoch": 1.7208480565371025, "grad_norm": 0.5669263400553495, "kl": 0.26159391552209854, "learning_rate": 5.43125e-07, "loss": 0.26159391552209854, "reward": 1.4162370562553406, "reward_std": 0.16993867605924606, "rewards/GDino": 0.7244028449058533, "rewards/GIT": 0.3912551701068878, "rewards/HPSv2": 0.3005790710449219, "step": 731 }, { "completion_length": 147.640625, "epoch": 1.723203769140165, "grad_norm": 0.5268126981167038, "kl": 0.3641798794269562, "learning_rate": 5.425e-07, "loss": 0.3641798794269562, "reward": 1.3349887132644653, "reward_std": 0.21330712735652924, "rewards/GDino": 0.6925823092460632, "rewards/GIT": 0.3529385030269623, "rewards/HPSv2": 0.28946781158447266, "step": 732 }, { "completion_length": 145.7265625, "epoch": 1.7255594817432274, "grad_norm": 0.6528764885334967, "kl": 0.4696078896522522, "learning_rate": 5.41875e-07, "loss": 0.4696078896522522, "reward": 1.2917814254760742, "reward_std": 0.18004102259874344, "rewards/GDino": 0.7077189981937408, "rewards/GIT": 0.2999417334794998, "rewards/HPSv2": 0.2841205596923828, "step": 733 }, { "completion_length": 149.359375, "epoch": 1.7279151943462896, "grad_norm": 0.8167120448725362, "kl": 0.28613704442977905, "learning_rate": 5.4125e-07, "loss": 0.28613704442977905, "reward": 1.484497308731079, "reward_std": 0.17016590386629105, "rewards/GDino": 0.7830067873001099, "rewards/GIT": 0.4156228005886078, "rewards/HPSv2": 0.28586769104003906, "step": 734 }, { "completion_length": 148.1640625, "epoch": 1.730270906949352, "grad_norm": 0.5817220267013877, "kl": 0.3406076431274414, "learning_rate": 5.40625e-07, "loss": 0.3406076431274414, "reward": 1.5423716306686401, "reward_std": 0.15003535151481628, "rewards/GDino": 0.8452584445476532, "rewards/GIT": 0.41551973670721054, "rewards/HPSv2": 0.28159332275390625, "step": 735 }, { "completion_length": 146.234375, "epoch": 1.7326266195524145, "grad_norm": 0.5998344765576338, "kl": 0.5255603492259979, "learning_rate": 5.4e-07, "loss": 0.5255603492259979, "reward": 1.5528455972671509, "reward_std": 0.12306249141693115, "rewards/GDino": 0.7608321011066437, "rewards/GIT": 0.49640877544879913, "rewards/HPSv2": 0.2956047058105469, "step": 736 }, { "completion_length": 147.703125, "epoch": 1.734982332155477, "grad_norm": 0.4914416950529299, "kl": 0.40423350036144257, "learning_rate": 5.39375e-07, "loss": 0.40423350036144257, "reward": 1.2774619460105896, "reward_std": 0.16610150784254074, "rewards/GDino": 0.7018203139305115, "rewards/GIT": 0.2895153760910034, "rewards/HPSv2": 0.28612613677978516, "step": 737 }, { "completion_length": 154.28125, "epoch": 1.7373380447585394, "grad_norm": 0.44248725317368814, "kl": 0.3482944369316101, "learning_rate": 5.387499999999999e-07, "loss": 0.3482944369316101, "reward": 1.4140409231185913, "reward_std": 0.19440804421901703, "rewards/GDino": 0.7592449188232422, "rewards/GIT": 0.3601907789707184, "rewards/HPSv2": 0.2946052551269531, "step": 738 }, { "completion_length": 146.765625, "epoch": 1.7396937573616018, "grad_norm": 0.6412608177399566, "kl": 0.3597259968519211, "learning_rate": 5.381249999999999e-07, "loss": 0.3597259968519211, "reward": 1.5869477987289429, "reward_std": 0.09491639956831932, "rewards/GDino": 0.7593518197536469, "rewards/GIT": 0.5433848202228546, "rewards/HPSv2": 0.2842111587524414, "step": 739 }, { "completion_length": 151.8125, "epoch": 1.7420494699646643, "grad_norm": 0.44541815190776474, "kl": 0.31226298213005066, "learning_rate": 5.374999999999999e-07, "loss": 0.31226298213005066, "reward": 1.244201898574829, "reward_std": 0.16555405408143997, "rewards/GDino": 0.7072892785072327, "rewards/GIT": 0.24934068322181702, "rewards/HPSv2": 0.28757190704345703, "step": 740 }, { "completion_length": 157.7734375, "epoch": 1.7444051825677267, "grad_norm": 0.767481001240583, "kl": 0.33487121760845184, "learning_rate": 5.368749999999999e-07, "loss": 0.33487121760845184, "reward": 1.2975819110870361, "reward_std": 0.1397399827837944, "rewards/GDino": 0.7548685073852539, "rewards/GIT": 0.24641622602939606, "rewards/HPSv2": 0.2962970733642578, "step": 741 }, { "completion_length": 144.5, "epoch": 1.7467608951707891, "grad_norm": 0.4747185218439371, "kl": 0.3630921393632889, "learning_rate": 5.3625e-07, "loss": 0.3630921393632889, "reward": 1.6411885023117065, "reward_std": 0.1555585376918316, "rewards/GDino": 0.8040129542350769, "rewards/GIT": 0.5548993945121765, "rewards/HPSv2": 0.2822761535644531, "step": 742 }, { "completion_length": 153.4609375, "epoch": 1.7491166077738516, "grad_norm": 0.6597322043054521, "kl": 0.39601680636405945, "learning_rate": 5.35625e-07, "loss": 0.39601680636405945, "reward": 1.6717033982276917, "reward_std": 0.1511596441268921, "rewards/GDino": 0.8351849317550659, "rewards/GIT": 0.5486195087432861, "rewards/HPSv2": 0.2878990173339844, "step": 743 }, { "completion_length": 144.4296875, "epoch": 1.751472320376914, "grad_norm": 0.4372222331224635, "kl": 0.3508177250623703, "learning_rate": 5.35e-07, "loss": 0.3508177250623703, "reward": 1.679397165775299, "reward_std": 0.15499665588140488, "rewards/GDino": 0.8297215402126312, "rewards/GIT": 0.5592788457870483, "rewards/HPSv2": 0.29039669036865234, "step": 744 }, { "completion_length": 145.1640625, "epoch": 1.7538280329799765, "grad_norm": 0.7371330309913664, "kl": 0.34201811254024506, "learning_rate": 5.343750000000001e-07, "loss": 0.34201811254024506, "reward": 1.272645354270935, "reward_std": 0.14339382201433182, "rewards/GDino": 0.7587890625, "rewards/GIT": 0.2175840586423874, "rewards/HPSv2": 0.29627227783203125, "step": 745 }, { "completion_length": 147.2578125, "epoch": 1.756183745583039, "grad_norm": 0.6630421984175565, "kl": 0.29182107746601105, "learning_rate": 5.3375e-07, "loss": 0.29182107746601105, "reward": 1.5832467079162598, "reward_std": 0.13641303777694702, "rewards/GDino": 0.7619234919548035, "rewards/GIT": 0.5223750025033951, "rewards/HPSv2": 0.2989482879638672, "step": 746 }, { "completion_length": 141.359375, "epoch": 1.7585394581861014, "grad_norm": 0.7760032299392308, "kl": 0.34378623962402344, "learning_rate": 5.33125e-07, "loss": 0.34378623962402344, "reward": 1.4943626523017883, "reward_std": 0.18585099279880524, "rewards/GDino": 0.8228445053100586, "rewards/GIT": 0.3846767097711563, "rewards/HPSv2": 0.28684139251708984, "step": 747 }, { "completion_length": 156.71875, "epoch": 1.7608951707891638, "grad_norm": 0.47043959587750295, "kl": 0.4073641300201416, "learning_rate": 5.325e-07, "loss": 0.4073641300201416, "reward": 1.4890388250350952, "reward_std": 0.15322282165288925, "rewards/GDino": 0.7842492163181305, "rewards/GIT": 0.39874592423439026, "rewards/HPSv2": 0.3060436248779297, "step": 748 }, { "completion_length": 150.1015625, "epoch": 1.7632508833922262, "grad_norm": 0.5797111736795582, "kl": 0.3200274407863617, "learning_rate": 5.31875e-07, "loss": 0.3200274407863617, "reward": 1.3128186464309692, "reward_std": 0.19365352392196655, "rewards/GDino": 0.7452206611633301, "rewards/GIT": 0.2711816802620888, "rewards/HPSv2": 0.2964162826538086, "step": 749 }, { "completion_length": 148.0234375, "epoch": 1.7656065959952887, "grad_norm": 0.49167425211882076, "kl": 0.25423865020275116, "learning_rate": 5.3125e-07, "loss": 0.25423865020275116, "reward": 1.2717047333717346, "reward_std": 0.18969321250915527, "rewards/GDino": 0.6392331719398499, "rewards/GIT": 0.34000344574451447, "rewards/HPSv2": 0.2924680709838867, "step": 750 }, { "completion_length": 145.390625, "epoch": 1.7679623085983511, "grad_norm": 0.4247896858178306, "kl": 0.22804933786392212, "learning_rate": 5.30625e-07, "loss": 0.22804933786392212, "reward": 1.6418799757957458, "reward_std": 0.18876148015260696, "rewards/GDino": 0.7945851385593414, "rewards/GIT": 0.5761795341968536, "rewards/HPSv2": 0.2711153030395508, "step": 751 }, { "completion_length": 145.578125, "epoch": 1.7703180212014136, "grad_norm": 1.6720091954015446, "kl": 0.3817945867776871, "learning_rate": 5.3e-07, "loss": 0.3817945867776871, "reward": 1.4110633730888367, "reward_std": 0.1390567347407341, "rewards/GDino": 0.7775918841362, "rewards/GIT": 0.3442879021167755, "rewards/HPSv2": 0.2891836166381836, "step": 752 }, { "completion_length": 139.71875, "epoch": 1.772673733804476, "grad_norm": 0.6916103947831778, "kl": 0.3394780158996582, "learning_rate": 5.29375e-07, "loss": 0.3394780158996582, "reward": 1.437959909439087, "reward_std": 0.18220099806785583, "rewards/GDino": 0.7526431083679199, "rewards/GIT": 0.3959224224090576, "rewards/HPSv2": 0.2893943786621094, "step": 753 }, { "completion_length": 149.8125, "epoch": 1.7750294464075382, "grad_norm": 0.7561264268031648, "kl": 0.39778995513916016, "learning_rate": 5.2875e-07, "loss": 0.39778995513916016, "reward": 1.5532872676849365, "reward_std": 0.14921041205525398, "rewards/GDino": 0.7543669939041138, "rewards/GIT": 0.49524369835853577, "rewards/HPSv2": 0.3036766052246094, "step": 754 }, { "completion_length": 145.734375, "epoch": 1.7773851590106007, "grad_norm": 0.6316777570260029, "kl": 0.2907618433237076, "learning_rate": 5.281249999999999e-07, "loss": 0.2907618433237076, "reward": 1.641988456249237, "reward_std": 0.12138235196471214, "rewards/GDino": 0.8451609313488007, "rewards/GIT": 0.49896636605262756, "rewards/HPSv2": 0.29786109924316406, "step": 755 }, { "completion_length": 155.796875, "epoch": 1.7797408716136631, "grad_norm": 0.5274459105210972, "kl": 0.348466694355011, "learning_rate": 5.274999999999999e-07, "loss": 0.348466694355011, "reward": 1.3681582808494568, "reward_std": 0.14449243247509003, "rewards/GDino": 0.7270989418029785, "rewards/GIT": 0.334899365901947, "rewards/HPSv2": 0.30615997314453125, "step": 756 }, { "completion_length": 151.328125, "epoch": 1.7820965842167256, "grad_norm": 0.8875222893698976, "kl": 0.28324800729751587, "learning_rate": 5.268749999999999e-07, "loss": 0.28324800729751587, "reward": 1.5705856084823608, "reward_std": 0.14869976043701172, "rewards/GDino": 0.7974264919757843, "rewards/GIT": 0.47749805450439453, "rewards/HPSv2": 0.29566097259521484, "step": 757 }, { "completion_length": 142.28125, "epoch": 1.784452296819788, "grad_norm": 1.556126223806373, "kl": 0.3136792480945587, "learning_rate": 5.262499999999999e-07, "loss": 0.3136792480945587, "reward": 1.5467990636825562, "reward_std": 0.15794160217046738, "rewards/GDino": 0.7790595889091492, "rewards/GIT": 0.4836112856864929, "rewards/HPSv2": 0.28412818908691406, "step": 758 }, { "completion_length": 142.2734375, "epoch": 1.7868080094228505, "grad_norm": 0.7803578747283523, "kl": 0.277873158454895, "learning_rate": 5.256249999999999e-07, "loss": 0.277873158454895, "reward": 1.379017949104309, "reward_std": 0.22451142966747284, "rewards/GDino": 0.710568368434906, "rewards/GIT": 0.38462844491004944, "rewards/HPSv2": 0.28382110595703125, "step": 759 }, { "completion_length": 147.8046875, "epoch": 1.7891637220259127, "grad_norm": 0.7341671036223971, "kl": 0.29699069261550903, "learning_rate": 5.25e-07, "loss": 0.29699069261550903, "reward": 1.7511455416679382, "reward_std": 0.14536598324775696, "rewards/GDino": 0.8749453127384186, "rewards/GIT": 0.587434321641922, "rewards/HPSv2": 0.28876590728759766, "step": 760 }, { "completion_length": 144.6171875, "epoch": 1.7915194346289751, "grad_norm": 0.44251627484014333, "kl": 0.37299397587776184, "learning_rate": 5.243750000000001e-07, "loss": 0.37299397587776184, "reward": 1.1961843967437744, "reward_std": 0.1853116825222969, "rewards/GDino": 0.635791152715683, "rewards/GIT": 0.2832535430788994, "rewards/HPSv2": 0.27713966369628906, "step": 761 }, { "completion_length": 151.0546875, "epoch": 1.7938751472320376, "grad_norm": 0.5310343132270884, "kl": 0.34552957117557526, "learning_rate": 5.237500000000001e-07, "loss": 0.34552957117557526, "reward": 1.3477033376693726, "reward_std": 0.16614580154418945, "rewards/GDino": 0.6994572281837463, "rewards/GIT": 0.36023448407649994, "rewards/HPSv2": 0.2880115509033203, "step": 762 }, { "completion_length": 145.109375, "epoch": 1.7962308598351, "grad_norm": 1.2927114591451319, "kl": 0.331984743475914, "learning_rate": 5.23125e-07, "loss": 0.331984743475914, "reward": 1.505416452884674, "reward_std": 0.1654127985239029, "rewards/GDino": 0.7666008770465851, "rewards/GIT": 0.44138164818286896, "rewards/HPSv2": 0.29743385314941406, "step": 763 }, { "completion_length": 153.4765625, "epoch": 1.7985865724381624, "grad_norm": 0.47813726828530195, "kl": 0.3542918562889099, "learning_rate": 5.225e-07, "loss": 0.3542918562889099, "reward": 1.6525586247444153, "reward_std": 0.16975512355566025, "rewards/GDino": 0.8068446218967438, "rewards/GIT": 0.5560325533151627, "rewards/HPSv2": 0.28968143463134766, "step": 764 }, { "completion_length": 150.0625, "epoch": 1.8009422850412249, "grad_norm": 0.7995041838126764, "kl": 0.30523232370615005, "learning_rate": 5.21875e-07, "loss": 0.30523232370615005, "reward": 1.49308580160141, "reward_std": 0.15078215301036835, "rewards/GDino": 0.8132773637771606, "rewards/GIT": 0.39323121309280396, "rewards/HPSv2": 0.2865772247314453, "step": 765 }, { "completion_length": 146.328125, "epoch": 1.8032979976442873, "grad_norm": 0.4854806968497567, "kl": 0.3498879671096802, "learning_rate": 5.2125e-07, "loss": 0.3498879671096802, "reward": 1.2915522456169128, "reward_std": 0.12646470218896866, "rewards/GDino": 0.723405122756958, "rewards/GIT": 0.2881224900484085, "rewards/HPSv2": 0.28002452850341797, "step": 766 }, { "completion_length": 149.859375, "epoch": 1.8056537102473498, "grad_norm": 0.523180735655295, "kl": 0.26267801970243454, "learning_rate": 5.20625e-07, "loss": 0.26267801970243454, "reward": 1.5742886662483215, "reward_std": 0.1660643219947815, "rewards/GDino": 0.8180676400661469, "rewards/GIT": 0.471451073884964, "rewards/HPSv2": 0.28477001190185547, "step": 767 }, { "completion_length": 152.0078125, "epoch": 1.8080094228504122, "grad_norm": 0.451039220201636, "kl": 0.2770536318421364, "learning_rate": 5.2e-07, "loss": 0.2770536318421364, "reward": 1.656054973602295, "reward_std": 0.14969055354595184, "rewards/GDino": 0.8276512622833252, "rewards/GIT": 0.5471402704715729, "rewards/HPSv2": 0.2812633514404297, "step": 768 }, { "completion_length": 144.921875, "epoch": 1.8103651354534747, "grad_norm": 1.066231680680484, "kl": 0.39289119839668274, "learning_rate": 5.19375e-07, "loss": 0.39289119839668274, "reward": 1.2789015173912048, "reward_std": 0.11148998141288757, "rewards/GDino": 0.702972799539566, "rewards/GIT": 0.27795974165201187, "rewards/HPSv2": 0.29796886444091797, "step": 769 }, { "completion_length": 151.4765625, "epoch": 1.812720848056537, "grad_norm": 0.880366919070202, "kl": 0.3235255479812622, "learning_rate": 5.1875e-07, "loss": 0.3235255479812622, "reward": 1.3903911113739014, "reward_std": 0.16693946719169617, "rewards/GDino": 0.7086868286132812, "rewards/GIT": 0.3854835778474808, "rewards/HPSv2": 0.2962207794189453, "step": 770 }, { "completion_length": 148.2890625, "epoch": 1.8150765606595995, "grad_norm": 0.46100917186154566, "kl": 0.37810826301574707, "learning_rate": 5.181249999999999e-07, "loss": 0.37810826301574707, "reward": 1.6356542706489563, "reward_std": 0.127709299325943, "rewards/GDino": 0.8177496492862701, "rewards/GIT": 0.5154925435781479, "rewards/HPSv2": 0.3024120330810547, "step": 771 }, { "completion_length": 153.203125, "epoch": 1.817432273262662, "grad_norm": 0.7564143478126781, "kl": 0.3477868288755417, "learning_rate": 5.174999999999999e-07, "loss": 0.3477868288755417, "reward": 1.4120174646377563, "reward_std": 0.18721068650484085, "rewards/GDino": 0.7303121387958527, "rewards/GIT": 0.3888434171676636, "rewards/HPSv2": 0.2928619384765625, "step": 772 }, { "completion_length": 150.46875, "epoch": 1.8197879858657244, "grad_norm": 0.9481121048599709, "kl": 0.26942410320043564, "learning_rate": 5.168749999999999e-07, "loss": 0.26942410320043564, "reward": 1.5720882415771484, "reward_std": 0.16988325864076614, "rewards/GDino": 0.7571074962615967, "rewards/GIT": 0.5280658900737762, "rewards/HPSv2": 0.2869148254394531, "step": 773 }, { "completion_length": 145.0859375, "epoch": 1.8221436984687869, "grad_norm": 0.6726333934797197, "kl": 0.3529961407184601, "learning_rate": 5.162499999999999e-07, "loss": 0.3529961407184601, "reward": 1.2400310039520264, "reward_std": 0.21847903728485107, "rewards/GDino": 0.7063978016376495, "rewards/GIT": 0.2522220015525818, "rewards/HPSv2": 0.28141117095947266, "step": 774 }, { "completion_length": 147.3828125, "epoch": 1.8244994110718493, "grad_norm": 0.7083937239465433, "kl": 0.37787361443042755, "learning_rate": 5.156249999999999e-07, "loss": 0.37787361443042755, "reward": 1.3181783556938171, "reward_std": 0.17650160938501358, "rewards/GDino": 0.7060568928718567, "rewards/GIT": 0.3242567926645279, "rewards/HPSv2": 0.28786468505859375, "step": 775 }, { "completion_length": 151.6171875, "epoch": 1.8268551236749118, "grad_norm": 0.5646514625254404, "kl": 0.31989115476608276, "learning_rate": 5.149999999999999e-07, "loss": 0.31989115476608276, "reward": 1.691009759902954, "reward_std": 0.0795685425400734, "rewards/GDino": 0.828497588634491, "rewards/GIT": 0.5577320456504822, "rewards/HPSv2": 0.3047800064086914, "step": 776 }, { "completion_length": 148.6875, "epoch": 1.8292108362779742, "grad_norm": 1.3094957065336794, "kl": 0.439039945602417, "learning_rate": 5.14375e-07, "loss": 0.439039945602417, "reward": 1.7782069444656372, "reward_std": 0.12409776449203491, "rewards/GDino": 0.8425591886043549, "rewards/GIT": 0.6352689564228058, "rewards/HPSv2": 0.30037879943847656, "step": 777 }, { "completion_length": 149.90625, "epoch": 1.8315665488810366, "grad_norm": 0.377120088260515, "kl": 0.2916112393140793, "learning_rate": 5.137500000000001e-07, "loss": 0.2916112393140793, "reward": 1.5898423194885254, "reward_std": 0.1803339384496212, "rewards/GDino": 0.8279119431972504, "rewards/GIT": 0.47264666855335236, "rewards/HPSv2": 0.28928375244140625, "step": 778 }, { "completion_length": 146.0, "epoch": 1.833922261484099, "grad_norm": 0.7146717946496414, "kl": 0.40500932931900024, "learning_rate": 5.131250000000001e-07, "loss": 0.40500932931900024, "reward": 1.3662488460540771, "reward_std": 0.19136038422584534, "rewards/GDino": 0.7169347405433655, "rewards/GIT": 0.3598920591175556, "rewards/HPSv2": 0.28942203521728516, "step": 779 }, { "completion_length": 147.78125, "epoch": 1.8362779740871613, "grad_norm": 0.6378140526602524, "kl": 0.2615543529391289, "learning_rate": 5.125e-07, "loss": 0.2615543529391289, "reward": 1.552364706993103, "reward_std": 0.183646559715271, "rewards/GDino": 0.7853807508945465, "rewards/GIT": 0.47649097442626953, "rewards/HPSv2": 0.2904930114746094, "step": 780 }, { "completion_length": 145.421875, "epoch": 1.8386336866902238, "grad_norm": 0.9068718609353228, "kl": 0.3563908338546753, "learning_rate": 5.11875e-07, "loss": 0.3563908338546753, "reward": 1.4075452089309692, "reward_std": 0.1417853757739067, "rewards/GDino": 0.7756104469299316, "rewards/GIT": 0.3420463800430298, "rewards/HPSv2": 0.2898883819580078, "step": 781 }, { "completion_length": 149.0078125, "epoch": 1.8409893992932862, "grad_norm": 0.40850149976032685, "kl": 0.25013142824172974, "learning_rate": 5.1125e-07, "loss": 0.25013142824172974, "reward": 1.1977518200874329, "reward_std": 0.19787029922008514, "rewards/GDino": 0.626783549785614, "rewards/GIT": 0.2814203351736069, "rewards/HPSv2": 0.2895479202270508, "step": 782 }, { "completion_length": 147.9375, "epoch": 1.8433451118963486, "grad_norm": 1.0457304848811304, "kl": 0.36823713779449463, "learning_rate": 5.10625e-07, "loss": 0.36823713779449463, "reward": 1.73883056640625, "reward_std": 0.12424224987626076, "rewards/GDino": 0.8772646188735962, "rewards/GIT": 0.5628446042537689, "rewards/HPSv2": 0.2987213134765625, "step": 783 }, { "completion_length": 146.5, "epoch": 1.845700824499411, "grad_norm": 1.0707548131852191, "kl": 0.2959247827529907, "learning_rate": 5.1e-07, "loss": 0.2959247827529907, "reward": 1.2687984108924866, "reward_std": 0.22141724824905396, "rewards/GDino": 0.7019115686416626, "rewards/GIT": 0.27423377335071564, "rewards/HPSv2": 0.29265308380126953, "step": 784 }, { "completion_length": 143.8515625, "epoch": 1.8480565371024735, "grad_norm": 0.7149479855608373, "kl": 0.2697216793894768, "learning_rate": 5.09375e-07, "loss": 0.2697216793894768, "reward": 1.25291508436203, "reward_std": 0.14959585666656494, "rewards/GDino": 0.6685865521430969, "rewards/GIT": 0.2899378314614296, "rewards/HPSv2": 0.2943906784057617, "step": 785 }, { "completion_length": 151.7890625, "epoch": 1.850412249705536, "grad_norm": 0.39060635757212836, "kl": 0.3404082953929901, "learning_rate": 5.0875e-07, "loss": 0.3404082953929901, "reward": 1.3502548336982727, "reward_std": 0.2004503831267357, "rewards/GDino": 0.723473995923996, "rewards/GIT": 0.3311046361923218, "rewards/HPSv2": 0.29567623138427734, "step": 786 }, { "completion_length": 144.5390625, "epoch": 1.8527679623085982, "grad_norm": 0.49178294492598873, "kl": 0.33087214827537537, "learning_rate": 5.08125e-07, "loss": 0.33087214827537537, "reward": 1.6852235794067383, "reward_std": 0.09686646237969398, "rewards/GDino": 0.8007104694843292, "rewards/GIT": 0.5897897183895111, "rewards/HPSv2": 0.2947235107421875, "step": 787 }, { "completion_length": 139.265625, "epoch": 1.8551236749116606, "grad_norm": 0.4795522736365559, "kl": 0.35538021475076675, "learning_rate": 5.074999999999999e-07, "loss": 0.35538021475076675, "reward": 1.3755954504013062, "reward_std": 0.19119691848754883, "rewards/GDino": 0.7269530892372131, "rewards/GIT": 0.3553550988435745, "rewards/HPSv2": 0.2932872772216797, "step": 788 }, { "completion_length": 143.8828125, "epoch": 1.857479387514723, "grad_norm": 0.6475255347427058, "kl": 0.4011817127466202, "learning_rate": 5.068749999999999e-07, "loss": 0.4011817127466202, "reward": 1.652111291885376, "reward_std": 0.1580354869365692, "rewards/GDino": 0.777370810508728, "rewards/GIT": 0.5842331945896149, "rewards/HPSv2": 0.29050731658935547, "step": 789 }, { "completion_length": 149.375, "epoch": 1.8598351001177855, "grad_norm": 0.8999369253323775, "kl": 0.30541878938674927, "learning_rate": 5.062499999999999e-07, "loss": 0.30541878938674927, "reward": 1.3656489253044128, "reward_std": 0.12582620978355408, "rewards/GDino": 0.7145903706550598, "rewards/GIT": 0.360462449491024, "rewards/HPSv2": 0.29059600830078125, "step": 790 }, { "completion_length": 148.5625, "epoch": 1.862190812720848, "grad_norm": 0.4777584319937551, "kl": 0.43203870952129364, "learning_rate": 5.056249999999999e-07, "loss": 0.43203870952129364, "reward": 1.5752392411231995, "reward_std": 0.11080099642276764, "rewards/GDino": 0.8353421986103058, "rewards/GIT": 0.4483311250805855, "rewards/HPSv2": 0.2915658950805664, "step": 791 }, { "completion_length": 146.1484375, "epoch": 1.8645465253239104, "grad_norm": 0.7314785850231685, "kl": 0.28693392127752304, "learning_rate": 5.049999999999999e-07, "loss": 0.28693392127752304, "reward": 1.2102081775665283, "reward_std": 0.14836997538805008, "rewards/GDino": 0.6405426859855652, "rewards/GIT": 0.2736230790615082, "rewards/HPSv2": 0.29604244232177734, "step": 792 }, { "completion_length": 147.2109375, "epoch": 1.8669022379269729, "grad_norm": 0.6838352595131361, "kl": 0.47693654894828796, "learning_rate": 5.04375e-07, "loss": 0.47693654894828796, "reward": 1.8696069717407227, "reward_std": 0.0947577990591526, "rewards/GDino": 0.8991016447544098, "rewards/GIT": 0.6598573625087738, "rewards/HPSv2": 0.31064796447753906, "step": 793 }, { "completion_length": 149.046875, "epoch": 1.8692579505300353, "grad_norm": 0.6250045378071601, "kl": 0.33329685032367706, "learning_rate": 5.0375e-07, "loss": 0.33329685032367706, "reward": 1.1905521154403687, "reward_std": 0.15430769324302673, "rewards/GDino": 0.7338141202926636, "rewards/GIT": 0.1609242744743824, "rewards/HPSv2": 0.29581356048583984, "step": 794 }, { "completion_length": 151.3984375, "epoch": 1.8716136631330977, "grad_norm": 0.49903348111789725, "kl": 0.386860653758049, "learning_rate": 5.031250000000001e-07, "loss": 0.386860653758049, "reward": 1.5662407279014587, "reward_std": 0.15920208394527435, "rewards/GDino": 0.766704648733139, "rewards/GIT": 0.5035107731819153, "rewards/HPSv2": 0.29602527618408203, "step": 795 }, { "completion_length": 145.328125, "epoch": 1.8739693757361602, "grad_norm": 1.1197552820302097, "kl": 0.28512275218963623, "learning_rate": 5.025e-07, "loss": 0.28512275218963623, "reward": 1.3823754787445068, "reward_std": 0.11056404933333397, "rewards/GDino": 0.7317709028720856, "rewards/GIT": 0.3660539537668228, "rewards/HPSv2": 0.28455066680908203, "step": 796 }, { "completion_length": 148.609375, "epoch": 1.8763250883392226, "grad_norm": 0.6780517574814277, "kl": 0.3793477714061737, "learning_rate": 5.01875e-07, "loss": 0.3793477714061737, "reward": 1.5249319076538086, "reward_std": 0.14384550973773003, "rewards/GDino": 0.7965268790721893, "rewards/GIT": 0.4237193912267685, "rewards/HPSv2": 0.3046855926513672, "step": 797 }, { "completion_length": 151.8671875, "epoch": 1.878680800942285, "grad_norm": 0.5901504414776721, "kl": 0.35585595667362213, "learning_rate": 5.0125e-07, "loss": 0.35585595667362213, "reward": 1.2783191204071045, "reward_std": 0.17013496160507202, "rewards/GDino": 0.6917300522327423, "rewards/GIT": 0.29758284986019135, "rewards/HPSv2": 0.28900623321533203, "step": 798 }, { "completion_length": 145.7109375, "epoch": 1.8810365135453475, "grad_norm": 0.8550649304118674, "kl": 0.2930138260126114, "learning_rate": 5.00625e-07, "loss": 0.2930138260126114, "reward": 1.801570177078247, "reward_std": 0.14727991074323654, "rewards/GDino": 0.8367154598236084, "rewards/GIT": 0.6776653230190277, "rewards/HPSv2": 0.2871894836425781, "step": 799 }, { "completion_length": 146.1015625, "epoch": 1.88339222614841, "grad_norm": 0.7832109593268759, "kl": 0.32603316009044647, "learning_rate": 5e-07, "loss": 0.32603316009044647, "reward": 1.456203043460846, "reward_std": 0.16690320521593094, "rewards/GDino": 0.754707396030426, "rewards/GIT": 0.4094891846179962, "rewards/HPSv2": 0.2920064926147461, "step": 800 } ], "logging_steps": 1.0, "max_steps": 1600, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }